1 //===- Construction of pass pipelines -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file provides the implementation of the PassBuilder based on our
11 /// static pass registry as well as related functionality. It also provides
12 /// helpers to aid in analyzing, debugging, and testing passes and pass
13 /// pipelines.
14 ///
15 //===----------------------------------------------------------------------===//
16
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/Analysis/AliasAnalysis.h"
19 #include "llvm/Analysis/BasicAliasAnalysis.h"
20 #include "llvm/Analysis/CGSCCPassManager.h"
21 #include "llvm/Analysis/GlobalsModRef.h"
22 #include "llvm/Analysis/InlineAdvisor.h"
23 #include "llvm/Analysis/ProfileSummaryInfo.h"
24 #include "llvm/Analysis/ScopedNoAliasAA.h"
25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
26 #include "llvm/IR/PassManager.h"
27 #include "llvm/Passes/OptimizationLevel.h"
28 #include "llvm/Passes/PassBuilder.h"
29 #include "llvm/Support/CommandLine.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/PGOOptions.h"
32 #include "llvm/Support/VirtualFileSystem.h"
33 #include "llvm/Target/TargetMachine.h"
34 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
35 #include "llvm/Transforms/Coroutines/CoroCleanup.h"
36 #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h"
37 #include "llvm/Transforms/Coroutines/CoroEarly.h"
38 #include "llvm/Transforms/Coroutines/CoroElide.h"
39 #include "llvm/Transforms/Coroutines/CoroSplit.h"
40 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
41 #include "llvm/Transforms/IPO/AlwaysInliner.h"
42 #include "llvm/Transforms/IPO/Annotation2Metadata.h"
43 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
44 #include "llvm/Transforms/IPO/Attributor.h"
45 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
46 #include "llvm/Transforms/IPO/ConstantMerge.h"
47 #include "llvm/Transforms/IPO/CrossDSOCFI.h"
48 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
49 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
50 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
51 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
52 #include "llvm/Transforms/IPO/FunctionAttrs.h"
53 #include "llvm/Transforms/IPO/GlobalDCE.h"
54 #include "llvm/Transforms/IPO/GlobalOpt.h"
55 #include "llvm/Transforms/IPO/GlobalSplit.h"
56 #include "llvm/Transforms/IPO/HotColdSplitting.h"
57 #include "llvm/Transforms/IPO/IROutliner.h"
58 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
59 #include "llvm/Transforms/IPO/Inliner.h"
60 #include "llvm/Transforms/IPO/LowerTypeTests.h"
61 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
62 #include "llvm/Transforms/IPO/MergeFunctions.h"
63 #include "llvm/Transforms/IPO/ModuleInliner.h"
64 #include "llvm/Transforms/IPO/OpenMPOpt.h"
65 #include "llvm/Transforms/IPO/PartialInlining.h"
66 #include "llvm/Transforms/IPO/SCCP.h"
67 #include "llvm/Transforms/IPO/SampleProfile.h"
68 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
69 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
70 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
71 #include "llvm/Transforms/InstCombine/InstCombine.h"
72 #include "llvm/Transforms/Instrumentation/CGProfile.h"
73 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
74 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
75 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
76 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
77 #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
78 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
79 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
80 #include "llvm/Transforms/Scalar/ADCE.h"
81 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
82 #include "llvm/Transforms/Scalar/AnnotationRemarks.h"
83 #include "llvm/Transforms/Scalar/BDCE.h"
84 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
85 #include "llvm/Transforms/Scalar/ConstraintElimination.h"
86 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
87 #include "llvm/Transforms/Scalar/DFAJumpThreading.h"
88 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
89 #include "llvm/Transforms/Scalar/DivRemPairs.h"
90 #include "llvm/Transforms/Scalar/EarlyCSE.h"
91 #include "llvm/Transforms/Scalar/Float2Int.h"
92 #include "llvm/Transforms/Scalar/GVN.h"
93 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
94 #include "llvm/Transforms/Scalar/InferAlignment.h"
95 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
96 #include "llvm/Transforms/Scalar/JumpTableToSwitch.h"
97 #include "llvm/Transforms/Scalar/JumpThreading.h"
98 #include "llvm/Transforms/Scalar/LICM.h"
99 #include "llvm/Transforms/Scalar/LoopDeletion.h"
100 #include "llvm/Transforms/Scalar/LoopDistribute.h"
101 #include "llvm/Transforms/Scalar/LoopFlatten.h"
102 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
103 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
104 #include "llvm/Transforms/Scalar/LoopInterchange.h"
105 #include "llvm/Transforms/Scalar/LoopLoadElimination.h"
106 #include "llvm/Transforms/Scalar/LoopPassManager.h"
107 #include "llvm/Transforms/Scalar/LoopRotation.h"
108 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
109 #include "llvm/Transforms/Scalar/LoopSink.h"
110 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
111 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
112 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
113 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
114 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
115 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
116 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
117 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
118 #include "llvm/Transforms/Scalar/NewGVN.h"
119 #include "llvm/Transforms/Scalar/Reassociate.h"
120 #include "llvm/Transforms/Scalar/SCCP.h"
121 #include "llvm/Transforms/Scalar/SROA.h"
122 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
123 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
124 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
125 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
126 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
127 #include "llvm/Transforms/Utils/AddDiscriminators.h"
128 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
129 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
130 #include "llvm/Transforms/Utils/CountVisits.h"
131 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
132 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
133 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
134 #include "llvm/Transforms/Utils/Mem2Reg.h"
135 #include "llvm/Transforms/Utils/MoveAutoInit.h"
136 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
137 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
138 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
139 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
140 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
141 #include "llvm/Transforms/Vectorize/VectorCombine.h"
142
143 using namespace llvm;
144
145 static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
146 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
147 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
148 cl::values(clEnumValN(InliningAdvisorMode::Default, "default",
149 "Heuristics-based inliner version"),
150 clEnumValN(InliningAdvisorMode::Development, "development",
151 "Use development mode (runtime-loadable model)"),
152 clEnumValN(InliningAdvisorMode::Release, "release",
153 "Use release mode (AOT-compiled model)")));
154
155 static cl::opt<bool> EnableSyntheticCounts(
156 "enable-npm-synthetic-counts", cl::Hidden,
157 cl::desc("Run synthetic function entry count generation "
158 "pass"));
159
160 /// Flag to enable inline deferral during PGO.
161 static cl::opt<bool>
162 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
163 cl::Hidden,
164 cl::desc("Enable inline deferral during PGO"));
165
166 static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
167 cl::init(false), cl::Hidden,
168 cl::desc("Enable module inliner"));
169
170 static cl::opt<bool> PerformMandatoryInliningsFirst(
171 "mandatory-inlining-first", cl::init(false), cl::Hidden,
172 cl::desc("Perform mandatory inlinings module-wide, before performing "
173 "inlining"));
174
175 static cl::opt<bool> EnableEagerlyInvalidateAnalyses(
176 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden,
177 cl::desc("Eagerly invalidate more analyses in default pipelines"));
178
179 static cl::opt<bool> EnableMergeFunctions(
180 "enable-merge-functions", cl::init(false), cl::Hidden,
181 cl::desc("Enable function merging as part of the optimization pipeline"));
182
183 static cl::opt<bool> EnablePostPGOLoopRotation(
184 "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
185 cl::desc("Run the loop rotation transformation after PGO instrumentation"));
186
187 static cl::opt<bool> EnableGlobalAnalyses(
188 "enable-global-analyses", cl::init(true), cl::Hidden,
189 cl::desc("Enable inter-procedural analyses"));
190
191 static cl::opt<bool>
192 RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
193 cl::desc("Run Partial inlinining pass"));
194
195 static cl::opt<bool> ExtraVectorizerPasses(
196 "extra-vectorizer-passes", cl::init(false), cl::Hidden,
197 cl::desc("Run cleanup optimization passes after vectorization"));
198
199 static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
200 cl::desc("Run the NewGVN pass"));
201
202 static cl::opt<bool> EnableLoopInterchange(
203 "enable-loopinterchange", cl::init(false), cl::Hidden,
204 cl::desc("Enable the experimental LoopInterchange Pass"));
205
206 static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
207 cl::init(false), cl::Hidden,
208 cl::desc("Enable Unroll And Jam Pass"));
209
210 static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
211 cl::Hidden,
212 cl::desc("Enable the LoopFlatten Pass"));
213
214 // Experimentally allow loop header duplication. This should allow for better
215 // optimization at Oz, since loop-idiom recognition can then recognize things
216 // like memcpy. If this ends up being useful for many targets, we should drop
217 // this flag and make a code generation option that can be controlled
218 // independent of the opt level and exposed through the frontend.
219 static cl::opt<bool> EnableLoopHeaderDuplication(
220 "enable-loop-header-duplication", cl::init(false), cl::Hidden,
221 cl::desc("Enable loop header duplication at any optimization level"));
222
223 static cl::opt<bool>
224 EnableDFAJumpThreading("enable-dfa-jump-thread",
225 cl::desc("Enable DFA jump threading"),
226 cl::init(false), cl::Hidden);
227
228 // TODO: turn on and remove flag
229 static cl::opt<bool> EnablePGOForceFunctionAttrs(
230 "enable-pgo-force-function-attrs",
231 cl::desc("Enable pass to set function attributes based on PGO profiles"),
232 cl::init(false));
233
234 static cl::opt<bool>
235 EnableHotColdSplit("hot-cold-split",
236 cl::desc("Enable hot-cold splitting pass"));
237
238 static cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false),
239 cl::Hidden,
240 cl::desc("Enable ir outliner pass"));
241
242 static cl::opt<bool>
243 DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
244 cl::desc("Disable pre-instrumentation inliner"));
245
246 static cl::opt<int> PreInlineThreshold(
247 "preinline-threshold", cl::Hidden, cl::init(75),
248 cl::desc("Control the amount of inlining in pre-instrumentation inliner "
249 "(default = 75)"));
250
251 static cl::opt<bool>
252 EnableGVNHoist("enable-gvn-hoist",
253 cl::desc("Enable the GVN hoisting pass (default = off)"));
254
255 static cl::opt<bool>
256 EnableGVNSink("enable-gvn-sink",
257 cl::desc("Enable the GVN sinking pass (default = off)"));
258
259 static cl::opt<bool> EnableJumpTableToSwitch(
260 "enable-jump-table-to-switch",
261 cl::desc("Enable JumpTableToSwitch pass (default = off)"));
262
263 // This option is used in simplifying testing SampleFDO optimizations for
264 // profile loading.
265 static cl::opt<bool>
266 EnableCHR("enable-chr", cl::init(true), cl::Hidden,
267 cl::desc("Enable control height reduction optimization (CHR)"));
268
269 static cl::opt<bool> FlattenedProfileUsed(
270 "flattened-profile-used", cl::init(false), cl::Hidden,
271 cl::desc("Indicate the sample profile being used is flattened, i.e., "
272 "no inline hierachy exists in the profile"));
273
274 static cl::opt<bool> EnableOrderFileInstrumentation(
275 "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
276 cl::desc("Enable order file instrumentation (default = off)"));
277
278 static cl::opt<bool>
279 EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
280 cl::desc("Enable lowering of the matrix intrinsics"));
281
282 static cl::opt<bool> EnableConstraintElimination(
283 "enable-constraint-elimination", cl::init(true), cl::Hidden,
284 cl::desc(
285 "Enable pass to eliminate conditions based on linear constraints"));
286
287 static cl::opt<AttributorRunOption> AttributorRun(
288 "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
289 cl::desc("Enable the attributor inter-procedural deduction pass"),
290 cl::values(clEnumValN(AttributorRunOption::ALL, "all",
291 "enable all attributor runs"),
292 clEnumValN(AttributorRunOption::MODULE, "module",
293 "enable module-wide attributor runs"),
294 clEnumValN(AttributorRunOption::CGSCC, "cgscc",
295 "enable call graph SCC attributor runs"),
296 clEnumValN(AttributorRunOption::NONE, "none",
297 "disable attributor runs")));
298
299 static cl::opt<bool> EnableSampledInstr(
300 "enable-sampled-instrumentation", cl::init(false), cl::Hidden,
301 cl::desc("Enable profile instrumentation sampling (default = off)"));
302 static cl::opt<bool> UseLoopVersioningLICM(
303 "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
304 cl::desc("Enable the experimental Loop Versioning LICM pass"));
305
306 namespace llvm {
307 extern cl::opt<bool> EnableMemProfContextDisambiguation;
308
309 extern cl::opt<bool> EnableInferAlignmentPass;
310 } // namespace llvm
311
PipelineTuningOptions()312 PipelineTuningOptions::PipelineTuningOptions() {
313 LoopInterleaving = true;
314 LoopVectorization = true;
315 SLPVectorization = false;
316 LoopUnrolling = true;
317 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
318 LicmMssaOptCap = SetLicmMssaOptCap;
319 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
320 CallGraphProfile = true;
321 UnifiedLTO = false;
322 MergeFunctions = EnableMergeFunctions;
323 InlinerThreshold = -1;
324 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses;
325 }
326
327 namespace llvm {
328 extern cl::opt<unsigned> MaxDevirtIterations;
329 } // namespace llvm
330
invokePeepholeEPCallbacks(FunctionPassManager & FPM,OptimizationLevel Level)331 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM,
332 OptimizationLevel Level) {
333 for (auto &C : PeepholeEPCallbacks)
334 C(FPM, Level);
335 }
invokeLateLoopOptimizationsEPCallbacks(LoopPassManager & LPM,OptimizationLevel Level)336 void PassBuilder::invokeLateLoopOptimizationsEPCallbacks(
337 LoopPassManager &LPM, OptimizationLevel Level) {
338 for (auto &C : LateLoopOptimizationsEPCallbacks)
339 C(LPM, Level);
340 }
invokeLoopOptimizerEndEPCallbacks(LoopPassManager & LPM,OptimizationLevel Level)341 void PassBuilder::invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
342 OptimizationLevel Level) {
343 for (auto &C : LoopOptimizerEndEPCallbacks)
344 C(LPM, Level);
345 }
invokeScalarOptimizerLateEPCallbacks(FunctionPassManager & FPM,OptimizationLevel Level)346 void PassBuilder::invokeScalarOptimizerLateEPCallbacks(
347 FunctionPassManager &FPM, OptimizationLevel Level) {
348 for (auto &C : ScalarOptimizerLateEPCallbacks)
349 C(FPM, Level);
350 }
invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager & CGPM,OptimizationLevel Level)351 void PassBuilder::invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
352 OptimizationLevel Level) {
353 for (auto &C : CGSCCOptimizerLateEPCallbacks)
354 C(CGPM, Level);
355 }
invokeVectorizerStartEPCallbacks(FunctionPassManager & FPM,OptimizationLevel Level)356 void PassBuilder::invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
357 OptimizationLevel Level) {
358 for (auto &C : VectorizerStartEPCallbacks)
359 C(FPM, Level);
360 }
invokeOptimizerEarlyEPCallbacks(ModulePassManager & MPM,OptimizationLevel Level)361 void PassBuilder::invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
362 OptimizationLevel Level) {
363 for (auto &C : OptimizerEarlyEPCallbacks)
364 C(MPM, Level);
365 }
invokeOptimizerLastEPCallbacks(ModulePassManager & MPM,OptimizationLevel Level)366 void PassBuilder::invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
367 OptimizationLevel Level) {
368 for (auto &C : OptimizerLastEPCallbacks)
369 C(MPM, Level);
370 }
invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager & MPM,OptimizationLevel Level)371 void PassBuilder::invokeFullLinkTimeOptimizationEarlyEPCallbacks(
372 ModulePassManager &MPM, OptimizationLevel Level) {
373 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks)
374 C(MPM, Level);
375 }
invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager & MPM,OptimizationLevel Level)376 void PassBuilder::invokeFullLinkTimeOptimizationLastEPCallbacks(
377 ModulePassManager &MPM, OptimizationLevel Level) {
378 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks)
379 C(MPM, Level);
380 }
invokePipelineStartEPCallbacks(ModulePassManager & MPM,OptimizationLevel Level)381 void PassBuilder::invokePipelineStartEPCallbacks(ModulePassManager &MPM,
382 OptimizationLevel Level) {
383 for (auto &C : PipelineStartEPCallbacks)
384 C(MPM, Level);
385 }
invokePipelineEarlySimplificationEPCallbacks(ModulePassManager & MPM,OptimizationLevel Level)386 void PassBuilder::invokePipelineEarlySimplificationEPCallbacks(
387 ModulePassManager &MPM, OptimizationLevel Level) {
388 for (auto &C : PipelineEarlySimplificationEPCallbacks)
389 C(MPM, Level);
390 }
391
392 // Helper to add AnnotationRemarksPass.
addAnnotationRemarksPass(ModulePassManager & MPM)393 static void addAnnotationRemarksPass(ModulePassManager &MPM) {
394 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
395 }
396
397 // Helper to check if the current compilation phase is preparing for LTO
isLTOPreLink(ThinOrFullLTOPhase Phase)398 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
399 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink ||
400 Phase == ThinOrFullLTOPhase::FullLTOPreLink;
401 }
402
403 // TODO: Investigate the cost/benefit of tail call elimination on debugging.
404 FunctionPassManager
buildO1FunctionSimplificationPipeline(OptimizationLevel Level,ThinOrFullLTOPhase Phase)405 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
406 ThinOrFullLTOPhase Phase) {
407
408 FunctionPassManager FPM;
409
410 if (AreStatisticsEnabled())
411 FPM.addPass(CountVisitsPass());
412
413 // Form SSA out of local memory accesses after breaking apart aggregates into
414 // scalars.
415 FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
416
417 // Catch trivial redundancies
418 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
419
420 // Hoisting of scalars and load expressions.
421 FPM.addPass(
422 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
423 FPM.addPass(InstCombinePass());
424
425 FPM.addPass(LibCallsShrinkWrapPass());
426
427 invokePeepholeEPCallbacks(FPM, Level);
428
429 FPM.addPass(
430 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
431
432 // Form canonically associated expression trees, and simplify the trees using
433 // basic mathematical properties. For example, this will form (nearly)
434 // minimal multiplication trees.
435 FPM.addPass(ReassociatePass());
436
437 // Add the primary loop simplification pipeline.
438 // FIXME: Currently this is split into two loop pass pipelines because we run
439 // some function passes in between them. These can and should be removed
440 // and/or replaced by scheduling the loop pass equivalents in the correct
441 // positions. But those equivalent passes aren't powerful enough yet.
442 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
443 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
444 // fully replace `SimplifyCFGPass`, and the closest to the other we have is
445 // `LoopInstSimplify`.
446 LoopPassManager LPM1, LPM2;
447
448 // Simplify the loop body. We do this initially to clean up after other loop
449 // passes run, either when iterating on a loop or on inner loops with
450 // implications on the outer loop.
451 LPM1.addPass(LoopInstSimplifyPass());
452 LPM1.addPass(LoopSimplifyCFGPass());
453
454 // Try to remove as much code from the loop header as possible,
455 // to reduce amount of IR that will have to be duplicated. However,
456 // do not perform speculative hoisting the first time as LICM
457 // will destroy metadata that may not need to be destroyed if run
458 // after loop rotation.
459 // TODO: Investigate promotion cap for O1.
460 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
461 /*AllowSpeculation=*/false));
462
463 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
464 isLTOPreLink(Phase)));
465 // TODO: Investigate promotion cap for O1.
466 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
467 /*AllowSpeculation=*/true));
468 LPM1.addPass(SimpleLoopUnswitchPass());
469 if (EnableLoopFlatten)
470 LPM1.addPass(LoopFlattenPass());
471
472 LPM2.addPass(LoopIdiomRecognizePass());
473 LPM2.addPass(IndVarSimplifyPass());
474
475 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level);
476
477 LPM2.addPass(LoopDeletionPass());
478
479 if (EnableLoopInterchange)
480 LPM2.addPass(LoopInterchangePass());
481
482 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
483 // because it changes IR to makes profile annotation in back compile
484 // inaccurate. The normal unroller doesn't pay attention to forced full unroll
485 // attributes so we need to make sure and allow the full unroll pass to pay
486 // attention to it.
487 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
488 PGOOpt->Action != PGOOptions::SampleUse)
489 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
490 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
491 PTO.ForgetAllSCEVInLoopUnroll));
492
493 invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
494
495 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
496 /*UseMemorySSA=*/true,
497 /*UseBlockFrequencyInfo=*/true));
498 FPM.addPass(
499 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
500 FPM.addPass(InstCombinePass());
501 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
502 // *All* loop passes must preserve it, in order to be able to use it.
503 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
504 /*UseMemorySSA=*/false,
505 /*UseBlockFrequencyInfo=*/false));
506
507 // Delete small array after loop unroll.
508 FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
509
510 // Specially optimize memory movement as it doesn't look like dataflow in SSA.
511 FPM.addPass(MemCpyOptPass());
512
513 // Sparse conditional constant propagation.
514 // FIXME: It isn't clear why we do this *after* loop passes rather than
515 // before...
516 FPM.addPass(SCCPPass());
517
518 // Delete dead bit computations (instcombine runs after to fold away the dead
519 // computations, and then ADCE will run later to exploit any new DCE
520 // opportunities that creates).
521 FPM.addPass(BDCEPass());
522
523 // Run instcombine after redundancy and dead bit elimination to exploit
524 // opportunities opened up by them.
525 FPM.addPass(InstCombinePass());
526 invokePeepholeEPCallbacks(FPM, Level);
527
528 FPM.addPass(CoroElidePass());
529
530 invokeScalarOptimizerLateEPCallbacks(FPM, Level);
531
532 // Finally, do an expensive DCE pass to catch all the dead code exposed by
533 // the simplifications and basic cleanup after all the simplifications.
534 // TODO: Investigate if this is too expensive.
535 FPM.addPass(ADCEPass());
536 FPM.addPass(
537 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
538 FPM.addPass(InstCombinePass());
539 invokePeepholeEPCallbacks(FPM, Level);
540
541 return FPM;
542 }
543
544 FunctionPassManager
buildFunctionSimplificationPipeline(OptimizationLevel Level,ThinOrFullLTOPhase Phase)545 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
546 ThinOrFullLTOPhase Phase) {
547 assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
548
549 // The O1 pipeline has a separate pipeline creation function to simplify
550 // construction readability.
551 if (Level.getSpeedupLevel() == 1)
552 return buildO1FunctionSimplificationPipeline(Level, Phase);
553
554 FunctionPassManager FPM;
555
556 if (AreStatisticsEnabled())
557 FPM.addPass(CountVisitsPass());
558
559 // Form SSA out of local memory accesses after breaking apart aggregates into
560 // scalars.
561 FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
562
563 // Catch trivial redundancies
564 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
565 if (EnableKnowledgeRetention)
566 FPM.addPass(AssumeSimplifyPass());
567
568 // Hoisting of scalars and load expressions.
569 if (EnableGVNHoist)
570 FPM.addPass(GVNHoistPass());
571
572 // Global value numbering based sinking.
573 if (EnableGVNSink) {
574 FPM.addPass(GVNSinkPass());
575 FPM.addPass(
576 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
577 }
578
579 // Speculative execution if the target has divergent branches; otherwise nop.
580 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
581
582 // Optimize based on known information about branches, and cleanup afterward.
583 FPM.addPass(JumpThreadingPass());
584 FPM.addPass(CorrelatedValuePropagationPass());
585
586 // Jump table to switch conversion.
587 if (EnableJumpTableToSwitch)
588 FPM.addPass(JumpTableToSwitchPass());
589
590 FPM.addPass(
591 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
592 FPM.addPass(InstCombinePass());
593 FPM.addPass(AggressiveInstCombinePass());
594
595 if (!Level.isOptimizingForSize())
596 FPM.addPass(LibCallsShrinkWrapPass());
597
598 invokePeepholeEPCallbacks(FPM, Level);
599
600 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
601 // using the size value profile. Don't perform this when optimizing for size.
602 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
603 !Level.isOptimizingForSize())
604 FPM.addPass(PGOMemOPSizeOpt());
605
606 FPM.addPass(TailCallElimPass());
607 FPM.addPass(
608 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
609
610 // Form canonically associated expression trees, and simplify the trees using
611 // basic mathematical properties. For example, this will form (nearly)
612 // minimal multiplication trees.
613 FPM.addPass(ReassociatePass());
614
615 if (EnableConstraintElimination)
616 FPM.addPass(ConstraintEliminationPass());
617
618 // Add the primary loop simplification pipeline.
619 // FIXME: Currently this is split into two loop pass pipelines because we run
620 // some function passes in between them. These can and should be removed
621 // and/or replaced by scheduling the loop pass equivalents in the correct
622 // positions. But those equivalent passes aren't powerful enough yet.
623 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
624 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
625 // fully replace `SimplifyCFGPass`, and the closest to the other we have is
626 // `LoopInstSimplify`.
627 LoopPassManager LPM1, LPM2;
628
629 // Simplify the loop body. We do this initially to clean up after other loop
630 // passes run, either when iterating on a loop or on inner loops with
631 // implications on the outer loop.
632 LPM1.addPass(LoopInstSimplifyPass());
633 LPM1.addPass(LoopSimplifyCFGPass());
634
635 // Try to remove as much code from the loop header as possible,
636 // to reduce amount of IR that will have to be duplicated. However,
637 // do not perform speculative hoisting the first time as LICM
638 // will destroy metadata that may not need to be destroyed if run
639 // after loop rotation.
640 // TODO: Investigate promotion cap for O1.
641 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
642 /*AllowSpeculation=*/false));
643
644 // Disable header duplication in loop rotation at -Oz.
645 LPM1.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
646 Level != OptimizationLevel::Oz,
647 isLTOPreLink(Phase)));
648 // TODO: Investigate promotion cap for O1.
649 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
650 /*AllowSpeculation=*/true));
651 LPM1.addPass(
652 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3));
653 if (EnableLoopFlatten)
654 LPM1.addPass(LoopFlattenPass());
655
656 LPM2.addPass(LoopIdiomRecognizePass());
657 LPM2.addPass(IndVarSimplifyPass());
658
659 {
660 ExtraSimpleLoopUnswitchPassManager ExtraPasses;
661 ExtraPasses.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
662 OptimizationLevel::O3));
663 LPM2.addPass(std::move(ExtraPasses));
664 }
665
666 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level);
667
668 LPM2.addPass(LoopDeletionPass());
669
670 if (EnableLoopInterchange)
671 LPM2.addPass(LoopInterchangePass());
672
673 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
674 // because it changes IR to makes profile annotation in back compile
675 // inaccurate. The normal unroller doesn't pay attention to forced full unroll
676 // attributes so we need to make sure and allow the full unroll pass to pay
677 // attention to it.
678 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
679 PGOOpt->Action != PGOOptions::SampleUse)
680 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
681 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
682 PTO.ForgetAllSCEVInLoopUnroll));
683
684 invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
685
686 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
687 /*UseMemorySSA=*/true,
688 /*UseBlockFrequencyInfo=*/true));
689 FPM.addPass(
690 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
691 FPM.addPass(InstCombinePass());
692 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
693 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
694 // *All* loop passes must preserve it, in order to be able to use it.
695 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
696 /*UseMemorySSA=*/false,
697 /*UseBlockFrequencyInfo=*/false));
698
699 // Delete small array after loop unroll.
700 FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
701
702 // Try vectorization/scalarization transforms that are both improvements
703 // themselves and can allow further folds with GVN and InstCombine.
704 FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
705
706 // Eliminate redundancies.
707 FPM.addPass(MergedLoadStoreMotionPass());
708 if (RunNewGVN)
709 FPM.addPass(NewGVNPass());
710 else
711 FPM.addPass(GVNPass());
712
713 // Sparse conditional constant propagation.
714 // FIXME: It isn't clear why we do this *after* loop passes rather than
715 // before...
716 FPM.addPass(SCCPPass());
717
718 // Delete dead bit computations (instcombine runs after to fold away the dead
719 // computations, and then ADCE will run later to exploit any new DCE
720 // opportunities that creates).
721 FPM.addPass(BDCEPass());
722
723 // Run instcombine after redundancy and dead bit elimination to exploit
724 // opportunities opened up by them.
725 FPM.addPass(InstCombinePass());
726 invokePeepholeEPCallbacks(FPM, Level);
727
728 // Re-consider control flow based optimizations after redundancy elimination,
729 // redo DCE, etc.
730 if (EnableDFAJumpThreading)
731 FPM.addPass(DFAJumpThreadingPass());
732
733 FPM.addPass(JumpThreadingPass());
734 FPM.addPass(CorrelatedValuePropagationPass());
735
736 // Finally, do an expensive DCE pass to catch all the dead code exposed by
737 // the simplifications and basic cleanup after all the simplifications.
738 // TODO: Investigate if this is too expensive.
739 FPM.addPass(ADCEPass());
740
741 // Specially optimize memory movement as it doesn't look like dataflow in SSA.
742 FPM.addPass(MemCpyOptPass());
743
744 FPM.addPass(DSEPass());
745 FPM.addPass(MoveAutoInitPass());
746
747 FPM.addPass(createFunctionToLoopPassAdaptor(
748 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
749 /*AllowSpeculation=*/true),
750 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
751
752 FPM.addPass(CoroElidePass());
753
754 invokeScalarOptimizerLateEPCallbacks(FPM, Level);
755
756 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
757 .convertSwitchRangeToICmp(true)
758 .hoistCommonInsts(true)
759 .sinkCommonInsts(true)));
760 FPM.addPass(InstCombinePass());
761 invokePeepholeEPCallbacks(FPM, Level);
762
763 return FPM;
764 }
765
addRequiredLTOPreLinkPasses(ModulePassManager & MPM)766 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
767 MPM.addPass(CanonicalizeAliasesPass());
768 MPM.addPass(NameAnonGlobalPass());
769 }
770
addPreInlinerPasses(ModulePassManager & MPM,OptimizationLevel Level,ThinOrFullLTOPhase LTOPhase)771 void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM,
772 OptimizationLevel Level,
773 ThinOrFullLTOPhase LTOPhase) {
774 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
775 if (DisablePreInliner)
776 return;
777 InlineParams IP;
778
779 IP.DefaultThreshold = PreInlineThreshold;
780
781 // FIXME: The hint threshold has the same value used by the regular inliner
782 // when not optimzing for size. This should probably be lowered after
783 // performance testing.
784 // FIXME: this comment is cargo culted from the old pass manager, revisit).
785 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
786 ModuleInlinerWrapperPass MIWP(
787 IP, /* MandatoryFirst */ true,
788 InlineContext{LTOPhase, InlinePass::EarlyInliner});
789 CGSCCPassManager &CGPipeline = MIWP.getPM();
790
791 FunctionPassManager FPM;
792 FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
793 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies.
794 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
795 true))); // Merge & remove basic blocks.
796 FPM.addPass(InstCombinePass()); // Combine silly sequences.
797 invokePeepholeEPCallbacks(FPM, Level);
798
799 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
800 std::move(FPM), PTO.EagerlyInvalidateAnalyses));
801
802 MPM.addPass(std::move(MIWP));
803
804 // Delete anything that is now dead to make sure that we don't instrument
805 // dead code. Instrumentation can end up keeping dead code around and
806 // dramatically increase code size.
807 MPM.addPass(GlobalDCEPass());
808 }
809
addPostPGOLoopRotation(ModulePassManager & MPM,OptimizationLevel Level)810 void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM,
811 OptimizationLevel Level) {
812 if (EnablePostPGOLoopRotation) {
813 // Disable header duplication in loop rotation at -Oz.
814 MPM.addPass(createModuleToFunctionPassAdaptor(
815 createFunctionToLoopPassAdaptor(
816 LoopRotatePass(EnableLoopHeaderDuplication ||
817 Level != OptimizationLevel::Oz),
818 /*UseMemorySSA=*/false,
819 /*UseBlockFrequencyInfo=*/false),
820 PTO.EagerlyInvalidateAnalyses));
821 }
822 }
823
addPGOInstrPasses(ModulePassManager & MPM,OptimizationLevel Level,bool RunProfileGen,bool IsCS,bool AtomicCounterUpdate,std::string ProfileFile,std::string ProfileRemappingFile,IntrusiveRefCntPtr<vfs::FileSystem> FS)824 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
825 OptimizationLevel Level, bool RunProfileGen,
826 bool IsCS, bool AtomicCounterUpdate,
827 std::string ProfileFile,
828 std::string ProfileRemappingFile,
829 IntrusiveRefCntPtr<vfs::FileSystem> FS) {
830 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
831
832 if (!RunProfileGen) {
833 assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
834 MPM.addPass(
835 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS));
836 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
837 // RequireAnalysisPass for PSI before subsequent non-module passes.
838 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
839 return;
840 }
841
842 // Perform PGO instrumentation.
843 MPM.addPass(PGOInstrumentationGen(IsCS));
844
845 addPostPGOLoopRotation(MPM, Level);
846 // Add the profile lowering pass.
847 InstrProfOptions Options;
848 if (!ProfileFile.empty())
849 Options.InstrProfileOutput = ProfileFile;
850 // Do counter promotion at Level greater than O0.
851 Options.DoCounterPromotion = true;
852 Options.UseBFIInPromotion = IsCS;
853 if (EnableSampledInstr) {
854 Options.Sampling = true;
855 // With sampling, there is little beneifit to enable counter promotion.
856 // But note that sampling does work with counter promotion.
857 Options.DoCounterPromotion = false;
858 }
859 Options.Atomic = AtomicCounterUpdate;
860 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS));
861 }
862
addPGOInstrPassesForO0(ModulePassManager & MPM,bool RunProfileGen,bool IsCS,bool AtomicCounterUpdate,std::string ProfileFile,std::string ProfileRemappingFile,IntrusiveRefCntPtr<vfs::FileSystem> FS)863 void PassBuilder::addPGOInstrPassesForO0(
864 ModulePassManager &MPM, bool RunProfileGen, bool IsCS,
865 bool AtomicCounterUpdate, std::string ProfileFile,
866 std::string ProfileRemappingFile, IntrusiveRefCntPtr<vfs::FileSystem> FS) {
867 if (!RunProfileGen) {
868 assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
869 MPM.addPass(
870 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS));
871 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
872 // RequireAnalysisPass for PSI before subsequent non-module passes.
873 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
874 return;
875 }
876
877 // Perform PGO instrumentation.
878 MPM.addPass(PGOInstrumentationGen(IsCS));
879 // Add the profile lowering pass.
880 InstrProfOptions Options;
881 if (!ProfileFile.empty())
882 Options.InstrProfileOutput = ProfileFile;
883 // Do not do counter promotion at O0.
884 Options.DoCounterPromotion = false;
885 Options.UseBFIInPromotion = IsCS;
886 Options.Atomic = AtomicCounterUpdate;
887 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS));
888 }
889
getInlineParamsFromOptLevel(OptimizationLevel Level)890 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) {
891 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
892 }
893
894 ModuleInlinerWrapperPass
buildInlinerPipeline(OptimizationLevel Level,ThinOrFullLTOPhase Phase)895 PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
896 ThinOrFullLTOPhase Phase) {
897 InlineParams IP;
898 if (PTO.InlinerThreshold == -1)
899 IP = getInlineParamsFromOptLevel(Level);
900 else
901 IP = getInlineParams(PTO.InlinerThreshold);
902 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
903 // disable hot callsite inline (as much as possible [1]) because it makes
904 // profile annotation in the backend inaccurate.
905 //
906 // [1] Note the cost of a function could be below zero due to erased
907 // prologue / epilogue.
908 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
909 PGOOpt->Action == PGOOptions::SampleUse)
910 IP.HotCallSiteThreshold = 0;
911
912 if (PGOOpt)
913 IP.EnableDeferral = EnablePGOInlineDeferral;
914
915 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst,
916 InlineContext{Phase, InlinePass::CGSCCInliner},
917 UseInlineAdvisor, MaxDevirtIterations);
918
919 // Require the GlobalsAA analysis for the module so we can query it within
920 // the CGSCC pipeline.
921 if (EnableGlobalAnalyses) {
922 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>());
923 // Invalidate AAManager so it can be recreated and pick up the newly
924 // available GlobalsAA.
925 MIWP.addModulePass(
926 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
927 }
928
929 // Require the ProfileSummaryAnalysis for the module so we can query it within
930 // the inliner pass.
931 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
932
933 // Now begin the main postorder CGSCC pipeline.
934 // FIXME: The current CGSCC pipeline has its origins in the legacy pass
935 // manager and trying to emulate its precise behavior. Much of this doesn't
936 // make a lot of sense and we should revisit the core CGSCC structure.
937 CGSCCPassManager &MainCGPipeline = MIWP.getPM();
938
939 // Note: historically, the PruneEH pass was run first to deduce nounwind and
940 // generally clean up exception handling overhead. It isn't clear this is
941 // valuable as the inliner doesn't currently care whether it is inlining an
942 // invoke or a call.
943
944 if (AttributorRun & AttributorRunOption::CGSCC)
945 MainCGPipeline.addPass(AttributorCGSCCPass());
946
947 // Deduce function attributes. We do another run of this after the function
948 // simplification pipeline, so this only needs to run when it could affect the
949 // function simplification pipeline, which is only the case with recursive
950 // functions.
951 MainCGPipeline.addPass(PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true));
952
953 // When at O3 add argument promotion to the pass pipeline.
954 // FIXME: It isn't at all clear why this should be limited to O3.
955 if (Level == OptimizationLevel::O3)
956 MainCGPipeline.addPass(ArgumentPromotionPass());
957
958 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
959 // there are no OpenMP runtime calls present in the module.
960 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
961 MainCGPipeline.addPass(OpenMPOptCGSCCPass());
962
963 invokeCGSCCOptimizerLateEPCallbacks(MainCGPipeline, Level);
964
965 // Add the core function simplification pipeline nested inside the
966 // CGSCC walk.
967 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
968 buildFunctionSimplificationPipeline(Level, Phase),
969 PTO.EagerlyInvalidateAnalyses, /*NoRerun=*/true));
970
971 // Finally, deduce any function attributes based on the fully simplified
972 // function.
973 MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
974
975 // Mark that the function is fully simplified and that it shouldn't be
976 // simplified again if we somehow revisit it due to CGSCC mutations unless
977 // it's been modified since.
978 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
979 RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
980
981 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
982
983 // Make sure we don't affect potential future NoRerun CGSCC adaptors.
984 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
985 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>()));
986
987 return MIWP;
988 }
989
990 ModulePassManager
buildModuleInlinerPipeline(OptimizationLevel Level,ThinOrFullLTOPhase Phase)991 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
992 ThinOrFullLTOPhase Phase) {
993 ModulePassManager MPM;
994
995 InlineParams IP = getInlineParamsFromOptLevel(Level);
996 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
997 // disable hot callsite inline (as much as possible [1]) because it makes
998 // profile annotation in the backend inaccurate.
999 //
1000 // [1] Note the cost of a function could be below zero due to erased
1001 // prologue / epilogue.
1002 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
1003 PGOOpt->Action == PGOOptions::SampleUse)
1004 IP.HotCallSiteThreshold = 0;
1005
1006 if (PGOOpt)
1007 IP.EnableDeferral = EnablePGOInlineDeferral;
1008
1009 // The inline deferral logic is used to avoid losing some
1010 // inlining chance in future. It is helpful in SCC inliner, in which
1011 // inlining is processed in bottom-up order.
1012 // While in module inliner, the inlining order is a priority-based order
1013 // by default. The inline deferral is unnecessary there. So we disable the
1014 // inline deferral logic in module inliner.
1015 IP.EnableDeferral = false;
1016
1017 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor, Phase));
1018
1019 MPM.addPass(createModuleToFunctionPassAdaptor(
1020 buildFunctionSimplificationPipeline(Level, Phase),
1021 PTO.EagerlyInvalidateAnalyses));
1022
1023 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
1024 CoroSplitPass(Level != OptimizationLevel::O0)));
1025
1026 return MPM;
1027 }
1028
1029 ModulePassManager
buildModuleSimplificationPipeline(OptimizationLevel Level,ThinOrFullLTOPhase Phase)1030 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
1031 ThinOrFullLTOPhase Phase) {
1032 assert(Level != OptimizationLevel::O0 &&
1033 "Should not be used for O0 pipeline");
1034
1035 assert(Phase != ThinOrFullLTOPhase::FullLTOPostLink &&
1036 "FullLTOPostLink shouldn't call buildModuleSimplificationPipeline!");
1037
1038 ModulePassManager MPM;
1039
1040 // Place pseudo probe instrumentation as the first pass of the pipeline to
1041 // minimize the impact of optimization changes.
1042 if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
1043 Phase != ThinOrFullLTOPhase::ThinLTOPostLink)
1044 MPM.addPass(SampleProfileProbePass(TM));
1045
1046 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
1047
1048 // In ThinLTO mode, when flattened profile is used, all the available
1049 // profile information will be annotated in PreLink phase so there is
1050 // no need to load the profile again in PostLink.
1051 bool LoadSampleProfile =
1052 HasSampleProfile &&
1053 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink);
1054
1055 // During the ThinLTO backend phase we perform early indirect call promotion
1056 // here, before globalopt. Otherwise imported available_externally functions
1057 // look unreferenced and are removed. If we are going to load the sample
1058 // profile then defer until later.
1059 // TODO: See if we can move later and consolidate with the location where
1060 // we perform ICP when we are loading a sample profile.
1061 // TODO: We pass HasSampleProfile (whether there was a sample profile file
1062 // passed to the compile) to the SamplePGO flag of ICP. This is used to
1063 // determine whether the new direct calls are annotated with prof metadata.
1064 // Ideally this should be determined from whether the IR is annotated with
1065 // sample profile, and not whether the a sample profile was provided on the
1066 // command line. E.g. for flattened profiles where we will not be reloading
1067 // the sample profile in the ThinLTO backend, we ideally shouldn't have to
1068 // provide the sample profile file.
1069 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile)
1070 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile));
1071
1072 // Create an early function pass manager to cleanup the output of the
1073 // frontend. Not necessary with LTO post link pipelines since the pre link
1074 // pipeline already cleaned up the frontend output.
1075 if (Phase != ThinOrFullLTOPhase::ThinLTOPostLink) {
1076 // Do basic inference of function attributes from known properties of system
1077 // libraries and other oracles.
1078 MPM.addPass(InferFunctionAttrsPass());
1079 MPM.addPass(CoroEarlyPass());
1080
1081 FunctionPassManager EarlyFPM;
1082 EarlyFPM.addPass(EntryExitInstrumenterPass(/*PostInlining=*/false));
1083 // Lower llvm.expect to metadata before attempting transforms.
1084 // Compare/branch metadata may alter the behavior of passes like
1085 // SimplifyCFG.
1086 EarlyFPM.addPass(LowerExpectIntrinsicPass());
1087 EarlyFPM.addPass(SimplifyCFGPass());
1088 EarlyFPM.addPass(SROAPass(SROAOptions::ModifyCFG));
1089 EarlyFPM.addPass(EarlyCSEPass());
1090 if (Level == OptimizationLevel::O3)
1091 EarlyFPM.addPass(CallSiteSplittingPass());
1092 MPM.addPass(createModuleToFunctionPassAdaptor(
1093 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
1094 }
1095
1096 if (LoadSampleProfile) {
1097 // Annotate sample profile right after early FPM to ensure freshness of
1098 // the debug info.
1099 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
1100 PGOOpt->ProfileRemappingFile, Phase));
1101 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
1102 // RequireAnalysisPass for PSI before subsequent non-module passes.
1103 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
1104 // Do not invoke ICP in the LTOPrelink phase as it makes it hard
1105 // for the profile annotation to be accurate in the LTO backend.
1106 if (!isLTOPreLink(Phase))
1107 // We perform early indirect call promotion here, before globalopt.
1108 // This is important for the ThinLTO backend phase because otherwise
1109 // imported available_externally functions look unreferenced and are
1110 // removed.
1111 MPM.addPass(
1112 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
1113 }
1114
1115 // Try to perform OpenMP specific optimizations on the module. This is a
1116 // (quick!) no-op if there are no OpenMP runtime calls present in the module.
1117 MPM.addPass(OpenMPOptPass());
1118
1119 if (AttributorRun & AttributorRunOption::MODULE)
1120 MPM.addPass(AttributorPass());
1121
1122 // Lower type metadata and the type.test intrinsic in the ThinLTO
1123 // post link pipeline after ICP. This is to enable usage of the type
1124 // tests in ICP sequences.
1125 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink)
1126 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1127
1128 invokePipelineEarlySimplificationEPCallbacks(MPM, Level);
1129
1130 // Interprocedural constant propagation now that basic cleanup has occurred
1131 // and prior to optimizing globals.
1132 // FIXME: This position in the pipeline hasn't been carefully considered in
1133 // years, it should be re-analyzed.
1134 MPM.addPass(IPSCCPPass(
1135 IPSCCPOptions(/*AllowFuncSpec=*/
1136 Level != OptimizationLevel::Os &&
1137 Level != OptimizationLevel::Oz &&
1138 !isLTOPreLink(Phase))));
1139
1140 // Attach metadata to indirect call sites indicating the set of functions
1141 // they may target at run-time. This should follow IPSCCP.
1142 MPM.addPass(CalledValuePropagationPass());
1143
1144 // Optimize globals to try and fold them into constants.
1145 MPM.addPass(GlobalOptPass());
1146
1147 // Create a small function pass pipeline to cleanup after all the global
1148 // optimizations.
1149 FunctionPassManager GlobalCleanupPM;
1150 // FIXME: Should this instead by a run of SROA?
1151 GlobalCleanupPM.addPass(PromotePass());
1152 GlobalCleanupPM.addPass(InstCombinePass());
1153 invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
1154 GlobalCleanupPM.addPass(
1155 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
1156 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
1157 PTO.EagerlyInvalidateAnalyses));
1158
1159 // We already asserted this happens in non-FullLTOPostLink earlier.
1160 const bool IsPreLink = Phase != ThinOrFullLTOPhase::ThinLTOPostLink;
1161 const bool IsPGOPreLink = PGOOpt && IsPreLink;
1162 const bool IsPGOInstrGen =
1163 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRInstr;
1164 const bool IsPGOInstrUse =
1165 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRUse;
1166 const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty();
1167 // We don't want to mix pgo ctx gen and pgo gen; we also don't currently
1168 // enable ctx profiling from the frontend.
1169 assert(
1170 !(IsPGOInstrGen && PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) &&
1171 "Enabling both instrumented FDO and contextual instrumentation is not "
1172 "supported.");
1173 // Enable contextual profiling instrumentation.
1174 const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
1175 PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
1176
1177 if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen)
1178 addPreInlinerPasses(MPM, Level, Phase);
1179
1180 // Add all the requested passes for instrumentation PGO, if requested.
1181 if (IsPGOInstrGen || IsPGOInstrUse) {
1182 addPGOInstrPasses(MPM, Level,
1183 /*RunProfileGen=*/IsPGOInstrGen,
1184 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate,
1185 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1186 PGOOpt->FS);
1187 } else if (IsCtxProfGen) {
1188 MPM.addPass(PGOInstrumentationGen(false));
1189 addPostPGOLoopRotation(MPM, Level);
1190 MPM.addPass(PGOCtxProfLoweringPass());
1191 }
1192
1193 if (IsPGOInstrGen || IsPGOInstrUse || IsCtxProfGen)
1194 MPM.addPass(PGOIndirectCallPromotion(false, false));
1195
1196 if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr)
1197 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile,
1198 EnableSampledInstr));
1199
1200 if (IsMemprofUse)
1201 MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));
1202
1203 // Synthesize function entry counts for non-PGO compilation.
1204 if (EnableSyntheticCounts && !PGOOpt)
1205 MPM.addPass(SyntheticCountsPropagation());
1206
1207 if (EnablePGOForceFunctionAttrs && PGOOpt)
1208 MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
1209
1210 MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
1211
1212 if (EnableModuleInliner)
1213 MPM.addPass(buildModuleInlinerPipeline(Level, Phase));
1214 else
1215 MPM.addPass(buildInlinerPipeline(Level, Phase));
1216
1217 // Remove any dead arguments exposed by cleanups, constant folding globals,
1218 // and argument promotion.
1219 MPM.addPass(DeadArgumentEliminationPass());
1220
1221 MPM.addPass(CoroCleanupPass());
1222
1223 // Optimize globals now that functions are fully simplified.
1224 MPM.addPass(GlobalOptPass());
1225 MPM.addPass(GlobalDCEPass());
1226
1227 return MPM;
1228 }
1229
1230 /// TODO: Should LTO cause any differences to this set of passes?
addVectorPasses(OptimizationLevel Level,FunctionPassManager & FPM,bool IsFullLTO)1231 void PassBuilder::addVectorPasses(OptimizationLevel Level,
1232 FunctionPassManager &FPM, bool IsFullLTO) {
1233 FPM.addPass(LoopVectorizePass(
1234 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
1235
1236 if (EnableInferAlignmentPass)
1237 FPM.addPass(InferAlignmentPass());
1238 if (IsFullLTO) {
1239 // The vectorizer may have significantly shortened a loop body; unroll
1240 // again. Unroll small loops to hide loop backedge latency and saturate any
1241 // parallel execution resources of an out-of-order processor. We also then
1242 // need to clean up redundancies and loop invariant code.
1243 // FIXME: It would be really good to use a loop-integrated instruction
1244 // combiner for cleanup here so that the unrolling and LICM can be pipelined
1245 // across the loop nests.
1246 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
1247 if (EnableUnrollAndJam && PTO.LoopUnrolling)
1248 FPM.addPass(createFunctionToLoopPassAdaptor(
1249 LoopUnrollAndJamPass(Level.getSpeedupLevel())));
1250 FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
1251 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
1252 PTO.ForgetAllSCEVInLoopUnroll)));
1253 FPM.addPass(WarnMissedTransformationsPass());
1254 // Now that we are done with loop unrolling, be it either by LoopVectorizer,
1255 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have
1256 // become constant-offset, thus enabling SROA and alloca promotion. Do so.
1257 // NOTE: we are very late in the pipeline, and we don't have any LICM
1258 // or SimplifyCFG passes scheduled after us, that would cleanup
1259 // the CFG mess this may created if allowed to modify CFG, so forbid that.
1260 FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
1261 }
1262
1263 if (!IsFullLTO) {
1264 // Eliminate loads by forwarding stores from the previous iteration to loads
1265 // of the current iteration.
1266 FPM.addPass(LoopLoadEliminationPass());
1267 }
1268 // Cleanup after the loop optimization passes.
1269 FPM.addPass(InstCombinePass());
1270
1271 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
1272 ExtraVectorPassManager ExtraPasses;
1273 // At higher optimization levels, try to clean up any runtime overlap and
1274 // alignment checks inserted by the vectorizer. We want to track correlated
1275 // runtime checks for two inner loops in the same outer loop, fold any
1276 // common computations, hoist loop-invariant aspects out of any outer loop,
1277 // and unswitch the runtime checks if possible. Once hoisted, we may have
1278 // dead (or speculatable) control flows or more combining opportunities.
1279 ExtraPasses.addPass(EarlyCSEPass());
1280 ExtraPasses.addPass(CorrelatedValuePropagationPass());
1281 ExtraPasses.addPass(InstCombinePass());
1282 LoopPassManager LPM;
1283 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
1284 /*AllowSpeculation=*/true));
1285 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
1286 OptimizationLevel::O3));
1287 ExtraPasses.addPass(
1288 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
1289 /*UseBlockFrequencyInfo=*/true));
1290 ExtraPasses.addPass(
1291 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
1292 ExtraPasses.addPass(InstCombinePass());
1293 FPM.addPass(std::move(ExtraPasses));
1294 }
1295
1296 // Now that we've formed fast to execute loop structures, we do further
1297 // optimizations. These are run afterward as they might block doing complex
1298 // analyses and transforms such as what are needed for loop vectorization.
1299
1300 // Cleanup after loop vectorization, etc. Simplification passes like CVP and
1301 // GVN, loop transforms, and others have already run, so it's now better to
1302 // convert to more optimized IR using more aggressive simplify CFG options.
1303 // The extra sinking transform can create larger basic blocks, so do this
1304 // before SLP vectorization.
1305 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
1306 .forwardSwitchCondToPhi(true)
1307 .convertSwitchRangeToICmp(true)
1308 .convertSwitchToLookupTable(true)
1309 .needCanonicalLoops(false)
1310 .hoistCommonInsts(true)
1311 .sinkCommonInsts(true)));
1312
1313 if (IsFullLTO) {
1314 FPM.addPass(SCCPPass());
1315 FPM.addPass(InstCombinePass());
1316 FPM.addPass(BDCEPass());
1317 }
1318
1319 // Optimize parallel scalar instruction chains into SIMD instructions.
1320 if (PTO.SLPVectorization) {
1321 FPM.addPass(SLPVectorizerPass());
1322 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
1323 FPM.addPass(EarlyCSEPass());
1324 }
1325 }
1326 // Enhance/cleanup vector code.
1327 FPM.addPass(VectorCombinePass());
1328
1329 if (!IsFullLTO) {
1330 FPM.addPass(InstCombinePass());
1331 // Unroll small loops to hide loop backedge latency and saturate any
1332 // parallel execution resources of an out-of-order processor. We also then
1333 // need to clean up redundancies and loop invariant code.
1334 // FIXME: It would be really good to use a loop-integrated instruction
1335 // combiner for cleanup here so that the unrolling and LICM can be pipelined
1336 // across the loop nests.
1337 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
1338 if (EnableUnrollAndJam && PTO.LoopUnrolling) {
1339 FPM.addPass(createFunctionToLoopPassAdaptor(
1340 LoopUnrollAndJamPass(Level.getSpeedupLevel())));
1341 }
1342 FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
1343 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
1344 PTO.ForgetAllSCEVInLoopUnroll)));
1345 FPM.addPass(WarnMissedTransformationsPass());
1346 // Now that we are done with loop unrolling, be it either by LoopVectorizer,
1347 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have
1348 // become constant-offset, thus enabling SROA and alloca promotion. Do so.
1349 // NOTE: we are very late in the pipeline, and we don't have any LICM
1350 // or SimplifyCFG passes scheduled after us, that would cleanup
1351 // the CFG mess this may created if allowed to modify CFG, so forbid that.
1352 FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
1353 }
1354
1355 if (EnableInferAlignmentPass)
1356 FPM.addPass(InferAlignmentPass());
1357 FPM.addPass(InstCombinePass());
1358
1359 // This is needed for two reasons:
1360 // 1. It works around problems that instcombine introduces, such as sinking
1361 // expensive FP divides into loops containing multiplications using the
1362 // divide result.
1363 // 2. It helps to clean up some loop-invariant code created by the loop
1364 // unroll pass when IsFullLTO=false.
1365 FPM.addPass(createFunctionToLoopPassAdaptor(
1366 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
1367 /*AllowSpeculation=*/true),
1368 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
1369
1370 // Now that we've vectorized and unrolled loops, we may have more refined
1371 // alignment information, try to re-derive it here.
1372 FPM.addPass(AlignmentFromAssumptionsPass());
1373 }
1374
1375 ModulePassManager
buildModuleOptimizationPipeline(OptimizationLevel Level,ThinOrFullLTOPhase LTOPhase)1376 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
1377 ThinOrFullLTOPhase LTOPhase) {
1378 const bool LTOPreLink = isLTOPreLink(LTOPhase);
1379 ModulePassManager MPM;
1380
1381 // Run partial inlining pass to partially inline functions that have
1382 // large bodies.
1383 if (RunPartialInlining)
1384 MPM.addPass(PartialInlinerPass());
1385
1386 // Remove avail extern fns and globals definitions since we aren't compiling
1387 // an object file for later LTO. For LTO we want to preserve these so they
1388 // are eligible for inlining at link-time. Note if they are unreferenced they
1389 // will be removed by GlobalDCE later, so this only impacts referenced
1390 // available externally globals. Eventually they will be suppressed during
1391 // codegen, but eliminating here enables more opportunity for GlobalDCE as it
1392 // may make globals referenced by available external functions dead and saves
1393 // running remaining passes on the eliminated functions. These should be
1394 // preserved during prelinking for link-time inlining decisions.
1395 if (!LTOPreLink)
1396 MPM.addPass(EliminateAvailableExternallyPass());
1397
1398 if (EnableOrderFileInstrumentation)
1399 MPM.addPass(InstrOrderFilePass());
1400
1401 // Do RPO function attribute inference across the module to forward-propagate
1402 // attributes where applicable.
1403 // FIXME: Is this really an optimization rather than a canonicalization?
1404 MPM.addPass(ReversePostOrderFunctionAttrsPass());
1405
1406 // Do a post inline PGO instrumentation and use pass. This is a context
1407 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as
1408 // cross-module inline has not been done yet. The context sensitive
1409 // instrumentation is after all the inlines are done.
1410 if (!LTOPreLink && PGOOpt) {
1411 if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
1412 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true,
1413 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1414 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile,
1415 PGOOpt->FS);
1416 else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
1417 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false,
1418 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1419 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1420 PGOOpt->FS);
1421 }
1422
1423 // Re-compute GlobalsAA here prior to function passes. This is particularly
1424 // useful as the above will have inlined, DCE'ed, and function-attr
1425 // propagated everything. We should at this point have a reasonably minimal
1426 // and richly annotated call graph. By computing aliasing and mod/ref
1427 // information for all local globals here, the late loop passes and notably
1428 // the vectorizer will be able to use them to help recognize vectorizable
1429 // memory operations.
1430 if (EnableGlobalAnalyses)
1431 MPM.addPass(RecomputeGlobalsAAPass());
1432
1433 invokeOptimizerEarlyEPCallbacks(MPM, Level);
1434
1435 FunctionPassManager OptimizePM;
1436 // Scheduling LoopVersioningLICM when inlining is over, because after that
1437 // we may see more accurate aliasing. Reason to run this late is that too
1438 // early versioning may prevent further inlining due to increase of code
1439 // size. Other optimizations which runs later might get benefit of no-alias
1440 // assumption in clone loop.
1441 if (UseLoopVersioningLICM) {
1442 OptimizePM.addPass(
1443 createFunctionToLoopPassAdaptor(LoopVersioningLICMPass()));
1444 // LoopVersioningLICM pass might increase new LICM opportunities.
1445 OptimizePM.addPass(createFunctionToLoopPassAdaptor(
1446 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
1447 /*AllowSpeculation=*/true),
1448 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
1449 }
1450
1451 OptimizePM.addPass(Float2IntPass());
1452 OptimizePM.addPass(LowerConstantIntrinsicsPass());
1453
1454 if (EnableMatrix) {
1455 OptimizePM.addPass(LowerMatrixIntrinsicsPass());
1456 OptimizePM.addPass(EarlyCSEPass());
1457 }
1458
1459 // CHR pass should only be applied with the profile information.
1460 // The check is to check the profile summary information in CHR.
1461 if (EnableCHR && Level == OptimizationLevel::O3)
1462 OptimizePM.addPass(ControlHeightReductionPass());
1463
1464 // FIXME: We need to run some loop optimizations to re-rotate loops after
1465 // simplifycfg and others undo their rotation.
1466
1467 // Optimize the loop execution. These passes operate on entire loop nests
1468 // rather than on each loop in an inside-out manner, and so they are actually
1469 // function passes.
1470
1471 invokeVectorizerStartEPCallbacks(OptimizePM, Level);
1472
1473 LoopPassManager LPM;
1474 // First rotate loops that may have been un-rotated by prior passes.
1475 // Disable header duplication at -Oz.
1476 LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
1477 Level != OptimizationLevel::Oz,
1478 LTOPreLink));
1479 // Some loops may have become dead by now. Try to delete them.
1480 // FIXME: see discussion in https://reviews.llvm.org/D112851,
1481 // this may need to be revisited once we run GVN before loop deletion
1482 // in the simplification pipeline.
1483 LPM.addPass(LoopDeletionPass());
1484 OptimizePM.addPass(createFunctionToLoopPassAdaptor(
1485 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
1486
1487 // Distribute loops to allow partial vectorization. I.e. isolate dependences
1488 // into separate loop that would otherwise inhibit vectorization. This is
1489 // currently only performed for loops marked with the metadata
1490 // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
1491 OptimizePM.addPass(LoopDistributePass());
1492
1493 // Populates the VFABI attribute with the scalar-to-vector mappings
1494 // from the TargetLibraryInfo.
1495 OptimizePM.addPass(InjectTLIMappings());
1496
1497 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
1498
1499 // LoopSink pass sinks instructions hoisted by LICM, which serves as a
1500 // canonicalization pass that enables other optimizations. As a result,
1501 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
1502 // result too early.
1503 OptimizePM.addPass(LoopSinkPass());
1504
1505 // And finally clean up LCSSA form before generating code.
1506 OptimizePM.addPass(InstSimplifyPass());
1507
1508 // This hoists/decomposes div/rem ops. It should run after other sink/hoist
1509 // passes to avoid re-sinking, but before SimplifyCFG because it can allow
1510 // flattening of blocks.
1511 OptimizePM.addPass(DivRemPairsPass());
1512
1513 // Try to annotate calls that were created during optimization.
1514 OptimizePM.addPass(TailCallElimPass());
1515
1516 // LoopSink (and other loop passes since the last simplifyCFG) might have
1517 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
1518 OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
1519 .convertSwitchRangeToICmp(true)
1520 .speculateUnpredictables(true)));
1521
1522 // Add the core optimizing pipeline.
1523 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
1524 PTO.EagerlyInvalidateAnalyses));
1525
1526 invokeOptimizerLastEPCallbacks(MPM, Level);
1527
1528 // Split out cold code. Splitting is done late to avoid hiding context from
1529 // other optimizations and inadvertently regressing performance. The tradeoff
1530 // is that this has a higher code size cost than splitting early.
1531 if (EnableHotColdSplit && !LTOPreLink)
1532 MPM.addPass(HotColdSplittingPass());
1533
1534 // Search the code for similar regions of code. If enough similar regions can
1535 // be found where extracting the regions into their own function will decrease
1536 // the size of the program, we extract the regions, a deduplicate the
1537 // structurally similar regions.
1538 if (EnableIROutliner)
1539 MPM.addPass(IROutlinerPass());
1540
1541 // Now we need to do some global optimization transforms.
1542 // FIXME: It would seem like these should come first in the optimization
1543 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
1544 // ordering here.
1545 MPM.addPass(GlobalDCEPass());
1546 MPM.addPass(ConstantMergePass());
1547
1548 // Merge functions if requested. It has a better chance to merge functions
1549 // after ConstantMerge folded jump tables.
1550 if (PTO.MergeFunctions)
1551 MPM.addPass(MergeFunctionsPass());
1552
1553 if (PTO.CallGraphProfile && !LTOPreLink)
1554 MPM.addPass(CGProfilePass(LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1555 LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink));
1556
1557 // TODO: Relative look table converter pass caused an issue when full lto is
1558 // enabled. See https://reviews.llvm.org/D94355 for more details.
1559 // Until the issue fixed, disable this pass during pre-linking phase.
1560 if (!LTOPreLink)
1561 MPM.addPass(RelLookupTableConverterPass());
1562
1563 return MPM;
1564 }
1565
1566 ModulePassManager
buildPerModuleDefaultPipeline(OptimizationLevel Level,bool LTOPreLink)1567 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
1568 bool LTOPreLink) {
1569 if (Level == OptimizationLevel::O0)
1570 return buildO0DefaultPipeline(Level, LTOPreLink);
1571
1572 ModulePassManager MPM;
1573
1574 // Convert @llvm.global.annotations to !annotation metadata.
1575 MPM.addPass(Annotation2MetadataPass());
1576
1577 // Force any function attributes we want the rest of the pipeline to observe.
1578 MPM.addPass(ForceFunctionAttrsPass());
1579
1580 if (PGOOpt && PGOOpt->DebugInfoForProfiling)
1581 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
1582
1583 // Apply module pipeline start EP callback.
1584 invokePipelineStartEPCallbacks(MPM, Level);
1585
1586 const ThinOrFullLTOPhase LTOPhase = LTOPreLink
1587 ? ThinOrFullLTOPhase::FullLTOPreLink
1588 : ThinOrFullLTOPhase::None;
1589 // Add the core simplification pipeline.
1590 MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase));
1591
1592 // Now add the optimization pipeline.
1593 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPhase));
1594
1595 if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
1596 PGOOpt->Action == PGOOptions::SampleUse)
1597 MPM.addPass(PseudoProbeUpdatePass());
1598
1599 // Emit annotation remarks.
1600 addAnnotationRemarksPass(MPM);
1601
1602 if (LTOPreLink)
1603 addRequiredLTOPreLinkPasses(MPM);
1604 return MPM;
1605 }
1606
1607 ModulePassManager
buildFatLTODefaultPipeline(OptimizationLevel Level,bool ThinLTO,bool EmitSummary)1608 PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
1609 bool EmitSummary) {
1610 ModulePassManager MPM;
1611 if (ThinLTO)
1612 MPM.addPass(buildThinLTOPreLinkDefaultPipeline(Level));
1613 else
1614 MPM.addPass(buildLTOPreLinkDefaultPipeline(Level));
1615 MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary));
1616
1617 // Use the ThinLTO post-link pipeline with sample profiling
1618 if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)
1619 MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr));
1620 else {
1621 // otherwise, just use module optimization
1622 MPM.addPass(
1623 buildModuleOptimizationPipeline(Level, ThinOrFullLTOPhase::None));
1624 // Emit annotation remarks.
1625 addAnnotationRemarksPass(MPM);
1626 }
1627 return MPM;
1628 }
1629
1630 ModulePassManager
buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level)1631 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
1632 if (Level == OptimizationLevel::O0)
1633 return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
1634
1635 ModulePassManager MPM;
1636
1637 // Convert @llvm.global.annotations to !annotation metadata.
1638 MPM.addPass(Annotation2MetadataPass());
1639
1640 // Force any function attributes we want the rest of the pipeline to observe.
1641 MPM.addPass(ForceFunctionAttrsPass());
1642
1643 if (PGOOpt && PGOOpt->DebugInfoForProfiling)
1644 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
1645
1646 // Apply module pipeline start EP callback.
1647 invokePipelineStartEPCallbacks(MPM, Level);
1648
1649 // If we are planning to perform ThinLTO later, we don't bloat the code with
1650 // unrolling/vectorization/... now. Just simplify the module as much as we
1651 // can.
1652 MPM.addPass(buildModuleSimplificationPipeline(
1653 Level, ThinOrFullLTOPhase::ThinLTOPreLink));
1654
1655 // Run partial inlining pass to partially inline functions that have
1656 // large bodies.
1657 // FIXME: It isn't clear whether this is really the right place to run this
1658 // in ThinLTO. Because there is another canonicalization and simplification
1659 // phase that will run after the thin link, running this here ends up with
1660 // less information than will be available later and it may grow functions in
1661 // ways that aren't beneficial.
1662 if (RunPartialInlining)
1663 MPM.addPass(PartialInlinerPass());
1664
1665 if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
1666 PGOOpt->Action == PGOOptions::SampleUse)
1667 MPM.addPass(PseudoProbeUpdatePass());
1668
1669 // Handle Optimizer{Early,Last}EPCallbacks added by clang on PreLink. Actual
1670 // optimization is going to be done in PostLink stage, but clang can't add
1671 // callbacks there in case of in-process ThinLTO called by linker.
1672 invokeOptimizerEarlyEPCallbacks(MPM, Level);
1673 invokeOptimizerLastEPCallbacks(MPM, Level);
1674
1675 // Emit annotation remarks.
1676 addAnnotationRemarksPass(MPM);
1677
1678 addRequiredLTOPreLinkPasses(MPM);
1679
1680 return MPM;
1681 }
1682
buildThinLTODefaultPipeline(OptimizationLevel Level,const ModuleSummaryIndex * ImportSummary)1683 ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
1684 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
1685 ModulePassManager MPM;
1686
1687 if (ImportSummary) {
1688 // For ThinLTO we must apply the context disambiguation decisions early, to
1689 // ensure we can correctly match the callsites to summary data.
1690 if (EnableMemProfContextDisambiguation)
1691 MPM.addPass(MemProfContextDisambiguation(ImportSummary));
1692
1693 // These passes import type identifier resolutions for whole-program
1694 // devirtualization and CFI. They must run early because other passes may
1695 // disturb the specific instruction patterns that these passes look for,
1696 // creating dependencies on resolutions that may not appear in the summary.
1697 //
1698 // For example, GVN may transform the pattern assume(type.test) appearing in
1699 // two basic blocks into assume(phi(type.test, type.test)), which would
1700 // transform a dependency on a WPD resolution into a dependency on a type
1701 // identifier resolution for CFI.
1702 //
1703 // Also, WPD has access to more precise information than ICP and can
1704 // devirtualize more effectively, so it should operate on the IR first.
1705 //
1706 // The WPD and LowerTypeTest passes need to run at -O0 to lower type
1707 // metadata and intrinsics.
1708 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary));
1709 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
1710 }
1711
1712 if (Level == OptimizationLevel::O0) {
1713 // Run a second time to clean up any type tests left behind by WPD for use
1714 // in ICP.
1715 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1716 // Drop available_externally and unreferenced globals. This is necessary
1717 // with ThinLTO in order to avoid leaving undefined references to dead
1718 // globals in the object file.
1719 MPM.addPass(EliminateAvailableExternallyPass());
1720 MPM.addPass(GlobalDCEPass());
1721 return MPM;
1722 }
1723
1724 // Add the core simplification pipeline.
1725 MPM.addPass(buildModuleSimplificationPipeline(
1726 Level, ThinOrFullLTOPhase::ThinLTOPostLink));
1727
1728 // Now add the optimization pipeline.
1729 MPM.addPass(buildModuleOptimizationPipeline(
1730 Level, ThinOrFullLTOPhase::ThinLTOPostLink));
1731
1732 // Emit annotation remarks.
1733 addAnnotationRemarksPass(MPM);
1734
1735 return MPM;
1736 }
1737
1738 ModulePassManager
buildLTOPreLinkDefaultPipeline(OptimizationLevel Level)1739 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
1740 // FIXME: We should use a customized pre-link pipeline!
1741 return buildPerModuleDefaultPipeline(Level,
1742 /* LTOPreLink */ true);
1743 }
1744
1745 ModulePassManager
buildLTODefaultPipeline(OptimizationLevel Level,ModuleSummaryIndex * ExportSummary)1746 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
1747 ModuleSummaryIndex *ExportSummary) {
1748 ModulePassManager MPM;
1749
1750 invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level);
1751
1752 // Create a function that performs CFI checks for cross-DSO calls with targets
1753 // in the current module.
1754 MPM.addPass(CrossDSOCFIPass());
1755
1756 if (Level == OptimizationLevel::O0) {
1757 // The WPD and LowerTypeTest passes need to run at -O0 to lower type
1758 // metadata and intrinsics.
1759 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
1760 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
1761 // Run a second time to clean up any type tests left behind by WPD for use
1762 // in ICP.
1763 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1764
1765 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
1766
1767 // Emit annotation remarks.
1768 addAnnotationRemarksPass(MPM);
1769
1770 return MPM;
1771 }
1772
1773 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) {
1774 // Load sample profile before running the LTO optimization pipeline.
1775 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
1776 PGOOpt->ProfileRemappingFile,
1777 ThinOrFullLTOPhase::FullLTOPostLink));
1778 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
1779 // RequireAnalysisPass for PSI before subsequent non-module passes.
1780 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
1781 }
1782
1783 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
1784 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
1785
1786 // Remove unused virtual tables to improve the quality of code generated by
1787 // whole-program devirtualization and bitset lowering.
1788 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
1789
1790 // Do basic inference of function attributes from known properties of system
1791 // libraries and other oracles.
1792 MPM.addPass(InferFunctionAttrsPass());
1793
1794 if (Level.getSpeedupLevel() > 1) {
1795 MPM.addPass(createModuleToFunctionPassAdaptor(
1796 CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses));
1797
1798 // Indirect call promotion. This should promote all the targets that are
1799 // left by the earlier promotion pass that promotes intra-module targets.
1800 // This two-step promotion is to save the compile time. For LTO, it should
1801 // produce the same result as if we only do promotion here.
1802 MPM.addPass(PGOIndirectCallPromotion(
1803 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
1804
1805 // Propagate constants at call sites into the functions they call. This
1806 // opens opportunities for globalopt (and inlining) by substituting function
1807 // pointers passed as arguments to direct uses of functions.
1808 MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
1809 Level != OptimizationLevel::Os &&
1810 Level != OptimizationLevel::Oz)));
1811
1812 // Attach metadata to indirect call sites indicating the set of functions
1813 // they may target at run-time. This should follow IPSCCP.
1814 MPM.addPass(CalledValuePropagationPass());
1815 }
1816
1817 // Now deduce any function attributes based in the current code.
1818 MPM.addPass(
1819 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
1820
1821 // Do RPO function attribute inference across the module to forward-propagate
1822 // attributes where applicable.
1823 // FIXME: Is this really an optimization rather than a canonicalization?
1824 MPM.addPass(ReversePostOrderFunctionAttrsPass());
1825
1826 // Use in-range annotations on GEP indices to split globals where beneficial.
1827 MPM.addPass(GlobalSplitPass());
1828
1829 // Run whole program optimization of virtual call when the list of callees
1830 // is fixed.
1831 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
1832
1833 // Stop here at -O1.
1834 if (Level == OptimizationLevel::O1) {
1835 // The LowerTypeTestsPass needs to run to lower type metadata and the
1836 // type.test intrinsics. The pass does nothing if CFI is disabled.
1837 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
1838 // Run a second time to clean up any type tests left behind by WPD for use
1839 // in ICP (which is performed earlier than this in the regular LTO
1840 // pipeline).
1841 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1842
1843 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
1844
1845 // Emit annotation remarks.
1846 addAnnotationRemarksPass(MPM);
1847
1848 return MPM;
1849 }
1850
1851 // Optimize globals to try and fold them into constants.
1852 MPM.addPass(GlobalOptPass());
1853
1854 // Promote any localized globals to SSA registers.
1855 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
1856
1857 // Linking modules together can lead to duplicate global constant, only
1858 // keep one copy of each constant.
1859 MPM.addPass(ConstantMergePass());
1860
1861 // Remove unused arguments from functions.
1862 MPM.addPass(DeadArgumentEliminationPass());
1863
1864 // Reduce the code after globalopt and ipsccp. Both can open up significant
1865 // simplification opportunities, and both can propagate functions through
1866 // function pointers. When this happens, we often have to resolve varargs
1867 // calls, etc, so let instcombine do this.
1868 FunctionPassManager PeepholeFPM;
1869 PeepholeFPM.addPass(InstCombinePass());
1870 if (Level.getSpeedupLevel() > 1)
1871 PeepholeFPM.addPass(AggressiveInstCombinePass());
1872 invokePeepholeEPCallbacks(PeepholeFPM, Level);
1873
1874 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
1875 PTO.EagerlyInvalidateAnalyses));
1876
1877 // Note: historically, the PruneEH pass was run first to deduce nounwind and
1878 // generally clean up exception handling overhead. It isn't clear this is
1879 // valuable as the inliner doesn't currently care whether it is inlining an
1880 // invoke or a call.
1881 // Run the inliner now.
1882 if (EnableModuleInliner) {
1883 MPM.addPass(ModuleInlinerPass(getInlineParamsFromOptLevel(Level),
1884 UseInlineAdvisor,
1885 ThinOrFullLTOPhase::FullLTOPostLink));
1886 } else {
1887 MPM.addPass(ModuleInlinerWrapperPass(
1888 getInlineParamsFromOptLevel(Level),
1889 /* MandatoryFirst */ true,
1890 InlineContext{ThinOrFullLTOPhase::FullLTOPostLink,
1891 InlinePass::CGSCCInliner}));
1892 }
1893
1894 // Perform context disambiguation after inlining, since that would reduce the
1895 // amount of additional cloning required to distinguish the allocation
1896 // contexts.
1897 if (EnableMemProfContextDisambiguation)
1898 MPM.addPass(MemProfContextDisambiguation());
1899
1900 // Optimize globals again after we ran the inliner.
1901 MPM.addPass(GlobalOptPass());
1902
1903 // Run the OpenMPOpt pass again after global optimizations.
1904 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
1905
1906 // Garbage collect dead functions.
1907 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
1908
1909 // If we didn't decide to inline a function, check to see if we can
1910 // transform it to pass arguments by value instead of by reference.
1911 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass()));
1912
1913 FunctionPassManager FPM;
1914 // The IPO Passes may leave cruft around. Clean up after them.
1915 FPM.addPass(InstCombinePass());
1916 invokePeepholeEPCallbacks(FPM, Level);
1917
1918 if (EnableConstraintElimination)
1919 FPM.addPass(ConstraintEliminationPass());
1920
1921 FPM.addPass(JumpThreadingPass());
1922
1923 // Do a post inline PGO instrumentation and use pass. This is a context
1924 // sensitive PGO pass.
1925 if (PGOOpt) {
1926 if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
1927 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true,
1928 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1929 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile,
1930 PGOOpt->FS);
1931 else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
1932 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false,
1933 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1934 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1935 PGOOpt->FS);
1936 }
1937
1938 // Break up allocas
1939 FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
1940
1941 // LTO provides additional opportunities for tailcall elimination due to
1942 // link-time inlining, and visibility of nocapture attribute.
1943 FPM.addPass(TailCallElimPass());
1944
1945 // Run a few AA driver optimizations here and now to cleanup the code.
1946 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
1947 PTO.EagerlyInvalidateAnalyses));
1948
1949 MPM.addPass(
1950 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
1951
1952 // Require the GlobalsAA analysis for the module so we can query it within
1953 // MainFPM.
1954 if (EnableGlobalAnalyses) {
1955 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
1956 // Invalidate AAManager so it can be recreated and pick up the newly
1957 // available GlobalsAA.
1958 MPM.addPass(
1959 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
1960 }
1961
1962 FunctionPassManager MainFPM;
1963 MainFPM.addPass(createFunctionToLoopPassAdaptor(
1964 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
1965 /*AllowSpeculation=*/true),
1966 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
1967
1968 if (RunNewGVN)
1969 MainFPM.addPass(NewGVNPass());
1970 else
1971 MainFPM.addPass(GVNPass());
1972
1973 // Remove dead memcpy()'s.
1974 MainFPM.addPass(MemCpyOptPass());
1975
1976 // Nuke dead stores.
1977 MainFPM.addPass(DSEPass());
1978 MainFPM.addPass(MoveAutoInitPass());
1979 MainFPM.addPass(MergedLoadStoreMotionPass());
1980
1981 LoopPassManager LPM;
1982 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
1983 LPM.addPass(LoopFlattenPass());
1984 LPM.addPass(IndVarSimplifyPass());
1985 LPM.addPass(LoopDeletionPass());
1986 // FIXME: Add loop interchange.
1987
1988 // Unroll small loops and perform peeling.
1989 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
1990 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
1991 PTO.ForgetAllSCEVInLoopUnroll));
1992 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
1993 // *All* loop passes must preserve it, in order to be able to use it.
1994 MainFPM.addPass(createFunctionToLoopPassAdaptor(
1995 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
1996
1997 MainFPM.addPass(LoopDistributePass());
1998
1999 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
2000
2001 // Run the OpenMPOpt CGSCC pass again late.
2002 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
2003 OpenMPOptCGSCCPass(ThinOrFullLTOPhase::FullLTOPostLink)));
2004
2005 invokePeepholeEPCallbacks(MainFPM, Level);
2006 MainFPM.addPass(JumpThreadingPass());
2007 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM),
2008 PTO.EagerlyInvalidateAnalyses));
2009
2010 // Lower type metadata and the type.test intrinsic. This pass supports
2011 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
2012 // to be run at link time if CFI is enabled. This pass does nothing if
2013 // CFI is disabled.
2014 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
2015 // Run a second time to clean up any type tests left behind by WPD for use
2016 // in ICP (which is performed earlier than this in the regular LTO pipeline).
2017 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
2018
2019 // Enable splitting late in the FullLTO post-link pipeline.
2020 if (EnableHotColdSplit)
2021 MPM.addPass(HotColdSplittingPass());
2022
2023 // Add late LTO optimization passes.
2024 FunctionPassManager LateFPM;
2025
2026 // LoopSink pass sinks instructions hoisted by LICM, which serves as a
2027 // canonicalization pass that enables other optimizations. As a result,
2028 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
2029 // result too early.
2030 LateFPM.addPass(LoopSinkPass());
2031
2032 // This hoists/decomposes div/rem ops. It should run after other sink/hoist
2033 // passes to avoid re-sinking, but before SimplifyCFG because it can allow
2034 // flattening of blocks.
2035 LateFPM.addPass(DivRemPairsPass());
2036
2037 // Delete basic blocks, which optimization passes may have killed.
2038 LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
2039 .convertSwitchRangeToICmp(true)
2040 .hoistCommonInsts(true)
2041 .speculateUnpredictables(true)));
2042 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
2043
2044 // Drop bodies of available eternally objects to improve GlobalDCE.
2045 MPM.addPass(EliminateAvailableExternallyPass());
2046
2047 // Now that we have optimized the program, discard unreachable functions.
2048 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
2049
2050 if (PTO.MergeFunctions)
2051 MPM.addPass(MergeFunctionsPass());
2052
2053 if (PTO.CallGraphProfile)
2054 MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true));
2055
2056 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
2057
2058 // Emit annotation remarks.
2059 addAnnotationRemarksPass(MPM);
2060
2061 return MPM;
2062 }
2063
buildO0DefaultPipeline(OptimizationLevel Level,bool LTOPreLink)2064 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
2065 bool LTOPreLink) {
2066 assert(Level == OptimizationLevel::O0 &&
2067 "buildO0DefaultPipeline should only be used with O0");
2068
2069 ModulePassManager MPM;
2070
2071 // Perform pseudo probe instrumentation in O0 mode. This is for the
2072 // consistency between different build modes. For example, a LTO build can be
2073 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in
2074 // the postlink will require pseudo probe instrumentation in the prelink.
2075 if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
2076 MPM.addPass(SampleProfileProbePass(TM));
2077
2078 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr ||
2079 PGOOpt->Action == PGOOptions::IRUse))
2080 addPGOInstrPassesForO0(
2081 MPM,
2082 /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr),
2083 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, PGOOpt->ProfileFile,
2084 PGOOpt->ProfileRemappingFile, PGOOpt->FS);
2085
2086 // Instrument function entry and exit before all inlining.
2087 MPM.addPass(createModuleToFunctionPassAdaptor(
2088 EntryExitInstrumenterPass(/*PostInlining=*/false)));
2089
2090 invokePipelineStartEPCallbacks(MPM, Level);
2091
2092 if (PGOOpt && PGOOpt->DebugInfoForProfiling)
2093 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
2094
2095 invokePipelineEarlySimplificationEPCallbacks(MPM, Level);
2096
2097 // Build a minimal pipeline based on the semantics required by LLVM,
2098 // which is just that always inlining occurs. Further, disable generating
2099 // lifetime intrinsics to avoid enabling further optimizations during
2100 // code generation.
2101 MPM.addPass(AlwaysInlinerPass(
2102 /*InsertLifetimeIntrinsics=*/false));
2103
2104 if (PTO.MergeFunctions)
2105 MPM.addPass(MergeFunctionsPass());
2106
2107 if (EnableMatrix)
2108 MPM.addPass(
2109 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true)));
2110
2111 if (!CGSCCOptimizerLateEPCallbacks.empty()) {
2112 CGSCCPassManager CGPM;
2113 invokeCGSCCOptimizerLateEPCallbacks(CGPM, Level);
2114 if (!CGPM.isEmpty())
2115 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
2116 }
2117 if (!LateLoopOptimizationsEPCallbacks.empty()) {
2118 LoopPassManager LPM;
2119 invokeLateLoopOptimizationsEPCallbacks(LPM, Level);
2120 if (!LPM.isEmpty()) {
2121 MPM.addPass(createModuleToFunctionPassAdaptor(
2122 createFunctionToLoopPassAdaptor(std::move(LPM))));
2123 }
2124 }
2125 if (!LoopOptimizerEndEPCallbacks.empty()) {
2126 LoopPassManager LPM;
2127 invokeLoopOptimizerEndEPCallbacks(LPM, Level);
2128 if (!LPM.isEmpty()) {
2129 MPM.addPass(createModuleToFunctionPassAdaptor(
2130 createFunctionToLoopPassAdaptor(std::move(LPM))));
2131 }
2132 }
2133 if (!ScalarOptimizerLateEPCallbacks.empty()) {
2134 FunctionPassManager FPM;
2135 invokeScalarOptimizerLateEPCallbacks(FPM, Level);
2136 if (!FPM.isEmpty())
2137 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
2138 }
2139
2140 invokeOptimizerEarlyEPCallbacks(MPM, Level);
2141
2142 if (!VectorizerStartEPCallbacks.empty()) {
2143 FunctionPassManager FPM;
2144 invokeVectorizerStartEPCallbacks(FPM, Level);
2145 if (!FPM.isEmpty())
2146 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
2147 }
2148
2149 ModulePassManager CoroPM;
2150 CoroPM.addPass(CoroEarlyPass());
2151 CGSCCPassManager CGPM;
2152 CGPM.addPass(CoroSplitPass());
2153 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
2154 CoroPM.addPass(CoroCleanupPass());
2155 CoroPM.addPass(GlobalDCEPass());
2156 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM)));
2157
2158 invokeOptimizerLastEPCallbacks(MPM, Level);
2159
2160 if (LTOPreLink)
2161 addRequiredLTOPreLinkPasses(MPM);
2162
2163 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
2164
2165 return MPM;
2166 }
2167
buildDefaultAAPipeline()2168 AAManager PassBuilder::buildDefaultAAPipeline() {
2169 AAManager AA;
2170
2171 // The order in which these are registered determines their priority when
2172 // being queried.
2173
2174 // First we register the basic alias analysis that provides the majority of
2175 // per-function local AA logic. This is a stateless, on-demand local set of
2176 // AA techniques.
2177 AA.registerFunctionAnalysis<BasicAA>();
2178
2179 // Next we query fast, specialized alias analyses that wrap IR-embedded
2180 // information about aliasing.
2181 AA.registerFunctionAnalysis<ScopedNoAliasAA>();
2182 AA.registerFunctionAnalysis<TypeBasedAA>();
2183
2184 // Add support for querying global aliasing information when available.
2185 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
2186 // analysis, all that the `AAManager` can do is query for any *cached*
2187 // results from `GlobalsAA` through a readonly proxy.
2188 if (EnableGlobalAnalyses)
2189 AA.registerModuleAnalysis<GlobalsAA>();
2190
2191 // Add target-specific alias analyses.
2192 if (TM)
2193 TM->registerDefaultAliasAnalyses(AA);
2194
2195 return AA;
2196 }
2197