xref: /freebsd/contrib/llvm-project/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //== ArrayBoundChecker.cpp -------------------------------------------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines security.ArrayBound, which is a path-sensitive checker
10 // that looks for out of bounds access of memory regions.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/CharUnits.h"
15 #include "clang/AST/ParentMapContext.h"
16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17 #include "clang/StaticAnalyzer/Checkers/Taint.h"
18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19 #include "clang/StaticAnalyzer/Core/Checker.h"
20 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25 #include "llvm/ADT/APSInt.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include <optional>
29 
30 using namespace clang;
31 using namespace ento;
32 using namespace taint;
33 using llvm::formatv;
34 
35 namespace {
36 /// If `E` is an array subscript expression with a base that is "clean" (= not
37 /// modified by pointer arithmetic = the beginning of a memory region), return
38 /// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
39 /// This helper function is used by two separate heuristics that are only valid
40 /// in these "clean" cases.
41 static const ArraySubscriptExpr *
42 getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) {
43   const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44   if (!ASE)
45     return nullptr;
46 
47   const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48   if (!SubscriptBaseReg)
49     return nullptr;
50 
51   // The base of the subscript expression is affected by pointer arithmetics,
52   // so we want to report byte offsets instead of indices and we don't want to
53   // activate the "index is unsigned -> cannot be negative" shortcut.
54   if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
55     return nullptr;
56 
57   return ASE;
58 }
59 
60 /// If `E` is a "clean" array subscript expression, return the type of the
61 /// accessed element; otherwise return std::nullopt because that's the best (or
62 /// least bad) option for the diagnostic generation that relies on this.
63 static std::optional<QualType> determineElementType(const Expr *E,
64                                                     const CheckerContext &C) {
65   const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
66   if (!ASE)
67     return std::nullopt;
68 
69   return ASE->getType();
70 }
71 
72 static std::optional<int64_t>
73 determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
74   if (!T)
75     return std::nullopt;
76   return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
77 }
78 
79 class StateUpdateReporter {
80   const MemSpaceRegion *Space;
81   const SubRegion *Reg;
82   const NonLoc ByteOffsetVal;
83   const std::optional<QualType> ElementType;
84   const std::optional<int64_t> ElementSize;
85   bool AssumedNonNegative = false;
86   std::optional<NonLoc> AssumedUpperBound = std::nullopt;
87 
88 public:
89   StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
90                       CheckerContext &C)
91       : Space(R->getMemorySpace(C.getState())), Reg(R),
92         ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)),
93         ElementSize(determineElementSize(ElementType, C)) {}
94 
95   void recordNonNegativeAssumption() { AssumedNonNegative = true; }
96   void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
97     AssumedUpperBound = UpperBoundVal;
98   }
99 
100   bool assumedNonNegative() { return AssumedNonNegative; }
101 
102   const NoteTag *createNoteTag(CheckerContext &C) const;
103 
104 private:
105   std::string getMessage(PathSensitiveBugReport &BR) const;
106 
107   /// Return true if information about the value of `Sym` can put constraints
108   /// on some symbol which is interesting within the bug report `BR`.
109   /// In particular, this returns true when `Sym` is interesting within `BR`;
110   /// but it also returns true if `Sym` is an expression that contains integer
111   /// constants and a single symbolic operand which is interesting (in `BR`).
112   /// We need to use this instead of plain `BR.isInteresting()` because if we
113   /// are analyzing code like
114   ///   int array[10];
115   ///   int f(int arg) {
116   ///     return array[arg] && array[arg + 10];
117   ///   }
118   /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
119   /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
120   /// to detect this out of bounds access).
121   static bool providesInformationAboutInteresting(SymbolRef Sym,
122                                                   PathSensitiveBugReport &BR);
123   static bool providesInformationAboutInteresting(SVal SV,
124                                                   PathSensitiveBugReport &BR) {
125     return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
126   }
127 };
128 
129 struct Messages {
130   std::string Short, Full;
131 };
132 
133 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
134 // instead of `PreStmt` because the current implementation passes the whole
135 // expression to `CheckerContext::getSVal()` which only works after the
136 // symbolic evaluation of the expression. (To turn them into `PreStmt`
137 // callbacks, we'd need to duplicate the logic that evaluates these
138 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
139 // defined as `PostStmt` for the sake of consistency with the other callbacks.
140 class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>,
141                                          check::PostStmt<UnaryOperator>,
142                                          check::PostStmt<MemberExpr>> {
143   BugType BT{this, "Out-of-bound access"};
144   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
145 
146   void performCheck(const Expr *E, CheckerContext &C) const;
147 
148   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
149                  NonLoc Offset, std::optional<NonLoc> Extent,
150                  bool IsTaintBug = false) const;
151 
152   static void markPartsInteresting(PathSensitiveBugReport &BR,
153                                    ProgramStateRef ErrorState, NonLoc Val,
154                                    bool MarkTaint);
155 
156   static bool isFromCtypeMacro(const Expr *E, ASTContext &AC);
157 
158   static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C);
159 
160   static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
161                                        NonLoc Offset, NonLoc Limit,
162                                        CheckerContext &C);
163   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
164 
165 public:
166   void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
167     performCheck(E, C);
168   }
169   void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
170     if (E->getOpcode() == UO_Deref)
171       performCheck(E, C);
172   }
173   void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
174     if (E->isArrow())
175       performCheck(E->getBase(), C);
176   }
177 };
178 
179 } // anonymous namespace
180 
181 /// For a given Location that can be represented as a symbolic expression
182 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
183 /// Arr and the distance of Location from the beginning of Arr (expressed in a
184 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
185 /// cannot be determined.
186 static std::optional<std::pair<const SubRegion *, NonLoc>>
187 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
188   QualType T = SVB.getArrayIndexType();
189   auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
190     // We will use this utility to add and multiply values.
191     return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
192   };
193 
194   const SubRegion *OwnerRegion = nullptr;
195   std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
196 
197   const ElementRegion *CurRegion =
198       dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
199 
200   while (CurRegion) {
201     const auto Index = CurRegion->getIndex().getAs<NonLoc>();
202     if (!Index)
203       return std::nullopt;
204 
205     QualType ElemType = CurRegion->getElementType();
206 
207     // FIXME: The following early return was presumably added to safeguard the
208     // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
209     // it seems that `ElemType` cannot be incomplete at this point.
210     if (ElemType->isIncompleteType())
211       return std::nullopt;
212 
213     // Calculate Delta = Index * sizeof(ElemType).
214     NonLoc Size = SVB.makeArrayIndex(
215         SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
216     auto Delta = EvalBinOp(BO_Mul, *Index, Size);
217     if (!Delta)
218       return std::nullopt;
219 
220     // Perform Offset += Delta.
221     Offset = EvalBinOp(BO_Add, *Offset, *Delta);
222     if (!Offset)
223       return std::nullopt;
224 
225     OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
226     // When this is just another ElementRegion layer, we need to continue the
227     // offset calculations:
228     CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
229   }
230 
231   if (OwnerRegion)
232     return std::make_pair(OwnerRegion, *Offset);
233 
234   return std::nullopt;
235 }
236 
237 // NOTE: This function is the "heart" of this checker. It simplifies
238 // inequalities with transformations that are valid (and very elementary) in
239 // pure mathematics, but become invalid if we use them in C++ number model
240 // where the calculations may overflow.
241 // Due to the overflow issues I think it's impossible (or at least not
242 // practical) to integrate this kind of simplification into the resolution of
243 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
244 // produces valid results when the calculations are handling memory offsets
245 // and every value is well below SIZE_MAX.
246 // TODO: This algorithm should be moved to a central location where it's
247 // available for other checkers that need to compare memory offsets.
248 // NOTE: the simplification preserves the order of the two operands in a
249 // mathematical sense, but it may change the result produced by a C++
250 // comparison operator (and the automatic type conversions).
251 // For example, consider a comparison "X+1 < 0", where the LHS is stored as a
252 // size_t and the RHS is stored in an int. (As size_t is unsigned, this
253 // comparison is false for all values of "X".) However, the simplification may
254 // turn it into "X < -1", which is still always false in a mathematical sense,
255 // but can produce a true result when evaluated by `evalBinOp` (which follows
256 // the rules of C++ and casts -1 to SIZE_MAX).
257 static std::pair<NonLoc, nonloc::ConcreteInt>
258 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
259                      SValBuilder &svalBuilder) {
260   const llvm::APSInt &extentVal = extent.getValue();
261   std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
262   if (SymVal && SymVal->isExpression()) {
263     if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
264       llvm::APSInt constant = APSIntType(extentVal).convert(SIE->getRHS());
265       switch (SIE->getOpcode()) {
266       case BO_Mul:
267         // The constant should never be 0 here, becasue multiplication by zero
268         // is simplified by the engine.
269         if ((extentVal % constant) != 0)
270           return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271         else
272           return getSimplifiedOffsets(
273               nonloc::SymbolVal(SIE->getLHS()),
274               svalBuilder.makeIntVal(extentVal / constant), svalBuilder);
275       case BO_Add:
276         return getSimplifiedOffsets(
277             nonloc::SymbolVal(SIE->getLHS()),
278             svalBuilder.makeIntVal(extentVal - constant), svalBuilder);
279       default:
280         break;
281       }
282     }
283   }
284 
285   return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
286 }
287 
288 static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) {
289   const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
290   return MaxV && MaxV->isNegative();
291 }
292 
293 static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
294   QualType T = Value.getType(SVB.getContext());
295   return T->isUnsignedIntegerType();
296 }
297 
298 // Evaluate the comparison Value < Threshold with the help of the custom
299 // simplification algorithm defined for this checker. Return a pair of states,
300 // where the first one corresponds to "value below threshold" and the second
301 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
302 // the case when the evaluation fails.
303 // If the optional argument CheckEquality is true, then use BO_EQ instead of
304 // the default BO_LT after consistently applying the same simplification steps.
305 static std::pair<ProgramStateRef, ProgramStateRef>
306 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
307                         SValBuilder &SVB, bool CheckEquality = false) {
308   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
309     std::tie(Value, Threshold) =
310         getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
311   }
312 
313   // We want to perform a _mathematical_ comparison between the numbers `Value`
314   // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
315   // perform automatic conversions. For example the number -1 is less than the
316   // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
317   // -1 is converted to ULONGLONG_MAX.
318   // To avoid automatic conversions, we evaluate the "obvious" cases without
319   // calling `evalBinOpNN`:
320   if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
321     if (CheckEquality) {
322       // negative_value == unsigned_threshold is always false
323       return {nullptr, State};
324     }
325     // negative_value < unsigned_threshold is always true
326     return {State, nullptr};
327   }
328   if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
329     // unsigned_value == negative_threshold and
330     // unsigned_value < negative_threshold are both always false
331     return {nullptr, State};
332   }
333   // FIXME: These special cases are sufficient for handling real-world
334   // comparisons, but in theory there could be contrived situations where
335   // automatic conversion of a symbolic value (which can be negative and can be
336   // positive) leads to incorrect results.
337   // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
338   // we want to ensure that assumptions coming from this precondition and
339   // assumptions coming from regular C/C++ operator calls are represented by
340   // constraints on the same symbolic expression. A solution that would
341   // evaluate these "mathematical" comparisons through a separate pathway would
342   // be a step backwards in this sense.
343 
344   const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
345   auto BelowThreshold =
346       SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
347           .getAs<NonLoc>();
348 
349   if (BelowThreshold)
350     return State->assume(*BelowThreshold);
351 
352   return {nullptr, nullptr};
353 }
354 
355 static std::string getRegionName(const MemSpaceRegion *Space,
356                                  const SubRegion *Region) {
357   if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
358     return RegName;
359 
360   // Field regions only have descriptive names when their parent has a
361   // descriptive name; so we provide a fallback representation for them:
362   if (const auto *FR = Region->getAs<FieldRegion>()) {
363     if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
364       return formatv("the field '{0}'", Name);
365     return "the unnamed field";
366   }
367 
368   if (isa<AllocaRegion>(Region))
369     return "the memory returned by 'alloca'";
370 
371   if (isa<SymbolicRegion>(Region) && isa<HeapSpaceRegion>(Space))
372     return "the heap area";
373 
374   if (isa<StringRegion>(Region))
375     return "the string literal";
376 
377   return "the region";
378 }
379 
380 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
381   if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
382     return ConcreteVal->getValue()->tryExtValue();
383   }
384   return std::nullopt;
385 }
386 
387 static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
388   return SV ? getConcreteValue(*SV) : std::nullopt;
389 }
390 
391 static Messages getPrecedesMsgs(const MemSpaceRegion *Space,
392                                 const SubRegion *Region, NonLoc Offset) {
393   std::string RegName = getRegionName(Space, Region), OffsetStr = "";
394 
395   if (auto ConcreteOffset = getConcreteValue(Offset))
396     OffsetStr = formatv(" {0}", ConcreteOffset);
397 
398   return {
399       formatv("Out of bound access to memory preceding {0}", RegName),
400       formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
401 }
402 
403 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
404 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
405 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
406 /// division is considered to be successful.
407 static bool tryDividePair(std::optional<int64_t> &Val1,
408                           std::optional<int64_t> &Val2, int64_t Divisor) {
409   if (!Divisor)
410     return false;
411   const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
412   const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
413   if (Val1HasRemainder || Val2HasRemainder)
414     return false;
415   if (Val1)
416     *Val1 /= Divisor;
417   if (Val2)
418     *Val2 /= Divisor;
419   return true;
420 }
421 
422 static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
423                                const SubRegion *Region, NonLoc Offset,
424                                NonLoc Extent, SVal Location,
425                                bool AlsoMentionUnderflow) {
426   std::string RegName = getRegionName(Space, Region);
427   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
428   assert(EReg && "this checker only handles element access");
429   QualType ElemType = EReg->getElementType();
430 
431   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
432   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
433 
434   int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
435 
436   bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
437   const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
438 
439   SmallString<256> Buf;
440   llvm::raw_svector_ostream Out(Buf);
441   Out << "Access of ";
442   if (!ExtentN && !UseByteOffsets)
443     Out << "'" << ElemType.getAsString() << "' element in ";
444   Out << RegName << " at ";
445   if (AlsoMentionUnderflow) {
446     Out << "a negative or overflowing " << OffsetOrIndex;
447   } else if (OffsetN) {
448     Out << OffsetOrIndex << " " << *OffsetN;
449   } else {
450     Out << "an overflowing " << OffsetOrIndex;
451   }
452   if (ExtentN) {
453     Out << ", while it holds only ";
454     if (*ExtentN != 1)
455       Out << *ExtentN;
456     else
457       Out << "a single";
458     if (UseByteOffsets)
459       Out << " byte";
460     else
461       Out << " '" << ElemType.getAsString() << "' element";
462 
463     if (*ExtentN > 1)
464       Out << "s";
465   }
466 
467   return {formatv("Out of bound access to memory {0} {1}",
468                   AlsoMentionUnderflow ? "around" : "after the end of",
469                   RegName),
470           std::string(Buf)};
471 }
472 
473 static Messages getTaintMsgs(const MemSpaceRegion *Space,
474                              const SubRegion *Region, const char *OffsetName,
475                              bool AlsoMentionUnderflow) {
476   std::string RegName = getRegionName(Space, Region);
477   return {formatv("Potential out of bound access to {0} with tainted {1}",
478                   RegName, OffsetName),
479           formatv("Access of {0} with a tainted {1} that may be {2}too large",
480                   RegName, OffsetName,
481                   AlsoMentionUnderflow ? "negative or " : "")};
482 }
483 
484 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
485   // Don't create a note tag if we didn't assume anything:
486   if (!AssumedNonNegative && !AssumedUpperBound)
487     return nullptr;
488 
489   return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
490     return getMessage(BR);
491   });
492 }
493 
494 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
495   bool ShouldReportNonNegative = AssumedNonNegative;
496   if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
497     if (AssumedUpperBound &&
498         providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
499       // Even if the byte offset isn't interesting (e.g. it's a constant value),
500       // the assumption can still be interesting if it provides information
501       // about an interesting symbolic upper bound.
502       ShouldReportNonNegative = false;
503     } else {
504       // We don't have anything interesting, don't report the assumption.
505       return "";
506     }
507   }
508 
509   std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
510   std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
511 
512   const bool UseIndex =
513       ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
514 
515   SmallString<256> Buf;
516   llvm::raw_svector_ostream Out(Buf);
517   Out << "Assuming ";
518   if (UseIndex) {
519     Out << "index ";
520     if (OffsetN)
521       Out << "'" << OffsetN << "' ";
522   } else if (AssumedUpperBound) {
523     Out << "byte offset ";
524     if (OffsetN)
525       Out << "'" << OffsetN << "' ";
526   } else {
527     Out << "offset ";
528   }
529 
530   Out << "is";
531   if (ShouldReportNonNegative) {
532     Out << " non-negative";
533   }
534   if (AssumedUpperBound) {
535     if (ShouldReportNonNegative)
536       Out << " and";
537     Out << " less than ";
538     if (ExtentN)
539       Out << *ExtentN << ", ";
540     if (UseIndex && ElementType)
541       Out << "the number of '" << ElementType->getAsString()
542           << "' elements in ";
543     else
544       Out << "the extent of ";
545     Out << getRegionName(Space, Reg);
546   }
547   return std::string(Out.str());
548 }
549 
550 bool StateUpdateReporter::providesInformationAboutInteresting(
551     SymbolRef Sym, PathSensitiveBugReport &BR) {
552   if (!Sym)
553     return false;
554   for (SymbolRef PartSym : Sym->symbols()) {
555     // The interestingess mark may appear on any layer as we're stripping off
556     // the SymIntExpr, UnarySymExpr etc. layers...
557     if (BR.isInteresting(PartSym))
558       return true;
559     // ...but if both sides of the expression are symbolic, then there is no
560     // practical algorithm to produce separate constraints for the two
561     // operands (from the single combined result).
562     if (isa<SymSymExpr>(PartSym))
563       return false;
564   }
565   return false;
566 }
567 
568 void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
569   const SVal Location = C.getSVal(E);
570 
571   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
572   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
573   // and incomplete analysis of these leads to false positives. As even
574   // accurate reports would be confusing for the users, just disable reports
575   // from these macros:
576   if (isFromCtypeMacro(E, C.getASTContext()))
577     return;
578 
579   ProgramStateRef State = C.getState();
580   SValBuilder &SVB = C.getSValBuilder();
581 
582   const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
583       computeOffset(State, SVB, Location);
584 
585   if (!RawOffset)
586     return;
587 
588   auto [Reg, ByteOffset] = *RawOffset;
589 
590   // The state updates will be reported as a single note tag, which will be
591   // composed by this helper class.
592   StateUpdateReporter SUR(Reg, ByteOffset, E, C);
593 
594   // CHECK LOWER BOUND
595   const MemSpaceRegion *Space = Reg->getMemorySpace(State);
596   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
597     // A symbolic region in unknown space represents an unknown pointer that
598     // may point into the middle of an array, so we don't look for underflows.
599     // Both conditions are significant because we want to check underflows in
600     // symbolic regions on the heap (which may be introduced by checkers like
601     // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
602     // non-symbolic regions (e.g. a field subregion of a symbolic region) in
603     // unknown space.
604     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
605         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
606 
607     if (PrecedesLowerBound) {
608       // The analyzer thinks that the offset may be invalid (negative)...
609 
610       if (isOffsetObviouslyNonnegative(E, C)) {
611         // ...but the offset is obviously non-negative (clear array subscript
612         // with an unsigned index), so we're in a buggy situation.
613 
614         // TODO: Currently the analyzer ignores many casts (e.g. signed ->
615         // unsigned casts), so it can easily reach states where it will load a
616         // signed (and negative) value from an unsigned variable. This sanity
617         // check is a duct tape "solution" that silences most of the ugly false
618         // positives that are caused by this buggy behavior. Note that this is
619         // not a complete solution: this cannot silence reports where pointer
620         // arithmetic complicates the picture and cannot ensure modeling of the
621         // "unsigned index is positive with highest bit set" cases which are
622         // "usurped" by the nonsense "unsigned index is negative" case.
623         // For more information about this topic, see the umbrella ticket
624         // https://github.com/llvm/llvm-project/issues/39492
625         // TODO: Remove this hack once 'SymbolCast's are modeled properly.
626 
627         if (!WithinLowerBound) {
628           // The state is completely nonsense -- let's just sink it!
629           C.addSink();
630           return;
631         }
632         // Otherwise continue on the 'WithinLowerBound' branch where the
633         // unsigned index _is_ non-negative. Don't mention this assumption as a
634         // note tag, because it would just confuse the users!
635       } else {
636         if (!WithinLowerBound) {
637           // ...and it cannot be valid (>= 0), so report an error.
638           Messages Msgs = getPrecedesMsgs(Space, Reg, ByteOffset);
639           reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
640           return;
641         }
642         // ...but it can be valid as well, so the checker will (optimistically)
643         // assume that it's valid and mention this in the note tag.
644         SUR.recordNonNegativeAssumption();
645       }
646     }
647 
648     // Actually update the state. The "if" only fails in the extremely unlikely
649     // case when compareValueToThreshold returns {nullptr, nullptr} because
650     // evalBinOpNN fails to evaluate the less-than operator.
651     if (WithinLowerBound)
652       State = WithinLowerBound;
653   }
654 
655   // CHECK UPPER BOUND
656   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
657   if (auto KnownSize = Size.getAs<NonLoc>()) {
658     // In a situation where both underflow and overflow are possible (but the
659     // index is either tainted or known to be invalid), the logic of this
660     // checker will first assume that the offset is non-negative, and then
661     // (with this additional assumption) it will detect an overflow error.
662     // In this situation the warning message should mention both possibilities.
663     bool AlsoMentionUnderflow = SUR.assumedNonNegative();
664 
665     auto [WithinUpperBound, ExceedsUpperBound] =
666         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
667 
668     if (ExceedsUpperBound) {
669       // The offset may be invalid (>= Size)...
670       if (!WithinUpperBound) {
671         // ...and it cannot be within bounds, so report an error, unless we can
672         // definitely determine that this is an idiomatic `&array[size]`
673         // expression that calculates the past-the-end pointer.
674         if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
675                                      *KnownSize, C)) {
676           C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
677           return;
678         }
679 
680         Messages Msgs =
681             getExceedsMsgs(C.getASTContext(), Space, Reg, ByteOffset,
682                            *KnownSize, Location, AlsoMentionUnderflow);
683         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
684         return;
685       }
686       // ...and it can be valid as well...
687       if (isTainted(State, ByteOffset)) {
688         // ...but it's tainted, so report an error.
689 
690         // Diagnostic detail: saying "tainted offset" is always correct, but
691         // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
692         // nicer to say "tainted index".
693         const char *OffsetName = "offset";
694         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
695           if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
696             OffsetName = "index";
697 
698         Messages Msgs =
699             getTaintMsgs(Space, Reg, OffsetName, AlsoMentionUnderflow);
700         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
701                   /*IsTaintBug=*/true);
702         return;
703       }
704       // ...and it isn't tainted, so the checker will (optimistically) assume
705       // that the offset is in bounds and mention this in the note tag.
706       SUR.recordUpperBoundAssumption(*KnownSize);
707     }
708 
709     // Actually update the state. The "if" only fails in the extremely unlikely
710     // case when compareValueToThreshold returns {nullptr, nullptr} because
711     // evalBinOpNN fails to evaluate the less-than operator.
712     if (WithinUpperBound)
713       State = WithinUpperBound;
714   }
715 
716   // Add a transition, reporting the state updates that we accumulated.
717   C.addTransition(State, SUR.createNoteTag(C));
718 }
719 
720 void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR,
721                                              ProgramStateRef ErrorState,
722                                              NonLoc Val, bool MarkTaint) {
723   if (SymbolRef Sym = Val.getAsSymbol()) {
724     // If the offset is a symbolic value, iterate over its "parts" with
725     // `SymExpr::symbols()` and mark each of them as interesting.
726     // For example, if the offset is `x*4 + y` then we put interestingness onto
727     // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
728     // `x` and `y`.
729     for (SymbolRef PartSym : Sym->symbols())
730       BR.markInteresting(PartSym);
731   }
732 
733   if (MarkTaint) {
734     // If the issue that we're reporting depends on the taintedness of the
735     // offset, then put interestingness onto symbols that could be the origin
736     // of the taint. Note that this may find symbols that did not appear in
737     // `Sym->symbols()` (because they're only loosely connected to `Val`).
738     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
739       BR.markInteresting(Sym);
740   }
741 }
742 
743 void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState,
744                                   Messages Msgs, NonLoc Offset,
745                                   std::optional<NonLoc> Extent,
746                                   bool IsTaintBug /*=false*/) const {
747 
748   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
749   if (!ErrorNode)
750     return;
751 
752   auto BR = std::make_unique<PathSensitiveBugReport>(
753       IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
754 
755   // FIXME: ideally we would just call trackExpressionValue() and that would
756   // "do the right thing": mark the relevant symbols as interesting, track the
757   // control dependencies and statements storing the relevant values and add
758   // helpful diagnostic pieces. However, right now trackExpressionValue() is
759   // a heap of unreliable heuristics, so it would cause several issues:
760   // - Interestingness is not applied consistently, e.g. if `array[x+10]`
761   //   causes an overflow, then `x` is not marked as interesting.
762   // - We get irrelevant diagnostic pieces, e.g. in the code
763   //   `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
764   //   it places a "Storing uninitialized value" note on the `malloc` call
765   //   (which is technically true, but irrelevant).
766   // If trackExpressionValue() becomes reliable, it should be applied instead
767   // of this custom markPartsInteresting().
768   markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
769   if (Extent)
770     markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
771 
772   C.emitReport(std::move(BR));
773 }
774 
775 bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) {
776   SourceLocation Loc = E->getBeginLoc();
777   if (!Loc.isMacroID())
778     return false;
779 
780   StringRef MacroName = Lexer::getImmediateMacroName(
781       Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
782 
783   if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
784     return false;
785 
786   return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
787           (MacroName == "isblank") || (MacroName == "isdigit") ||
788           (MacroName == "isgraph") || (MacroName == "islower") ||
789           (MacroName == "isnctrl") || (MacroName == "isprint") ||
790           (MacroName == "ispunct") || (MacroName == "isspace") ||
791           (MacroName == "isupper") || (MacroName == "isxdigit"));
792 }
793 
794 bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E,
795                                                      CheckerContext &C) {
796   const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C);
797   if (!ASE)
798     return false;
799   return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType();
800 }
801 
802 bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
803   ParentMapContext &ParentCtx = ACtx.getParentMapContext();
804   do {
805     const DynTypedNodeList Parents = ParentCtx.getParents(*S);
806     if (Parents.empty())
807       return false;
808     S = Parents[0].get<Stmt>();
809   } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
810   const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
811   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
812 }
813 
814 bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E,
815                                                  ProgramStateRef State,
816                                                  NonLoc Offset, NonLoc Limit,
817                                                  CheckerContext &C) {
818   if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
819     auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
820         State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
821     return EqualsToThreshold && !NotEqualToThreshold;
822   }
823   return false;
824 }
825 
826 void ento::registerArrayBoundChecker(CheckerManager &mgr) {
827   mgr.registerChecker<ArrayBoundChecker>();
828 }
829 
830 bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) {
831   return true;
832 }
833