xref: /freebsd/contrib/llvm-project/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10 // which looks for an out-of-bound array element access.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/CharUnits.h"
15 #include "clang/AST/ParentMapContext.h"
16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17 #include "clang/StaticAnalyzer/Checkers/Taint.h"
18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19 #include "clang/StaticAnalyzer/Core/Checker.h"
20 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include <optional>
29 
30 using namespace clang;
31 using namespace ento;
32 using namespace taint;
33 using llvm::formatv;
34 
35 namespace {
36 /// If `E` is a "clean" array subscript expression, return the type of the
37 /// accessed element. If the base of the subscript expression is modified by
38 /// pointer arithmetic (and not the beginning of a "full" memory region), this
39 /// always returns nullopt because that's the right (or the least bad) thing to
40 /// do for the diagnostic output that's relying on this.
41 static std::optional<QualType> determineElementType(const Expr *E,
42                                                     const CheckerContext &C) {
43   const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44   if (!ASE)
45     return std::nullopt;
46 
47   const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48   if (!SubscriptBaseReg)
49     return std::nullopt;
50 
51   // The base of the subscript expression is affected by pointer arithmetics,
52   // so we want to report byte offsets instead of indices.
53   if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
54     return std::nullopt;
55 
56   return ASE->getType();
57 }
58 
59 static std::optional<int64_t>
60 determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
61   if (!T)
62     return std::nullopt;
63   return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
64 }
65 
66 class StateUpdateReporter {
67   const SubRegion *Reg;
68   const NonLoc ByteOffsetVal;
69   const std::optional<QualType> ElementType;
70   const std::optional<int64_t> ElementSize;
71   bool AssumedNonNegative = false;
72   std::optional<NonLoc> AssumedUpperBound = std::nullopt;
73 
74 public:
75   StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
76                       CheckerContext &C)
77       : Reg(R), ByteOffsetVal(ByteOffsVal),
78         ElementType(determineElementType(E, C)),
79         ElementSize(determineElementSize(ElementType, C)) {}
80 
81   void recordNonNegativeAssumption() { AssumedNonNegative = true; }
82   void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
83     AssumedUpperBound = UpperBoundVal;
84   }
85 
86   bool assumedNonNegative() { return AssumedNonNegative; }
87 
88   const NoteTag *createNoteTag(CheckerContext &C) const;
89 
90 private:
91   std::string getMessage(PathSensitiveBugReport &BR) const;
92 
93   /// Return true if information about the value of `Sym` can put constraints
94   /// on some symbol which is interesting within the bug report `BR`.
95   /// In particular, this returns true when `Sym` is interesting within `BR`;
96   /// but it also returns true if `Sym` is an expression that contains integer
97   /// constants and a single symbolic operand which is interesting (in `BR`).
98   /// We need to use this instead of plain `BR.isInteresting()` because if we
99   /// are analyzing code like
100   ///   int array[10];
101   ///   int f(int arg) {
102   ///     return array[arg] && array[arg + 10];
103   ///   }
104   /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
105   /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
106   /// to detect this out of bounds access).
107   static bool providesInformationAboutInteresting(SymbolRef Sym,
108                                                   PathSensitiveBugReport &BR);
109   static bool providesInformationAboutInteresting(SVal SV,
110                                                   PathSensitiveBugReport &BR) {
111     return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
112   }
113 };
114 
115 struct Messages {
116   std::string Short, Full;
117 };
118 
119 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
120 // instead of `PreStmt` because the current implementation passes the whole
121 // expression to `CheckerContext::getSVal()` which only works after the
122 // symbolic evaluation of the expression. (To turn them into `PreStmt`
123 // callbacks, we'd need to duplicate the logic that evaluates these
124 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
125 // defined as `PostStmt` for the sake of consistency with the other callbacks.
126 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
127                                            check::PostStmt<UnaryOperator>,
128                                            check::PostStmt<MemberExpr>> {
129   BugType BT{this, "Out-of-bound access"};
130   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
131 
132   void performCheck(const Expr *E, CheckerContext &C) const;
133 
134   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
135                  NonLoc Offset, std::optional<NonLoc> Extent,
136                  bool IsTaintBug = false) const;
137 
138   static void markPartsInteresting(PathSensitiveBugReport &BR,
139                                    ProgramStateRef ErrorState, NonLoc Val,
140                                    bool MarkTaint);
141 
142   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
143 
144   static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
145                                        NonLoc Offset, NonLoc Limit,
146                                        CheckerContext &C);
147   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
148 
149 public:
150   void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
151     performCheck(E, C);
152   }
153   void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
154     if (E->getOpcode() == UO_Deref)
155       performCheck(E, C);
156   }
157   void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
158     if (E->isArrow())
159       performCheck(E->getBase(), C);
160   }
161 };
162 
163 } // anonymous namespace
164 
165 /// For a given Location that can be represented as a symbolic expression
166 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
167 /// Arr and the distance of Location from the beginning of Arr (expressed in a
168 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
169 /// cannot be determined.
170 static std::optional<std::pair<const SubRegion *, NonLoc>>
171 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
172   QualType T = SVB.getArrayIndexType();
173   auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
174     // We will use this utility to add and multiply values.
175     return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
176   };
177 
178   const SubRegion *OwnerRegion = nullptr;
179   std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
180 
181   const ElementRegion *CurRegion =
182       dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
183 
184   while (CurRegion) {
185     const auto Index = CurRegion->getIndex().getAs<NonLoc>();
186     if (!Index)
187       return std::nullopt;
188 
189     QualType ElemType = CurRegion->getElementType();
190 
191     // FIXME: The following early return was presumably added to safeguard the
192     // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
193     // it seems that `ElemType` cannot be incomplete at this point.
194     if (ElemType->isIncompleteType())
195       return std::nullopt;
196 
197     // Calculate Delta = Index * sizeof(ElemType).
198     NonLoc Size = SVB.makeArrayIndex(
199         SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
200     auto Delta = EvalBinOp(BO_Mul, *Index, Size);
201     if (!Delta)
202       return std::nullopt;
203 
204     // Perform Offset += Delta.
205     Offset = EvalBinOp(BO_Add, *Offset, *Delta);
206     if (!Offset)
207       return std::nullopt;
208 
209     OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
210     // When this is just another ElementRegion layer, we need to continue the
211     // offset calculations:
212     CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
213   }
214 
215   if (OwnerRegion)
216     return std::make_pair(OwnerRegion, *Offset);
217 
218   return std::nullopt;
219 }
220 
221 // NOTE: This function is the "heart" of this checker. It simplifies
222 // inequalities with transformations that are valid (and very elementary) in
223 // pure mathematics, but become invalid if we use them in C++ number model
224 // where the calculations may overflow.
225 // Due to the overflow issues I think it's impossible (or at least not
226 // practical) to integrate this kind of simplification into the resolution of
227 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
228 // produces valid results when the calculations are handling memory offsets
229 // and every value is well below SIZE_MAX.
230 // TODO: This algorithm should be moved to a central location where it's
231 // available for other checkers that need to compare memory offsets.
232 // NOTE: the simplification preserves the order of the two operands in a
233 // mathematical sense, but it may change the result produced by a C++
234 // comparison operator (and the automatic type conversions).
235 // For example, consider a comparison "X+1 < 0", where the LHS is stored as a
236 // size_t and the RHS is stored in an int. (As size_t is unsigned, this
237 // comparison is false for all values of "X".) However, the simplification may
238 // turn it into "X < -1", which is still always false in a mathematical sense,
239 // but can produce a true result when evaluated by `evalBinOp` (which follows
240 // the rules of C++ and casts -1 to SIZE_MAX).
241 static std::pair<NonLoc, nonloc::ConcreteInt>
242 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
243                      SValBuilder &svalBuilder) {
244   std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
245   if (SymVal && SymVal->isExpression()) {
246     if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
247       llvm::APSInt constant =
248           APSIntType(extent.getValue()).convert(SIE->getRHS());
249       switch (SIE->getOpcode()) {
250       case BO_Mul:
251         // The constant should never be 0 here, becasue multiplication by zero
252         // is simplified by the engine.
253         if ((extent.getValue() % constant) != 0)
254           return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
255         else
256           return getSimplifiedOffsets(
257               nonloc::SymbolVal(SIE->getLHS()),
258               svalBuilder.makeIntVal(extent.getValue() / constant),
259               svalBuilder);
260       case BO_Add:
261         return getSimplifiedOffsets(
262             nonloc::SymbolVal(SIE->getLHS()),
263             svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
264       default:
265         break;
266       }
267     }
268   }
269 
270   return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271 }
272 
273 static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) {
274   const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
275   return MaxV && MaxV->isNegative();
276 }
277 
278 static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
279   QualType T = Value.getType(SVB.getContext());
280   return T->isUnsignedIntegerType();
281 }
282 
283 // Evaluate the comparison Value < Threshold with the help of the custom
284 // simplification algorithm defined for this checker. Return a pair of states,
285 // where the first one corresponds to "value below threshold" and the second
286 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
287 // the case when the evaluation fails.
288 // If the optional argument CheckEquality is true, then use BO_EQ instead of
289 // the default BO_LT after consistently applying the same simplification steps.
290 static std::pair<ProgramStateRef, ProgramStateRef>
291 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
292                         SValBuilder &SVB, bool CheckEquality = false) {
293   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
294     std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
295   }
296 
297   // We want to perform a _mathematical_ comparison between the numbers `Value`
298   // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
299   // perform automatic conversions. For example the number -1 is less than the
300   // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
301   // -1 is converted to ULONGLONG_MAX.
302   // To avoid automatic conversions, we evaluate the "obvious" cases without
303   // calling `evalBinOpNN`:
304   if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
305     if (CheckEquality) {
306       // negative_value == unsigned_threshold is always false
307       return {nullptr, State};
308     }
309     // negative_value < unsigned_threshold is always true
310     return {State, nullptr};
311   }
312   if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
313     // unsigned_value == negative_threshold and
314     // unsigned_value < negative_threshold are both always false
315     return {nullptr, State};
316   }
317   // FIXME: These special cases are sufficient for handling real-world
318   // comparisons, but in theory there could be contrived situations where
319   // automatic conversion of a symbolic value (which can be negative and can be
320   // positive) leads to incorrect results.
321   // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
322   // we want to ensure that assumptions coming from this precondition and
323   // assumptions coming from regular C/C++ operator calls are represented by
324   // constraints on the same symbolic expression. A solution that would
325   // evaluate these "mathematical" compariosns through a separate pathway would
326   // be a step backwards in this sense.
327 
328   const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
329   auto BelowThreshold =
330       SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
331           .getAs<NonLoc>();
332 
333   if (BelowThreshold)
334     return State->assume(*BelowThreshold);
335 
336   return {nullptr, nullptr};
337 }
338 
339 static std::string getRegionName(const SubRegion *Region) {
340   if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
341     return RegName;
342 
343   // Field regions only have descriptive names when their parent has a
344   // descriptive name; so we provide a fallback representation for them:
345   if (const auto *FR = Region->getAs<FieldRegion>()) {
346     if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
347       return formatv("the field '{0}'", Name);
348     return "the unnamed field";
349   }
350 
351   if (isa<AllocaRegion>(Region))
352     return "the memory returned by 'alloca'";
353 
354   if (isa<SymbolicRegion>(Region) &&
355       isa<HeapSpaceRegion>(Region->getMemorySpace()))
356     return "the heap area";
357 
358   if (isa<StringRegion>(Region))
359     return "the string literal";
360 
361   return "the region";
362 }
363 
364 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
365   if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
366     return ConcreteVal->getValue().tryExtValue();
367   }
368   return std::nullopt;
369 }
370 
371 static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
372   return SV ? getConcreteValue(*SV) : std::nullopt;
373 }
374 
375 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
376   std::string RegName = getRegionName(Region), OffsetStr = "";
377 
378   if (auto ConcreteOffset = getConcreteValue(Offset))
379     OffsetStr = formatv(" {0}", ConcreteOffset);
380 
381   return {
382       formatv("Out of bound access to memory preceding {0}", RegName),
383       formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
384 }
385 
386 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
387 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
388 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
389 /// division is considered to be successful.
390 static bool tryDividePair(std::optional<int64_t> &Val1,
391                           std::optional<int64_t> &Val2, int64_t Divisor) {
392   if (!Divisor)
393     return false;
394   const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
395   const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
396   if (!Val1HasRemainder && !Val2HasRemainder) {
397     if (Val1)
398       *Val1 /= Divisor;
399     if (Val2)
400       *Val2 /= Divisor;
401     return true;
402   }
403   return false;
404 }
405 
406 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
407                                NonLoc Offset, NonLoc Extent, SVal Location,
408                                bool AlsoMentionUnderflow) {
409   std::string RegName = getRegionName(Region);
410   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
411   assert(EReg && "this checker only handles element access");
412   QualType ElemType = EReg->getElementType();
413 
414   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
415   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
416 
417   int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
418 
419   bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
420   const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
421 
422   SmallString<256> Buf;
423   llvm::raw_svector_ostream Out(Buf);
424   Out << "Access of ";
425   if (!ExtentN && !UseByteOffsets)
426     Out << "'" << ElemType.getAsString() << "' element in ";
427   Out << RegName << " at ";
428   if (AlsoMentionUnderflow) {
429     Out << "a negative or overflowing " << OffsetOrIndex;
430   } else if (OffsetN) {
431     Out << OffsetOrIndex << " " << *OffsetN;
432   } else {
433     Out << "an overflowing " << OffsetOrIndex;
434   }
435   if (ExtentN) {
436     Out << ", while it holds only ";
437     if (*ExtentN != 1)
438       Out << *ExtentN;
439     else
440       Out << "a single";
441     if (UseByteOffsets)
442       Out << " byte";
443     else
444       Out << " '" << ElemType.getAsString() << "' element";
445 
446     if (*ExtentN > 1)
447       Out << "s";
448   }
449 
450   return {formatv("Out of bound access to memory {0} {1}",
451                   AlsoMentionUnderflow ? "around" : "after the end of",
452                   RegName),
453           std::string(Buf)};
454 }
455 
456 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
457                              bool AlsoMentionUnderflow) {
458   std::string RegName = getRegionName(Region);
459   return {formatv("Potential out of bound access to {0} with tainted {1}",
460                   RegName, OffsetName),
461           formatv("Access of {0} with a tainted {1} that may be {2}too large",
462                   RegName, OffsetName,
463                   AlsoMentionUnderflow ? "negative or " : "")};
464 }
465 
466 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
467   // Don't create a note tag if we didn't assume anything:
468   if (!AssumedNonNegative && !AssumedUpperBound)
469     return nullptr;
470 
471   return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
472     return getMessage(BR);
473   });
474 }
475 
476 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
477   bool ShouldReportNonNegative = AssumedNonNegative;
478   if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
479     if (AssumedUpperBound &&
480         providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
481       // Even if the byte offset isn't interesting (e.g. it's a constant value),
482       // the assumption can still be interesting if it provides information
483       // about an interesting symbolic upper bound.
484       ShouldReportNonNegative = false;
485     } else {
486       // We don't have anything interesting, don't report the assumption.
487       return "";
488     }
489   }
490 
491   std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
492   std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
493 
494   const bool UseIndex =
495       ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
496 
497   SmallString<256> Buf;
498   llvm::raw_svector_ostream Out(Buf);
499   Out << "Assuming ";
500   if (UseIndex) {
501     Out << "index ";
502     if (OffsetN)
503       Out << "'" << OffsetN << "' ";
504   } else if (AssumedUpperBound) {
505     Out << "byte offset ";
506     if (OffsetN)
507       Out << "'" << OffsetN << "' ";
508   } else {
509     Out << "offset ";
510   }
511 
512   Out << "is";
513   if (ShouldReportNonNegative) {
514     Out << " non-negative";
515   }
516   if (AssumedUpperBound) {
517     if (ShouldReportNonNegative)
518       Out << " and";
519     Out << " less than ";
520     if (ExtentN)
521       Out << *ExtentN << ", ";
522     if (UseIndex && ElementType)
523       Out << "the number of '" << ElementType->getAsString()
524           << "' elements in ";
525     else
526       Out << "the extent of ";
527     Out << getRegionName(Reg);
528   }
529   return std::string(Out.str());
530 }
531 
532 bool StateUpdateReporter::providesInformationAboutInteresting(
533     SymbolRef Sym, PathSensitiveBugReport &BR) {
534   if (!Sym)
535     return false;
536   for (SymbolRef PartSym : Sym->symbols()) {
537     // The interestingess mark may appear on any layer as we're stripping off
538     // the SymIntExpr, UnarySymExpr etc. layers...
539     if (BR.isInteresting(PartSym))
540       return true;
541     // ...but if both sides of the expression are symbolic, then there is no
542     // practical algorithm to produce separate constraints for the two
543     // operands (from the single combined result).
544     if (isa<SymSymExpr>(PartSym))
545       return false;
546   }
547   return false;
548 }
549 
550 void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
551   const SVal Location = C.getSVal(E);
552 
553   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
554   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
555   // and incomplete analysis of these leads to false positives. As even
556   // accurate reports would be confusing for the users, just disable reports
557   // from these macros:
558   if (isFromCtypeMacro(E, C.getASTContext()))
559     return;
560 
561   ProgramStateRef State = C.getState();
562   SValBuilder &SVB = C.getSValBuilder();
563 
564   const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
565       computeOffset(State, SVB, Location);
566 
567   if (!RawOffset)
568     return;
569 
570   auto [Reg, ByteOffset] = *RawOffset;
571 
572   // The state updates will be reported as a single note tag, which will be
573   // composed by this helper class.
574   StateUpdateReporter SUR(Reg, ByteOffset, E, C);
575 
576   // CHECK LOWER BOUND
577   const MemSpaceRegion *Space = Reg->getMemorySpace();
578   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
579     // A symbolic region in unknown space represents an unknown pointer that
580     // may point into the middle of an array, so we don't look for underflows.
581     // Both conditions are significant because we want to check underflows in
582     // symbolic regions on the heap (which may be introduced by checkers like
583     // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
584     // non-symbolic regions (e.g. a field subregion of a symbolic region) in
585     // unknown space.
586     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
587         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
588 
589     if (PrecedesLowerBound) {
590       // The offset may be invalid (negative)...
591       if (!WithinLowerBound) {
592         // ...and it cannot be valid (>= 0), so report an error.
593         Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
594         reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
595         return;
596       }
597       // ...but it can be valid as well, so the checker will (optimistically)
598       // assume that it's valid and mention this in the note tag.
599       SUR.recordNonNegativeAssumption();
600     }
601 
602     // Actually update the state. The "if" only fails in the extremely unlikely
603     // case when compareValueToThreshold returns {nullptr, nullptr} becasue
604     // evalBinOpNN fails to evaluate the less-than operator.
605     if (WithinLowerBound)
606       State = WithinLowerBound;
607   }
608 
609   // CHECK UPPER BOUND
610   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
611   if (auto KnownSize = Size.getAs<NonLoc>()) {
612     // In a situation where both underflow and overflow are possible (but the
613     // index is either tainted or known to be invalid), the logic of this
614     // checker will first assume that the offset is non-negative, and then
615     // (with this additional assumption) it will detect an overflow error.
616     // In this situation the warning message should mention both possibilities.
617     bool AlsoMentionUnderflow = SUR.assumedNonNegative();
618 
619     auto [WithinUpperBound, ExceedsUpperBound] =
620         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
621 
622     if (ExceedsUpperBound) {
623       // The offset may be invalid (>= Size)...
624       if (!WithinUpperBound) {
625         // ...and it cannot be within bounds, so report an error, unless we can
626         // definitely determine that this is an idiomatic `&array[size]`
627         // expression that calculates the past-the-end pointer.
628         if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
629                                      *KnownSize, C)) {
630           C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
631           return;
632         }
633 
634         Messages Msgs =
635             getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize,
636                            Location, AlsoMentionUnderflow);
637         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
638         return;
639       }
640       // ...and it can be valid as well...
641       if (isTainted(State, ByteOffset)) {
642         // ...but it's tainted, so report an error.
643 
644         // Diagnostic detail: saying "tainted offset" is always correct, but
645         // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
646         // nicer to say "tainted index".
647         const char *OffsetName = "offset";
648         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
649           if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
650             OffsetName = "index";
651 
652         Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow);
653         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
654                   /*IsTaintBug=*/true);
655         return;
656       }
657       // ...and it isn't tainted, so the checker will (optimistically) assume
658       // that the offset is in bounds and mention this in the note tag.
659       SUR.recordUpperBoundAssumption(*KnownSize);
660     }
661 
662     // Actually update the state. The "if" only fails in the extremely unlikely
663     // case when compareValueToThreshold returns {nullptr, nullptr} becasue
664     // evalBinOpNN fails to evaluate the less-than operator.
665     if (WithinUpperBound)
666       State = WithinUpperBound;
667   }
668 
669   // Add a transition, reporting the state updates that we accumulated.
670   C.addTransition(State, SUR.createNoteTag(C));
671 }
672 
673 void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
674                                                ProgramStateRef ErrorState,
675                                                NonLoc Val, bool MarkTaint) {
676   if (SymbolRef Sym = Val.getAsSymbol()) {
677     // If the offset is a symbolic value, iterate over its "parts" with
678     // `SymExpr::symbols()` and mark each of them as interesting.
679     // For example, if the offset is `x*4 + y` then we put interestingness onto
680     // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
681     // `x` and `y`.
682     for (SymbolRef PartSym : Sym->symbols())
683       BR.markInteresting(PartSym);
684   }
685 
686   if (MarkTaint) {
687     // If the issue that we're reporting depends on the taintedness of the
688     // offset, then put interestingness onto symbols that could be the origin
689     // of the taint. Note that this may find symbols that did not appear in
690     // `Sym->symbols()` (because they're only loosely connected to `Val`).
691     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
692       BR.markInteresting(Sym);
693   }
694 }
695 
696 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
697                                     ProgramStateRef ErrorState, Messages Msgs,
698                                     NonLoc Offset, std::optional<NonLoc> Extent,
699                                     bool IsTaintBug /*=false*/) const {
700 
701   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
702   if (!ErrorNode)
703     return;
704 
705   auto BR = std::make_unique<PathSensitiveBugReport>(
706       IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
707 
708   // FIXME: ideally we would just call trackExpressionValue() and that would
709   // "do the right thing": mark the relevant symbols as interesting, track the
710   // control dependencies and statements storing the relevant values and add
711   // helpful diagnostic pieces. However, right now trackExpressionValue() is
712   // a heap of unreliable heuristics, so it would cause several issues:
713   // - Interestingness is not applied consistently, e.g. if `array[x+10]`
714   //   causes an overflow, then `x` is not marked as interesting.
715   // - We get irrelevant diagnostic pieces, e.g. in the code
716   //   `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
717   //   it places a "Storing uninitialized value" note on the `malloc` call
718   //   (which is technically true, but irrelevant).
719   // If trackExpressionValue() becomes reliable, it should be applied instead
720   // of this custom markPartsInteresting().
721   markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
722   if (Extent)
723     markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
724 
725   C.emitReport(std::move(BR));
726 }
727 
728 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
729   SourceLocation Loc = S->getBeginLoc();
730   if (!Loc.isMacroID())
731     return false;
732 
733   StringRef MacroName = Lexer::getImmediateMacroName(
734       Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
735 
736   if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
737     return false;
738 
739   return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
740           (MacroName == "isblank") || (MacroName == "isdigit") ||
741           (MacroName == "isgraph") || (MacroName == "islower") ||
742           (MacroName == "isnctrl") || (MacroName == "isprint") ||
743           (MacroName == "ispunct") || (MacroName == "isspace") ||
744           (MacroName == "isupper") || (MacroName == "isxdigit"));
745 }
746 
747 bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
748   ParentMapContext &ParentCtx = ACtx.getParentMapContext();
749   do {
750     const DynTypedNodeList Parents = ParentCtx.getParents(*S);
751     if (Parents.empty())
752       return false;
753     S = Parents[0].get<Stmt>();
754   } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
755   const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
756   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
757 }
758 
759 bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
760                                                    ProgramStateRef State,
761                                                    NonLoc Offset, NonLoc Limit,
762                                                    CheckerContext &C) {
763   if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
764     auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
765         State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
766     return EqualsToThreshold && !NotEqualToThreshold;
767   }
768   return false;
769 }
770 
771 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
772   mgr.registerChecker<ArrayBoundCheckerV2>();
773 }
774 
775 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
776   return true;
777 }
778