1 //== ArrayBoundChecker.cpp -------------------------------------------------==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines security.ArrayBound, which is a path-sensitive checker 10 // that looks for out of bounds access of memory regions. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "clang/AST/CharUnits.h" 15 #include "clang/AST/ParentMapContext.h" 16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" 17 #include "clang/StaticAnalyzer/Checkers/Taint.h" 18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" 19 #include "clang/StaticAnalyzer/Core/Checker.h" 20 #include "clang/StaticAnalyzer/Core/CheckerManager.h" 21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h" 22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" 23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" 24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h" 25 #include "llvm/ADT/APSInt.h" 26 #include "llvm/Support/FormatVariadic.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include <optional> 29 30 using namespace clang; 31 using namespace ento; 32 using namespace taint; 33 using llvm::formatv; 34 35 namespace { 36 /// If `E` is an array subscript expression with a base that is "clean" (= not 37 /// modified by pointer arithmetic = the beginning of a memory region), return 38 /// it as a pointer to ArraySubscriptExpr; otherwise return nullptr. 39 /// This helper function is used by two separate heuristics that are only valid 40 /// in these "clean" cases. 41 static const ArraySubscriptExpr * 42 getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) { 43 const auto *ASE = dyn_cast<ArraySubscriptExpr>(E); 44 if (!ASE) 45 return nullptr; 46 47 const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion(); 48 if (!SubscriptBaseReg) 49 return nullptr; 50 51 // The base of the subscript expression is affected by pointer arithmetics, 52 // so we want to report byte offsets instead of indices and we don't want to 53 // activate the "index is unsigned -> cannot be negative" shortcut. 54 if (isa<ElementRegion>(SubscriptBaseReg->StripCasts())) 55 return nullptr; 56 57 return ASE; 58 } 59 60 /// If `E` is a "clean" array subscript expression, return the type of the 61 /// accessed element; otherwise return std::nullopt because that's the best (or 62 /// least bad) option for the diagnostic generation that relies on this. 63 static std::optional<QualType> determineElementType(const Expr *E, 64 const CheckerContext &C) { 65 const auto *ASE = getAsCleanArraySubscriptExpr(E, C); 66 if (!ASE) 67 return std::nullopt; 68 69 return ASE->getType(); 70 } 71 72 static std::optional<int64_t> 73 determineElementSize(const std::optional<QualType> T, const CheckerContext &C) { 74 if (!T) 75 return std::nullopt; 76 return C.getASTContext().getTypeSizeInChars(*T).getQuantity(); 77 } 78 79 class StateUpdateReporter { 80 const MemSpaceRegion *Space; 81 const SubRegion *Reg; 82 const NonLoc ByteOffsetVal; 83 const std::optional<QualType> ElementType; 84 const std::optional<int64_t> ElementSize; 85 bool AssumedNonNegative = false; 86 std::optional<NonLoc> AssumedUpperBound = std::nullopt; 87 88 public: 89 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E, 90 CheckerContext &C) 91 : Space(R->getMemorySpace(C.getState())), Reg(R), 92 ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)), 93 ElementSize(determineElementSize(ElementType, C)) {} 94 95 void recordNonNegativeAssumption() { AssumedNonNegative = true; } 96 void recordUpperBoundAssumption(NonLoc UpperBoundVal) { 97 AssumedUpperBound = UpperBoundVal; 98 } 99 100 bool assumedNonNegative() { return AssumedNonNegative; } 101 102 const NoteTag *createNoteTag(CheckerContext &C) const; 103 104 private: 105 std::string getMessage(PathSensitiveBugReport &BR) const; 106 107 /// Return true if information about the value of `Sym` can put constraints 108 /// on some symbol which is interesting within the bug report `BR`. 109 /// In particular, this returns true when `Sym` is interesting within `BR`; 110 /// but it also returns true if `Sym` is an expression that contains integer 111 /// constants and a single symbolic operand which is interesting (in `BR`). 112 /// We need to use this instead of plain `BR.isInteresting()` because if we 113 /// are analyzing code like 114 /// int array[10]; 115 /// int f(int arg) { 116 /// return array[arg] && array[arg + 10]; 117 /// } 118 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not 119 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough 120 /// to detect this out of bounds access). 121 static bool providesInformationAboutInteresting(SymbolRef Sym, 122 PathSensitiveBugReport &BR); 123 static bool providesInformationAboutInteresting(SVal SV, 124 PathSensitiveBugReport &BR) { 125 return providesInformationAboutInteresting(SV.getAsSymbol(), BR); 126 } 127 }; 128 129 struct Messages { 130 std::string Short, Full; 131 }; 132 133 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt` 134 // instead of `PreStmt` because the current implementation passes the whole 135 // expression to `CheckerContext::getSVal()` which only works after the 136 // symbolic evaluation of the expression. (To turn them into `PreStmt` 137 // callbacks, we'd need to duplicate the logic that evaluates these 138 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's 139 // defined as `PostStmt` for the sake of consistency with the other callbacks. 140 class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>, 141 check::PostStmt<UnaryOperator>, 142 check::PostStmt<MemberExpr>> { 143 BugType BT{this, "Out-of-bound access"}; 144 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData}; 145 146 void performCheck(const Expr *E, CheckerContext &C) const; 147 148 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs, 149 NonLoc Offset, std::optional<NonLoc> Extent, 150 bool IsTaintBug = false) const; 151 152 static void markPartsInteresting(PathSensitiveBugReport &BR, 153 ProgramStateRef ErrorState, NonLoc Val, 154 bool MarkTaint); 155 156 static bool isFromCtypeMacro(const Expr *E, ASTContext &AC); 157 158 static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C); 159 160 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, 161 NonLoc Offset, NonLoc Limit, 162 CheckerContext &C); 163 static bool isInAddressOf(const Stmt *S, ASTContext &AC); 164 165 public: 166 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const { 167 performCheck(E, C); 168 } 169 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const { 170 if (E->getOpcode() == UO_Deref) 171 performCheck(E, C); 172 } 173 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const { 174 if (E->isArrow()) 175 performCheck(E->getBase(), C); 176 } 177 }; 178 179 } // anonymous namespace 180 181 /// For a given Location that can be represented as a symbolic expression 182 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block 183 /// Arr and the distance of Location from the beginning of Arr (expressed in a 184 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these 185 /// cannot be determined. 186 static std::optional<std::pair<const SubRegion *, NonLoc>> 187 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { 188 QualType T = SVB.getArrayIndexType(); 189 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) { 190 // We will use this utility to add and multiply values. 191 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>(); 192 }; 193 194 const SubRegion *OwnerRegion = nullptr; 195 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex(); 196 197 const ElementRegion *CurRegion = 198 dyn_cast_or_null<ElementRegion>(Location.getAsRegion()); 199 200 while (CurRegion) { 201 const auto Index = CurRegion->getIndex().getAs<NonLoc>(); 202 if (!Index) 203 return std::nullopt; 204 205 QualType ElemType = CurRegion->getElementType(); 206 207 // FIXME: The following early return was presumably added to safeguard the 208 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but 209 // it seems that `ElemType` cannot be incomplete at this point. 210 if (ElemType->isIncompleteType()) 211 return std::nullopt; 212 213 // Calculate Delta = Index * sizeof(ElemType). 214 NonLoc Size = SVB.makeArrayIndex( 215 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity()); 216 auto Delta = EvalBinOp(BO_Mul, *Index, Size); 217 if (!Delta) 218 return std::nullopt; 219 220 // Perform Offset += Delta. 221 Offset = EvalBinOp(BO_Add, *Offset, *Delta); 222 if (!Offset) 223 return std::nullopt; 224 225 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>(); 226 // When this is just another ElementRegion layer, we need to continue the 227 // offset calculations: 228 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion); 229 } 230 231 if (OwnerRegion) 232 return std::make_pair(OwnerRegion, *Offset); 233 234 return std::nullopt; 235 } 236 237 // NOTE: This function is the "heart" of this checker. It simplifies 238 // inequalities with transformations that are valid (and very elementary) in 239 // pure mathematics, but become invalid if we use them in C++ number model 240 // where the calculations may overflow. 241 // Due to the overflow issues I think it's impossible (or at least not 242 // practical) to integrate this kind of simplification into the resolution of 243 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function 244 // produces valid results when the calculations are handling memory offsets 245 // and every value is well below SIZE_MAX. 246 // TODO: This algorithm should be moved to a central location where it's 247 // available for other checkers that need to compare memory offsets. 248 // NOTE: the simplification preserves the order of the two operands in a 249 // mathematical sense, but it may change the result produced by a C++ 250 // comparison operator (and the automatic type conversions). 251 // For example, consider a comparison "X+1 < 0", where the LHS is stored as a 252 // size_t and the RHS is stored in an int. (As size_t is unsigned, this 253 // comparison is false for all values of "X".) However, the simplification may 254 // turn it into "X < -1", which is still always false in a mathematical sense, 255 // but can produce a true result when evaluated by `evalBinOp` (which follows 256 // the rules of C++ and casts -1 to SIZE_MAX). 257 static std::pair<NonLoc, nonloc::ConcreteInt> 258 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, 259 SValBuilder &svalBuilder) { 260 const llvm::APSInt &extentVal = extent.getValue(); 261 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>(); 262 if (SymVal && SymVal->isExpression()) { 263 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) { 264 llvm::APSInt constant = APSIntType(extentVal).convert(SIE->getRHS()); 265 switch (SIE->getOpcode()) { 266 case BO_Mul: 267 // The constant should never be 0 here, becasue multiplication by zero 268 // is simplified by the engine. 269 if ((extentVal % constant) != 0) 270 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); 271 else 272 return getSimplifiedOffsets( 273 nonloc::SymbolVal(SIE->getLHS()), 274 svalBuilder.makeIntVal(extentVal / constant), svalBuilder); 275 case BO_Add: 276 return getSimplifiedOffsets( 277 nonloc::SymbolVal(SIE->getLHS()), 278 svalBuilder.makeIntVal(extentVal - constant), svalBuilder); 279 default: 280 break; 281 } 282 } 283 } 284 285 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); 286 } 287 288 static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) { 289 const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value); 290 return MaxV && MaxV->isNegative(); 291 } 292 293 static bool isUnsigned(SValBuilder &SVB, NonLoc Value) { 294 QualType T = Value.getType(SVB.getContext()); 295 return T->isUnsignedIntegerType(); 296 } 297 298 // Evaluate the comparison Value < Threshold with the help of the custom 299 // simplification algorithm defined for this checker. Return a pair of states, 300 // where the first one corresponds to "value below threshold" and the second 301 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in 302 // the case when the evaluation fails. 303 // If the optional argument CheckEquality is true, then use BO_EQ instead of 304 // the default BO_LT after consistently applying the same simplification steps. 305 static std::pair<ProgramStateRef, ProgramStateRef> 306 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, 307 SValBuilder &SVB, bool CheckEquality = false) { 308 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { 309 std::tie(Value, Threshold) = 310 getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); 311 } 312 313 // We want to perform a _mathematical_ comparison between the numbers `Value` 314 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may 315 // perform automatic conversions. For example the number -1 is less than the 316 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int` 317 // -1 is converted to ULONGLONG_MAX. 318 // To avoid automatic conversions, we evaluate the "obvious" cases without 319 // calling `evalBinOpNN`: 320 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) { 321 if (CheckEquality) { 322 // negative_value == unsigned_threshold is always false 323 return {nullptr, State}; 324 } 325 // negative_value < unsigned_threshold is always true 326 return {State, nullptr}; 327 } 328 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) { 329 // unsigned_value == negative_threshold and 330 // unsigned_value < negative_threshold are both always false 331 return {nullptr, State}; 332 } 333 // FIXME: These special cases are sufficient for handling real-world 334 // comparisons, but in theory there could be contrived situations where 335 // automatic conversion of a symbolic value (which can be negative and can be 336 // positive) leads to incorrect results. 337 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because 338 // we want to ensure that assumptions coming from this precondition and 339 // assumptions coming from regular C/C++ operator calls are represented by 340 // constraints on the same symbolic expression. A solution that would 341 // evaluate these "mathematical" comparisons through a separate pathway would 342 // be a step backwards in this sense. 343 344 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT; 345 auto BelowThreshold = 346 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType()) 347 .getAs<NonLoc>(); 348 349 if (BelowThreshold) 350 return State->assume(*BelowThreshold); 351 352 return {nullptr, nullptr}; 353 } 354 355 static std::string getRegionName(const MemSpaceRegion *Space, 356 const SubRegion *Region) { 357 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty()) 358 return RegName; 359 360 // Field regions only have descriptive names when their parent has a 361 // descriptive name; so we provide a fallback representation for them: 362 if (const auto *FR = Region->getAs<FieldRegion>()) { 363 if (StringRef Name = FR->getDecl()->getName(); !Name.empty()) 364 return formatv("the field '{0}'", Name); 365 return "the unnamed field"; 366 } 367 368 if (isa<AllocaRegion>(Region)) 369 return "the memory returned by 'alloca'"; 370 371 if (isa<SymbolicRegion>(Region) && isa<HeapSpaceRegion>(Space)) 372 return "the heap area"; 373 374 if (isa<StringRegion>(Region)) 375 return "the string literal"; 376 377 return "the region"; 378 } 379 380 static std::optional<int64_t> getConcreteValue(NonLoc SV) { 381 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) { 382 return ConcreteVal->getValue()->tryExtValue(); 383 } 384 return std::nullopt; 385 } 386 387 static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) { 388 return SV ? getConcreteValue(*SV) : std::nullopt; 389 } 390 391 static Messages getPrecedesMsgs(const MemSpaceRegion *Space, 392 const SubRegion *Region, NonLoc Offset) { 393 std::string RegName = getRegionName(Space, Region), OffsetStr = ""; 394 395 if (auto ConcreteOffset = getConcreteValue(Offset)) 396 OffsetStr = formatv(" {0}", ConcreteOffset); 397 398 return { 399 formatv("Out of bound access to memory preceding {0}", RegName), 400 formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)}; 401 } 402 403 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if 404 /// it can be performed (`Divisor` is nonzero and there is no remainder). The 405 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding 406 /// division is considered to be successful. 407 static bool tryDividePair(std::optional<int64_t> &Val1, 408 std::optional<int64_t> &Val2, int64_t Divisor) { 409 if (!Divisor) 410 return false; 411 const bool Val1HasRemainder = Val1 && *Val1 % Divisor; 412 const bool Val2HasRemainder = Val2 && *Val2 % Divisor; 413 if (Val1HasRemainder || Val2HasRemainder) 414 return false; 415 if (Val1) 416 *Val1 /= Divisor; 417 if (Val2) 418 *Val2 /= Divisor; 419 return true; 420 } 421 422 static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space, 423 const SubRegion *Region, NonLoc Offset, 424 NonLoc Extent, SVal Location, 425 bool AlsoMentionUnderflow) { 426 std::string RegName = getRegionName(Space, Region); 427 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>(); 428 assert(EReg && "this checker only handles element access"); 429 QualType ElemType = EReg->getElementType(); 430 431 std::optional<int64_t> OffsetN = getConcreteValue(Offset); 432 std::optional<int64_t> ExtentN = getConcreteValue(Extent); 433 434 int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity(); 435 436 bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize); 437 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index"; 438 439 SmallString<256> Buf; 440 llvm::raw_svector_ostream Out(Buf); 441 Out << "Access of "; 442 if (!ExtentN && !UseByteOffsets) 443 Out << "'" << ElemType.getAsString() << "' element in "; 444 Out << RegName << " at "; 445 if (AlsoMentionUnderflow) { 446 Out << "a negative or overflowing " << OffsetOrIndex; 447 } else if (OffsetN) { 448 Out << OffsetOrIndex << " " << *OffsetN; 449 } else { 450 Out << "an overflowing " << OffsetOrIndex; 451 } 452 if (ExtentN) { 453 Out << ", while it holds only "; 454 if (*ExtentN != 1) 455 Out << *ExtentN; 456 else 457 Out << "a single"; 458 if (UseByteOffsets) 459 Out << " byte"; 460 else 461 Out << " '" << ElemType.getAsString() << "' element"; 462 463 if (*ExtentN > 1) 464 Out << "s"; 465 } 466 467 return {formatv("Out of bound access to memory {0} {1}", 468 AlsoMentionUnderflow ? "around" : "after the end of", 469 RegName), 470 std::string(Buf)}; 471 } 472 473 static Messages getTaintMsgs(const MemSpaceRegion *Space, 474 const SubRegion *Region, const char *OffsetName, 475 bool AlsoMentionUnderflow) { 476 std::string RegName = getRegionName(Space, Region); 477 return {formatv("Potential out of bound access to {0} with tainted {1}", 478 RegName, OffsetName), 479 formatv("Access of {0} with a tainted {1} that may be {2}too large", 480 RegName, OffsetName, 481 AlsoMentionUnderflow ? "negative or " : "")}; 482 } 483 484 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const { 485 // Don't create a note tag if we didn't assume anything: 486 if (!AssumedNonNegative && !AssumedUpperBound) 487 return nullptr; 488 489 return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string { 490 return getMessage(BR); 491 }); 492 } 493 494 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const { 495 bool ShouldReportNonNegative = AssumedNonNegative; 496 if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) { 497 if (AssumedUpperBound && 498 providesInformationAboutInteresting(*AssumedUpperBound, BR)) { 499 // Even if the byte offset isn't interesting (e.g. it's a constant value), 500 // the assumption can still be interesting if it provides information 501 // about an interesting symbolic upper bound. 502 ShouldReportNonNegative = false; 503 } else { 504 // We don't have anything interesting, don't report the assumption. 505 return ""; 506 } 507 } 508 509 std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal); 510 std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound); 511 512 const bool UseIndex = 513 ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize); 514 515 SmallString<256> Buf; 516 llvm::raw_svector_ostream Out(Buf); 517 Out << "Assuming "; 518 if (UseIndex) { 519 Out << "index "; 520 if (OffsetN) 521 Out << "'" << OffsetN << "' "; 522 } else if (AssumedUpperBound) { 523 Out << "byte offset "; 524 if (OffsetN) 525 Out << "'" << OffsetN << "' "; 526 } else { 527 Out << "offset "; 528 } 529 530 Out << "is"; 531 if (ShouldReportNonNegative) { 532 Out << " non-negative"; 533 } 534 if (AssumedUpperBound) { 535 if (ShouldReportNonNegative) 536 Out << " and"; 537 Out << " less than "; 538 if (ExtentN) 539 Out << *ExtentN << ", "; 540 if (UseIndex && ElementType) 541 Out << "the number of '" << ElementType->getAsString() 542 << "' elements in "; 543 else 544 Out << "the extent of "; 545 Out << getRegionName(Space, Reg); 546 } 547 return std::string(Out.str()); 548 } 549 550 bool StateUpdateReporter::providesInformationAboutInteresting( 551 SymbolRef Sym, PathSensitiveBugReport &BR) { 552 if (!Sym) 553 return false; 554 for (SymbolRef PartSym : Sym->symbols()) { 555 // The interestingess mark may appear on any layer as we're stripping off 556 // the SymIntExpr, UnarySymExpr etc. layers... 557 if (BR.isInteresting(PartSym)) 558 return true; 559 // ...but if both sides of the expression are symbolic, then there is no 560 // practical algorithm to produce separate constraints for the two 561 // operands (from the single combined result). 562 if (isa<SymSymExpr>(PartSym)) 563 return false; 564 } 565 return false; 566 } 567 568 void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const { 569 const SVal Location = C.getSVal(E); 570 571 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as 572 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX) 573 // and incomplete analysis of these leads to false positives. As even 574 // accurate reports would be confusing for the users, just disable reports 575 // from these macros: 576 if (isFromCtypeMacro(E, C.getASTContext())) 577 return; 578 579 ProgramStateRef State = C.getState(); 580 SValBuilder &SVB = C.getSValBuilder(); 581 582 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset = 583 computeOffset(State, SVB, Location); 584 585 if (!RawOffset) 586 return; 587 588 auto [Reg, ByteOffset] = *RawOffset; 589 590 // The state updates will be reported as a single note tag, which will be 591 // composed by this helper class. 592 StateUpdateReporter SUR(Reg, ByteOffset, E, C); 593 594 // CHECK LOWER BOUND 595 const MemSpaceRegion *Space = Reg->getMemorySpace(State); 596 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) { 597 // A symbolic region in unknown space represents an unknown pointer that 598 // may point into the middle of an array, so we don't look for underflows. 599 // Both conditions are significant because we want to check underflows in 600 // symbolic regions on the heap (which may be introduced by checkers like 601 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and 602 // non-symbolic regions (e.g. a field subregion of a symbolic region) in 603 // unknown space. 604 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( 605 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); 606 607 if (PrecedesLowerBound) { 608 // The analyzer thinks that the offset may be invalid (negative)... 609 610 if (isOffsetObviouslyNonnegative(E, C)) { 611 // ...but the offset is obviously non-negative (clear array subscript 612 // with an unsigned index), so we're in a buggy situation. 613 614 // TODO: Currently the analyzer ignores many casts (e.g. signed -> 615 // unsigned casts), so it can easily reach states where it will load a 616 // signed (and negative) value from an unsigned variable. This sanity 617 // check is a duct tape "solution" that silences most of the ugly false 618 // positives that are caused by this buggy behavior. Note that this is 619 // not a complete solution: this cannot silence reports where pointer 620 // arithmetic complicates the picture and cannot ensure modeling of the 621 // "unsigned index is positive with highest bit set" cases which are 622 // "usurped" by the nonsense "unsigned index is negative" case. 623 // For more information about this topic, see the umbrella ticket 624 // https://github.com/llvm/llvm-project/issues/39492 625 // TODO: Remove this hack once 'SymbolCast's are modeled properly. 626 627 if (!WithinLowerBound) { 628 // The state is completely nonsense -- let's just sink it! 629 C.addSink(); 630 return; 631 } 632 // Otherwise continue on the 'WithinLowerBound' branch where the 633 // unsigned index _is_ non-negative. Don't mention this assumption as a 634 // note tag, because it would just confuse the users! 635 } else { 636 if (!WithinLowerBound) { 637 // ...and it cannot be valid (>= 0), so report an error. 638 Messages Msgs = getPrecedesMsgs(Space, Reg, ByteOffset); 639 reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt); 640 return; 641 } 642 // ...but it can be valid as well, so the checker will (optimistically) 643 // assume that it's valid and mention this in the note tag. 644 SUR.recordNonNegativeAssumption(); 645 } 646 } 647 648 // Actually update the state. The "if" only fails in the extremely unlikely 649 // case when compareValueToThreshold returns {nullptr, nullptr} because 650 // evalBinOpNN fails to evaluate the less-than operator. 651 if (WithinLowerBound) 652 State = WithinLowerBound; 653 } 654 655 // CHECK UPPER BOUND 656 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB); 657 if (auto KnownSize = Size.getAs<NonLoc>()) { 658 // In a situation where both underflow and overflow are possible (but the 659 // index is either tainted or known to be invalid), the logic of this 660 // checker will first assume that the offset is non-negative, and then 661 // (with this additional assumption) it will detect an overflow error. 662 // In this situation the warning message should mention both possibilities. 663 bool AlsoMentionUnderflow = SUR.assumedNonNegative(); 664 665 auto [WithinUpperBound, ExceedsUpperBound] = 666 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB); 667 668 if (ExceedsUpperBound) { 669 // The offset may be invalid (>= Size)... 670 if (!WithinUpperBound) { 671 // ...and it cannot be within bounds, so report an error, unless we can 672 // definitely determine that this is an idiomatic `&array[size]` 673 // expression that calculates the past-the-end pointer. 674 if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset, 675 *KnownSize, C)) { 676 C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C)); 677 return; 678 } 679 680 Messages Msgs = 681 getExceedsMsgs(C.getASTContext(), Space, Reg, ByteOffset, 682 *KnownSize, Location, AlsoMentionUnderflow); 683 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize); 684 return; 685 } 686 // ...and it can be valid as well... 687 if (isTainted(State, ByteOffset)) { 688 // ...but it's tainted, so report an error. 689 690 // Diagnostic detail: saying "tainted offset" is always correct, but 691 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's 692 // nicer to say "tainted index". 693 const char *OffsetName = "offset"; 694 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) 695 if (isTainted(State, ASE->getIdx(), C.getLocationContext())) 696 OffsetName = "index"; 697 698 Messages Msgs = 699 getTaintMsgs(Space, Reg, OffsetName, AlsoMentionUnderflow); 700 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize, 701 /*IsTaintBug=*/true); 702 return; 703 } 704 // ...and it isn't tainted, so the checker will (optimistically) assume 705 // that the offset is in bounds and mention this in the note tag. 706 SUR.recordUpperBoundAssumption(*KnownSize); 707 } 708 709 // Actually update the state. The "if" only fails in the extremely unlikely 710 // case when compareValueToThreshold returns {nullptr, nullptr} because 711 // evalBinOpNN fails to evaluate the less-than operator. 712 if (WithinUpperBound) 713 State = WithinUpperBound; 714 } 715 716 // Add a transition, reporting the state updates that we accumulated. 717 C.addTransition(State, SUR.createNoteTag(C)); 718 } 719 720 void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR, 721 ProgramStateRef ErrorState, 722 NonLoc Val, bool MarkTaint) { 723 if (SymbolRef Sym = Val.getAsSymbol()) { 724 // If the offset is a symbolic value, iterate over its "parts" with 725 // `SymExpr::symbols()` and mark each of them as interesting. 726 // For example, if the offset is `x*4 + y` then we put interestingness onto 727 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols 728 // `x` and `y`. 729 for (SymbolRef PartSym : Sym->symbols()) 730 BR.markInteresting(PartSym); 731 } 732 733 if (MarkTaint) { 734 // If the issue that we're reporting depends on the taintedness of the 735 // offset, then put interestingness onto symbols that could be the origin 736 // of the taint. Note that this may find symbols that did not appear in 737 // `Sym->symbols()` (because they're only loosely connected to `Val`). 738 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val)) 739 BR.markInteresting(Sym); 740 } 741 } 742 743 void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState, 744 Messages Msgs, NonLoc Offset, 745 std::optional<NonLoc> Extent, 746 bool IsTaintBug /*=false*/) const { 747 748 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState); 749 if (!ErrorNode) 750 return; 751 752 auto BR = std::make_unique<PathSensitiveBugReport>( 753 IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode); 754 755 // FIXME: ideally we would just call trackExpressionValue() and that would 756 // "do the right thing": mark the relevant symbols as interesting, track the 757 // control dependencies and statements storing the relevant values and add 758 // helpful diagnostic pieces. However, right now trackExpressionValue() is 759 // a heap of unreliable heuristics, so it would cause several issues: 760 // - Interestingness is not applied consistently, e.g. if `array[x+10]` 761 // causes an overflow, then `x` is not marked as interesting. 762 // - We get irrelevant diagnostic pieces, e.g. in the code 763 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;` 764 // it places a "Storing uninitialized value" note on the `malloc` call 765 // (which is technically true, but irrelevant). 766 // If trackExpressionValue() becomes reliable, it should be applied instead 767 // of this custom markPartsInteresting(). 768 markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug); 769 if (Extent) 770 markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug); 771 772 C.emitReport(std::move(BR)); 773 } 774 775 bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) { 776 SourceLocation Loc = E->getBeginLoc(); 777 if (!Loc.isMacroID()) 778 return false; 779 780 StringRef MacroName = Lexer::getImmediateMacroName( 781 Loc, ACtx.getSourceManager(), ACtx.getLangOpts()); 782 783 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's') 784 return false; 785 786 return ((MacroName == "isalnum") || (MacroName == "isalpha") || 787 (MacroName == "isblank") || (MacroName == "isdigit") || 788 (MacroName == "isgraph") || (MacroName == "islower") || 789 (MacroName == "isnctrl") || (MacroName == "isprint") || 790 (MacroName == "ispunct") || (MacroName == "isspace") || 791 (MacroName == "isupper") || (MacroName == "isxdigit")); 792 } 793 794 bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E, 795 CheckerContext &C) { 796 const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C); 797 if (!ASE) 798 return false; 799 return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType(); 800 } 801 802 bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) { 803 ParentMapContext &ParentCtx = ACtx.getParentMapContext(); 804 do { 805 const DynTypedNodeList Parents = ParentCtx.getParents(*S); 806 if (Parents.empty()) 807 return false; 808 S = Parents[0].get<Stmt>(); 809 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S)); 810 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S); 811 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf; 812 } 813 814 bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E, 815 ProgramStateRef State, 816 NonLoc Offset, NonLoc Limit, 817 CheckerContext &C) { 818 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) { 819 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold( 820 State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true); 821 return EqualsToThreshold && !NotEqualToThreshold; 822 } 823 return false; 824 } 825 826 void ento::registerArrayBoundChecker(CheckerManager &mgr) { 827 mgr.registerChecker<ArrayBoundChecker>(); 828 } 829 830 bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) { 831 return true; 832 } 833