1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check 10 // which looks for an out-of-bound array element access. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "clang/AST/CharUnits.h" 15 #include "clang/AST/ParentMapContext.h" 16 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" 17 #include "clang/StaticAnalyzer/Checkers/Taint.h" 18 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" 19 #include "clang/StaticAnalyzer/Core/Checker.h" 20 #include "clang/StaticAnalyzer/Core/CheckerManager.h" 21 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h" 22 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" 23 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" 24 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h" 25 #include "llvm/ADT/SmallString.h" 26 #include "llvm/Support/FormatVariadic.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include <optional> 29 30 using namespace clang; 31 using namespace ento; 32 using namespace taint; 33 using llvm::formatv; 34 35 namespace { 36 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint }; 37 38 struct Messages { 39 std::string Short, Full; 40 }; 41 42 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt` 43 // instead of `PreStmt` because the current implementation passes the whole 44 // expression to `CheckerContext::getSVal()` which only works after the 45 // symbolic evaluation of the expression. (To turn them into `PreStmt` 46 // callbacks, we'd need to duplicate the logic that evaluates these 47 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's 48 // defined as `PostStmt` for the sake of consistency with the other callbacks. 49 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>, 50 check::PostStmt<UnaryOperator>, 51 check::PostStmt<MemberExpr>> { 52 BugType BT{this, "Out-of-bound access"}; 53 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData}; 54 55 void performCheck(const Expr *E, CheckerContext &C) const; 56 57 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind, 58 NonLoc Offset, Messages Msgs) const; 59 60 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); 61 62 static bool isInAddressOf(const Stmt *S, ASTContext &AC); 63 64 public: 65 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const { 66 performCheck(E, C); 67 } 68 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const { 69 if (E->getOpcode() == UO_Deref) 70 performCheck(E, C); 71 } 72 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const { 73 if (E->isArrow()) 74 performCheck(E->getBase(), C); 75 } 76 }; 77 78 } // anonymous namespace 79 80 /// For a given Location that can be represented as a symbolic expression 81 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block 82 /// Arr and the distance of Location from the beginning of Arr (expressed in a 83 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these 84 /// cannot be determined. 85 static std::optional<std::pair<const SubRegion *, NonLoc>> 86 computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { 87 QualType T = SVB.getArrayIndexType(); 88 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) { 89 // We will use this utility to add and multiply values. 90 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>(); 91 }; 92 93 const SubRegion *OwnerRegion = nullptr; 94 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex(); 95 96 const ElementRegion *CurRegion = 97 dyn_cast_or_null<ElementRegion>(Location.getAsRegion()); 98 99 while (CurRegion) { 100 const auto Index = CurRegion->getIndex().getAs<NonLoc>(); 101 if (!Index) 102 return std::nullopt; 103 104 QualType ElemType = CurRegion->getElementType(); 105 106 // FIXME: The following early return was presumably added to safeguard the 107 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but 108 // it seems that `ElemType` cannot be incomplete at this point. 109 if (ElemType->isIncompleteType()) 110 return std::nullopt; 111 112 // Calculate Delta = Index * sizeof(ElemType). 113 NonLoc Size = SVB.makeArrayIndex( 114 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity()); 115 auto Delta = EvalBinOp(BO_Mul, *Index, Size); 116 if (!Delta) 117 return std::nullopt; 118 119 // Perform Offset += Delta. 120 Offset = EvalBinOp(BO_Add, *Offset, *Delta); 121 if (!Offset) 122 return std::nullopt; 123 124 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>(); 125 // When this is just another ElementRegion layer, we need to continue the 126 // offset calculations: 127 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion); 128 } 129 130 if (OwnerRegion) 131 return std::make_pair(OwnerRegion, *Offset); 132 133 return std::nullopt; 134 } 135 136 // TODO: once the constraint manager is smart enough to handle non simplified 137 // symbolic expressions remove this function. Note that this can not be used in 138 // the constraint manager as is, since this does not handle overflows. It is 139 // safe to assume, however, that memory offsets will not overflow. 140 // NOTE: callers of this function need to be aware of the effects of overflows 141 // and signed<->unsigned conversions! 142 static std::pair<NonLoc, nonloc::ConcreteInt> 143 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, 144 SValBuilder &svalBuilder) { 145 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>(); 146 if (SymVal && SymVal->isExpression()) { 147 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) { 148 llvm::APSInt constant = 149 APSIntType(extent.getValue()).convert(SIE->getRHS()); 150 switch (SIE->getOpcode()) { 151 case BO_Mul: 152 // The constant should never be 0 here, becasue multiplication by zero 153 // is simplified by the engine. 154 if ((extent.getValue() % constant) != 0) 155 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); 156 else 157 return getSimplifiedOffsets( 158 nonloc::SymbolVal(SIE->getLHS()), 159 svalBuilder.makeIntVal(extent.getValue() / constant), 160 svalBuilder); 161 case BO_Add: 162 return getSimplifiedOffsets( 163 nonloc::SymbolVal(SIE->getLHS()), 164 svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder); 165 default: 166 break; 167 } 168 } 169 } 170 171 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); 172 } 173 174 // Evaluate the comparison Value < Threshold with the help of the custom 175 // simplification algorithm defined for this checker. Return a pair of states, 176 // where the first one corresponds to "value below threshold" and the second 177 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in 178 // the case when the evaluation fails. 179 // If the optional argument CheckEquality is true, then use BO_EQ instead of 180 // the default BO_LT after consistently applying the same simplification steps. 181 static std::pair<ProgramStateRef, ProgramStateRef> 182 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, 183 SValBuilder &SVB, bool CheckEquality = false) { 184 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { 185 std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); 186 } 187 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { 188 QualType T = Value.getType(SVB.getContext()); 189 if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) { 190 // In this case we reduced the bound check to a comparison of the form 191 // (symbol or value with unsigned type) < (negative number) 192 // which is always false. We are handling these cases separately because 193 // evalBinOpNN can perform a signed->unsigned conversion that turns the 194 // negative number into a huge positive value and leads to wildly 195 // inaccurate conclusions. 196 return {nullptr, State}; 197 } 198 } 199 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT; 200 auto BelowThreshold = 201 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType()) 202 .getAs<NonLoc>(); 203 204 if (BelowThreshold) 205 return State->assume(*BelowThreshold); 206 207 return {nullptr, nullptr}; 208 } 209 210 static std::string getRegionName(const SubRegion *Region) { 211 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty()) 212 return RegName; 213 214 // Field regions only have descriptive names when their parent has a 215 // descriptive name; so we provide a fallback representation for them: 216 if (const auto *FR = Region->getAs<FieldRegion>()) { 217 if (StringRef Name = FR->getDecl()->getName(); !Name.empty()) 218 return formatv("the field '{0}'", Name); 219 return "the unnamed field"; 220 } 221 222 if (isa<AllocaRegion>(Region)) 223 return "the memory returned by 'alloca'"; 224 225 if (isa<SymbolicRegion>(Region) && 226 isa<HeapSpaceRegion>(Region->getMemorySpace())) 227 return "the heap area"; 228 229 if (isa<StringRegion>(Region)) 230 return "the string literal"; 231 232 return "the region"; 233 } 234 235 static std::optional<int64_t> getConcreteValue(NonLoc SV) { 236 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) { 237 return ConcreteVal->getValue().tryExtValue(); 238 } 239 return std::nullopt; 240 } 241 242 static std::string getShortMsg(OOB_Kind Kind, std::string RegName) { 243 static const char *ShortMsgTemplates[] = { 244 "Out of bound access to memory preceding {0}", 245 "Out of bound access to memory after the end of {0}", 246 "Potential out of bound access to {0} with tainted offset"}; 247 248 return formatv(ShortMsgTemplates[Kind], RegName); 249 } 250 251 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) { 252 std::string RegName = getRegionName(Region); 253 SmallString<128> Buf; 254 llvm::raw_svector_ostream Out(Buf); 255 Out << "Access of " << RegName << " at negative byte offset"; 256 if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>()) 257 Out << ' ' << ConcreteIdx->getValue(); 258 return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)}; 259 } 260 261 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, 262 NonLoc Offset, NonLoc Extent, SVal Location) { 263 std::string RegName = getRegionName(Region); 264 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>(); 265 assert(EReg && "this checker only handles element access"); 266 QualType ElemType = EReg->getElementType(); 267 268 std::optional<int64_t> OffsetN = getConcreteValue(Offset); 269 std::optional<int64_t> ExtentN = getConcreteValue(Extent); 270 271 bool UseByteOffsets = true; 272 if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) { 273 const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize; 274 const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize; 275 if (!OffsetHasRemainder && !ExtentHasRemainder) { 276 UseByteOffsets = false; 277 if (OffsetN) 278 *OffsetN /= ElemSize; 279 if (ExtentN) 280 *ExtentN /= ElemSize; 281 } 282 } 283 284 SmallString<256> Buf; 285 llvm::raw_svector_ostream Out(Buf); 286 Out << "Access of "; 287 if (!ExtentN && !UseByteOffsets) 288 Out << "'" << ElemType.getAsString() << "' element in "; 289 Out << RegName << " at "; 290 if (OffsetN) { 291 Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN; 292 } else { 293 Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index"); 294 } 295 if (ExtentN) { 296 Out << ", while it holds only "; 297 if (*ExtentN != 1) 298 Out << *ExtentN; 299 else 300 Out << "a single"; 301 if (UseByteOffsets) 302 Out << " byte"; 303 else 304 Out << " '" << ElemType.getAsString() << "' element"; 305 306 if (*ExtentN > 1) 307 Out << "s"; 308 } 309 310 return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)}; 311 } 312 313 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) { 314 std::string RegName = getRegionName(Region); 315 return {formatv("Potential out of bound access to {0} with tainted {1}", 316 RegName, OffsetName), 317 formatv("Access of {0} with a tainted {1} that may be too large", 318 RegName, OffsetName)}; 319 } 320 321 void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const { 322 // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping 323 // some new logic here that reasons directly about memory region extents. 324 // Once that logic is more mature, we can bring it back to assumeInBound() 325 // for all clients to use. 326 // 327 // The algorithm we are using here for bounds checking is to see if the 328 // memory access is within the extent of the base region. Since we 329 // have some flexibility in defining the base region, we can achieve 330 // various levels of conservatism in our buffer overflow checking. 331 332 const SVal Location = C.getSVal(E); 333 334 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as 335 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX) 336 // and incomplete analysis of these leads to false positives. As even 337 // accurate reports would be confusing for the users, just disable reports 338 // from these macros: 339 if (isFromCtypeMacro(E, C.getASTContext())) 340 return; 341 342 ProgramStateRef State = C.getState(); 343 SValBuilder &SVB = C.getSValBuilder(); 344 345 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset = 346 computeOffset(State, SVB, Location); 347 348 if (!RawOffset) 349 return; 350 351 auto [Reg, ByteOffset] = *RawOffset; 352 353 // CHECK LOWER BOUND 354 const MemSpaceRegion *Space = Reg->getMemorySpace(); 355 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) { 356 // A symbolic region in unknown space represents an unknown pointer that 357 // may point into the middle of an array, so we don't look for underflows. 358 // Both conditions are significant because we want to check underflows in 359 // symbolic regions on the heap (which may be introduced by checkers like 360 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and 361 // non-symbolic regions (e.g. a field subregion of a symbolic region) in 362 // unknown space. 363 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( 364 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); 365 366 if (PrecedesLowerBound && !WithinLowerBound) { 367 // We know that the index definitely precedes the lower bound. 368 Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); 369 reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs); 370 return; 371 } 372 373 if (WithinLowerBound) 374 State = WithinLowerBound; 375 } 376 377 // CHECK UPPER BOUND 378 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB); 379 if (auto KnownSize = Size.getAs<NonLoc>()) { 380 auto [WithinUpperBound, ExceedsUpperBound] = 381 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB); 382 383 if (ExceedsUpperBound) { 384 if (!WithinUpperBound) { 385 // We know that the index definitely exceeds the upper bound. 386 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) { 387 // ...but this is within an addressof expression, so we need to check 388 // for the exceptional case that `&array[size]` is valid. 389 auto [EqualsToThreshold, NotEqualToThreshold] = 390 compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize, 391 SVB, /*CheckEquality=*/true); 392 if (EqualsToThreshold && !NotEqualToThreshold) { 393 // We are definitely in the exceptional case, so return early 394 // instead of reporting a bug. 395 C.addTransition(EqualsToThreshold); 396 return; 397 } 398 } 399 Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, 400 *KnownSize, Location); 401 reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs); 402 return; 403 } 404 if (isTainted(State, ByteOffset)) { 405 // Both cases are possible, but the offset is tainted, so report. 406 std::string RegName = getRegionName(Reg); 407 408 // Diagnostic detail: "tainted offset" is always correct, but the 409 // common case is that 'idx' is tainted in 'arr[idx]' and then it's 410 // nicer to say "tainted index". 411 const char *OffsetName = "offset"; 412 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) 413 if (isTainted(State, ASE->getIdx(), C.getLocationContext())) 414 OffsetName = "index"; 415 416 Messages Msgs = getTaintMsgs(Reg, OffsetName); 417 reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs); 418 return; 419 } 420 } 421 422 if (WithinUpperBound) 423 State = WithinUpperBound; 424 } 425 426 C.addTransition(State); 427 } 428 429 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C, 430 ProgramStateRef ErrorState, OOB_Kind Kind, 431 NonLoc Offset, Messages Msgs) const { 432 433 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState); 434 if (!ErrorNode) 435 return; 436 437 auto BR = std::make_unique<PathSensitiveBugReport>( 438 Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode); 439 440 // Track back the propagation of taintedness. 441 if (Kind == OOB_Taint) 442 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset)) 443 BR->markInteresting(Sym); 444 445 C.emitReport(std::move(BR)); 446 } 447 448 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { 449 SourceLocation Loc = S->getBeginLoc(); 450 if (!Loc.isMacroID()) 451 return false; 452 453 StringRef MacroName = Lexer::getImmediateMacroName( 454 Loc, ACtx.getSourceManager(), ACtx.getLangOpts()); 455 456 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's') 457 return false; 458 459 return ((MacroName == "isalnum") || (MacroName == "isalpha") || 460 (MacroName == "isblank") || (MacroName == "isdigit") || 461 (MacroName == "isgraph") || (MacroName == "islower") || 462 (MacroName == "isnctrl") || (MacroName == "isprint") || 463 (MacroName == "ispunct") || (MacroName == "isspace") || 464 (MacroName == "isupper") || (MacroName == "isxdigit")); 465 } 466 467 bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) { 468 ParentMapContext &ParentCtx = ACtx.getParentMapContext(); 469 do { 470 const DynTypedNodeList Parents = ParentCtx.getParents(*S); 471 if (Parents.empty()) 472 return false; 473 S = Parents[0].get<Stmt>(); 474 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S)); 475 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S); 476 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf; 477 } 478 479 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) { 480 mgr.registerChecker<ArrayBoundCheckerV2>(); 481 } 482 483 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) { 484 return true; 485 } 486