1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // fuzzer::InputCorpus 9 //===----------------------------------------------------------------------===// 10 11 #ifndef LLVM_FUZZER_CORPUS 12 #define LLVM_FUZZER_CORPUS 13 14 #include "FuzzerDataFlowTrace.h" 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <chrono> 22 #include <numeric> 23 #include <random> 24 #include <unordered_set> 25 26 namespace fuzzer { 27 28 struct InputInfo { 29 Unit U; // The actual input data. 30 std::chrono::microseconds TimeOfUnit; 31 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 32 // Number of features that this input has and no smaller input has. 33 size_t NumFeatures = 0; 34 size_t Tmp = 0; // Used by ValidateFeatureSet. 35 // Stats. 36 size_t NumExecutedMutations = 0; 37 size_t NumSuccessfullMutations = 0; 38 bool NeverReduce = false; 39 bool MayDeleteFile = false; 40 bool Reduced = false; 41 bool HasFocusFunction = false; 42 std::vector<uint32_t> UniqFeatureSet; 43 std::vector<uint8_t> DataFlowTraceForFocusFunction; 44 // Power schedule. 45 bool NeedsEnergyUpdate = false; 46 double Energy = 0.0; 47 double SumIncidence = 0.0; 48 std::vector<std::pair<uint32_t, uint16_t>> FeatureFreqs; 49 50 // Delete feature Idx and its frequency from FeatureFreqs. 51 bool DeleteFeatureFreq(uint32_t Idx) { 52 if (FeatureFreqs.empty()) 53 return false; 54 55 // Binary search over local feature frequencies sorted by index. 56 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 57 std::pair<uint32_t, uint16_t>(Idx, 0)); 58 59 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 60 FeatureFreqs.erase(Lower); 61 return true; 62 } 63 return false; 64 } 65 66 // Assign more energy to a high-entropy seed, i.e., that reveals more 67 // information about the globally rare features in the neighborhood of the 68 // seed. Since we do not know the entropy of a seed that has never been 69 // executed we assign fresh seeds maximum entropy and let II->Energy approach 70 // the true entropy from above. If ScalePerExecTime is true, the computed 71 // entropy is scaled based on how fast this input executes compared to the 72 // average execution time of inputs. The faster an input executes, the more 73 // energy gets assigned to the input. 74 void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime, 75 std::chrono::microseconds AverageUnitExecutionTime) { 76 Energy = 0.0; 77 SumIncidence = 0.0; 78 79 // Apply add-one smoothing to locally discovered features. 80 for (const auto &F : FeatureFreqs) { 81 double LocalIncidence = F.second + 1; 82 Energy -= LocalIncidence * log(LocalIncidence); 83 SumIncidence += LocalIncidence; 84 } 85 86 // Apply add-one smoothing to locally undiscovered features. 87 // PreciseEnergy -= 0; // since log(1.0) == 0) 88 SumIncidence += 89 static_cast<double>(GlobalNumberOfFeatures - FeatureFreqs.size()); 90 91 // Add a single locally abundant feature apply add-one smoothing. 92 double AbdIncidence = static_cast<double>(NumExecutedMutations + 1); 93 Energy -= AbdIncidence * log(AbdIncidence); 94 SumIncidence += AbdIncidence; 95 96 // Normalize. 97 if (SumIncidence != 0) 98 Energy = Energy / SumIncidence + log(SumIncidence); 99 100 if (ScalePerExecTime) { 101 // Scaling to favor inputs with lower execution time. 102 uint32_t PerfScore = 100; 103 if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10) 104 PerfScore = 10; 105 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4) 106 PerfScore = 25; 107 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2) 108 PerfScore = 50; 109 else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4) 110 PerfScore = 75; 111 else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count()) 112 PerfScore = 300; 113 else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count()) 114 PerfScore = 200; 115 else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count()) 116 PerfScore = 150; 117 118 Energy *= PerfScore; 119 } 120 } 121 122 // Increment the frequency of the feature Idx. 123 void UpdateFeatureFrequency(uint32_t Idx) { 124 NeedsEnergyUpdate = true; 125 126 // The local feature frequencies is an ordered vector of pairs. 127 // If there are no local feature frequencies, push_back preserves order. 128 // Set the feature frequency for feature Idx32 to 1. 129 if (FeatureFreqs.empty()) { 130 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1)); 131 return; 132 } 133 134 // Binary search over local feature frequencies sorted by index. 135 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 136 std::pair<uint32_t, uint16_t>(Idx, 0)); 137 138 // If feature Idx32 already exists, increment its frequency. 139 // Otherwise, insert a new pair right after the next lower index. 140 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 141 Lower->second++; 142 } else { 143 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1)); 144 } 145 } 146 }; 147 148 struct EntropicOptions { 149 bool Enabled; 150 size_t NumberOfRarestFeatures; 151 size_t FeatureFrequencyThreshold; 152 bool ScalePerExecTime; 153 }; 154 155 class InputCorpus { 156 static const uint32_t kFeatureSetSize = 1 << 21; 157 static const uint8_t kMaxMutationFactor = 20; 158 static const size_t kSparseEnergyUpdates = 100; 159 160 size_t NumExecutedMutations = 0; 161 162 EntropicOptions Entropic; 163 164 public: 165 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic) 166 : Entropic(Entropic), OutputCorpus(OutputCorpus) { 167 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 168 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 169 } 170 ~InputCorpus() { 171 for (auto II : Inputs) 172 delete II; 173 } 174 size_t size() const { return Inputs.size(); } 175 size_t SizeInBytes() const { 176 size_t Res = 0; 177 for (auto II : Inputs) 178 Res += II->U.size(); 179 return Res; 180 } 181 size_t NumActiveUnits() const { 182 size_t Res = 0; 183 for (auto II : Inputs) 184 Res += !II->U.empty(); 185 return Res; 186 } 187 size_t MaxInputSize() const { 188 size_t Res = 0; 189 for (auto II : Inputs) 190 Res = std::max(Res, II->U.size()); 191 return Res; 192 } 193 void IncrementNumExecutedMutations() { NumExecutedMutations++; } 194 195 size_t NumInputsThatTouchFocusFunction() { 196 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 197 return II->HasFocusFunction; 198 }); 199 } 200 201 size_t NumInputsWithDataFlowTrace() { 202 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 203 return !II->DataFlowTraceForFocusFunction.empty(); 204 }); 205 } 206 207 bool empty() const { return Inputs.empty(); } 208 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 209 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 210 bool HasFocusFunction, bool NeverReduce, 211 std::chrono::microseconds TimeOfUnit, 212 const std::vector<uint32_t> &FeatureSet, 213 const DataFlowTrace &DFT, const InputInfo *BaseII) { 214 assert(!U.empty()); 215 if (FeatureDebug) 216 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 217 // Inputs.size() is cast to uint32_t below. 218 assert(Inputs.size() < std::numeric_limits<uint32_t>::max()); 219 Inputs.push_back(new InputInfo()); 220 InputInfo &II = *Inputs.back(); 221 II.U = U; 222 II.NumFeatures = NumFeatures; 223 II.NeverReduce = NeverReduce; 224 II.TimeOfUnit = TimeOfUnit; 225 II.MayDeleteFile = MayDeleteFile; 226 II.UniqFeatureSet = FeatureSet; 227 II.HasFocusFunction = HasFocusFunction; 228 // Assign maximal energy to the new seed. 229 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size()); 230 II.SumIncidence = static_cast<double>(RareFeatures.size()); 231 II.NeedsEnergyUpdate = false; 232 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 233 ComputeSHA1(U.data(), U.size(), II.Sha1); 234 auto Sha1Str = Sha1ToString(II.Sha1); 235 Hashes.insert(Sha1Str); 236 if (HasFocusFunction) 237 if (auto V = DFT.Get(Sha1Str)) 238 II.DataFlowTraceForFocusFunction = *V; 239 // This is a gross heuristic. 240 // Ideally, when we add an element to a corpus we need to know its DFT. 241 // But if we don't, we'll use the DFT of its base input. 242 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 243 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 244 DistributionNeedsUpdate = true; 245 PrintCorpus(); 246 // ValidateFeatureSet(); 247 return &II; 248 } 249 250 // Debug-only 251 void PrintUnit(const Unit &U) { 252 if (!FeatureDebug) return; 253 for (uint8_t C : U) { 254 if (C != 'F' && C != 'U' && C != 'Z') 255 C = '.'; 256 Printf("%c", C); 257 } 258 } 259 260 // Debug-only 261 void PrintFeatureSet(const std::vector<uint32_t> &FeatureSet) { 262 if (!FeatureDebug) return; 263 Printf("{"); 264 for (uint32_t Feature: FeatureSet) 265 Printf("%u,", Feature); 266 Printf("}"); 267 } 268 269 // Debug-only 270 void PrintCorpus() { 271 if (!FeatureDebug) return; 272 Printf("======= CORPUS:\n"); 273 int i = 0; 274 for (auto II : Inputs) { 275 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 276 Printf("[%2d] ", i); 277 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 278 PrintUnit(II->U); 279 Printf(" "); 280 PrintFeatureSet(II->UniqFeatureSet); 281 Printf("\n"); 282 } 283 i++; 284 } 285 } 286 287 void Replace(InputInfo *II, const Unit &U, 288 std::chrono::microseconds TimeOfUnit) { 289 assert(II->U.size() > U.size()); 290 Hashes.erase(Sha1ToString(II->Sha1)); 291 DeleteFile(*II); 292 ComputeSHA1(U.data(), U.size(), II->Sha1); 293 Hashes.insert(Sha1ToString(II->Sha1)); 294 II->U = U; 295 II->Reduced = true; 296 II->TimeOfUnit = TimeOfUnit; 297 DistributionNeedsUpdate = true; 298 } 299 300 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 301 bool HasUnit(const std::string &H) { return Hashes.count(H); } 302 InputInfo &ChooseUnitToMutate(Random &Rand) { 303 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 304 assert(!II.U.empty()); 305 return II; 306 } 307 308 InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) { 309 if (!UniformDist) { 310 return ChooseUnitToMutate(Rand); 311 } 312 InputInfo &II = *Inputs[Rand(Inputs.size())]; 313 assert(!II.U.empty()); 314 return II; 315 } 316 317 // Returns an index of random unit from the corpus to mutate. 318 size_t ChooseUnitIdxToMutate(Random &Rand) { 319 UpdateCorpusDistribution(Rand); 320 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 321 assert(Idx < Inputs.size()); 322 return Idx; 323 } 324 325 void PrintStats() { 326 for (size_t i = 0; i < Inputs.size(); i++) { 327 const auto &II = *Inputs[i]; 328 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 329 Sha1ToString(II.Sha1).c_str(), II.U.size(), 330 II.NumExecutedMutations, II.NumSuccessfullMutations, 331 II.HasFocusFunction); 332 } 333 } 334 335 void PrintFeatureSet() { 336 for (size_t i = 0; i < kFeatureSetSize; i++) { 337 if(size_t Sz = GetFeature(i)) 338 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 339 } 340 Printf("\n\t"); 341 for (size_t i = 0; i < Inputs.size(); i++) 342 if (size_t N = Inputs[i]->NumFeatures) 343 Printf(" %zd=>%zd ", i, N); 344 Printf("\n"); 345 } 346 347 void DeleteFile(const InputInfo &II) { 348 if (!OutputCorpus.empty() && II.MayDeleteFile) 349 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 350 } 351 352 void DeleteInput(size_t Idx) { 353 InputInfo &II = *Inputs[Idx]; 354 DeleteFile(II); 355 Unit().swap(II.U); 356 II.Energy = 0.0; 357 II.NeedsEnergyUpdate = false; 358 DistributionNeedsUpdate = true; 359 if (FeatureDebug) 360 Printf("EVICTED %zd\n", Idx); 361 } 362 363 void AddRareFeature(uint32_t Idx) { 364 // Maintain *at least* TopXRarestFeatures many rare features 365 // and all features with a frequency below ConsideredRare. 366 // Remove all other features. 367 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures && 368 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) { 369 370 // Find most and second most abbundant feature. 371 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0], 372 RareFeatures[0]}; 373 size_t Delete = 0; 374 for (size_t i = 0; i < RareFeatures.size(); i++) { 375 uint32_t Idx2 = RareFeatures[i]; 376 if (GlobalFeatureFreqs[Idx2] >= 377 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) { 378 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0]; 379 MostAbundantRareFeatureIndices[0] = Idx2; 380 Delete = i; 381 } 382 } 383 384 // Remove most abundant rare feature. 385 RareFeatures[Delete] = RareFeatures.back(); 386 RareFeatures.pop_back(); 387 388 for (auto II : Inputs) { 389 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0])) 390 II->NeedsEnergyUpdate = true; 391 } 392 393 // Set 2nd most abundant as the new most abundant feature count. 394 FreqOfMostAbundantRareFeature = 395 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]]; 396 } 397 398 // Add rare feature, handle collisions, and update energy. 399 RareFeatures.push_back(Idx); 400 GlobalFeatureFreqs[Idx] = 0; 401 for (auto II : Inputs) { 402 II->DeleteFeatureFreq(Idx); 403 404 // Apply add-one smoothing to this locally undiscovered feature. 405 // Zero energy seeds will never be fuzzed and remain zero energy. 406 if (II->Energy > 0.0) { 407 II->SumIncidence += 1; 408 II->Energy += log(II->SumIncidence) / II->SumIncidence; 409 } 410 } 411 412 DistributionNeedsUpdate = true; 413 } 414 415 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 416 assert(NewSize); 417 Idx = Idx % kFeatureSetSize; 418 uint32_t OldSize = GetFeature(Idx); 419 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 420 if (OldSize > 0) { 421 size_t OldIdx = SmallestElementPerFeature[Idx]; 422 InputInfo &II = *Inputs[OldIdx]; 423 assert(II.NumFeatures > 0); 424 II.NumFeatures--; 425 if (II.NumFeatures == 0) 426 DeleteInput(OldIdx); 427 } else { 428 NumAddedFeatures++; 429 if (Entropic.Enabled) 430 AddRareFeature((uint32_t)Idx); 431 } 432 NumUpdatedFeatures++; 433 if (FeatureDebug) 434 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 435 // Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus. 436 SmallestElementPerFeature[Idx] = static_cast<uint32_t>(Inputs.size()); 437 InputSizesPerFeature[Idx] = NewSize; 438 return true; 439 } 440 return false; 441 } 442 443 // Increment frequency of feature Idx globally and locally. 444 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) { 445 uint32_t Idx32 = Idx % kFeatureSetSize; 446 447 // Saturated increment. 448 if (GlobalFeatureFreqs[Idx32] == 0xFFFF) 449 return; 450 uint16_t Freq = GlobalFeatureFreqs[Idx32]++; 451 452 // Skip if abundant. 453 if (Freq > FreqOfMostAbundantRareFeature || 454 std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) == 455 RareFeatures.end()) 456 return; 457 458 // Update global frequencies. 459 if (Freq == FreqOfMostAbundantRareFeature) 460 FreqOfMostAbundantRareFeature++; 461 462 // Update local frequencies. 463 if (II) 464 II->UpdateFeatureFrequency(Idx32); 465 } 466 467 size_t NumFeatures() const { return NumAddedFeatures; } 468 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 469 470 private: 471 472 static const bool FeatureDebug = false; 473 474 uint32_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 475 476 void ValidateFeatureSet() { 477 if (FeatureDebug) 478 PrintFeatureSet(); 479 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 480 if (GetFeature(Idx)) 481 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 482 for (auto II: Inputs) { 483 if (II->Tmp != II->NumFeatures) 484 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 485 assert(II->Tmp == II->NumFeatures); 486 II->Tmp = 0; 487 } 488 } 489 490 // Updates the probability distribution for the units in the corpus. 491 // Must be called whenever the corpus or unit weights are changed. 492 // 493 // Hypothesis: inputs that maximize information about globally rare features 494 // are interesting. 495 void UpdateCorpusDistribution(Random &Rand) { 496 // Skip update if no seeds or rare features were added/deleted. 497 // Sparse updates for local change of feature frequencies, 498 // i.e., randomly do not skip. 499 if (!DistributionNeedsUpdate && 500 (!Entropic.Enabled || Rand(kSparseEnergyUpdates))) 501 return; 502 503 DistributionNeedsUpdate = false; 504 505 size_t N = Inputs.size(); 506 assert(N); 507 Intervals.resize(N + 1); 508 Weights.resize(N); 509 std::iota(Intervals.begin(), Intervals.end(), 0); 510 511 std::chrono::microseconds AverageUnitExecutionTime(0); 512 for (auto II : Inputs) { 513 AverageUnitExecutionTime += II->TimeOfUnit; 514 } 515 AverageUnitExecutionTime /= N; 516 517 bool VanillaSchedule = true; 518 if (Entropic.Enabled) { 519 for (auto II : Inputs) { 520 if (II->NeedsEnergyUpdate && II->Energy != 0.0) { 521 II->NeedsEnergyUpdate = false; 522 II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime, 523 AverageUnitExecutionTime); 524 } 525 } 526 527 for (size_t i = 0; i < N; i++) { 528 529 if (Inputs[i]->NumFeatures == 0) { 530 // If the seed doesn't represent any features, assign zero energy. 531 Weights[i] = 0.; 532 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor > 533 NumExecutedMutations / Inputs.size()) { 534 // If the seed was fuzzed a lot more than average, assign zero energy. 535 Weights[i] = 0.; 536 } else { 537 // Otherwise, simply assign the computed energy. 538 Weights[i] = Inputs[i]->Energy; 539 } 540 541 // If energy for all seeds is zero, fall back to vanilla schedule. 542 if (Weights[i] > 0.0) 543 VanillaSchedule = false; 544 } 545 } 546 547 if (VanillaSchedule) { 548 for (size_t i = 0; i < N; i++) 549 Weights[i] = 550 Inputs[i]->NumFeatures 551 ? static_cast<double>((i + 1) * 552 (Inputs[i]->HasFocusFunction ? 1000 : 1)) 553 : 0.; 554 } 555 556 if (FeatureDebug) { 557 for (size_t i = 0; i < N; i++) 558 Printf("%zd ", Inputs[i]->NumFeatures); 559 Printf("SCORE\n"); 560 for (size_t i = 0; i < N; i++) 561 Printf("%f ", Weights[i]); 562 Printf("Weights\n"); 563 } 564 CorpusDistribution = std::piecewise_constant_distribution<double>( 565 Intervals.begin(), Intervals.end(), Weights.begin()); 566 } 567 std::piecewise_constant_distribution<double> CorpusDistribution; 568 569 std::vector<double> Intervals; 570 std::vector<double> Weights; 571 572 std::unordered_set<std::string> Hashes; 573 std::vector<InputInfo *> Inputs; 574 575 size_t NumAddedFeatures = 0; 576 size_t NumUpdatedFeatures = 0; 577 uint32_t InputSizesPerFeature[kFeatureSetSize]; 578 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 579 580 bool DistributionNeedsUpdate = true; 581 uint16_t FreqOfMostAbundantRareFeature = 0; 582 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {}; 583 std::vector<uint32_t> RareFeatures; 584 585 std::string OutputCorpus; 586 }; 587 588 } // namespace fuzzer 589 590 #endif // LLVM_FUZZER_CORPUS 591