1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // fuzzer::InputCorpus 9 //===----------------------------------------------------------------------===// 10 11 #ifndef LLVM_FUZZER_CORPUS 12 #define LLVM_FUZZER_CORPUS 13 14 #include "FuzzerDataFlowTrace.h" 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <bitset> 22 #include <chrono> 23 #include <numeric> 24 #include <random> 25 #include <unordered_set> 26 27 namespace fuzzer { 28 29 struct InputInfo { 30 Unit U; // The actual input data. 31 std::chrono::microseconds TimeOfUnit; 32 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 33 // Number of features that this input has and no smaller input has. 34 size_t NumFeatures = 0; 35 size_t Tmp = 0; // Used by ValidateFeatureSet. 36 // Stats. 37 size_t NumExecutedMutations = 0; 38 size_t NumSuccessfullMutations = 0; 39 bool NeverReduce = false; 40 bool MayDeleteFile = false; 41 bool Reduced = false; 42 bool HasFocusFunction = false; 43 std::vector<uint32_t> UniqFeatureSet; 44 std::vector<uint8_t> DataFlowTraceForFocusFunction; 45 // Power schedule. 46 bool NeedsEnergyUpdate = false; 47 double Energy = 0.0; 48 double SumIncidence = 0.0; 49 std::vector<std::pair<uint32_t, uint16_t>> FeatureFreqs; 50 51 // Delete feature Idx and its frequency from FeatureFreqs. 52 bool DeleteFeatureFreq(uint32_t Idx) { 53 if (FeatureFreqs.empty()) 54 return false; 55 56 // Binary search over local feature frequencies sorted by index. 57 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 58 std::pair<uint32_t, uint16_t>(Idx, 0)); 59 60 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 61 FeatureFreqs.erase(Lower); 62 return true; 63 } 64 return false; 65 } 66 67 // Assign more energy to a high-entropy seed, i.e., that reveals more 68 // information about the globally rare features in the neighborhood of the 69 // seed. Since we do not know the entropy of a seed that has never been 70 // executed we assign fresh seeds maximum entropy and let II->Energy approach 71 // the true entropy from above. If ScalePerExecTime is true, the computed 72 // entropy is scaled based on how fast this input executes compared to the 73 // average execution time of inputs. The faster an input executes, the more 74 // energy gets assigned to the input. 75 void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime, 76 std::chrono::microseconds AverageUnitExecutionTime) { 77 Energy = 0.0; 78 SumIncidence = 0.0; 79 80 // Apply add-one smoothing to locally discovered features. 81 for (const auto &F : FeatureFreqs) { 82 double LocalIncidence = F.second + 1; 83 Energy -= LocalIncidence * log(LocalIncidence); 84 SumIncidence += LocalIncidence; 85 } 86 87 // Apply add-one smoothing to locally undiscovered features. 88 // PreciseEnergy -= 0; // since log(1.0) == 0) 89 SumIncidence += 90 static_cast<double>(GlobalNumberOfFeatures - FeatureFreqs.size()); 91 92 // Add a single locally abundant feature apply add-one smoothing. 93 double AbdIncidence = static_cast<double>(NumExecutedMutations + 1); 94 Energy -= AbdIncidence * log(AbdIncidence); 95 SumIncidence += AbdIncidence; 96 97 // Normalize. 98 if (SumIncidence != 0) 99 Energy = Energy / SumIncidence + log(SumIncidence); 100 101 if (ScalePerExecTime) { 102 // Scaling to favor inputs with lower execution time. 103 uint32_t PerfScore = 100; 104 if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10) 105 PerfScore = 10; 106 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4) 107 PerfScore = 25; 108 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2) 109 PerfScore = 50; 110 else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4) 111 PerfScore = 75; 112 else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count()) 113 PerfScore = 300; 114 else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count()) 115 PerfScore = 200; 116 else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count()) 117 PerfScore = 150; 118 119 Energy *= PerfScore; 120 } 121 } 122 123 // Increment the frequency of the feature Idx. 124 void UpdateFeatureFrequency(uint32_t Idx) { 125 NeedsEnergyUpdate = true; 126 127 // The local feature frequencies is an ordered vector of pairs. 128 // If there are no local feature frequencies, push_back preserves order. 129 // Set the feature frequency for feature Idx32 to 1. 130 if (FeatureFreqs.empty()) { 131 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1)); 132 return; 133 } 134 135 // Binary search over local feature frequencies sorted by index. 136 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 137 std::pair<uint32_t, uint16_t>(Idx, 0)); 138 139 // If feature Idx32 already exists, increment its frequency. 140 // Otherwise, insert a new pair right after the next lower index. 141 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 142 Lower->second++; 143 } else { 144 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1)); 145 } 146 } 147 }; 148 149 struct EntropicOptions { 150 bool Enabled; 151 size_t NumberOfRarestFeatures; 152 size_t FeatureFrequencyThreshold; 153 bool ScalePerExecTime; 154 }; 155 156 class InputCorpus { 157 static const uint32_t kFeatureSetSize = 1 << 21; 158 static const uint8_t kMaxMutationFactor = 20; 159 static const size_t kSparseEnergyUpdates = 100; 160 161 size_t NumExecutedMutations = 0; 162 163 EntropicOptions Entropic; 164 165 public: 166 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic) 167 : Entropic(Entropic), OutputCorpus(OutputCorpus) { 168 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 169 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 170 } 171 ~InputCorpus() { 172 for (auto II : Inputs) 173 delete II; 174 } 175 size_t size() const { return Inputs.size(); } 176 size_t SizeInBytes() const { 177 size_t Res = 0; 178 for (auto II : Inputs) 179 Res += II->U.size(); 180 return Res; 181 } 182 size_t NumActiveUnits() const { 183 size_t Res = 0; 184 for (auto II : Inputs) 185 Res += !II->U.empty(); 186 return Res; 187 } 188 size_t MaxInputSize() const { 189 size_t Res = 0; 190 for (auto II : Inputs) 191 Res = std::max(Res, II->U.size()); 192 return Res; 193 } 194 void IncrementNumExecutedMutations() { NumExecutedMutations++; } 195 196 size_t NumInputsThatTouchFocusFunction() { 197 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 198 return II->HasFocusFunction; 199 }); 200 } 201 202 size_t NumInputsWithDataFlowTrace() { 203 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 204 return !II->DataFlowTraceForFocusFunction.empty(); 205 }); 206 } 207 208 bool empty() const { return Inputs.empty(); } 209 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 210 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 211 bool HasFocusFunction, bool NeverReduce, 212 std::chrono::microseconds TimeOfUnit, 213 const std::vector<uint32_t> &FeatureSet, 214 const DataFlowTrace &DFT, const InputInfo *BaseII) { 215 assert(!U.empty()); 216 if (FeatureDebug) 217 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 218 // Inputs.size() is cast to uint32_t below. 219 assert(Inputs.size() < std::numeric_limits<uint32_t>::max()); 220 Inputs.push_back(new InputInfo()); 221 InputInfo &II = *Inputs.back(); 222 II.U = U; 223 II.NumFeatures = NumFeatures; 224 II.NeverReduce = NeverReduce; 225 II.TimeOfUnit = TimeOfUnit; 226 II.MayDeleteFile = MayDeleteFile; 227 II.UniqFeatureSet = FeatureSet; 228 II.HasFocusFunction = HasFocusFunction; 229 // Assign maximal energy to the new seed. 230 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size()); 231 II.SumIncidence = static_cast<double>(RareFeatures.size()); 232 II.NeedsEnergyUpdate = false; 233 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 234 ComputeSHA1(U.data(), U.size(), II.Sha1); 235 auto Sha1Str = Sha1ToString(II.Sha1); 236 Hashes.insert(Sha1Str); 237 if (HasFocusFunction) 238 if (auto V = DFT.Get(Sha1Str)) 239 II.DataFlowTraceForFocusFunction = *V; 240 // This is a gross heuristic. 241 // Ideally, when we add an element to a corpus we need to know its DFT. 242 // But if we don't, we'll use the DFT of its base input. 243 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 244 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 245 DistributionNeedsUpdate = true; 246 PrintCorpus(); 247 // ValidateFeatureSet(); 248 return &II; 249 } 250 251 // Debug-only 252 void PrintUnit(const Unit &U) { 253 if (!FeatureDebug) return; 254 for (uint8_t C : U) { 255 if (C != 'F' && C != 'U' && C != 'Z') 256 C = '.'; 257 Printf("%c", C); 258 } 259 } 260 261 // Debug-only 262 void PrintFeatureSet(const std::vector<uint32_t> &FeatureSet) { 263 if (!FeatureDebug) return; 264 Printf("{"); 265 for (uint32_t Feature: FeatureSet) 266 Printf("%u,", Feature); 267 Printf("}"); 268 } 269 270 // Debug-only 271 void PrintCorpus() { 272 if (!FeatureDebug) return; 273 Printf("======= CORPUS:\n"); 274 int i = 0; 275 for (auto II : Inputs) { 276 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 277 Printf("[%2d] ", i); 278 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 279 PrintUnit(II->U); 280 Printf(" "); 281 PrintFeatureSet(II->UniqFeatureSet); 282 Printf("\n"); 283 } 284 i++; 285 } 286 } 287 288 void Replace(InputInfo *II, const Unit &U, 289 std::chrono::microseconds TimeOfUnit) { 290 assert(II->U.size() > U.size()); 291 Hashes.erase(Sha1ToString(II->Sha1)); 292 DeleteFile(*II); 293 ComputeSHA1(U.data(), U.size(), II->Sha1); 294 Hashes.insert(Sha1ToString(II->Sha1)); 295 II->U = U; 296 II->Reduced = true; 297 II->TimeOfUnit = TimeOfUnit; 298 DistributionNeedsUpdate = true; 299 } 300 301 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 302 bool HasUnit(const std::string &H) { return Hashes.count(H); } 303 InputInfo &ChooseUnitToMutate(Random &Rand) { 304 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 305 assert(!II.U.empty()); 306 return II; 307 } 308 309 InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) { 310 if (!UniformDist) { 311 return ChooseUnitToMutate(Rand); 312 } 313 InputInfo &II = *Inputs[Rand(Inputs.size())]; 314 assert(!II.U.empty()); 315 return II; 316 } 317 318 // Returns an index of random unit from the corpus to mutate. 319 size_t ChooseUnitIdxToMutate(Random &Rand) { 320 UpdateCorpusDistribution(Rand); 321 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 322 assert(Idx < Inputs.size()); 323 return Idx; 324 } 325 326 void PrintStats() { 327 for (size_t i = 0; i < Inputs.size(); i++) { 328 const auto &II = *Inputs[i]; 329 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 330 Sha1ToString(II.Sha1).c_str(), II.U.size(), 331 II.NumExecutedMutations, II.NumSuccessfullMutations, 332 II.HasFocusFunction); 333 } 334 } 335 336 void PrintFeatureSet() { 337 for (size_t i = 0; i < kFeatureSetSize; i++) { 338 if(size_t Sz = GetFeature(i)) 339 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 340 } 341 Printf("\n\t"); 342 for (size_t i = 0; i < Inputs.size(); i++) 343 if (size_t N = Inputs[i]->NumFeatures) 344 Printf(" %zd=>%zd ", i, N); 345 Printf("\n"); 346 } 347 348 void DeleteFile(const InputInfo &II) { 349 if (!OutputCorpus.empty() && II.MayDeleteFile) 350 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 351 } 352 353 void DeleteInput(size_t Idx) { 354 InputInfo &II = *Inputs[Idx]; 355 DeleteFile(II); 356 Unit().swap(II.U); 357 II.Energy = 0.0; 358 II.NeedsEnergyUpdate = false; 359 DistributionNeedsUpdate = true; 360 if (FeatureDebug) 361 Printf("EVICTED %zd\n", Idx); 362 } 363 364 void AddRareFeature(uint32_t Idx) { 365 // Maintain *at least* TopXRarestFeatures many rare features 366 // and all features with a frequency below ConsideredRare. 367 // Remove all other features. 368 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures && 369 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) { 370 371 // Find most and second most abbundant feature. 372 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0], 373 RareFeatures[0]}; 374 size_t Delete = 0; 375 for (size_t i = 0; i < RareFeatures.size(); i++) { 376 uint32_t Idx2 = RareFeatures[i]; 377 if (GlobalFeatureFreqs[Idx2] >= 378 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) { 379 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0]; 380 MostAbundantRareFeatureIndices[0] = Idx2; 381 Delete = i; 382 } 383 } 384 385 // Remove most abundant rare feature. 386 IsRareFeature[Delete] = false; 387 RareFeatures[Delete] = RareFeatures.back(); 388 RareFeatures.pop_back(); 389 390 for (auto II : Inputs) { 391 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0])) 392 II->NeedsEnergyUpdate = true; 393 } 394 395 // Set 2nd most abundant as the new most abundant feature count. 396 FreqOfMostAbundantRareFeature = 397 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]]; 398 } 399 400 // Add rare feature, handle collisions, and update energy. 401 RareFeatures.push_back(Idx); 402 IsRareFeature[Idx] = true; 403 GlobalFeatureFreqs[Idx] = 0; 404 for (auto II : Inputs) { 405 II->DeleteFeatureFreq(Idx); 406 407 // Apply add-one smoothing to this locally undiscovered feature. 408 // Zero energy seeds will never be fuzzed and remain zero energy. 409 if (II->Energy > 0.0) { 410 II->SumIncidence += 1; 411 II->Energy += log(II->SumIncidence) / II->SumIncidence; 412 } 413 } 414 415 DistributionNeedsUpdate = true; 416 } 417 418 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 419 assert(NewSize); 420 Idx = Idx % kFeatureSetSize; 421 uint32_t OldSize = GetFeature(Idx); 422 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 423 if (OldSize > 0) { 424 size_t OldIdx = SmallestElementPerFeature[Idx]; 425 InputInfo &II = *Inputs[OldIdx]; 426 assert(II.NumFeatures > 0); 427 II.NumFeatures--; 428 if (II.NumFeatures == 0) 429 DeleteInput(OldIdx); 430 } else { 431 NumAddedFeatures++; 432 if (Entropic.Enabled) 433 AddRareFeature((uint32_t)Idx); 434 } 435 NumUpdatedFeatures++; 436 if (FeatureDebug) 437 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 438 // Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus. 439 SmallestElementPerFeature[Idx] = static_cast<uint32_t>(Inputs.size()); 440 InputSizesPerFeature[Idx] = NewSize; 441 return true; 442 } 443 return false; 444 } 445 446 // Increment frequency of feature Idx globally and locally. 447 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) { 448 uint32_t Idx32 = Idx % kFeatureSetSize; 449 450 // Saturated increment. 451 if (GlobalFeatureFreqs[Idx32] == 0xFFFF) 452 return; 453 uint16_t Freq = GlobalFeatureFreqs[Idx32]++; 454 455 // Skip if abundant. 456 if (Freq > FreqOfMostAbundantRareFeature || !IsRareFeature[Idx32]) 457 return; 458 459 // Update global frequencies. 460 if (Freq == FreqOfMostAbundantRareFeature) 461 FreqOfMostAbundantRareFeature++; 462 463 // Update local frequencies. 464 if (II) 465 II->UpdateFeatureFrequency(Idx32); 466 } 467 468 size_t NumFeatures() const { return NumAddedFeatures; } 469 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 470 471 private: 472 473 static const bool FeatureDebug = false; 474 475 uint32_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 476 477 void ValidateFeatureSet() { 478 if (FeatureDebug) 479 PrintFeatureSet(); 480 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 481 if (GetFeature(Idx)) 482 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 483 for (auto II: Inputs) { 484 if (II->Tmp != II->NumFeatures) 485 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 486 assert(II->Tmp == II->NumFeatures); 487 II->Tmp = 0; 488 } 489 } 490 491 // Updates the probability distribution for the units in the corpus. 492 // Must be called whenever the corpus or unit weights are changed. 493 // 494 // Hypothesis: inputs that maximize information about globally rare features 495 // are interesting. 496 void UpdateCorpusDistribution(Random &Rand) { 497 // Skip update if no seeds or rare features were added/deleted. 498 // Sparse updates for local change of feature frequencies, 499 // i.e., randomly do not skip. 500 if (!DistributionNeedsUpdate && 501 (!Entropic.Enabled || Rand(kSparseEnergyUpdates))) 502 return; 503 504 DistributionNeedsUpdate = false; 505 506 size_t N = Inputs.size(); 507 assert(N); 508 Intervals.resize(N + 1); 509 Weights.resize(N); 510 std::iota(Intervals.begin(), Intervals.end(), 0); 511 512 std::chrono::microseconds AverageUnitExecutionTime(0); 513 for (auto II : Inputs) { 514 AverageUnitExecutionTime += II->TimeOfUnit; 515 } 516 AverageUnitExecutionTime /= N; 517 518 bool VanillaSchedule = true; 519 if (Entropic.Enabled) { 520 for (auto II : Inputs) { 521 if (II->NeedsEnergyUpdate && II->Energy != 0.0) { 522 II->NeedsEnergyUpdate = false; 523 II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime, 524 AverageUnitExecutionTime); 525 } 526 } 527 528 for (size_t i = 0; i < N; i++) { 529 530 if (Inputs[i]->NumFeatures == 0) { 531 // If the seed doesn't represent any features, assign zero energy. 532 Weights[i] = 0.; 533 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor > 534 NumExecutedMutations / Inputs.size()) { 535 // If the seed was fuzzed a lot more than average, assign zero energy. 536 Weights[i] = 0.; 537 } else { 538 // Otherwise, simply assign the computed energy. 539 Weights[i] = Inputs[i]->Energy; 540 } 541 542 // If energy for all seeds is zero, fall back to vanilla schedule. 543 if (Weights[i] > 0.0) 544 VanillaSchedule = false; 545 } 546 } 547 548 if (VanillaSchedule) { 549 for (size_t i = 0; i < N; i++) 550 Weights[i] = 551 Inputs[i]->NumFeatures 552 ? static_cast<double>((i + 1) * 553 (Inputs[i]->HasFocusFunction ? 1000 : 1)) 554 : 0.; 555 } 556 557 if (FeatureDebug) { 558 for (size_t i = 0; i < N; i++) 559 Printf("%zd ", Inputs[i]->NumFeatures); 560 Printf("SCORE\n"); 561 for (size_t i = 0; i < N; i++) 562 Printf("%f ", Weights[i]); 563 Printf("Weights\n"); 564 } 565 CorpusDistribution = std::piecewise_constant_distribution<double>( 566 Intervals.begin(), Intervals.end(), Weights.begin()); 567 } 568 std::piecewise_constant_distribution<double> CorpusDistribution; 569 570 std::vector<double> Intervals; 571 std::vector<double> Weights; 572 573 std::unordered_set<std::string> Hashes; 574 std::vector<InputInfo *> Inputs; 575 576 size_t NumAddedFeatures = 0; 577 size_t NumUpdatedFeatures = 0; 578 uint32_t InputSizesPerFeature[kFeatureSetSize]; 579 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 580 581 bool DistributionNeedsUpdate = true; 582 uint16_t FreqOfMostAbundantRareFeature = 0; 583 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {}; 584 std::vector<uint32_t> RareFeatures; 585 std::bitset<kFeatureSetSize> IsRareFeature; 586 587 std::string OutputCorpus; 588 }; 589 590 } // namespace fuzzer 591 592 #endif // LLVM_FUZZER_CORPUS 593