1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // fuzzer::InputCorpus 9 //===----------------------------------------------------------------------===// 10 11 #ifndef LLVM_FUZZER_CORPUS 12 #define LLVM_FUZZER_CORPUS 13 14 #include "FuzzerDataFlowTrace.h" 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <chrono> 22 #include <numeric> 23 #include <random> 24 #include <unordered_set> 25 26 namespace fuzzer { 27 28 struct InputInfo { 29 Unit U; // The actual input data. 30 std::chrono::microseconds TimeOfUnit; 31 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 32 // Number of features that this input has and no smaller input has. 33 size_t NumFeatures = 0; 34 size_t Tmp = 0; // Used by ValidateFeatureSet. 35 // Stats. 36 size_t NumExecutedMutations = 0; 37 size_t NumSuccessfullMutations = 0; 38 bool NeverReduce = false; 39 bool MayDeleteFile = false; 40 bool Reduced = false; 41 bool HasFocusFunction = false; 42 Vector<uint32_t> UniqFeatureSet; 43 Vector<uint8_t> DataFlowTraceForFocusFunction; 44 // Power schedule. 45 bool NeedsEnergyUpdate = false; 46 double Energy = 0.0; 47 double SumIncidence = 0.0; 48 Vector<std::pair<uint32_t, uint16_t>> FeatureFreqs; 49 50 // Delete feature Idx and its frequency from FeatureFreqs. 51 bool DeleteFeatureFreq(uint32_t Idx) { 52 if (FeatureFreqs.empty()) 53 return false; 54 55 // Binary search over local feature frequencies sorted by index. 56 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 57 std::pair<uint32_t, uint16_t>(Idx, 0)); 58 59 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 60 FeatureFreqs.erase(Lower); 61 return true; 62 } 63 return false; 64 } 65 66 // Assign more energy to a high-entropy seed, i.e., that reveals more 67 // information about the globally rare features in the neighborhood of the 68 // seed. Since we do not know the entropy of a seed that has never been 69 // executed we assign fresh seeds maximum entropy and let II->Energy approach 70 // the true entropy from above. If ScalePerExecTime is true, the computed 71 // entropy is scaled based on how fast this input executes compared to the 72 // average execution time of inputs. The faster an input executes, the more 73 // energy gets assigned to the input. 74 void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime, 75 std::chrono::microseconds AverageUnitExecutionTime) { 76 Energy = 0.0; 77 SumIncidence = 0.0; 78 79 // Apply add-one smoothing to locally discovered features. 80 for (auto F : FeatureFreqs) { 81 double LocalIncidence = F.second + 1; 82 Energy -= LocalIncidence * log(LocalIncidence); 83 SumIncidence += LocalIncidence; 84 } 85 86 // Apply add-one smoothing to locally undiscovered features. 87 // PreciseEnergy -= 0; // since log(1.0) == 0) 88 SumIncidence += 89 static_cast<double>(GlobalNumberOfFeatures - FeatureFreqs.size()); 90 91 // Add a single locally abundant feature apply add-one smoothing. 92 double AbdIncidence = static_cast<double>(NumExecutedMutations + 1); 93 Energy -= AbdIncidence * log(AbdIncidence); 94 SumIncidence += AbdIncidence; 95 96 // Normalize. 97 if (SumIncidence != 0) 98 Energy = Energy / SumIncidence + log(SumIncidence); 99 100 if (ScalePerExecTime) { 101 // Scaling to favor inputs with lower execution time. 102 uint32_t PerfScore = 100; 103 if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10) 104 PerfScore = 10; 105 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4) 106 PerfScore = 25; 107 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2) 108 PerfScore = 50; 109 else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4) 110 PerfScore = 75; 111 else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count()) 112 PerfScore = 300; 113 else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count()) 114 PerfScore = 200; 115 else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count()) 116 PerfScore = 150; 117 118 Energy *= PerfScore; 119 } 120 } 121 122 // Increment the frequency of the feature Idx. 123 void UpdateFeatureFrequency(uint32_t Idx) { 124 NeedsEnergyUpdate = true; 125 126 // The local feature frequencies is an ordered vector of pairs. 127 // If there are no local feature frequencies, push_back preserves order. 128 // Set the feature frequency for feature Idx32 to 1. 129 if (FeatureFreqs.empty()) { 130 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1)); 131 return; 132 } 133 134 // Binary search over local feature frequencies sorted by index. 135 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 136 std::pair<uint32_t, uint16_t>(Idx, 0)); 137 138 // If feature Idx32 already exists, increment its frequency. 139 // Otherwise, insert a new pair right after the next lower index. 140 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 141 Lower->second++; 142 } else { 143 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1)); 144 } 145 } 146 }; 147 148 struct EntropicOptions { 149 bool Enabled; 150 size_t NumberOfRarestFeatures; 151 size_t FeatureFrequencyThreshold; 152 bool ScalePerExecTime; 153 }; 154 155 class InputCorpus { 156 static const uint32_t kFeatureSetSize = 1 << 21; 157 static const uint8_t kMaxMutationFactor = 20; 158 static const size_t kSparseEnergyUpdates = 100; 159 160 size_t NumExecutedMutations = 0; 161 162 EntropicOptions Entropic; 163 164 public: 165 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic) 166 : Entropic(Entropic), OutputCorpus(OutputCorpus) { 167 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 168 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 169 } 170 ~InputCorpus() { 171 for (auto II : Inputs) 172 delete II; 173 } 174 size_t size() const { return Inputs.size(); } 175 size_t SizeInBytes() const { 176 size_t Res = 0; 177 for (auto II : Inputs) 178 Res += II->U.size(); 179 return Res; 180 } 181 size_t NumActiveUnits() const { 182 size_t Res = 0; 183 for (auto II : Inputs) 184 Res += !II->U.empty(); 185 return Res; 186 } 187 size_t MaxInputSize() const { 188 size_t Res = 0; 189 for (auto II : Inputs) 190 Res = std::max(Res, II->U.size()); 191 return Res; 192 } 193 void IncrementNumExecutedMutations() { NumExecutedMutations++; } 194 195 size_t NumInputsThatTouchFocusFunction() { 196 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 197 return II->HasFocusFunction; 198 }); 199 } 200 201 size_t NumInputsWithDataFlowTrace() { 202 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 203 return !II->DataFlowTraceForFocusFunction.empty(); 204 }); 205 } 206 207 bool empty() const { return Inputs.empty(); } 208 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 209 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 210 bool HasFocusFunction, bool NeverReduce, 211 std::chrono::microseconds TimeOfUnit, 212 const Vector<uint32_t> &FeatureSet, 213 const DataFlowTrace &DFT, const InputInfo *BaseII) { 214 assert(!U.empty()); 215 if (FeatureDebug) 216 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 217 // Inputs.size() is cast to uint32_t below. 218 assert(Inputs.size() < std::numeric_limits<uint32_t>::max()); 219 Inputs.push_back(new InputInfo()); 220 InputInfo &II = *Inputs.back(); 221 II.U = U; 222 II.NumFeatures = NumFeatures; 223 II.NeverReduce = NeverReduce; 224 II.TimeOfUnit = TimeOfUnit; 225 II.MayDeleteFile = MayDeleteFile; 226 II.UniqFeatureSet = FeatureSet; 227 II.HasFocusFunction = HasFocusFunction; 228 // Assign maximal energy to the new seed. 229 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size()); 230 II.SumIncidence = static_cast<double>(RareFeatures.size()); 231 II.NeedsEnergyUpdate = false; 232 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 233 ComputeSHA1(U.data(), U.size(), II.Sha1); 234 auto Sha1Str = Sha1ToString(II.Sha1); 235 Hashes.insert(Sha1Str); 236 if (HasFocusFunction) 237 if (auto V = DFT.Get(Sha1Str)) 238 II.DataFlowTraceForFocusFunction = *V; 239 // This is a gross heuristic. 240 // Ideally, when we add an element to a corpus we need to know its DFT. 241 // But if we don't, we'll use the DFT of its base input. 242 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 243 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 244 DistributionNeedsUpdate = true; 245 PrintCorpus(); 246 // ValidateFeatureSet(); 247 return &II; 248 } 249 250 // Debug-only 251 void PrintUnit(const Unit &U) { 252 if (!FeatureDebug) return; 253 for (uint8_t C : U) { 254 if (C != 'F' && C != 'U' && C != 'Z') 255 C = '.'; 256 Printf("%c", C); 257 } 258 } 259 260 // Debug-only 261 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) { 262 if (!FeatureDebug) return; 263 Printf("{"); 264 for (uint32_t Feature: FeatureSet) 265 Printf("%u,", Feature); 266 Printf("}"); 267 } 268 269 // Debug-only 270 void PrintCorpus() { 271 if (!FeatureDebug) return; 272 Printf("======= CORPUS:\n"); 273 int i = 0; 274 for (auto II : Inputs) { 275 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 276 Printf("[%2d] ", i); 277 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 278 PrintUnit(II->U); 279 Printf(" "); 280 PrintFeatureSet(II->UniqFeatureSet); 281 Printf("\n"); 282 } 283 i++; 284 } 285 } 286 287 void Replace(InputInfo *II, const Unit &U) { 288 assert(II->U.size() > U.size()); 289 Hashes.erase(Sha1ToString(II->Sha1)); 290 DeleteFile(*II); 291 ComputeSHA1(U.data(), U.size(), II->Sha1); 292 Hashes.insert(Sha1ToString(II->Sha1)); 293 II->U = U; 294 II->Reduced = true; 295 DistributionNeedsUpdate = true; 296 } 297 298 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 299 bool HasUnit(const std::string &H) { return Hashes.count(H); } 300 InputInfo &ChooseUnitToMutate(Random &Rand) { 301 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 302 assert(!II.U.empty()); 303 return II; 304 } 305 306 InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) { 307 if (!UniformDist) { 308 return ChooseUnitToMutate(Rand); 309 } 310 InputInfo &II = *Inputs[Rand(Inputs.size())]; 311 assert(!II.U.empty()); 312 return II; 313 } 314 315 // Returns an index of random unit from the corpus to mutate. 316 size_t ChooseUnitIdxToMutate(Random &Rand) { 317 UpdateCorpusDistribution(Rand); 318 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 319 assert(Idx < Inputs.size()); 320 return Idx; 321 } 322 323 void PrintStats() { 324 for (size_t i = 0; i < Inputs.size(); i++) { 325 const auto &II = *Inputs[i]; 326 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 327 Sha1ToString(II.Sha1).c_str(), II.U.size(), 328 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction); 329 } 330 } 331 332 void PrintFeatureSet() { 333 for (size_t i = 0; i < kFeatureSetSize; i++) { 334 if(size_t Sz = GetFeature(i)) 335 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 336 } 337 Printf("\n\t"); 338 for (size_t i = 0; i < Inputs.size(); i++) 339 if (size_t N = Inputs[i]->NumFeatures) 340 Printf(" %zd=>%zd ", i, N); 341 Printf("\n"); 342 } 343 344 void DeleteFile(const InputInfo &II) { 345 if (!OutputCorpus.empty() && II.MayDeleteFile) 346 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 347 } 348 349 void DeleteInput(size_t Idx) { 350 InputInfo &II = *Inputs[Idx]; 351 DeleteFile(II); 352 Unit().swap(II.U); 353 II.Energy = 0.0; 354 II.NeedsEnergyUpdate = false; 355 DistributionNeedsUpdate = true; 356 if (FeatureDebug) 357 Printf("EVICTED %zd\n", Idx); 358 } 359 360 void AddRareFeature(uint32_t Idx) { 361 // Maintain *at least* TopXRarestFeatures many rare features 362 // and all features with a frequency below ConsideredRare. 363 // Remove all other features. 364 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures && 365 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) { 366 367 // Find most and second most abbundant feature. 368 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0], 369 RareFeatures[0]}; 370 size_t Delete = 0; 371 for (size_t i = 0; i < RareFeatures.size(); i++) { 372 uint32_t Idx2 = RareFeatures[i]; 373 if (GlobalFeatureFreqs[Idx2] >= 374 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) { 375 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0]; 376 MostAbundantRareFeatureIndices[0] = Idx2; 377 Delete = i; 378 } 379 } 380 381 // Remove most abundant rare feature. 382 RareFeatures[Delete] = RareFeatures.back(); 383 RareFeatures.pop_back(); 384 385 for (auto II : Inputs) { 386 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0])) 387 II->NeedsEnergyUpdate = true; 388 } 389 390 // Set 2nd most abundant as the new most abundant feature count. 391 FreqOfMostAbundantRareFeature = 392 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]]; 393 } 394 395 // Add rare feature, handle collisions, and update energy. 396 RareFeatures.push_back(Idx); 397 GlobalFeatureFreqs[Idx] = 0; 398 for (auto II : Inputs) { 399 II->DeleteFeatureFreq(Idx); 400 401 // Apply add-one smoothing to this locally undiscovered feature. 402 // Zero energy seeds will never be fuzzed and remain zero energy. 403 if (II->Energy > 0.0) { 404 II->SumIncidence += 1; 405 II->Energy += log(II->SumIncidence) / II->SumIncidence; 406 } 407 } 408 409 DistributionNeedsUpdate = true; 410 } 411 412 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 413 assert(NewSize); 414 Idx = Idx % kFeatureSetSize; 415 uint32_t OldSize = GetFeature(Idx); 416 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 417 if (OldSize > 0) { 418 size_t OldIdx = SmallestElementPerFeature[Idx]; 419 InputInfo &II = *Inputs[OldIdx]; 420 assert(II.NumFeatures > 0); 421 II.NumFeatures--; 422 if (II.NumFeatures == 0) 423 DeleteInput(OldIdx); 424 } else { 425 NumAddedFeatures++; 426 if (Entropic.Enabled) 427 AddRareFeature((uint32_t)Idx); 428 } 429 NumUpdatedFeatures++; 430 if (FeatureDebug) 431 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 432 // Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus. 433 SmallestElementPerFeature[Idx] = static_cast<uint32_t>(Inputs.size()); 434 InputSizesPerFeature[Idx] = NewSize; 435 return true; 436 } 437 return false; 438 } 439 440 // Increment frequency of feature Idx globally and locally. 441 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) { 442 uint32_t Idx32 = Idx % kFeatureSetSize; 443 444 // Saturated increment. 445 if (GlobalFeatureFreqs[Idx32] == 0xFFFF) 446 return; 447 uint16_t Freq = GlobalFeatureFreqs[Idx32]++; 448 449 // Skip if abundant. 450 if (Freq > FreqOfMostAbundantRareFeature || 451 std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) == 452 RareFeatures.end()) 453 return; 454 455 // Update global frequencies. 456 if (Freq == FreqOfMostAbundantRareFeature) 457 FreqOfMostAbundantRareFeature++; 458 459 // Update local frequencies. 460 if (II) 461 II->UpdateFeatureFrequency(Idx32); 462 } 463 464 size_t NumFeatures() const { return NumAddedFeatures; } 465 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 466 467 private: 468 469 static const bool FeatureDebug = false; 470 471 uint32_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 472 473 void ValidateFeatureSet() { 474 if (FeatureDebug) 475 PrintFeatureSet(); 476 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 477 if (GetFeature(Idx)) 478 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 479 for (auto II: Inputs) { 480 if (II->Tmp != II->NumFeatures) 481 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 482 assert(II->Tmp == II->NumFeatures); 483 II->Tmp = 0; 484 } 485 } 486 487 // Updates the probability distribution for the units in the corpus. 488 // Must be called whenever the corpus or unit weights are changed. 489 // 490 // Hypothesis: inputs that maximize information about globally rare features 491 // are interesting. 492 void UpdateCorpusDistribution(Random &Rand) { 493 // Skip update if no seeds or rare features were added/deleted. 494 // Sparse updates for local change of feature frequencies, 495 // i.e., randomly do not skip. 496 if (!DistributionNeedsUpdate && 497 (!Entropic.Enabled || Rand(kSparseEnergyUpdates))) 498 return; 499 500 DistributionNeedsUpdate = false; 501 502 size_t N = Inputs.size(); 503 assert(N); 504 Intervals.resize(N + 1); 505 Weights.resize(N); 506 std::iota(Intervals.begin(), Intervals.end(), 0); 507 508 std::chrono::microseconds AverageUnitExecutionTime(0); 509 for (auto II : Inputs) { 510 AverageUnitExecutionTime += II->TimeOfUnit; 511 } 512 AverageUnitExecutionTime /= N; 513 514 bool VanillaSchedule = true; 515 if (Entropic.Enabled) { 516 for (auto II : Inputs) { 517 if (II->NeedsEnergyUpdate && II->Energy != 0.0) { 518 II->NeedsEnergyUpdate = false; 519 II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime, 520 AverageUnitExecutionTime); 521 } 522 } 523 524 for (size_t i = 0; i < N; i++) { 525 526 if (Inputs[i]->NumFeatures == 0) { 527 // If the seed doesn't represent any features, assign zero energy. 528 Weights[i] = 0.; 529 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor > 530 NumExecutedMutations / Inputs.size()) { 531 // If the seed was fuzzed a lot more than average, assign zero energy. 532 Weights[i] = 0.; 533 } else { 534 // Otherwise, simply assign the computed energy. 535 Weights[i] = Inputs[i]->Energy; 536 } 537 538 // If energy for all seeds is zero, fall back to vanilla schedule. 539 if (Weights[i] > 0.0) 540 VanillaSchedule = false; 541 } 542 } 543 544 if (VanillaSchedule) { 545 for (size_t i = 0; i < N; i++) 546 Weights[i] = 547 Inputs[i]->NumFeatures 548 ? static_cast<double>((i + 1) * 549 (Inputs[i]->HasFocusFunction ? 1000 : 1)) 550 : 0.; 551 } 552 553 if (FeatureDebug) { 554 for (size_t i = 0; i < N; i++) 555 Printf("%zd ", Inputs[i]->NumFeatures); 556 Printf("SCORE\n"); 557 for (size_t i = 0; i < N; i++) 558 Printf("%f ", Weights[i]); 559 Printf("Weights\n"); 560 } 561 CorpusDistribution = std::piecewise_constant_distribution<double>( 562 Intervals.begin(), Intervals.end(), Weights.begin()); 563 } 564 std::piecewise_constant_distribution<double> CorpusDistribution; 565 566 Vector<double> Intervals; 567 Vector<double> Weights; 568 569 std::unordered_set<std::string> Hashes; 570 Vector<InputInfo*> Inputs; 571 572 size_t NumAddedFeatures = 0; 573 size_t NumUpdatedFeatures = 0; 574 uint32_t InputSizesPerFeature[kFeatureSetSize]; 575 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 576 577 bool DistributionNeedsUpdate = true; 578 uint16_t FreqOfMostAbundantRareFeature = 0; 579 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {}; 580 Vector<uint32_t> RareFeatures; 581 582 std::string OutputCorpus; 583 }; 584 585 } // namespace fuzzer 586 587 #endif // LLVM_FUZZER_CORPUS 588