1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // fuzzer::InputCorpus 9 //===----------------------------------------------------------------------===// 10 11 #ifndef LLVM_FUZZER_CORPUS 12 #define LLVM_FUZZER_CORPUS 13 14 #include "FuzzerDataFlowTrace.h" 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <chrono> 22 #include <numeric> 23 #include <random> 24 #include <unordered_set> 25 26 namespace fuzzer { 27 28 struct InputInfo { 29 Unit U; // The actual input data. 30 std::chrono::microseconds TimeOfUnit; 31 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 32 // Number of features that this input has and no smaller input has. 33 size_t NumFeatures = 0; 34 size_t Tmp = 0; // Used by ValidateFeatureSet. 35 // Stats. 36 size_t NumExecutedMutations = 0; 37 size_t NumSuccessfullMutations = 0; 38 bool NeverReduce = false; 39 bool MayDeleteFile = false; 40 bool Reduced = false; 41 bool HasFocusFunction = false; 42 Vector<uint32_t> UniqFeatureSet; 43 Vector<uint8_t> DataFlowTraceForFocusFunction; 44 // Power schedule. 45 bool NeedsEnergyUpdate = false; 46 double Energy = 0.0; 47 size_t SumIncidence = 0; 48 Vector<std::pair<uint32_t, uint16_t>> FeatureFreqs; 49 50 // Delete feature Idx and its frequency from FeatureFreqs. 51 bool DeleteFeatureFreq(uint32_t Idx) { 52 if (FeatureFreqs.empty()) 53 return false; 54 55 // Binary search over local feature frequencies sorted by index. 56 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 57 std::pair<uint32_t, uint16_t>(Idx, 0)); 58 59 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 60 FeatureFreqs.erase(Lower); 61 return true; 62 } 63 return false; 64 } 65 66 // Assign more energy to a high-entropy seed, i.e., that reveals more 67 // information about the globally rare features in the neighborhood of the 68 // seed. Since we do not know the entropy of a seed that has never been 69 // executed we assign fresh seeds maximum entropy and let II->Energy approach 70 // the true entropy from above. If ScalePerExecTime is true, the computed 71 // entropy is scaled based on how fast this input executes compared to the 72 // average execution time of inputs. The faster an input executes, the more 73 // energy gets assigned to the input. 74 void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime, 75 std::chrono::microseconds AverageUnitExecutionTime) { 76 Energy = 0.0; 77 SumIncidence = 0; 78 79 // Apply add-one smoothing to locally discovered features. 80 for (auto F : FeatureFreqs) { 81 size_t LocalIncidence = F.second + 1; 82 Energy -= LocalIncidence * logl(LocalIncidence); 83 SumIncidence += LocalIncidence; 84 } 85 86 // Apply add-one smoothing to locally undiscovered features. 87 // PreciseEnergy -= 0; // since logl(1.0) == 0) 88 SumIncidence += (GlobalNumberOfFeatures - FeatureFreqs.size()); 89 90 // Add a single locally abundant feature apply add-one smoothing. 91 size_t AbdIncidence = NumExecutedMutations + 1; 92 Energy -= AbdIncidence * logl(AbdIncidence); 93 SumIncidence += AbdIncidence; 94 95 // Normalize. 96 if (SumIncidence != 0) 97 Energy = (Energy / SumIncidence) + logl(SumIncidence); 98 99 if (ScalePerExecTime) { 100 // Scaling to favor inputs with lower execution time. 101 uint32_t PerfScore = 100; 102 if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10) 103 PerfScore = 10; 104 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4) 105 PerfScore = 25; 106 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2) 107 PerfScore = 50; 108 else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4) 109 PerfScore = 75; 110 else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count()) 111 PerfScore = 300; 112 else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count()) 113 PerfScore = 200; 114 else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count()) 115 PerfScore = 150; 116 117 Energy *= PerfScore; 118 } 119 } 120 121 // Increment the frequency of the feature Idx. 122 void UpdateFeatureFrequency(uint32_t Idx) { 123 NeedsEnergyUpdate = true; 124 125 // The local feature frequencies is an ordered vector of pairs. 126 // If there are no local feature frequencies, push_back preserves order. 127 // Set the feature frequency for feature Idx32 to 1. 128 if (FeatureFreqs.empty()) { 129 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1)); 130 return; 131 } 132 133 // Binary search over local feature frequencies sorted by index. 134 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 135 std::pair<uint32_t, uint16_t>(Idx, 0)); 136 137 // If feature Idx32 already exists, increment its frequency. 138 // Otherwise, insert a new pair right after the next lower index. 139 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 140 Lower->second++; 141 } else { 142 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1)); 143 } 144 } 145 }; 146 147 struct EntropicOptions { 148 bool Enabled; 149 size_t NumberOfRarestFeatures; 150 size_t FeatureFrequencyThreshold; 151 bool ScalePerExecTime; 152 }; 153 154 class InputCorpus { 155 static const uint32_t kFeatureSetSize = 1 << 21; 156 static const uint8_t kMaxMutationFactor = 20; 157 static const size_t kSparseEnergyUpdates = 100; 158 159 size_t NumExecutedMutations = 0; 160 161 EntropicOptions Entropic; 162 163 public: 164 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic) 165 : Entropic(Entropic), OutputCorpus(OutputCorpus) { 166 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 167 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 168 } 169 ~InputCorpus() { 170 for (auto II : Inputs) 171 delete II; 172 } 173 size_t size() const { return Inputs.size(); } 174 size_t SizeInBytes() const { 175 size_t Res = 0; 176 for (auto II : Inputs) 177 Res += II->U.size(); 178 return Res; 179 } 180 size_t NumActiveUnits() const { 181 size_t Res = 0; 182 for (auto II : Inputs) 183 Res += !II->U.empty(); 184 return Res; 185 } 186 size_t MaxInputSize() const { 187 size_t Res = 0; 188 for (auto II : Inputs) 189 Res = std::max(Res, II->U.size()); 190 return Res; 191 } 192 void IncrementNumExecutedMutations() { NumExecutedMutations++; } 193 194 size_t NumInputsThatTouchFocusFunction() { 195 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 196 return II->HasFocusFunction; 197 }); 198 } 199 200 size_t NumInputsWithDataFlowTrace() { 201 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 202 return !II->DataFlowTraceForFocusFunction.empty(); 203 }); 204 } 205 206 bool empty() const { return Inputs.empty(); } 207 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 208 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 209 bool HasFocusFunction, bool NeverReduce, 210 std::chrono::microseconds TimeOfUnit, 211 const Vector<uint32_t> &FeatureSet, 212 const DataFlowTrace &DFT, const InputInfo *BaseII) { 213 assert(!U.empty()); 214 if (FeatureDebug) 215 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 216 Inputs.push_back(new InputInfo()); 217 InputInfo &II = *Inputs.back(); 218 II.U = U; 219 II.NumFeatures = NumFeatures; 220 II.NeverReduce = NeverReduce; 221 II.TimeOfUnit = TimeOfUnit; 222 II.MayDeleteFile = MayDeleteFile; 223 II.UniqFeatureSet = FeatureSet; 224 II.HasFocusFunction = HasFocusFunction; 225 // Assign maximal energy to the new seed. 226 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size()); 227 II.SumIncidence = RareFeatures.size(); 228 II.NeedsEnergyUpdate = false; 229 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 230 ComputeSHA1(U.data(), U.size(), II.Sha1); 231 auto Sha1Str = Sha1ToString(II.Sha1); 232 Hashes.insert(Sha1Str); 233 if (HasFocusFunction) 234 if (auto V = DFT.Get(Sha1Str)) 235 II.DataFlowTraceForFocusFunction = *V; 236 // This is a gross heuristic. 237 // Ideally, when we add an element to a corpus we need to know its DFT. 238 // But if we don't, we'll use the DFT of its base input. 239 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 240 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 241 DistributionNeedsUpdate = true; 242 PrintCorpus(); 243 // ValidateFeatureSet(); 244 return &II; 245 } 246 247 // Debug-only 248 void PrintUnit(const Unit &U) { 249 if (!FeatureDebug) return; 250 for (uint8_t C : U) { 251 if (C != 'F' && C != 'U' && C != 'Z') 252 C = '.'; 253 Printf("%c", C); 254 } 255 } 256 257 // Debug-only 258 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) { 259 if (!FeatureDebug) return; 260 Printf("{"); 261 for (uint32_t Feature: FeatureSet) 262 Printf("%u,", Feature); 263 Printf("}"); 264 } 265 266 // Debug-only 267 void PrintCorpus() { 268 if (!FeatureDebug) return; 269 Printf("======= CORPUS:\n"); 270 int i = 0; 271 for (auto II : Inputs) { 272 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 273 Printf("[%2d] ", i); 274 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 275 PrintUnit(II->U); 276 Printf(" "); 277 PrintFeatureSet(II->UniqFeatureSet); 278 Printf("\n"); 279 } 280 i++; 281 } 282 } 283 284 void Replace(InputInfo *II, const Unit &U) { 285 assert(II->U.size() > U.size()); 286 Hashes.erase(Sha1ToString(II->Sha1)); 287 DeleteFile(*II); 288 ComputeSHA1(U.data(), U.size(), II->Sha1); 289 Hashes.insert(Sha1ToString(II->Sha1)); 290 II->U = U; 291 II->Reduced = true; 292 DistributionNeedsUpdate = true; 293 } 294 295 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 296 bool HasUnit(const std::string &H) { return Hashes.count(H); } 297 InputInfo &ChooseUnitToMutate(Random &Rand) { 298 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 299 assert(!II.U.empty()); 300 return II; 301 } 302 303 InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) { 304 if (!UniformDist) { 305 return ChooseUnitToMutate(Rand); 306 } 307 InputInfo &II = *Inputs[Rand(Inputs.size())]; 308 assert(!II.U.empty()); 309 return II; 310 } 311 312 // Returns an index of random unit from the corpus to mutate. 313 size_t ChooseUnitIdxToMutate(Random &Rand) { 314 UpdateCorpusDistribution(Rand); 315 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 316 assert(Idx < Inputs.size()); 317 return Idx; 318 } 319 320 void PrintStats() { 321 for (size_t i = 0; i < Inputs.size(); i++) { 322 const auto &II = *Inputs[i]; 323 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 324 Sha1ToString(II.Sha1).c_str(), II.U.size(), 325 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction); 326 } 327 } 328 329 void PrintFeatureSet() { 330 for (size_t i = 0; i < kFeatureSetSize; i++) { 331 if(size_t Sz = GetFeature(i)) 332 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 333 } 334 Printf("\n\t"); 335 for (size_t i = 0; i < Inputs.size(); i++) 336 if (size_t N = Inputs[i]->NumFeatures) 337 Printf(" %zd=>%zd ", i, N); 338 Printf("\n"); 339 } 340 341 void DeleteFile(const InputInfo &II) { 342 if (!OutputCorpus.empty() && II.MayDeleteFile) 343 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 344 } 345 346 void DeleteInput(size_t Idx) { 347 InputInfo &II = *Inputs[Idx]; 348 DeleteFile(II); 349 Unit().swap(II.U); 350 II.Energy = 0.0; 351 II.NeedsEnergyUpdate = false; 352 DistributionNeedsUpdate = true; 353 if (FeatureDebug) 354 Printf("EVICTED %zd\n", Idx); 355 } 356 357 void AddRareFeature(uint32_t Idx) { 358 // Maintain *at least* TopXRarestFeatures many rare features 359 // and all features with a frequency below ConsideredRare. 360 // Remove all other features. 361 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures && 362 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) { 363 364 // Find most and second most abbundant feature. 365 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0], 366 RareFeatures[0]}; 367 size_t Delete = 0; 368 for (size_t i = 0; i < RareFeatures.size(); i++) { 369 uint32_t Idx2 = RareFeatures[i]; 370 if (GlobalFeatureFreqs[Idx2] >= 371 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) { 372 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0]; 373 MostAbundantRareFeatureIndices[0] = Idx2; 374 Delete = i; 375 } 376 } 377 378 // Remove most abundant rare feature. 379 RareFeatures[Delete] = RareFeatures.back(); 380 RareFeatures.pop_back(); 381 382 for (auto II : Inputs) { 383 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0])) 384 II->NeedsEnergyUpdate = true; 385 } 386 387 // Set 2nd most abundant as the new most abundant feature count. 388 FreqOfMostAbundantRareFeature = 389 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]]; 390 } 391 392 // Add rare feature, handle collisions, and update energy. 393 RareFeatures.push_back(Idx); 394 GlobalFeatureFreqs[Idx] = 0; 395 for (auto II : Inputs) { 396 II->DeleteFeatureFreq(Idx); 397 398 // Apply add-one smoothing to this locally undiscovered feature. 399 // Zero energy seeds will never be fuzzed and remain zero energy. 400 if (II->Energy > 0.0) { 401 II->SumIncidence += 1; 402 II->Energy += logl(II->SumIncidence) / II->SumIncidence; 403 } 404 } 405 406 DistributionNeedsUpdate = true; 407 } 408 409 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 410 assert(NewSize); 411 Idx = Idx % kFeatureSetSize; 412 uint32_t OldSize = GetFeature(Idx); 413 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 414 if (OldSize > 0) { 415 size_t OldIdx = SmallestElementPerFeature[Idx]; 416 InputInfo &II = *Inputs[OldIdx]; 417 assert(II.NumFeatures > 0); 418 II.NumFeatures--; 419 if (II.NumFeatures == 0) 420 DeleteInput(OldIdx); 421 } else { 422 NumAddedFeatures++; 423 if (Entropic.Enabled) 424 AddRareFeature((uint32_t)Idx); 425 } 426 NumUpdatedFeatures++; 427 if (FeatureDebug) 428 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 429 SmallestElementPerFeature[Idx] = Inputs.size(); 430 InputSizesPerFeature[Idx] = NewSize; 431 return true; 432 } 433 return false; 434 } 435 436 // Increment frequency of feature Idx globally and locally. 437 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) { 438 uint32_t Idx32 = Idx % kFeatureSetSize; 439 440 // Saturated increment. 441 if (GlobalFeatureFreqs[Idx32] == 0xFFFF) 442 return; 443 uint16_t Freq = GlobalFeatureFreqs[Idx32]++; 444 445 // Skip if abundant. 446 if (Freq > FreqOfMostAbundantRareFeature || 447 std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) == 448 RareFeatures.end()) 449 return; 450 451 // Update global frequencies. 452 if (Freq == FreqOfMostAbundantRareFeature) 453 FreqOfMostAbundantRareFeature++; 454 455 // Update local frequencies. 456 if (II) 457 II->UpdateFeatureFrequency(Idx32); 458 } 459 460 size_t NumFeatures() const { return NumAddedFeatures; } 461 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 462 463 private: 464 465 static const bool FeatureDebug = false; 466 467 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 468 469 void ValidateFeatureSet() { 470 if (FeatureDebug) 471 PrintFeatureSet(); 472 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 473 if (GetFeature(Idx)) 474 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 475 for (auto II: Inputs) { 476 if (II->Tmp != II->NumFeatures) 477 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 478 assert(II->Tmp == II->NumFeatures); 479 II->Tmp = 0; 480 } 481 } 482 483 // Updates the probability distribution for the units in the corpus. 484 // Must be called whenever the corpus or unit weights are changed. 485 // 486 // Hypothesis: inputs that maximize information about globally rare features 487 // are interesting. 488 void UpdateCorpusDistribution(Random &Rand) { 489 // Skip update if no seeds or rare features were added/deleted. 490 // Sparse updates for local change of feature frequencies, 491 // i.e., randomly do not skip. 492 if (!DistributionNeedsUpdate && 493 (!Entropic.Enabled || Rand(kSparseEnergyUpdates))) 494 return; 495 496 DistributionNeedsUpdate = false; 497 498 size_t N = Inputs.size(); 499 assert(N); 500 Intervals.resize(N + 1); 501 Weights.resize(N); 502 std::iota(Intervals.begin(), Intervals.end(), 0); 503 504 std::chrono::microseconds AverageUnitExecutionTime(0); 505 for (auto II : Inputs) { 506 AverageUnitExecutionTime += II->TimeOfUnit; 507 } 508 AverageUnitExecutionTime /= N; 509 510 bool VanillaSchedule = true; 511 if (Entropic.Enabled) { 512 for (auto II : Inputs) { 513 if (II->NeedsEnergyUpdate && II->Energy != 0.0) { 514 II->NeedsEnergyUpdate = false; 515 II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime, 516 AverageUnitExecutionTime); 517 } 518 } 519 520 for (size_t i = 0; i < N; i++) { 521 522 if (Inputs[i]->NumFeatures == 0) { 523 // If the seed doesn't represent any features, assign zero energy. 524 Weights[i] = 0.; 525 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor > 526 NumExecutedMutations / Inputs.size()) { 527 // If the seed was fuzzed a lot more than average, assign zero energy. 528 Weights[i] = 0.; 529 } else { 530 // Otherwise, simply assign the computed energy. 531 Weights[i] = Inputs[i]->Energy; 532 } 533 534 // If energy for all seeds is zero, fall back to vanilla schedule. 535 if (Weights[i] > 0.0) 536 VanillaSchedule = false; 537 } 538 } 539 540 if (VanillaSchedule) { 541 for (size_t i = 0; i < N; i++) 542 Weights[i] = Inputs[i]->NumFeatures 543 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1) 544 : 0.; 545 } 546 547 if (FeatureDebug) { 548 for (size_t i = 0; i < N; i++) 549 Printf("%zd ", Inputs[i]->NumFeatures); 550 Printf("SCORE\n"); 551 for (size_t i = 0; i < N; i++) 552 Printf("%f ", Weights[i]); 553 Printf("Weights\n"); 554 } 555 CorpusDistribution = std::piecewise_constant_distribution<double>( 556 Intervals.begin(), Intervals.end(), Weights.begin()); 557 } 558 std::piecewise_constant_distribution<double> CorpusDistribution; 559 560 Vector<double> Intervals; 561 Vector<double> Weights; 562 563 std::unordered_set<std::string> Hashes; 564 Vector<InputInfo*> Inputs; 565 566 size_t NumAddedFeatures = 0; 567 size_t NumUpdatedFeatures = 0; 568 uint32_t InputSizesPerFeature[kFeatureSetSize]; 569 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 570 571 bool DistributionNeedsUpdate = true; 572 uint16_t FreqOfMostAbundantRareFeature = 0; 573 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {}; 574 Vector<uint32_t> RareFeatures; 575 576 std::string OutputCorpus; 577 }; 578 579 } // namespace fuzzer 580 581 #endif // LLVM_FUZZER_CORPUS 582