1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // fuzzer::InputCorpus 9 //===----------------------------------------------------------------------===// 10 11 #ifndef LLVM_FUZZER_CORPUS 12 #define LLVM_FUZZER_CORPUS 13 14 #include "FuzzerDataFlowTrace.h" 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <numeric> 22 #include <random> 23 #include <unordered_set> 24 25 namespace fuzzer { 26 27 struct InputInfo { 28 Unit U; // The actual input data. 29 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 30 // Number of features that this input has and no smaller input has. 31 size_t NumFeatures = 0; 32 size_t Tmp = 0; // Used by ValidateFeatureSet. 33 // Stats. 34 size_t NumExecutedMutations = 0; 35 size_t NumSuccessfullMutations = 0; 36 bool MayDeleteFile = false; 37 bool Reduced = false; 38 bool HasFocusFunction = false; 39 Vector<uint32_t> UniqFeatureSet; 40 Vector<uint8_t> DataFlowTraceForFocusFunction; 41 // Power schedule. 42 bool NeedsEnergyUpdate = false; 43 double Energy = 0.0; 44 size_t SumIncidence = 0; 45 Vector<std::pair<uint32_t, uint16_t>> FeatureFreqs; 46 47 // Delete feature Idx and its frequency from FeatureFreqs. 48 bool DeleteFeatureFreq(uint32_t Idx) { 49 if (FeatureFreqs.empty()) 50 return false; 51 52 // Binary search over local feature frequencies sorted by index. 53 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 54 std::pair<uint32_t, uint16_t>(Idx, 0)); 55 56 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 57 FeatureFreqs.erase(Lower); 58 return true; 59 } 60 return false; 61 } 62 63 // Assign more energy to a high-entropy seed, i.e., that reveals more 64 // information about the globally rare features in the neighborhood 65 // of the seed. Since we do not know the entropy of a seed that has 66 // never been executed we assign fresh seeds maximum entropy and 67 // let II->Energy approach the true entropy from above. 68 void UpdateEnergy(size_t GlobalNumberOfFeatures) { 69 Energy = 0.0; 70 SumIncidence = 0; 71 72 // Apply add-one smoothing to locally discovered features. 73 for (auto F : FeatureFreqs) { 74 size_t LocalIncidence = F.second + 1; 75 Energy -= LocalIncidence * logl(LocalIncidence); 76 SumIncidence += LocalIncidence; 77 } 78 79 // Apply add-one smoothing to locally undiscovered features. 80 // PreciseEnergy -= 0; // since logl(1.0) == 0) 81 SumIncidence += (GlobalNumberOfFeatures - FeatureFreqs.size()); 82 83 // Add a single locally abundant feature apply add-one smoothing. 84 size_t AbdIncidence = NumExecutedMutations + 1; 85 Energy -= AbdIncidence * logl(AbdIncidence); 86 SumIncidence += AbdIncidence; 87 88 // Normalize. 89 if (SumIncidence != 0) 90 Energy = (Energy / SumIncidence) + logl(SumIncidence); 91 } 92 93 // Increment the frequency of the feature Idx. 94 void UpdateFeatureFrequency(uint32_t Idx) { 95 NeedsEnergyUpdate = true; 96 97 // The local feature frequencies is an ordered vector of pairs. 98 // If there are no local feature frequencies, push_back preserves order. 99 // Set the feature frequency for feature Idx32 to 1. 100 if (FeatureFreqs.empty()) { 101 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1)); 102 return; 103 } 104 105 // Binary search over local feature frequencies sorted by index. 106 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(), 107 std::pair<uint32_t, uint16_t>(Idx, 0)); 108 109 // If feature Idx32 already exists, increment its frequency. 110 // Otherwise, insert a new pair right after the next lower index. 111 if (Lower != FeatureFreqs.end() && Lower->first == Idx) { 112 Lower->second++; 113 } else { 114 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1)); 115 } 116 } 117 }; 118 119 struct EntropicOptions { 120 bool Enabled; 121 size_t NumberOfRarestFeatures; 122 size_t FeatureFrequencyThreshold; 123 }; 124 125 class InputCorpus { 126 static const uint32_t kFeatureSetSize = 1 << 21; 127 static const uint8_t kMaxMutationFactor = 20; 128 static const size_t kSparseEnergyUpdates = 100; 129 130 size_t NumExecutedMutations = 0; 131 132 EntropicOptions Entropic; 133 134 public: 135 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic) 136 : Entropic(Entropic), OutputCorpus(OutputCorpus) { 137 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 138 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 139 } 140 ~InputCorpus() { 141 for (auto II : Inputs) 142 delete II; 143 } 144 size_t size() const { return Inputs.size(); } 145 size_t SizeInBytes() const { 146 size_t Res = 0; 147 for (auto II : Inputs) 148 Res += II->U.size(); 149 return Res; 150 } 151 size_t NumActiveUnits() const { 152 size_t Res = 0; 153 for (auto II : Inputs) 154 Res += !II->U.empty(); 155 return Res; 156 } 157 size_t MaxInputSize() const { 158 size_t Res = 0; 159 for (auto II : Inputs) 160 Res = std::max(Res, II->U.size()); 161 return Res; 162 } 163 void IncrementNumExecutedMutations() { NumExecutedMutations++; } 164 165 size_t NumInputsThatTouchFocusFunction() { 166 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 167 return II->HasFocusFunction; 168 }); 169 } 170 171 size_t NumInputsWithDataFlowTrace() { 172 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 173 return !II->DataFlowTraceForFocusFunction.empty(); 174 }); 175 } 176 177 bool empty() const { return Inputs.empty(); } 178 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 179 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 180 bool HasFocusFunction, 181 const Vector<uint32_t> &FeatureSet, 182 const DataFlowTrace &DFT, const InputInfo *BaseII) { 183 assert(!U.empty()); 184 if (FeatureDebug) 185 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 186 Inputs.push_back(new InputInfo()); 187 InputInfo &II = *Inputs.back(); 188 II.U = U; 189 II.NumFeatures = NumFeatures; 190 II.MayDeleteFile = MayDeleteFile; 191 II.UniqFeatureSet = FeatureSet; 192 II.HasFocusFunction = HasFocusFunction; 193 // Assign maximal energy to the new seed. 194 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size()); 195 II.SumIncidence = RareFeatures.size(); 196 II.NeedsEnergyUpdate = false; 197 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 198 ComputeSHA1(U.data(), U.size(), II.Sha1); 199 auto Sha1Str = Sha1ToString(II.Sha1); 200 Hashes.insert(Sha1Str); 201 if (HasFocusFunction) 202 if (auto V = DFT.Get(Sha1Str)) 203 II.DataFlowTraceForFocusFunction = *V; 204 // This is a gross heuristic. 205 // Ideally, when we add an element to a corpus we need to know its DFT. 206 // But if we don't, we'll use the DFT of its base input. 207 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 208 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 209 DistributionNeedsUpdate = true; 210 PrintCorpus(); 211 // ValidateFeatureSet(); 212 return &II; 213 } 214 215 // Debug-only 216 void PrintUnit(const Unit &U) { 217 if (!FeatureDebug) return; 218 for (uint8_t C : U) { 219 if (C != 'F' && C != 'U' && C != 'Z') 220 C = '.'; 221 Printf("%c", C); 222 } 223 } 224 225 // Debug-only 226 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) { 227 if (!FeatureDebug) return; 228 Printf("{"); 229 for (uint32_t Feature: FeatureSet) 230 Printf("%u,", Feature); 231 Printf("}"); 232 } 233 234 // Debug-only 235 void PrintCorpus() { 236 if (!FeatureDebug) return; 237 Printf("======= CORPUS:\n"); 238 int i = 0; 239 for (auto II : Inputs) { 240 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 241 Printf("[%2d] ", i); 242 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 243 PrintUnit(II->U); 244 Printf(" "); 245 PrintFeatureSet(II->UniqFeatureSet); 246 Printf("\n"); 247 } 248 i++; 249 } 250 } 251 252 void Replace(InputInfo *II, const Unit &U) { 253 assert(II->U.size() > U.size()); 254 Hashes.erase(Sha1ToString(II->Sha1)); 255 DeleteFile(*II); 256 ComputeSHA1(U.data(), U.size(), II->Sha1); 257 Hashes.insert(Sha1ToString(II->Sha1)); 258 II->U = U; 259 II->Reduced = true; 260 DistributionNeedsUpdate = true; 261 } 262 263 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 264 bool HasUnit(const std::string &H) { return Hashes.count(H); } 265 InputInfo &ChooseUnitToMutate(Random &Rand) { 266 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 267 assert(!II.U.empty()); 268 return II; 269 } 270 271 // Returns an index of random unit from the corpus to mutate. 272 size_t ChooseUnitIdxToMutate(Random &Rand) { 273 UpdateCorpusDistribution(Rand); 274 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 275 assert(Idx < Inputs.size()); 276 return Idx; 277 } 278 279 void PrintStats() { 280 for (size_t i = 0; i < Inputs.size(); i++) { 281 const auto &II = *Inputs[i]; 282 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 283 Sha1ToString(II.Sha1).c_str(), II.U.size(), 284 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction); 285 } 286 } 287 288 void PrintFeatureSet() { 289 for (size_t i = 0; i < kFeatureSetSize; i++) { 290 if(size_t Sz = GetFeature(i)) 291 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 292 } 293 Printf("\n\t"); 294 for (size_t i = 0; i < Inputs.size(); i++) 295 if (size_t N = Inputs[i]->NumFeatures) 296 Printf(" %zd=>%zd ", i, N); 297 Printf("\n"); 298 } 299 300 void DeleteFile(const InputInfo &II) { 301 if (!OutputCorpus.empty() && II.MayDeleteFile) 302 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 303 } 304 305 void DeleteInput(size_t Idx) { 306 InputInfo &II = *Inputs[Idx]; 307 DeleteFile(II); 308 Unit().swap(II.U); 309 II.Energy = 0.0; 310 II.NeedsEnergyUpdate = false; 311 DistributionNeedsUpdate = true; 312 if (FeatureDebug) 313 Printf("EVICTED %zd\n", Idx); 314 } 315 316 void AddRareFeature(uint32_t Idx) { 317 // Maintain *at least* TopXRarestFeatures many rare features 318 // and all features with a frequency below ConsideredRare. 319 // Remove all other features. 320 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures && 321 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) { 322 323 // Find most and second most abbundant feature. 324 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0], 325 RareFeatures[0]}; 326 size_t Delete = 0; 327 for (size_t i = 0; i < RareFeatures.size(); i++) { 328 uint32_t Idx2 = RareFeatures[i]; 329 if (GlobalFeatureFreqs[Idx2] >= 330 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) { 331 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0]; 332 MostAbundantRareFeatureIndices[0] = Idx2; 333 Delete = i; 334 } 335 } 336 337 // Remove most abundant rare feature. 338 RareFeatures[Delete] = RareFeatures.back(); 339 RareFeatures.pop_back(); 340 341 for (auto II : Inputs) { 342 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0])) 343 II->NeedsEnergyUpdate = true; 344 } 345 346 // Set 2nd most abundant as the new most abundant feature count. 347 FreqOfMostAbundantRareFeature = 348 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]]; 349 } 350 351 // Add rare feature, handle collisions, and update energy. 352 RareFeatures.push_back(Idx); 353 GlobalFeatureFreqs[Idx] = 0; 354 for (auto II : Inputs) { 355 II->DeleteFeatureFreq(Idx); 356 357 // Apply add-one smoothing to this locally undiscovered feature. 358 // Zero energy seeds will never be fuzzed and remain zero energy. 359 if (II->Energy > 0.0) { 360 II->SumIncidence += 1; 361 II->Energy += logl(II->SumIncidence) / II->SumIncidence; 362 } 363 } 364 365 DistributionNeedsUpdate = true; 366 } 367 368 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 369 assert(NewSize); 370 Idx = Idx % kFeatureSetSize; 371 uint32_t OldSize = GetFeature(Idx); 372 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 373 if (OldSize > 0) { 374 size_t OldIdx = SmallestElementPerFeature[Idx]; 375 InputInfo &II = *Inputs[OldIdx]; 376 assert(II.NumFeatures > 0); 377 II.NumFeatures--; 378 if (II.NumFeatures == 0) 379 DeleteInput(OldIdx); 380 } else { 381 NumAddedFeatures++; 382 if (Entropic.Enabled) 383 AddRareFeature((uint32_t)Idx); 384 } 385 NumUpdatedFeatures++; 386 if (FeatureDebug) 387 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 388 SmallestElementPerFeature[Idx] = Inputs.size(); 389 InputSizesPerFeature[Idx] = NewSize; 390 return true; 391 } 392 return false; 393 } 394 395 // Increment frequency of feature Idx globally and locally. 396 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) { 397 uint32_t Idx32 = Idx % kFeatureSetSize; 398 399 // Saturated increment. 400 if (GlobalFeatureFreqs[Idx32] == 0xFFFF) 401 return; 402 uint16_t Freq = GlobalFeatureFreqs[Idx32]++; 403 404 // Skip if abundant. 405 if (Freq > FreqOfMostAbundantRareFeature || 406 std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) == 407 RareFeatures.end()) 408 return; 409 410 // Update global frequencies. 411 if (Freq == FreqOfMostAbundantRareFeature) 412 FreqOfMostAbundantRareFeature++; 413 414 // Update local frequencies. 415 if (II) 416 II->UpdateFeatureFrequency(Idx32); 417 } 418 419 size_t NumFeatures() const { return NumAddedFeatures; } 420 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 421 422 private: 423 424 static const bool FeatureDebug = false; 425 426 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 427 428 void ValidateFeatureSet() { 429 if (FeatureDebug) 430 PrintFeatureSet(); 431 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 432 if (GetFeature(Idx)) 433 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 434 for (auto II: Inputs) { 435 if (II->Tmp != II->NumFeatures) 436 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 437 assert(II->Tmp == II->NumFeatures); 438 II->Tmp = 0; 439 } 440 } 441 442 // Updates the probability distribution for the units in the corpus. 443 // Must be called whenever the corpus or unit weights are changed. 444 // 445 // Hypothesis: inputs that maximize information about globally rare features 446 // are interesting. 447 void UpdateCorpusDistribution(Random &Rand) { 448 // Skip update if no seeds or rare features were added/deleted. 449 // Sparse updates for local change of feature frequencies, 450 // i.e., randomly do not skip. 451 if (!DistributionNeedsUpdate && 452 (!Entropic.Enabled || Rand(kSparseEnergyUpdates))) 453 return; 454 455 DistributionNeedsUpdate = false; 456 457 size_t N = Inputs.size(); 458 assert(N); 459 Intervals.resize(N + 1); 460 Weights.resize(N); 461 std::iota(Intervals.begin(), Intervals.end(), 0); 462 463 bool VanillaSchedule = true; 464 if (Entropic.Enabled) { 465 for (auto II : Inputs) { 466 if (II->NeedsEnergyUpdate && II->Energy != 0.0) { 467 II->NeedsEnergyUpdate = false; 468 II->UpdateEnergy(RareFeatures.size()); 469 } 470 } 471 472 for (size_t i = 0; i < N; i++) { 473 474 if (Inputs[i]->NumFeatures == 0) { 475 // If the seed doesn't represent any features, assign zero energy. 476 Weights[i] = 0.; 477 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor > 478 NumExecutedMutations / Inputs.size()) { 479 // If the seed was fuzzed a lot more than average, assign zero energy. 480 Weights[i] = 0.; 481 } else { 482 // Otherwise, simply assign the computed energy. 483 Weights[i] = Inputs[i]->Energy; 484 } 485 486 // If energy for all seeds is zero, fall back to vanilla schedule. 487 if (Weights[i] > 0.0) 488 VanillaSchedule = false; 489 } 490 } 491 492 if (VanillaSchedule) { 493 for (size_t i = 0; i < N; i++) 494 Weights[i] = Inputs[i]->NumFeatures 495 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1) 496 : 0.; 497 } 498 499 if (FeatureDebug) { 500 for (size_t i = 0; i < N; i++) 501 Printf("%zd ", Inputs[i]->NumFeatures); 502 Printf("SCORE\n"); 503 for (size_t i = 0; i < N; i++) 504 Printf("%f ", Weights[i]); 505 Printf("Weights\n"); 506 } 507 CorpusDistribution = std::piecewise_constant_distribution<double>( 508 Intervals.begin(), Intervals.end(), Weights.begin()); 509 } 510 std::piecewise_constant_distribution<double> CorpusDistribution; 511 512 Vector<double> Intervals; 513 Vector<double> Weights; 514 515 std::unordered_set<std::string> Hashes; 516 Vector<InputInfo*> Inputs; 517 518 size_t NumAddedFeatures = 0; 519 size_t NumUpdatedFeatures = 0; 520 uint32_t InputSizesPerFeature[kFeatureSetSize]; 521 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 522 523 bool DistributionNeedsUpdate = true; 524 uint16_t FreqOfMostAbundantRareFeature = 0; 525 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {}; 526 Vector<uint32_t> RareFeatures; 527 528 std::string OutputCorpus; 529 }; 530 531 } // namespace fuzzer 532 533 #endif // LLVM_FUZZER_CORPUS 534