1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // fuzzer::InputCorpus 9 //===----------------------------------------------------------------------===// 10 11 #ifndef LLVM_FUZZER_CORPUS 12 #define LLVM_FUZZER_CORPUS 13 14 #include "FuzzerDataFlowTrace.h" 15 #include "FuzzerDefs.h" 16 #include "FuzzerIO.h" 17 #include "FuzzerRandom.h" 18 #include "FuzzerSHA1.h" 19 #include "FuzzerTracePC.h" 20 #include <algorithm> 21 #include <numeric> 22 #include <random> 23 #include <unordered_set> 24 25 namespace fuzzer { 26 27 struct InputInfo { 28 Unit U; // The actual input data. 29 uint8_t Sha1[kSHA1NumBytes]; // Checksum. 30 // Number of features that this input has and no smaller input has. 31 size_t NumFeatures = 0; 32 size_t Tmp = 0; // Used by ValidateFeatureSet. 33 // Stats. 34 size_t NumExecutedMutations = 0; 35 size_t NumSuccessfullMutations = 0; 36 bool MayDeleteFile = false; 37 bool Reduced = false; 38 bool HasFocusFunction = false; 39 Vector<uint32_t> UniqFeatureSet; 40 Vector<uint8_t> DataFlowTraceForFocusFunction; 41 }; 42 43 class InputCorpus { 44 static const size_t kFeatureSetSize = 1 << 21; 45 public: 46 InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) { 47 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature)); 48 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature)); 49 } 50 ~InputCorpus() { 51 for (auto II : Inputs) 52 delete II; 53 } 54 size_t size() const { return Inputs.size(); } 55 size_t SizeInBytes() const { 56 size_t Res = 0; 57 for (auto II : Inputs) 58 Res += II->U.size(); 59 return Res; 60 } 61 size_t NumActiveUnits() const { 62 size_t Res = 0; 63 for (auto II : Inputs) 64 Res += !II->U.empty(); 65 return Res; 66 } 67 size_t MaxInputSize() const { 68 size_t Res = 0; 69 for (auto II : Inputs) 70 Res = std::max(Res, II->U.size()); 71 return Res; 72 } 73 74 size_t NumInputsThatTouchFocusFunction() { 75 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 76 return II->HasFocusFunction; 77 }); 78 } 79 80 size_t NumInputsWithDataFlowTrace() { 81 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) { 82 return !II->DataFlowTraceForFocusFunction.empty(); 83 }); 84 } 85 86 bool empty() const { return Inputs.empty(); } 87 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; } 88 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile, 89 bool HasFocusFunction, 90 const Vector<uint32_t> &FeatureSet, 91 const DataFlowTrace &DFT, const InputInfo *BaseII) { 92 assert(!U.empty()); 93 if (FeatureDebug) 94 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures); 95 Inputs.push_back(new InputInfo()); 96 InputInfo &II = *Inputs.back(); 97 II.U = U; 98 II.NumFeatures = NumFeatures; 99 II.MayDeleteFile = MayDeleteFile; 100 II.UniqFeatureSet = FeatureSet; 101 II.HasFocusFunction = HasFocusFunction; 102 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end()); 103 ComputeSHA1(U.data(), U.size(), II.Sha1); 104 auto Sha1Str = Sha1ToString(II.Sha1); 105 Hashes.insert(Sha1Str); 106 if (HasFocusFunction) 107 if (auto V = DFT.Get(Sha1Str)) 108 II.DataFlowTraceForFocusFunction = *V; 109 // This is a gross heuristic. 110 // Ideally, when we add an element to a corpus we need to know its DFT. 111 // But if we don't, we'll use the DFT of its base input. 112 if (II.DataFlowTraceForFocusFunction.empty() && BaseII) 113 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction; 114 UpdateCorpusDistribution(); 115 PrintCorpus(); 116 // ValidateFeatureSet(); 117 return &II; 118 } 119 120 // Debug-only 121 void PrintUnit(const Unit &U) { 122 if (!FeatureDebug) return; 123 for (uint8_t C : U) { 124 if (C != 'F' && C != 'U' && C != 'Z') 125 C = '.'; 126 Printf("%c", C); 127 } 128 } 129 130 // Debug-only 131 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) { 132 if (!FeatureDebug) return; 133 Printf("{"); 134 for (uint32_t Feature: FeatureSet) 135 Printf("%u,", Feature); 136 Printf("}"); 137 } 138 139 // Debug-only 140 void PrintCorpus() { 141 if (!FeatureDebug) return; 142 Printf("======= CORPUS:\n"); 143 int i = 0; 144 for (auto II : Inputs) { 145 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) { 146 Printf("[%2d] ", i); 147 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size()); 148 PrintUnit(II->U); 149 Printf(" "); 150 PrintFeatureSet(II->UniqFeatureSet); 151 Printf("\n"); 152 } 153 i++; 154 } 155 } 156 157 void Replace(InputInfo *II, const Unit &U) { 158 assert(II->U.size() > U.size()); 159 Hashes.erase(Sha1ToString(II->Sha1)); 160 DeleteFile(*II); 161 ComputeSHA1(U.data(), U.size(), II->Sha1); 162 Hashes.insert(Sha1ToString(II->Sha1)); 163 II->U = U; 164 II->Reduced = true; 165 UpdateCorpusDistribution(); 166 } 167 168 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); } 169 bool HasUnit(const std::string &H) { return Hashes.count(H); } 170 InputInfo &ChooseUnitToMutate(Random &Rand) { 171 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)]; 172 assert(!II.U.empty()); 173 return II; 174 } 175 176 // Returns an index of random unit from the corpus to mutate. 177 size_t ChooseUnitIdxToMutate(Random &Rand) { 178 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand)); 179 assert(Idx < Inputs.size()); 180 return Idx; 181 } 182 183 void PrintStats() { 184 for (size_t i = 0; i < Inputs.size(); i++) { 185 const auto &II = *Inputs[i]; 186 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i, 187 Sha1ToString(II.Sha1).c_str(), II.U.size(), 188 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction); 189 } 190 } 191 192 void PrintFeatureSet() { 193 for (size_t i = 0; i < kFeatureSetSize; i++) { 194 if(size_t Sz = GetFeature(i)) 195 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz); 196 } 197 Printf("\n\t"); 198 for (size_t i = 0; i < Inputs.size(); i++) 199 if (size_t N = Inputs[i]->NumFeatures) 200 Printf(" %zd=>%zd ", i, N); 201 Printf("\n"); 202 } 203 204 void DeleteFile(const InputInfo &II) { 205 if (!OutputCorpus.empty() && II.MayDeleteFile) 206 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1))); 207 } 208 209 void DeleteInput(size_t Idx) { 210 InputInfo &II = *Inputs[Idx]; 211 DeleteFile(II); 212 Unit().swap(II.U); 213 if (FeatureDebug) 214 Printf("EVICTED %zd\n", Idx); 215 } 216 217 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) { 218 assert(NewSize); 219 Idx = Idx % kFeatureSetSize; 220 uint32_t OldSize = GetFeature(Idx); 221 if (OldSize == 0 || (Shrink && OldSize > NewSize)) { 222 if (OldSize > 0) { 223 size_t OldIdx = SmallestElementPerFeature[Idx]; 224 InputInfo &II = *Inputs[OldIdx]; 225 assert(II.NumFeatures > 0); 226 II.NumFeatures--; 227 if (II.NumFeatures == 0) 228 DeleteInput(OldIdx); 229 } else { 230 NumAddedFeatures++; 231 } 232 NumUpdatedFeatures++; 233 if (FeatureDebug) 234 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize); 235 SmallestElementPerFeature[Idx] = Inputs.size(); 236 InputSizesPerFeature[Idx] = NewSize; 237 return true; 238 } 239 return false; 240 } 241 242 size_t NumFeatures() const { return NumAddedFeatures; } 243 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; } 244 245 private: 246 247 static const bool FeatureDebug = false; 248 249 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; } 250 251 void ValidateFeatureSet() { 252 if (FeatureDebug) 253 PrintFeatureSet(); 254 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++) 255 if (GetFeature(Idx)) 256 Inputs[SmallestElementPerFeature[Idx]]->Tmp++; 257 for (auto II: Inputs) { 258 if (II->Tmp != II->NumFeatures) 259 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures); 260 assert(II->Tmp == II->NumFeatures); 261 II->Tmp = 0; 262 } 263 } 264 265 // Updates the probability distribution for the units in the corpus. 266 // Must be called whenever the corpus or unit weights are changed. 267 // 268 // Hypothesis: units added to the corpus last are more interesting. 269 // 270 // Hypothesis: inputs with infrequent features are more interesting. 271 void UpdateCorpusDistribution() { 272 size_t N = Inputs.size(); 273 assert(N); 274 Intervals.resize(N + 1); 275 Weights.resize(N); 276 std::iota(Intervals.begin(), Intervals.end(), 0); 277 for (size_t i = 0; i < N; i++) 278 Weights[i] = Inputs[i]->NumFeatures 279 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1) 280 : 0.; 281 if (FeatureDebug) { 282 for (size_t i = 0; i < N; i++) 283 Printf("%zd ", Inputs[i]->NumFeatures); 284 Printf("SCORE\n"); 285 for (size_t i = 0; i < N; i++) 286 Printf("%f ", Weights[i]); 287 Printf("Weights\n"); 288 } 289 CorpusDistribution = std::piecewise_constant_distribution<double>( 290 Intervals.begin(), Intervals.end(), Weights.begin()); 291 } 292 std::piecewise_constant_distribution<double> CorpusDistribution; 293 294 Vector<double> Intervals; 295 Vector<double> Weights; 296 297 std::unordered_set<std::string> Hashes; 298 Vector<InputInfo*> Inputs; 299 300 size_t NumAddedFeatures = 0; 301 size_t NumUpdatedFeatures = 0; 302 uint32_t InputSizesPerFeature[kFeatureSetSize]; 303 uint32_t SmallestElementPerFeature[kFeatureSetSize]; 304 305 std::string OutputCorpus; 306 }; 307 308 } // namespace fuzzer 309 310 #endif // LLVM_FUZZER_CORPUS 311