1*0b57cec5SDimitry Andric //===- FuzzerMerge.h - merging corpa ----------------------------*- C++ -* ===// 2*0b57cec5SDimitry Andric // 3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0b57cec5SDimitry Andric // 7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 8*0b57cec5SDimitry Andric // Merging Corpora. 9*0b57cec5SDimitry Andric // 10*0b57cec5SDimitry Andric // The task: 11*0b57cec5SDimitry Andric // Take the existing corpus (possibly empty) and merge new inputs into 12*0b57cec5SDimitry Andric // it so that only inputs with new coverage ('features') are added. 13*0b57cec5SDimitry Andric // The process should tolerate the crashes, OOMs, leaks, etc. 14*0b57cec5SDimitry Andric // 15*0b57cec5SDimitry Andric // Algorithm: 16*0b57cec5SDimitry Andric // The outter process collects the set of files and writes their names 17*0b57cec5SDimitry Andric // into a temporary "control" file, then repeatedly launches the inner 18*0b57cec5SDimitry Andric // process until all inputs are processed. 19*0b57cec5SDimitry Andric // The outer process does not actually execute the target code. 20*0b57cec5SDimitry Andric // 21*0b57cec5SDimitry Andric // The inner process reads the control file and sees a) list of all the inputs 22*0b57cec5SDimitry Andric // and b) the last processed input. Then it starts processing the inputs one 23*0b57cec5SDimitry Andric // by one. Before processing every input it writes one line to control file: 24*0b57cec5SDimitry Andric // STARTED INPUT_ID INPUT_SIZE 25*0b57cec5SDimitry Andric // After processing an input it write another line: 26*0b57cec5SDimitry Andric // DONE INPUT_ID Feature1 Feature2 Feature3 ... 27*0b57cec5SDimitry Andric // If a crash happens while processing an input the last line in the control 28*0b57cec5SDimitry Andric // file will be "STARTED INPUT_ID" and so the next process will know 29*0b57cec5SDimitry Andric // where to resume. 30*0b57cec5SDimitry Andric // 31*0b57cec5SDimitry Andric // Once all inputs are processed by the innner process(es) the outer process 32*0b57cec5SDimitry Andric // reads the control files and does the merge based entirely on the contents 33*0b57cec5SDimitry Andric // of control file. 34*0b57cec5SDimitry Andric // It uses a single pass greedy algorithm choosing first the smallest inputs 35*0b57cec5SDimitry Andric // within the same size the inputs that have more new features. 36*0b57cec5SDimitry Andric // 37*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 38*0b57cec5SDimitry Andric 39*0b57cec5SDimitry Andric #ifndef LLVM_FUZZER_MERGE_H 40*0b57cec5SDimitry Andric #define LLVM_FUZZER_MERGE_H 41*0b57cec5SDimitry Andric 42*0b57cec5SDimitry Andric #include "FuzzerDefs.h" 43*0b57cec5SDimitry Andric 44*0b57cec5SDimitry Andric #include <istream> 45*0b57cec5SDimitry Andric #include <ostream> 46*0b57cec5SDimitry Andric #include <set> 47*0b57cec5SDimitry Andric #include <vector> 48*0b57cec5SDimitry Andric 49*0b57cec5SDimitry Andric namespace fuzzer { 50*0b57cec5SDimitry Andric 51*0b57cec5SDimitry Andric struct MergeFileInfo { 52*0b57cec5SDimitry Andric std::string Name; 53*0b57cec5SDimitry Andric size_t Size = 0; 54*0b57cec5SDimitry Andric Vector<uint32_t> Features, Cov; 55*0b57cec5SDimitry Andric }; 56*0b57cec5SDimitry Andric 57*0b57cec5SDimitry Andric struct Merger { 58*0b57cec5SDimitry Andric Vector<MergeFileInfo> Files; 59*0b57cec5SDimitry Andric size_t NumFilesInFirstCorpus = 0; 60*0b57cec5SDimitry Andric size_t FirstNotProcessedFile = 0; 61*0b57cec5SDimitry Andric std::string LastFailure; 62*0b57cec5SDimitry Andric 63*0b57cec5SDimitry Andric bool Parse(std::istream &IS, bool ParseCoverage); 64*0b57cec5SDimitry Andric bool Parse(const std::string &Str, bool ParseCoverage); 65*0b57cec5SDimitry Andric void ParseOrExit(std::istream &IS, bool ParseCoverage); 66*0b57cec5SDimitry Andric size_t Merge(const Set<uint32_t> &InitialFeatures, Set<uint32_t> *NewFeatures, 67*0b57cec5SDimitry Andric const Set<uint32_t> &InitialCov, Set<uint32_t> *NewCov, 68*0b57cec5SDimitry Andric Vector<std::string> *NewFiles); 69*0b57cec5SDimitry Andric size_t ApproximateMemoryConsumption() const; 70*0b57cec5SDimitry Andric Set<uint32_t> AllFeatures() const; 71*0b57cec5SDimitry Andric }; 72*0b57cec5SDimitry Andric 73*0b57cec5SDimitry Andric void CrashResistantMerge(const Vector<std::string> &Args, 74*0b57cec5SDimitry Andric const Vector<SizedFile> &OldCorpus, 75*0b57cec5SDimitry Andric const Vector<SizedFile> &NewCorpus, 76*0b57cec5SDimitry Andric Vector<std::string> *NewFiles, 77*0b57cec5SDimitry Andric const Set<uint32_t> &InitialFeatures, 78*0b57cec5SDimitry Andric Set<uint32_t> *NewFeatures, 79*0b57cec5SDimitry Andric const Set<uint32_t> &InitialCov, 80*0b57cec5SDimitry Andric Set<uint32_t> *NewCov, 81*0b57cec5SDimitry Andric const std::string &CFPath, 82*0b57cec5SDimitry Andric bool Verbose); 83*0b57cec5SDimitry Andric 84*0b57cec5SDimitry Andric } // namespace fuzzer 85*0b57cec5SDimitry Andric 86*0b57cec5SDimitry Andric #endif // LLVM_FUZZER_MERGE_H 87