xref: /freebsd/contrib/llvm-project/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
10b57cec5SDimitry Andric //===- FuzzerDataFlowTrace.cpp - DataFlowTrace                ---*- C++ -* ===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric // fuzzer::DataFlowTrace
90b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
100b57cec5SDimitry Andric 
110b57cec5SDimitry Andric #include "FuzzerDataFlowTrace.h"
120b57cec5SDimitry Andric 
130b57cec5SDimitry Andric #include "FuzzerCommand.h"
140b57cec5SDimitry Andric #include "FuzzerIO.h"
150b57cec5SDimitry Andric #include "FuzzerRandom.h"
160b57cec5SDimitry Andric #include "FuzzerSHA1.h"
170b57cec5SDimitry Andric #include "FuzzerUtil.h"
180b57cec5SDimitry Andric 
190b57cec5SDimitry Andric #include <cstdlib>
200b57cec5SDimitry Andric #include <fstream>
210b57cec5SDimitry Andric #include <numeric>
220b57cec5SDimitry Andric #include <queue>
230b57cec5SDimitry Andric #include <sstream>
240b57cec5SDimitry Andric #include <string>
250b57cec5SDimitry Andric #include <unordered_map>
260b57cec5SDimitry Andric #include <unordered_set>
270b57cec5SDimitry Andric #include <vector>
280b57cec5SDimitry Andric 
290b57cec5SDimitry Andric namespace fuzzer {
300b57cec5SDimitry Andric static const char *kFunctionsTxt = "functions.txt";
310b57cec5SDimitry Andric 
AppendCoverage(const std::string & S)320b57cec5SDimitry Andric bool BlockCoverage::AppendCoverage(const std::string &S) {
330b57cec5SDimitry Andric   std::stringstream SS(S);
340b57cec5SDimitry Andric   return AppendCoverage(SS);
350b57cec5SDimitry Andric }
360b57cec5SDimitry Andric 
370b57cec5SDimitry Andric // Coverage lines have this form:
380b57cec5SDimitry Andric // CN X Y Z T
390b57cec5SDimitry Andric // where N is the number of the function, T is the total number of instrumented
40349cc55cSDimitry Andric // BBs, and X,Y,Z, if present, are the indices of covered BB.
410b57cec5SDimitry Andric // BB #0, which is the entry block, is not explicitly listed.
AppendCoverage(std::istream & IN)420b57cec5SDimitry Andric bool BlockCoverage::AppendCoverage(std::istream &IN) {
430b57cec5SDimitry Andric   std::string L;
440b57cec5SDimitry Andric   while (std::getline(IN, L, '\n')) {
450b57cec5SDimitry Andric     if (L.empty())
460b57cec5SDimitry Andric       continue;
470b57cec5SDimitry Andric     std::stringstream SS(L.c_str() + 1);
480b57cec5SDimitry Andric     size_t FunctionId  = 0;
490b57cec5SDimitry Andric     SS >> FunctionId;
500b57cec5SDimitry Andric     if (L[0] == 'F') {
510b57cec5SDimitry Andric       FunctionsWithDFT.insert(FunctionId);
520b57cec5SDimitry Andric       continue;
530b57cec5SDimitry Andric     }
540b57cec5SDimitry Andric     if (L[0] != 'C') continue;
55349cc55cSDimitry Andric     std::vector<uint32_t> CoveredBlocks;
560b57cec5SDimitry Andric     while (true) {
570b57cec5SDimitry Andric       uint32_t BB = 0;
580b57cec5SDimitry Andric       SS >> BB;
590b57cec5SDimitry Andric       if (!SS) break;
600b57cec5SDimitry Andric       CoveredBlocks.push_back(BB);
610b57cec5SDimitry Andric     }
620b57cec5SDimitry Andric     if (CoveredBlocks.empty()) return false;
63fe6060f1SDimitry Andric     // Ensures no CoverageVector is longer than UINT32_MAX.
640b57cec5SDimitry Andric     uint32_t NumBlocks = CoveredBlocks.back();
650b57cec5SDimitry Andric     CoveredBlocks.pop_back();
660b57cec5SDimitry Andric     for (auto BB : CoveredBlocks)
670b57cec5SDimitry Andric       if (BB >= NumBlocks) return false;
680b57cec5SDimitry Andric     auto It = Functions.find(FunctionId);
690b57cec5SDimitry Andric     auto &Counters =
700b57cec5SDimitry Andric         It == Functions.end()
71349cc55cSDimitry Andric             ? Functions.insert({FunctionId, std::vector<uint32_t>(NumBlocks)})
720b57cec5SDimitry Andric                   .first->second
730b57cec5SDimitry Andric             : It->second;
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric     if (Counters.size() != NumBlocks) return false;  // wrong number of blocks.
760b57cec5SDimitry Andric 
770b57cec5SDimitry Andric     Counters[0]++;
780b57cec5SDimitry Andric     for (auto BB : CoveredBlocks)
790b57cec5SDimitry Andric       Counters[BB]++;
800b57cec5SDimitry Andric   }
810b57cec5SDimitry Andric   return true;
820b57cec5SDimitry Andric }
830b57cec5SDimitry Andric 
840b57cec5SDimitry Andric // Assign weights to each function.
850b57cec5SDimitry Andric // General principles:
860b57cec5SDimitry Andric //   * any uncovered function gets weight 0.
870b57cec5SDimitry Andric //   * a function with lots of uncovered blocks gets bigger weight.
880b57cec5SDimitry Andric //   * a function with a less frequently executed code gets bigger weight.
FunctionWeights(size_t NumFunctions) const89349cc55cSDimitry Andric std::vector<double> BlockCoverage::FunctionWeights(size_t NumFunctions) const {
90349cc55cSDimitry Andric   std::vector<double> Res(NumFunctions);
91*06c3fb27SDimitry Andric   for (const auto &It : Functions) {
920b57cec5SDimitry Andric     auto FunctionID = It.first;
930b57cec5SDimitry Andric     auto Counters = It.second;
940b57cec5SDimitry Andric     assert(FunctionID < NumFunctions);
950b57cec5SDimitry Andric     auto &Weight = Res[FunctionID];
960b57cec5SDimitry Andric     // Give higher weight if the function has a DFT.
970b57cec5SDimitry Andric     Weight = FunctionsWithDFT.count(FunctionID) ? 1000. : 1;
980b57cec5SDimitry Andric     // Give higher weight to functions with less frequently seen basic blocks.
990b57cec5SDimitry Andric     Weight /= SmallestNonZeroCounter(Counters);
1000b57cec5SDimitry Andric     // Give higher weight to functions with the most uncovered basic blocks.
1010b57cec5SDimitry Andric     Weight *= NumberOfUncoveredBlocks(Counters) + 1;
1020b57cec5SDimitry Andric   }
1030b57cec5SDimitry Andric   return Res;
1040b57cec5SDimitry Andric }
1050b57cec5SDimitry Andric 
ReadCoverage(const std::string & DirPath)1060b57cec5SDimitry Andric void DataFlowTrace::ReadCoverage(const std::string &DirPath) {
107349cc55cSDimitry Andric   std::vector<SizedFile> Files;
1080b57cec5SDimitry Andric   GetSizedFilesFromDir(DirPath, &Files);
1090b57cec5SDimitry Andric   for (auto &SF : Files) {
1100b57cec5SDimitry Andric     auto Name = Basename(SF.File);
1110b57cec5SDimitry Andric     if (Name == kFunctionsTxt) continue;
1120b57cec5SDimitry Andric     if (!CorporaHashes.count(Name)) continue;
1130b57cec5SDimitry Andric     std::ifstream IF(SF.File);
1140b57cec5SDimitry Andric     Coverage.AppendCoverage(IF);
1150b57cec5SDimitry Andric   }
1160b57cec5SDimitry Andric }
1170b57cec5SDimitry Andric 
DFTStringAppendToVector(std::vector<uint8_t> * DFT,const std::string & DFTString)118349cc55cSDimitry Andric static void DFTStringAppendToVector(std::vector<uint8_t> *DFT,
1190b57cec5SDimitry Andric                                     const std::string &DFTString) {
1200b57cec5SDimitry Andric   assert(DFT->size() == DFTString.size());
1210b57cec5SDimitry Andric   for (size_t I = 0, Len = DFT->size(); I < Len; I++)
1220b57cec5SDimitry Andric     (*DFT)[I] = DFTString[I] == '1';
1230b57cec5SDimitry Andric }
1240b57cec5SDimitry Andric 
125349cc55cSDimitry Andric // converts a string of '0' and '1' into a std::vector<uint8_t>
DFTStringToVector(const std::string & DFTString)126349cc55cSDimitry Andric static std::vector<uint8_t> DFTStringToVector(const std::string &DFTString) {
127349cc55cSDimitry Andric   std::vector<uint8_t> DFT(DFTString.size());
1280b57cec5SDimitry Andric   DFTStringAppendToVector(&DFT, DFTString);
1290b57cec5SDimitry Andric   return DFT;
1300b57cec5SDimitry Andric }
1310b57cec5SDimitry Andric 
ParseError(const char * Err,const std::string & Line)1320b57cec5SDimitry Andric static bool ParseError(const char *Err, const std::string &Line) {
1330b57cec5SDimitry Andric   Printf("DataFlowTrace: parse error: %s: Line: %s\n", Err, Line.c_str());
1340b57cec5SDimitry Andric   return false;
1350b57cec5SDimitry Andric }
1360b57cec5SDimitry Andric 
1370b57cec5SDimitry Andric // TODO(metzman): replace std::string with std::string_view for
1380b57cec5SDimitry Andric // better performance. Need to figure our how to use string_view on Windows.
ParseDFTLine(const std::string & Line,size_t * FunctionNum,std::string * DFTString)1390b57cec5SDimitry Andric static bool ParseDFTLine(const std::string &Line, size_t *FunctionNum,
1400b57cec5SDimitry Andric                          std::string *DFTString) {
1410b57cec5SDimitry Andric   if (!Line.empty() && Line[0] != 'F')
1420b57cec5SDimitry Andric     return false; // Ignore coverage.
1430b57cec5SDimitry Andric   size_t SpacePos = Line.find(' ');
1440b57cec5SDimitry Andric   if (SpacePos == std::string::npos)
1450b57cec5SDimitry Andric     return ParseError("no space in the trace line", Line);
1460b57cec5SDimitry Andric   if (Line.empty() || Line[0] != 'F')
1470b57cec5SDimitry Andric     return ParseError("the trace line doesn't start with 'F'", Line);
1480b57cec5SDimitry Andric   *FunctionNum = std::atol(Line.c_str() + 1);
1490b57cec5SDimitry Andric   const char *Beg = Line.c_str() + SpacePos + 1;
1500b57cec5SDimitry Andric   const char *End = Line.c_str() + Line.size();
1510b57cec5SDimitry Andric   assert(Beg < End);
1520b57cec5SDimitry Andric   size_t Len = End - Beg;
1530b57cec5SDimitry Andric   for (size_t I = 0; I < Len; I++) {
1540b57cec5SDimitry Andric     if (Beg[I] != '0' && Beg[I] != '1')
1550b57cec5SDimitry Andric       return ParseError("the trace should contain only 0 or 1", Line);
1560b57cec5SDimitry Andric   }
1570b57cec5SDimitry Andric   *DFTString = Beg;
1580b57cec5SDimitry Andric   return true;
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric 
Init(const std::string & DirPath,std::string * FocusFunction,std::vector<SizedFile> & CorporaFiles,Random & Rand)1610b57cec5SDimitry Andric bool DataFlowTrace::Init(const std::string &DirPath, std::string *FocusFunction,
162349cc55cSDimitry Andric                          std::vector<SizedFile> &CorporaFiles, Random &Rand) {
1630b57cec5SDimitry Andric   if (DirPath.empty()) return false;
1640b57cec5SDimitry Andric   Printf("INFO: DataFlowTrace: reading from '%s'\n", DirPath.c_str());
165349cc55cSDimitry Andric   std::vector<SizedFile> Files;
1660b57cec5SDimitry Andric   GetSizedFilesFromDir(DirPath, &Files);
1670b57cec5SDimitry Andric   std::string L;
1680b57cec5SDimitry Andric   size_t FocusFuncIdx = SIZE_MAX;
169349cc55cSDimitry Andric   std::vector<std::string> FunctionNames;
1700b57cec5SDimitry Andric 
1710b57cec5SDimitry Andric   // Collect the hashes of the corpus files.
1720b57cec5SDimitry Andric   for (auto &SF : CorporaFiles)
1730b57cec5SDimitry Andric     CorporaHashes.insert(Hash(FileToVector(SF.File)));
1740b57cec5SDimitry Andric 
1750b57cec5SDimitry Andric   // Read functions.txt
1760b57cec5SDimitry Andric   std::ifstream IF(DirPlusFile(DirPath, kFunctionsTxt));
1770b57cec5SDimitry Andric   size_t NumFunctions = 0;
1780b57cec5SDimitry Andric   while (std::getline(IF, L, '\n')) {
1790b57cec5SDimitry Andric     FunctionNames.push_back(L);
1800b57cec5SDimitry Andric     NumFunctions++;
1810b57cec5SDimitry Andric     if (*FocusFunction == L)
1820b57cec5SDimitry Andric       FocusFuncIdx = NumFunctions - 1;
1830b57cec5SDimitry Andric   }
1840b57cec5SDimitry Andric   if (!NumFunctions)
1850b57cec5SDimitry Andric     return false;
1860b57cec5SDimitry Andric 
1870b57cec5SDimitry Andric   if (*FocusFunction == "auto") {
1880b57cec5SDimitry Andric     // AUTOFOCUS works like this:
1890b57cec5SDimitry Andric     // * reads the coverage data from the DFT files.
1900b57cec5SDimitry Andric     // * assigns weights to functions based on coverage.
1910b57cec5SDimitry Andric     // * chooses a random function according to the weights.
1920b57cec5SDimitry Andric     ReadCoverage(DirPath);
1930b57cec5SDimitry Andric     auto Weights = Coverage.FunctionWeights(NumFunctions);
194349cc55cSDimitry Andric     std::vector<double> Intervals(NumFunctions + 1);
1950b57cec5SDimitry Andric     std::iota(Intervals.begin(), Intervals.end(), 0);
1960b57cec5SDimitry Andric     auto Distribution = std::piecewise_constant_distribution<double>(
1970b57cec5SDimitry Andric         Intervals.begin(), Intervals.end(), Weights.begin());
1980b57cec5SDimitry Andric     FocusFuncIdx = static_cast<size_t>(Distribution(Rand));
1990b57cec5SDimitry Andric     *FocusFunction = FunctionNames[FocusFuncIdx];
2000b57cec5SDimitry Andric     assert(FocusFuncIdx < NumFunctions);
2010b57cec5SDimitry Andric     Printf("INFO: AUTOFOCUS: %zd %s\n", FocusFuncIdx,
2020b57cec5SDimitry Andric            FunctionNames[FocusFuncIdx].c_str());
2030b57cec5SDimitry Andric     for (size_t i = 0; i < NumFunctions; i++) {
204fe6060f1SDimitry Andric       if (Weights[i] == 0.0)
205fe6060f1SDimitry Andric         continue;
2060b57cec5SDimitry Andric       Printf("  [%zd] W %g\tBB-tot %u\tBB-cov %u\tEntryFreq %u:\t%s\n", i,
2070b57cec5SDimitry Andric              Weights[i], Coverage.GetNumberOfBlocks(i),
2080b57cec5SDimitry Andric              Coverage.GetNumberOfCoveredBlocks(i), Coverage.GetCounter(i, 0),
2090b57cec5SDimitry Andric              FunctionNames[i].c_str());
2100b57cec5SDimitry Andric     }
2110b57cec5SDimitry Andric   }
2120b57cec5SDimitry Andric 
2130b57cec5SDimitry Andric   if (!NumFunctions || FocusFuncIdx == SIZE_MAX || Files.size() <= 1)
2140b57cec5SDimitry Andric     return false;
2150b57cec5SDimitry Andric 
2160b57cec5SDimitry Andric   // Read traces.
2170b57cec5SDimitry Andric   size_t NumTraceFiles = 0;
2180b57cec5SDimitry Andric   size_t NumTracesWithFocusFunction = 0;
2190b57cec5SDimitry Andric   for (auto &SF : Files) {
2200b57cec5SDimitry Andric     auto Name = Basename(SF.File);
2210b57cec5SDimitry Andric     if (Name == kFunctionsTxt) continue;
2220b57cec5SDimitry Andric     if (!CorporaHashes.count(Name)) continue;  // not in the corpus.
2230b57cec5SDimitry Andric     NumTraceFiles++;
2240b57cec5SDimitry Andric     // Printf("=== %s\n", Name.c_str());
2250b57cec5SDimitry Andric     std::ifstream IF(SF.File);
2260b57cec5SDimitry Andric     while (std::getline(IF, L, '\n')) {
2270b57cec5SDimitry Andric       size_t FunctionNum = 0;
2280b57cec5SDimitry Andric       std::string DFTString;
2290b57cec5SDimitry Andric       if (ParseDFTLine(L, &FunctionNum, &DFTString) &&
2300b57cec5SDimitry Andric           FunctionNum == FocusFuncIdx) {
2310b57cec5SDimitry Andric         NumTracesWithFocusFunction++;
2320b57cec5SDimitry Andric 
2330b57cec5SDimitry Andric         if (FunctionNum >= NumFunctions)
2340b57cec5SDimitry Andric           return ParseError("N is greater than the number of functions", L);
2350b57cec5SDimitry Andric         Traces[Name] = DFTStringToVector(DFTString);
2360b57cec5SDimitry Andric         // Print just a few small traces.
2370b57cec5SDimitry Andric         if (NumTracesWithFocusFunction <= 3 && DFTString.size() <= 16)
2380b57cec5SDimitry Andric           Printf("%s => |%s|\n", Name.c_str(), std::string(DFTString).c_str());
2390b57cec5SDimitry Andric         break; // No need to parse the following lines.
2400b57cec5SDimitry Andric       }
2410b57cec5SDimitry Andric     }
2420b57cec5SDimitry Andric   }
2430b57cec5SDimitry Andric   Printf("INFO: DataFlowTrace: %zd trace files, %zd functions, "
2440b57cec5SDimitry Andric          "%zd traces with focus function\n",
2450b57cec5SDimitry Andric          NumTraceFiles, NumFunctions, NumTracesWithFocusFunction);
2460b57cec5SDimitry Andric   return NumTraceFiles > 0;
2470b57cec5SDimitry Andric }
2480b57cec5SDimitry Andric 
CollectDataFlow(const std::string & DFTBinary,const std::string & DirPath,const std::vector<SizedFile> & CorporaFiles)2490b57cec5SDimitry Andric int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath,
250349cc55cSDimitry Andric                     const std::vector<SizedFile> &CorporaFiles) {
2510b57cec5SDimitry Andric   Printf("INFO: collecting data flow: bin: %s dir: %s files: %zd\n",
2520b57cec5SDimitry Andric          DFTBinary.c_str(), DirPath.c_str(), CorporaFiles.size());
2535ffd83dbSDimitry Andric   if (CorporaFiles.empty()) {
2545ffd83dbSDimitry Andric     Printf("ERROR: can't collect data flow without corpus provided.");
2555ffd83dbSDimitry Andric     return 1;
2565ffd83dbSDimitry Andric   }
2575ffd83dbSDimitry Andric 
258e8d8bef9SDimitry Andric   static char DFSanEnv[] = "DFSAN_OPTIONS=warn_unimplemented=0";
2590b57cec5SDimitry Andric   putenv(DFSanEnv);
2600b57cec5SDimitry Andric   MkDir(DirPath);
2610b57cec5SDimitry Andric   for (auto &F : CorporaFiles) {
2620b57cec5SDimitry Andric     // For every input F we need to collect the data flow and the coverage.
2630b57cec5SDimitry Andric     // Data flow collection may fail if we request too many DFSan tags at once.
2640b57cec5SDimitry Andric     // So, we start from requesting all tags in range [0,Size) and if that fails
2650b57cec5SDimitry Andric     // we then request tags in [0,Size/2) and [Size/2, Size), and so on.
2660b57cec5SDimitry Andric     // Function number => DFT.
2670b57cec5SDimitry Andric     auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
268349cc55cSDimitry Andric     std::unordered_map<size_t, std::vector<uint8_t>> DFTMap;
2690b57cec5SDimitry Andric     std::unordered_set<std::string> Cov;
2700b57cec5SDimitry Andric     Command Cmd;
2710b57cec5SDimitry Andric     Cmd.addArgument(DFTBinary);
2720b57cec5SDimitry Andric     Cmd.addArgument(F.File);
2730b57cec5SDimitry Andric     Cmd.addArgument(OutPath);
2740b57cec5SDimitry Andric     Printf("CMD: %s\n", Cmd.toString().c_str());
2750b57cec5SDimitry Andric     ExecuteCommand(Cmd);
2760b57cec5SDimitry Andric   }
2770b57cec5SDimitry Andric   // Write functions.txt if it's currently empty or doesn't exist.
2780b57cec5SDimitry Andric   auto FunctionsTxtPath = DirPlusFile(DirPath, kFunctionsTxt);
2790b57cec5SDimitry Andric   if (FileToString(FunctionsTxtPath).empty()) {
2800b57cec5SDimitry Andric     Command Cmd;
2810b57cec5SDimitry Andric     Cmd.addArgument(DFTBinary);
2820b57cec5SDimitry Andric     Cmd.setOutputFile(FunctionsTxtPath);
2830b57cec5SDimitry Andric     ExecuteCommand(Cmd);
2840b57cec5SDimitry Andric   }
2850b57cec5SDimitry Andric   return 0;
2860b57cec5SDimitry Andric }
2870b57cec5SDimitry Andric 
2880b57cec5SDimitry Andric }  // namespace fuzzer
289