10b57cec5SDimitry Andric //===- FuzzerDataFlowTrace.cpp - DataFlowTrace ---*- C++ -* ===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric // fuzzer::DataFlowTrace
90b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
100b57cec5SDimitry Andric
110b57cec5SDimitry Andric #include "FuzzerDataFlowTrace.h"
120b57cec5SDimitry Andric
130b57cec5SDimitry Andric #include "FuzzerCommand.h"
140b57cec5SDimitry Andric #include "FuzzerIO.h"
150b57cec5SDimitry Andric #include "FuzzerRandom.h"
160b57cec5SDimitry Andric #include "FuzzerSHA1.h"
170b57cec5SDimitry Andric #include "FuzzerUtil.h"
180b57cec5SDimitry Andric
190b57cec5SDimitry Andric #include <cstdlib>
200b57cec5SDimitry Andric #include <fstream>
210b57cec5SDimitry Andric #include <numeric>
220b57cec5SDimitry Andric #include <queue>
230b57cec5SDimitry Andric #include <sstream>
240b57cec5SDimitry Andric #include <string>
250b57cec5SDimitry Andric #include <unordered_map>
260b57cec5SDimitry Andric #include <unordered_set>
270b57cec5SDimitry Andric #include <vector>
280b57cec5SDimitry Andric
290b57cec5SDimitry Andric namespace fuzzer {
300b57cec5SDimitry Andric static const char *kFunctionsTxt = "functions.txt";
310b57cec5SDimitry Andric
AppendCoverage(const std::string & S)320b57cec5SDimitry Andric bool BlockCoverage::AppendCoverage(const std::string &S) {
330b57cec5SDimitry Andric std::stringstream SS(S);
340b57cec5SDimitry Andric return AppendCoverage(SS);
350b57cec5SDimitry Andric }
360b57cec5SDimitry Andric
370b57cec5SDimitry Andric // Coverage lines have this form:
380b57cec5SDimitry Andric // CN X Y Z T
390b57cec5SDimitry Andric // where N is the number of the function, T is the total number of instrumented
40349cc55cSDimitry Andric // BBs, and X,Y,Z, if present, are the indices of covered BB.
410b57cec5SDimitry Andric // BB #0, which is the entry block, is not explicitly listed.
AppendCoverage(std::istream & IN)420b57cec5SDimitry Andric bool BlockCoverage::AppendCoverage(std::istream &IN) {
430b57cec5SDimitry Andric std::string L;
440b57cec5SDimitry Andric while (std::getline(IN, L, '\n')) {
450b57cec5SDimitry Andric if (L.empty())
460b57cec5SDimitry Andric continue;
470b57cec5SDimitry Andric std::stringstream SS(L.c_str() + 1);
480b57cec5SDimitry Andric size_t FunctionId = 0;
490b57cec5SDimitry Andric SS >> FunctionId;
500b57cec5SDimitry Andric if (L[0] == 'F') {
510b57cec5SDimitry Andric FunctionsWithDFT.insert(FunctionId);
520b57cec5SDimitry Andric continue;
530b57cec5SDimitry Andric }
540b57cec5SDimitry Andric if (L[0] != 'C') continue;
55349cc55cSDimitry Andric std::vector<uint32_t> CoveredBlocks;
560b57cec5SDimitry Andric while (true) {
570b57cec5SDimitry Andric uint32_t BB = 0;
580b57cec5SDimitry Andric SS >> BB;
590b57cec5SDimitry Andric if (!SS) break;
600b57cec5SDimitry Andric CoveredBlocks.push_back(BB);
610b57cec5SDimitry Andric }
620b57cec5SDimitry Andric if (CoveredBlocks.empty()) return false;
63fe6060f1SDimitry Andric // Ensures no CoverageVector is longer than UINT32_MAX.
640b57cec5SDimitry Andric uint32_t NumBlocks = CoveredBlocks.back();
650b57cec5SDimitry Andric CoveredBlocks.pop_back();
660b57cec5SDimitry Andric for (auto BB : CoveredBlocks)
670b57cec5SDimitry Andric if (BB >= NumBlocks) return false;
680b57cec5SDimitry Andric auto It = Functions.find(FunctionId);
690b57cec5SDimitry Andric auto &Counters =
700b57cec5SDimitry Andric It == Functions.end()
71349cc55cSDimitry Andric ? Functions.insert({FunctionId, std::vector<uint32_t>(NumBlocks)})
720b57cec5SDimitry Andric .first->second
730b57cec5SDimitry Andric : It->second;
740b57cec5SDimitry Andric
750b57cec5SDimitry Andric if (Counters.size() != NumBlocks) return false; // wrong number of blocks.
760b57cec5SDimitry Andric
770b57cec5SDimitry Andric Counters[0]++;
780b57cec5SDimitry Andric for (auto BB : CoveredBlocks)
790b57cec5SDimitry Andric Counters[BB]++;
800b57cec5SDimitry Andric }
810b57cec5SDimitry Andric return true;
820b57cec5SDimitry Andric }
830b57cec5SDimitry Andric
840b57cec5SDimitry Andric // Assign weights to each function.
850b57cec5SDimitry Andric // General principles:
860b57cec5SDimitry Andric // * any uncovered function gets weight 0.
870b57cec5SDimitry Andric // * a function with lots of uncovered blocks gets bigger weight.
880b57cec5SDimitry Andric // * a function with a less frequently executed code gets bigger weight.
FunctionWeights(size_t NumFunctions) const89349cc55cSDimitry Andric std::vector<double> BlockCoverage::FunctionWeights(size_t NumFunctions) const {
90349cc55cSDimitry Andric std::vector<double> Res(NumFunctions);
91*06c3fb27SDimitry Andric for (const auto &It : Functions) {
920b57cec5SDimitry Andric auto FunctionID = It.first;
930b57cec5SDimitry Andric auto Counters = It.second;
940b57cec5SDimitry Andric assert(FunctionID < NumFunctions);
950b57cec5SDimitry Andric auto &Weight = Res[FunctionID];
960b57cec5SDimitry Andric // Give higher weight if the function has a DFT.
970b57cec5SDimitry Andric Weight = FunctionsWithDFT.count(FunctionID) ? 1000. : 1;
980b57cec5SDimitry Andric // Give higher weight to functions with less frequently seen basic blocks.
990b57cec5SDimitry Andric Weight /= SmallestNonZeroCounter(Counters);
1000b57cec5SDimitry Andric // Give higher weight to functions with the most uncovered basic blocks.
1010b57cec5SDimitry Andric Weight *= NumberOfUncoveredBlocks(Counters) + 1;
1020b57cec5SDimitry Andric }
1030b57cec5SDimitry Andric return Res;
1040b57cec5SDimitry Andric }
1050b57cec5SDimitry Andric
ReadCoverage(const std::string & DirPath)1060b57cec5SDimitry Andric void DataFlowTrace::ReadCoverage(const std::string &DirPath) {
107349cc55cSDimitry Andric std::vector<SizedFile> Files;
1080b57cec5SDimitry Andric GetSizedFilesFromDir(DirPath, &Files);
1090b57cec5SDimitry Andric for (auto &SF : Files) {
1100b57cec5SDimitry Andric auto Name = Basename(SF.File);
1110b57cec5SDimitry Andric if (Name == kFunctionsTxt) continue;
1120b57cec5SDimitry Andric if (!CorporaHashes.count(Name)) continue;
1130b57cec5SDimitry Andric std::ifstream IF(SF.File);
1140b57cec5SDimitry Andric Coverage.AppendCoverage(IF);
1150b57cec5SDimitry Andric }
1160b57cec5SDimitry Andric }
1170b57cec5SDimitry Andric
DFTStringAppendToVector(std::vector<uint8_t> * DFT,const std::string & DFTString)118349cc55cSDimitry Andric static void DFTStringAppendToVector(std::vector<uint8_t> *DFT,
1190b57cec5SDimitry Andric const std::string &DFTString) {
1200b57cec5SDimitry Andric assert(DFT->size() == DFTString.size());
1210b57cec5SDimitry Andric for (size_t I = 0, Len = DFT->size(); I < Len; I++)
1220b57cec5SDimitry Andric (*DFT)[I] = DFTString[I] == '1';
1230b57cec5SDimitry Andric }
1240b57cec5SDimitry Andric
125349cc55cSDimitry Andric // converts a string of '0' and '1' into a std::vector<uint8_t>
DFTStringToVector(const std::string & DFTString)126349cc55cSDimitry Andric static std::vector<uint8_t> DFTStringToVector(const std::string &DFTString) {
127349cc55cSDimitry Andric std::vector<uint8_t> DFT(DFTString.size());
1280b57cec5SDimitry Andric DFTStringAppendToVector(&DFT, DFTString);
1290b57cec5SDimitry Andric return DFT;
1300b57cec5SDimitry Andric }
1310b57cec5SDimitry Andric
ParseError(const char * Err,const std::string & Line)1320b57cec5SDimitry Andric static bool ParseError(const char *Err, const std::string &Line) {
1330b57cec5SDimitry Andric Printf("DataFlowTrace: parse error: %s: Line: %s\n", Err, Line.c_str());
1340b57cec5SDimitry Andric return false;
1350b57cec5SDimitry Andric }
1360b57cec5SDimitry Andric
1370b57cec5SDimitry Andric // TODO(metzman): replace std::string with std::string_view for
1380b57cec5SDimitry Andric // better performance. Need to figure our how to use string_view on Windows.
ParseDFTLine(const std::string & Line,size_t * FunctionNum,std::string * DFTString)1390b57cec5SDimitry Andric static bool ParseDFTLine(const std::string &Line, size_t *FunctionNum,
1400b57cec5SDimitry Andric std::string *DFTString) {
1410b57cec5SDimitry Andric if (!Line.empty() && Line[0] != 'F')
1420b57cec5SDimitry Andric return false; // Ignore coverage.
1430b57cec5SDimitry Andric size_t SpacePos = Line.find(' ');
1440b57cec5SDimitry Andric if (SpacePos == std::string::npos)
1450b57cec5SDimitry Andric return ParseError("no space in the trace line", Line);
1460b57cec5SDimitry Andric if (Line.empty() || Line[0] != 'F')
1470b57cec5SDimitry Andric return ParseError("the trace line doesn't start with 'F'", Line);
1480b57cec5SDimitry Andric *FunctionNum = std::atol(Line.c_str() + 1);
1490b57cec5SDimitry Andric const char *Beg = Line.c_str() + SpacePos + 1;
1500b57cec5SDimitry Andric const char *End = Line.c_str() + Line.size();
1510b57cec5SDimitry Andric assert(Beg < End);
1520b57cec5SDimitry Andric size_t Len = End - Beg;
1530b57cec5SDimitry Andric for (size_t I = 0; I < Len; I++) {
1540b57cec5SDimitry Andric if (Beg[I] != '0' && Beg[I] != '1')
1550b57cec5SDimitry Andric return ParseError("the trace should contain only 0 or 1", Line);
1560b57cec5SDimitry Andric }
1570b57cec5SDimitry Andric *DFTString = Beg;
1580b57cec5SDimitry Andric return true;
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric
Init(const std::string & DirPath,std::string * FocusFunction,std::vector<SizedFile> & CorporaFiles,Random & Rand)1610b57cec5SDimitry Andric bool DataFlowTrace::Init(const std::string &DirPath, std::string *FocusFunction,
162349cc55cSDimitry Andric std::vector<SizedFile> &CorporaFiles, Random &Rand) {
1630b57cec5SDimitry Andric if (DirPath.empty()) return false;
1640b57cec5SDimitry Andric Printf("INFO: DataFlowTrace: reading from '%s'\n", DirPath.c_str());
165349cc55cSDimitry Andric std::vector<SizedFile> Files;
1660b57cec5SDimitry Andric GetSizedFilesFromDir(DirPath, &Files);
1670b57cec5SDimitry Andric std::string L;
1680b57cec5SDimitry Andric size_t FocusFuncIdx = SIZE_MAX;
169349cc55cSDimitry Andric std::vector<std::string> FunctionNames;
1700b57cec5SDimitry Andric
1710b57cec5SDimitry Andric // Collect the hashes of the corpus files.
1720b57cec5SDimitry Andric for (auto &SF : CorporaFiles)
1730b57cec5SDimitry Andric CorporaHashes.insert(Hash(FileToVector(SF.File)));
1740b57cec5SDimitry Andric
1750b57cec5SDimitry Andric // Read functions.txt
1760b57cec5SDimitry Andric std::ifstream IF(DirPlusFile(DirPath, kFunctionsTxt));
1770b57cec5SDimitry Andric size_t NumFunctions = 0;
1780b57cec5SDimitry Andric while (std::getline(IF, L, '\n')) {
1790b57cec5SDimitry Andric FunctionNames.push_back(L);
1800b57cec5SDimitry Andric NumFunctions++;
1810b57cec5SDimitry Andric if (*FocusFunction == L)
1820b57cec5SDimitry Andric FocusFuncIdx = NumFunctions - 1;
1830b57cec5SDimitry Andric }
1840b57cec5SDimitry Andric if (!NumFunctions)
1850b57cec5SDimitry Andric return false;
1860b57cec5SDimitry Andric
1870b57cec5SDimitry Andric if (*FocusFunction == "auto") {
1880b57cec5SDimitry Andric // AUTOFOCUS works like this:
1890b57cec5SDimitry Andric // * reads the coverage data from the DFT files.
1900b57cec5SDimitry Andric // * assigns weights to functions based on coverage.
1910b57cec5SDimitry Andric // * chooses a random function according to the weights.
1920b57cec5SDimitry Andric ReadCoverage(DirPath);
1930b57cec5SDimitry Andric auto Weights = Coverage.FunctionWeights(NumFunctions);
194349cc55cSDimitry Andric std::vector<double> Intervals(NumFunctions + 1);
1950b57cec5SDimitry Andric std::iota(Intervals.begin(), Intervals.end(), 0);
1960b57cec5SDimitry Andric auto Distribution = std::piecewise_constant_distribution<double>(
1970b57cec5SDimitry Andric Intervals.begin(), Intervals.end(), Weights.begin());
1980b57cec5SDimitry Andric FocusFuncIdx = static_cast<size_t>(Distribution(Rand));
1990b57cec5SDimitry Andric *FocusFunction = FunctionNames[FocusFuncIdx];
2000b57cec5SDimitry Andric assert(FocusFuncIdx < NumFunctions);
2010b57cec5SDimitry Andric Printf("INFO: AUTOFOCUS: %zd %s\n", FocusFuncIdx,
2020b57cec5SDimitry Andric FunctionNames[FocusFuncIdx].c_str());
2030b57cec5SDimitry Andric for (size_t i = 0; i < NumFunctions; i++) {
204fe6060f1SDimitry Andric if (Weights[i] == 0.0)
205fe6060f1SDimitry Andric continue;
2060b57cec5SDimitry Andric Printf(" [%zd] W %g\tBB-tot %u\tBB-cov %u\tEntryFreq %u:\t%s\n", i,
2070b57cec5SDimitry Andric Weights[i], Coverage.GetNumberOfBlocks(i),
2080b57cec5SDimitry Andric Coverage.GetNumberOfCoveredBlocks(i), Coverage.GetCounter(i, 0),
2090b57cec5SDimitry Andric FunctionNames[i].c_str());
2100b57cec5SDimitry Andric }
2110b57cec5SDimitry Andric }
2120b57cec5SDimitry Andric
2130b57cec5SDimitry Andric if (!NumFunctions || FocusFuncIdx == SIZE_MAX || Files.size() <= 1)
2140b57cec5SDimitry Andric return false;
2150b57cec5SDimitry Andric
2160b57cec5SDimitry Andric // Read traces.
2170b57cec5SDimitry Andric size_t NumTraceFiles = 0;
2180b57cec5SDimitry Andric size_t NumTracesWithFocusFunction = 0;
2190b57cec5SDimitry Andric for (auto &SF : Files) {
2200b57cec5SDimitry Andric auto Name = Basename(SF.File);
2210b57cec5SDimitry Andric if (Name == kFunctionsTxt) continue;
2220b57cec5SDimitry Andric if (!CorporaHashes.count(Name)) continue; // not in the corpus.
2230b57cec5SDimitry Andric NumTraceFiles++;
2240b57cec5SDimitry Andric // Printf("=== %s\n", Name.c_str());
2250b57cec5SDimitry Andric std::ifstream IF(SF.File);
2260b57cec5SDimitry Andric while (std::getline(IF, L, '\n')) {
2270b57cec5SDimitry Andric size_t FunctionNum = 0;
2280b57cec5SDimitry Andric std::string DFTString;
2290b57cec5SDimitry Andric if (ParseDFTLine(L, &FunctionNum, &DFTString) &&
2300b57cec5SDimitry Andric FunctionNum == FocusFuncIdx) {
2310b57cec5SDimitry Andric NumTracesWithFocusFunction++;
2320b57cec5SDimitry Andric
2330b57cec5SDimitry Andric if (FunctionNum >= NumFunctions)
2340b57cec5SDimitry Andric return ParseError("N is greater than the number of functions", L);
2350b57cec5SDimitry Andric Traces[Name] = DFTStringToVector(DFTString);
2360b57cec5SDimitry Andric // Print just a few small traces.
2370b57cec5SDimitry Andric if (NumTracesWithFocusFunction <= 3 && DFTString.size() <= 16)
2380b57cec5SDimitry Andric Printf("%s => |%s|\n", Name.c_str(), std::string(DFTString).c_str());
2390b57cec5SDimitry Andric break; // No need to parse the following lines.
2400b57cec5SDimitry Andric }
2410b57cec5SDimitry Andric }
2420b57cec5SDimitry Andric }
2430b57cec5SDimitry Andric Printf("INFO: DataFlowTrace: %zd trace files, %zd functions, "
2440b57cec5SDimitry Andric "%zd traces with focus function\n",
2450b57cec5SDimitry Andric NumTraceFiles, NumFunctions, NumTracesWithFocusFunction);
2460b57cec5SDimitry Andric return NumTraceFiles > 0;
2470b57cec5SDimitry Andric }
2480b57cec5SDimitry Andric
CollectDataFlow(const std::string & DFTBinary,const std::string & DirPath,const std::vector<SizedFile> & CorporaFiles)2490b57cec5SDimitry Andric int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath,
250349cc55cSDimitry Andric const std::vector<SizedFile> &CorporaFiles) {
2510b57cec5SDimitry Andric Printf("INFO: collecting data flow: bin: %s dir: %s files: %zd\n",
2520b57cec5SDimitry Andric DFTBinary.c_str(), DirPath.c_str(), CorporaFiles.size());
2535ffd83dbSDimitry Andric if (CorporaFiles.empty()) {
2545ffd83dbSDimitry Andric Printf("ERROR: can't collect data flow without corpus provided.");
2555ffd83dbSDimitry Andric return 1;
2565ffd83dbSDimitry Andric }
2575ffd83dbSDimitry Andric
258e8d8bef9SDimitry Andric static char DFSanEnv[] = "DFSAN_OPTIONS=warn_unimplemented=0";
2590b57cec5SDimitry Andric putenv(DFSanEnv);
2600b57cec5SDimitry Andric MkDir(DirPath);
2610b57cec5SDimitry Andric for (auto &F : CorporaFiles) {
2620b57cec5SDimitry Andric // For every input F we need to collect the data flow and the coverage.
2630b57cec5SDimitry Andric // Data flow collection may fail if we request too many DFSan tags at once.
2640b57cec5SDimitry Andric // So, we start from requesting all tags in range [0,Size) and if that fails
2650b57cec5SDimitry Andric // we then request tags in [0,Size/2) and [Size/2, Size), and so on.
2660b57cec5SDimitry Andric // Function number => DFT.
2670b57cec5SDimitry Andric auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
268349cc55cSDimitry Andric std::unordered_map<size_t, std::vector<uint8_t>> DFTMap;
2690b57cec5SDimitry Andric std::unordered_set<std::string> Cov;
2700b57cec5SDimitry Andric Command Cmd;
2710b57cec5SDimitry Andric Cmd.addArgument(DFTBinary);
2720b57cec5SDimitry Andric Cmd.addArgument(F.File);
2730b57cec5SDimitry Andric Cmd.addArgument(OutPath);
2740b57cec5SDimitry Andric Printf("CMD: %s\n", Cmd.toString().c_str());
2750b57cec5SDimitry Andric ExecuteCommand(Cmd);
2760b57cec5SDimitry Andric }
2770b57cec5SDimitry Andric // Write functions.txt if it's currently empty or doesn't exist.
2780b57cec5SDimitry Andric auto FunctionsTxtPath = DirPlusFile(DirPath, kFunctionsTxt);
2790b57cec5SDimitry Andric if (FileToString(FunctionsTxtPath).empty()) {
2800b57cec5SDimitry Andric Command Cmd;
2810b57cec5SDimitry Andric Cmd.addArgument(DFTBinary);
2820b57cec5SDimitry Andric Cmd.setOutputFile(FunctionsTxtPath);
2830b57cec5SDimitry Andric ExecuteCommand(Cmd);
2840b57cec5SDimitry Andric }
2850b57cec5SDimitry Andric return 0;
2860b57cec5SDimitry Andric }
2870b57cec5SDimitry Andric
2880b57cec5SDimitry Andric } // namespace fuzzer
289