1 //===- DependencyScanningFilesystem.h - clang-scan-deps fs ===---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
10 #define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
11 
12 #include "clang/Basic/LLVM.h"
13 #include "clang/Lex/DependencyDirectivesScanner.h"
14 #include "llvm/ADT/DenseMap.h"
15 #include "llvm/ADT/StringMap.h"
16 #include "llvm/Support/Allocator.h"
17 #include "llvm/Support/ErrorOr.h"
18 #include "llvm/Support/VirtualFileSystem.h"
19 #include <mutex>
20 #include <optional>
21 #include <variant>
22 
23 namespace clang {
24 namespace tooling {
25 namespace dependencies {
26 
27 using DependencyDirectivesTy =
28     SmallVector<dependency_directives_scan::Directive, 20>;
29 
30 /// Contents and directive tokens of a cached file entry. Single instance can
31 /// be shared between multiple entries.
32 struct CachedFileContents {
CachedFileContentsCachedFileContents33   CachedFileContents(std::unique_ptr<llvm::MemoryBuffer> Contents)
34       : Original(std::move(Contents)), DepDirectives(nullptr) {}
35 
36   /// Owning storage for the original contents.
37   std::unique_ptr<llvm::MemoryBuffer> Original;
38 
39   /// The mutex that must be locked before mutating directive tokens.
40   std::mutex ValueLock;
41   SmallVector<dependency_directives_scan::Token, 10> DepDirectiveTokens;
42   /// Accessor to the directive tokens that's atomic to avoid data races.
43   /// \p CachedFileContents has ownership of the pointer.
44   std::atomic<const std::optional<DependencyDirectivesTy> *> DepDirectives;
45 
~CachedFileContentsCachedFileContents46   ~CachedFileContents() { delete DepDirectives.load(); }
47 };
48 
49 /// An in-memory representation of a file system entity that is of interest to
50 /// the dependency scanning filesystem.
51 ///
52 /// It represents one of the following:
53 /// - opened file with contents and a stat value,
54 /// - opened file with contents, directive tokens and a stat value,
55 /// - directory entry with its stat value,
56 /// - filesystem error.
57 ///
58 /// Single instance of this class can be shared across different filenames (e.g.
59 /// a regular file and a symlink). For this reason the status filename is empty
60 /// and is only materialized by \c EntryRef that knows the requested filename.
61 class CachedFileSystemEntry {
62 public:
63   /// Creates an entry without contents: either a filesystem error or
64   /// a directory with stat value.
CachedFileSystemEntry(llvm::ErrorOr<llvm::vfs::Status> Stat)65   CachedFileSystemEntry(llvm::ErrorOr<llvm::vfs::Status> Stat)
66       : MaybeStat(std::move(Stat)), Contents(nullptr) {
67     clearStatName();
68   }
69 
70   /// Creates an entry representing a file with contents.
CachedFileSystemEntry(llvm::ErrorOr<llvm::vfs::Status> Stat,CachedFileContents * Contents)71   CachedFileSystemEntry(llvm::ErrorOr<llvm::vfs::Status> Stat,
72                         CachedFileContents *Contents)
73       : MaybeStat(std::move(Stat)), Contents(std::move(Contents)) {
74     clearStatName();
75   }
76 
77   /// \returns True if the entry is a filesystem error.
isError()78   bool isError() const { return !MaybeStat; }
79 
80   /// \returns True if the current entry represents a directory.
isDirectory()81   bool isDirectory() const { return !isError() && MaybeStat->isDirectory(); }
82 
83   /// \returns Original contents of the file.
getOriginalContents()84   StringRef getOriginalContents() const {
85     assert(!isError() && "error");
86     assert(!MaybeStat->isDirectory() && "not a file");
87     assert(Contents && "contents not initialized");
88     return Contents->Original->getBuffer();
89   }
90 
91   /// \returns The scanned preprocessor directive tokens of the file that are
92   /// used to speed up preprocessing, if available.
93   std::optional<ArrayRef<dependency_directives_scan::Directive>>
getDirectiveTokens()94   getDirectiveTokens() const {
95     assert(!isError() && "error");
96     assert(!isDirectory() && "not a file");
97     assert(Contents && "contents not initialized");
98     if (auto *Directives = Contents->DepDirectives.load()) {
99       if (Directives->has_value())
100         return ArrayRef<dependency_directives_scan::Directive>(**Directives);
101     }
102     return std::nullopt;
103   }
104 
105   /// \returns The error.
getError()106   std::error_code getError() const { return MaybeStat.getError(); }
107 
108   /// \returns The entry status with empty filename.
getStatus()109   llvm::vfs::Status getStatus() const {
110     assert(!isError() && "error");
111     assert(MaybeStat->getName().empty() && "stat name must be empty");
112     return *MaybeStat;
113   }
114 
115   /// \returns The unique ID of the entry.
getUniqueID()116   llvm::sys::fs::UniqueID getUniqueID() const {
117     assert(!isError() && "error");
118     return MaybeStat->getUniqueID();
119   }
120 
121   /// \returns The data structure holding both contents and directive tokens.
getCachedContents()122   CachedFileContents *getCachedContents() const {
123     assert(!isError() && "error");
124     assert(!isDirectory() && "not a file");
125     return Contents;
126   }
127 
128 private:
clearStatName()129   void clearStatName() {
130     if (MaybeStat)
131       MaybeStat = llvm::vfs::Status::copyWithNewName(*MaybeStat, "");
132   }
133 
134   /// Either the filesystem error or status of the entry.
135   /// The filename is empty and only materialized by \c EntryRef.
136   llvm::ErrorOr<llvm::vfs::Status> MaybeStat;
137 
138   /// Non-owning pointer to the file contents.
139   ///
140   /// We're using pointer here to keep the size of this class small. Instances
141   /// representing directories and filesystem errors don't hold any contents
142   /// anyway.
143   CachedFileContents *Contents;
144 };
145 
146 using CachedRealPath = llvm::ErrorOr<std::string>;
147 
148 /// This class is a shared cache, that caches the 'stat' and 'open' calls to the
149 /// underlying real file system, and the scanned preprocessor directives of
150 /// files.
151 ///
152 /// It is sharded based on the hash of the key to reduce the lock contention for
153 /// the worker threads.
154 class DependencyScanningFilesystemSharedCache {
155 public:
156   struct CacheShard {
157     /// The mutex that needs to be locked before mutation of any member.
158     mutable std::mutex CacheLock;
159 
160     /// Map from filenames to cached entries and real paths.
161     llvm::StringMap<
162         std::pair<const CachedFileSystemEntry *, const CachedRealPath *>,
163         llvm::BumpPtrAllocator>
164         CacheByFilename;
165 
166     /// Map from unique IDs to cached entries.
167     llvm::DenseMap<llvm::sys::fs::UniqueID, const CachedFileSystemEntry *>
168         EntriesByUID;
169 
170     /// The backing storage for cached entries.
171     llvm::SpecificBumpPtrAllocator<CachedFileSystemEntry> EntryStorage;
172 
173     /// The backing storage for cached contents.
174     llvm::SpecificBumpPtrAllocator<CachedFileContents> ContentsStorage;
175 
176     /// The backing storage for cached real paths.
177     llvm::SpecificBumpPtrAllocator<CachedRealPath> RealPathStorage;
178 
179     /// Returns entry associated with the filename or nullptr if none is found.
180     const CachedFileSystemEntry *findEntryByFilename(StringRef Filename) const;
181 
182     /// Returns entry associated with the unique ID or nullptr if none is found.
183     const CachedFileSystemEntry *
184     findEntryByUID(llvm::sys::fs::UniqueID UID) const;
185 
186     /// Returns entry associated with the filename if there is some. Otherwise,
187     /// constructs new one with the given status, associates it with the
188     /// filename and returns the result.
189     const CachedFileSystemEntry &
190     getOrEmplaceEntryForFilename(StringRef Filename,
191                                  llvm::ErrorOr<llvm::vfs::Status> Stat);
192 
193     /// Returns entry associated with the unique ID if there is some. Otherwise,
194     /// constructs new one with the given status and contents, associates it
195     /// with the unique ID and returns the result.
196     const CachedFileSystemEntry &
197     getOrEmplaceEntryForUID(llvm::sys::fs::UniqueID UID, llvm::vfs::Status Stat,
198                             std::unique_ptr<llvm::MemoryBuffer> Contents);
199 
200     /// Returns entry associated with the filename if there is some. Otherwise,
201     /// associates the given entry with the filename and returns it.
202     const CachedFileSystemEntry &
203     getOrInsertEntryForFilename(StringRef Filename,
204                                 const CachedFileSystemEntry &Entry);
205 
206     /// Returns the real path associated with the filename or nullptr if none is
207     /// found.
208     const CachedRealPath *findRealPathByFilename(StringRef Filename) const;
209 
210     /// Returns the real path associated with the filename if there is some.
211     /// Otherwise, constructs new one with the given one, associates it with the
212     /// filename and returns the result.
213     const CachedRealPath &
214     getOrEmplaceRealPathForFilename(StringRef Filename,
215                                     llvm::ErrorOr<StringRef> RealPath);
216   };
217 
218   DependencyScanningFilesystemSharedCache();
219 
220   /// Returns shard for the given key.
221   CacheShard &getShardForFilename(StringRef Filename) const;
222   CacheShard &getShardForUID(llvm::sys::fs::UniqueID UID) const;
223 
224   struct OutOfDateEntry {
225     // A null terminated string that contains a path.
226     const char *Path = nullptr;
227 
228     struct NegativelyCachedInfo {};
229     struct SizeChangedInfo {
230       uint64_t CachedSize = 0;
231       uint64_t ActualSize = 0;
232     };
233 
234     std::variant<NegativelyCachedInfo, SizeChangedInfo> Info;
235 
OutOfDateEntryOutOfDateEntry236     OutOfDateEntry(const char *Path)
237         : Path(Path), Info(NegativelyCachedInfo{}) {}
238 
OutOfDateEntryOutOfDateEntry239     OutOfDateEntry(const char *Path, uint64_t CachedSize, uint64_t ActualSize)
240         : Path(Path), Info(SizeChangedInfo{CachedSize, ActualSize}) {}
241   };
242 
243   /// Visits all cached entries and re-stat an entry using UnderlyingFS to check
244   /// if the cache contains out-of-date entries. An entry can be out-of-date for
245   /// two reasons:
246   ///  1. The entry contains a stat error, indicating the file did not exist
247   ///     in the cache, but the file exists on the UnderlyingFS.
248   ///  2. The entry is associated with a file whose size is different from the
249   ///     size of the file on the same path on the UnderlyingFS.
250   std::vector<OutOfDateEntry>
251   getOutOfDateEntries(llvm::vfs::FileSystem &UnderlyingFS) const;
252 
253 private:
254   std::unique_ptr<CacheShard[]> CacheShards;
255   unsigned NumShards;
256 };
257 
258 /// This class is a local cache, that caches the 'stat' and 'open' calls to the
259 /// underlying real file system.
260 class DependencyScanningFilesystemLocalCache {
261   llvm::StringMap<
262       std::pair<const CachedFileSystemEntry *, const CachedRealPath *>,
263       llvm::BumpPtrAllocator>
264       Cache;
265 
266 public:
267   /// Returns entry associated with the filename or nullptr if none is found.
findEntryByFilename(StringRef Filename)268   const CachedFileSystemEntry *findEntryByFilename(StringRef Filename) const {
269     assert(llvm::sys::path::is_absolute_gnu(Filename));
270     auto It = Cache.find(Filename);
271     return It == Cache.end() ? nullptr : It->getValue().first;
272   }
273 
274   /// Associates the given entry with the filename and returns the given entry
275   /// pointer (for convenience).
276   const CachedFileSystemEntry &
insertEntryForFilename(StringRef Filename,const CachedFileSystemEntry & Entry)277   insertEntryForFilename(StringRef Filename,
278                          const CachedFileSystemEntry &Entry) {
279     assert(llvm::sys::path::is_absolute_gnu(Filename));
280     auto [It, Inserted] = Cache.insert({Filename, {&Entry, nullptr}});
281     auto &[CachedEntry, CachedRealPath] = It->getValue();
282     if (!Inserted) {
283       // The file is already present in the local cache. If we got here, it only
284       // contains the real path. Let's make sure the entry is populated too.
285       assert((!CachedEntry && CachedRealPath) && "entry already present");
286       CachedEntry = &Entry;
287     }
288     return *CachedEntry;
289   }
290 
291   /// Returns real path associated with the filename or nullptr if none is
292   /// found.
findRealPathByFilename(StringRef Filename)293   const CachedRealPath *findRealPathByFilename(StringRef Filename) const {
294     assert(llvm::sys::path::is_absolute_gnu(Filename));
295     auto It = Cache.find(Filename);
296     return It == Cache.end() ? nullptr : It->getValue().second;
297   }
298 
299   /// Associates the given real path with the filename and returns the given
300   /// entry pointer (for convenience).
301   const CachedRealPath &
insertRealPathForFilename(StringRef Filename,const CachedRealPath & RealPath)302   insertRealPathForFilename(StringRef Filename,
303                             const CachedRealPath &RealPath) {
304     assert(llvm::sys::path::is_absolute_gnu(Filename));
305     auto [It, Inserted] = Cache.insert({Filename, {nullptr, &RealPath}});
306     auto &[CachedEntry, CachedRealPath] = It->getValue();
307     if (!Inserted) {
308       // The file is already present in the local cache. If we got here, it only
309       // contains the entry. Let's make sure the real path is populated too.
310       assert((!CachedRealPath && CachedEntry) && "real path already present");
311       CachedRealPath = &RealPath;
312     }
313     return *CachedRealPath;
314   }
315 };
316 
317 /// Reference to a CachedFileSystemEntry.
318 /// If the underlying entry is an opened file, this wrapper returns the file
319 /// contents and the scanned preprocessor directives.
320 class EntryRef {
321   /// The filename used to access this entry.
322   std::string Filename;
323 
324   /// The underlying cached entry.
325   const CachedFileSystemEntry &Entry;
326 
327   friend class DependencyScanningWorkerFilesystem;
328 
329 public:
EntryRef(StringRef Name,const CachedFileSystemEntry & Entry)330   EntryRef(StringRef Name, const CachedFileSystemEntry &Entry)
331       : Filename(Name), Entry(Entry) {}
332 
getStatus()333   llvm::vfs::Status getStatus() const {
334     llvm::vfs::Status Stat = Entry.getStatus();
335     if (!Stat.isDirectory())
336       Stat = llvm::vfs::Status::copyWithNewSize(Stat, getContents().size());
337     return llvm::vfs::Status::copyWithNewName(Stat, Filename);
338   }
339 
isError()340   bool isError() const { return Entry.isError(); }
isDirectory()341   bool isDirectory() const { return Entry.isDirectory(); }
342 
343   /// If the cached entry represents an error, promotes it into `ErrorOr`.
unwrapError()344   llvm::ErrorOr<EntryRef> unwrapError() const {
345     if (isError())
346       return Entry.getError();
347     return *this;
348   }
349 
getContents()350   StringRef getContents() const { return Entry.getOriginalContents(); }
351 
352   std::optional<ArrayRef<dependency_directives_scan::Directive>>
getDirectiveTokens()353   getDirectiveTokens() const {
354     return Entry.getDirectiveTokens();
355   }
356 };
357 
358 /// A virtual file system optimized for the dependency discovery.
359 ///
360 /// It is primarily designed to work with source files whose contents was
361 /// preprocessed to remove any tokens that are unlikely to affect the dependency
362 /// computation.
363 ///
364 /// This is not a thread safe VFS. A single instance is meant to be used only in
365 /// one thread. Multiple instances are allowed to service multiple threads
366 /// running in parallel.
367 class DependencyScanningWorkerFilesystem
368     : public llvm::RTTIExtends<DependencyScanningWorkerFilesystem,
369                                llvm::vfs::ProxyFileSystem> {
370 public:
371   static const char ID;
372 
373   DependencyScanningWorkerFilesystem(
374       DependencyScanningFilesystemSharedCache &SharedCache,
375       IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
376 
377   llvm::ErrorOr<llvm::vfs::Status> status(const Twine &Path) override;
378   llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>>
379   openFileForRead(const Twine &Path) override;
380 
381   std::error_code getRealPath(const Twine &Path,
382                               SmallVectorImpl<char> &Output) override;
383 
384   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
385 
386   /// Make it so that no paths bypass this VFS.
resetBypassedPathPrefix()387   void resetBypassedPathPrefix() { BypassedPathPrefix.reset(); }
388   /// Set the prefix for paths that should bypass this VFS and go straight to
389   /// the underlying VFS.
setBypassedPathPrefix(StringRef Prefix)390   void setBypassedPathPrefix(StringRef Prefix) { BypassedPathPrefix = Prefix; }
391 
392   /// Returns entry for the given filename.
393   ///
394   /// Attempts to use the local and shared caches first, then falls back to
395   /// using the underlying filesystem.
396   llvm::ErrorOr<EntryRef> getOrCreateFileSystemEntry(StringRef Filename);
397 
398   /// Ensure the directive tokens are populated for this file entry.
399   ///
400   /// Returns true if the directive tokens are populated for this file entry,
401   /// false if not (i.e. this entry is not a file or its scan fails).
402   bool ensureDirectiveTokensArePopulated(EntryRef Entry);
403 
404   /// \returns The scanned preprocessor directive tokens of the file that are
405   /// used to speed up preprocessing, if available.
406   std::optional<ArrayRef<dependency_directives_scan::Directive>>
getDirectiveTokens(const Twine & Path)407   getDirectiveTokens(const Twine &Path) {
408     if (llvm::ErrorOr<EntryRef> Entry = getOrCreateFileSystemEntry(Path.str()))
409       if (ensureDirectiveTokensArePopulated(*Entry))
410         return Entry->getDirectiveTokens();
411     return std::nullopt;
412   }
413 
414   /// Check whether \p Path exists. By default checks cached result of \c
415   /// status(), and falls back on FS if unable to do so.
416   bool exists(const Twine &Path) override;
417 
418 private:
419   /// For a filename that's not yet associated with any entry in the caches,
420   /// uses the underlying filesystem to either look up the entry based in the
421   /// shared cache indexed by unique ID, or creates new entry from scratch.
422   /// \p FilenameForLookup will always be an absolute path, and different than
423   /// \p OriginalFilename if \p OriginalFilename is relative.
424   llvm::ErrorOr<const CachedFileSystemEntry &>
425   computeAndStoreResult(StringRef OriginalFilename,
426                         StringRef FilenameForLookup);
427 
428   /// Represents a filesystem entry that has been stat-ed (and potentially read)
429   /// and that's about to be inserted into the cache as `CachedFileSystemEntry`.
430   struct TentativeEntry {
431     llvm::vfs::Status Status;
432     std::unique_ptr<llvm::MemoryBuffer> Contents;
433 
434     TentativeEntry(llvm::vfs::Status Status,
435                    std::unique_ptr<llvm::MemoryBuffer> Contents = nullptr)
StatusTentativeEntry436         : Status(std::move(Status)), Contents(std::move(Contents)) {}
437   };
438 
439   /// Reads file at the given path. Enforces consistency between the file size
440   /// in status and size of read contents.
441   llvm::ErrorOr<TentativeEntry> readFile(StringRef Filename);
442 
443   /// Returns entry associated with the unique ID of the given tentative entry
444   /// if there is some in the shared cache. Otherwise, constructs new one,
445   /// associates it with the unique ID and returns the result.
446   const CachedFileSystemEntry &
447   getOrEmplaceSharedEntryForUID(TentativeEntry TEntry);
448 
449   /// Returns entry associated with the filename or nullptr if none is found.
450   ///
451   /// Returns entry from local cache if there is some. Otherwise, if the entry
452   /// is found in the shared cache, writes it through the local cache and
453   /// returns it. Otherwise returns nullptr.
454   const CachedFileSystemEntry *
455   findEntryByFilenameWithWriteThrough(StringRef Filename);
456 
457   /// Returns entry associated with the unique ID in the shared cache or nullptr
458   /// if none is found.
459   const CachedFileSystemEntry *
findSharedEntryByUID(llvm::vfs::Status Stat)460   findSharedEntryByUID(llvm::vfs::Status Stat) const {
461     return SharedCache.getShardForUID(Stat.getUniqueID())
462         .findEntryByUID(Stat.getUniqueID());
463   }
464 
465   /// Associates the given entry with the filename in the local cache and
466   /// returns it.
467   const CachedFileSystemEntry &
insertLocalEntryForFilename(StringRef Filename,const CachedFileSystemEntry & Entry)468   insertLocalEntryForFilename(StringRef Filename,
469                               const CachedFileSystemEntry &Entry) {
470     return LocalCache.insertEntryForFilename(Filename, Entry);
471   }
472 
473   /// Returns entry associated with the filename in the shared cache if there is
474   /// some. Otherwise, constructs new one with the given error code, associates
475   /// it with the filename and returns the result.
476   const CachedFileSystemEntry &
getOrEmplaceSharedEntryForFilename(StringRef Filename,std::error_code EC)477   getOrEmplaceSharedEntryForFilename(StringRef Filename, std::error_code EC) {
478     return SharedCache.getShardForFilename(Filename)
479         .getOrEmplaceEntryForFilename(Filename, EC);
480   }
481 
482   /// Returns entry associated with the filename in the shared cache if there is
483   /// some. Otherwise, associates the given entry with the filename and returns
484   /// it.
485   const CachedFileSystemEntry &
getOrInsertSharedEntryForFilename(StringRef Filename,const CachedFileSystemEntry & Entry)486   getOrInsertSharedEntryForFilename(StringRef Filename,
487                                     const CachedFileSystemEntry &Entry) {
488     return SharedCache.getShardForFilename(Filename)
489         .getOrInsertEntryForFilename(Filename, Entry);
490   }
491 
printImpl(raw_ostream & OS,PrintType Type,unsigned IndentLevel)492   void printImpl(raw_ostream &OS, PrintType Type,
493                  unsigned IndentLevel) const override {
494     printIndent(OS, IndentLevel);
495     OS << "DependencyScanningFilesystem\n";
496     getUnderlyingFS().print(OS, Type, IndentLevel + 1);
497   }
498 
499   /// Whether this path should bypass this VFS and go straight to the underlying
500   /// VFS.
501   bool shouldBypass(StringRef Path) const;
502 
503   /// The global cache shared between worker threads.
504   DependencyScanningFilesystemSharedCache &SharedCache;
505   /// The local cache is used by the worker thread to cache file system queries
506   /// locally instead of querying the global cache every time.
507   DependencyScanningFilesystemLocalCache LocalCache;
508 
509   /// Prefix of paths that should go straight to the underlying VFS.
510   std::optional<std::string> BypassedPathPrefix;
511 
512   /// The working directory to use for making relative paths absolute before
513   /// using them for cache lookups.
514   llvm::ErrorOr<std::string> WorkingDirForCacheLookup;
515 
516   void updateWorkingDirForCacheLookup();
517 
518   llvm::ErrorOr<StringRef>
519   tryGetFilenameForLookup(StringRef OriginalFilename,
520                           llvm::SmallVectorImpl<char> &PathBuf) const;
521 };
522 
523 } // end namespace dependencies
524 } // end namespace tooling
525 } // end namespace clang
526 
527 #endif // LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNINGFILESYSTEM_H
528