xref: /linux/drivers/md/bcache/bcache.h (revision a1ff5a7d78a036d6c2178ee5acd6ba4946243800)
1  /* SPDX-License-Identifier: GPL-2.0 */
2  #ifndef _BCACHE_H
3  #define _BCACHE_H
4  
5  /*
6   * SOME HIGH LEVEL CODE DOCUMENTATION:
7   *
8   * Bcache mostly works with cache sets, cache devices, and backing devices.
9   *
10   * Support for multiple cache devices hasn't quite been finished off yet, but
11   * it's about 95% plumbed through. A cache set and its cache devices is sort of
12   * like a md raid array and its component devices. Most of the code doesn't care
13   * about individual cache devices, the main abstraction is the cache set.
14   *
15   * Multiple cache devices is intended to give us the ability to mirror dirty
16   * cached data and metadata, without mirroring clean cached data.
17   *
18   * Backing devices are different, in that they have a lifetime independent of a
19   * cache set. When you register a newly formatted backing device it'll come up
20   * in passthrough mode, and then you can attach and detach a backing device from
21   * a cache set at runtime - while it's mounted and in use. Detaching implicitly
22   * invalidates any cached data for that backing device.
23   *
24   * A cache set can have multiple (many) backing devices attached to it.
25   *
26   * There's also flash only volumes - this is the reason for the distinction
27   * between struct cached_dev and struct bcache_device. A flash only volume
28   * works much like a bcache device that has a backing device, except the
29   * "cached" data is always dirty. The end result is that we get thin
30   * provisioning with very little additional code.
31   *
32   * Flash only volumes work but they're not production ready because the moving
33   * garbage collector needs more work. More on that later.
34   *
35   * BUCKETS/ALLOCATION:
36   *
37   * Bcache is primarily designed for caching, which means that in normal
38   * operation all of our available space will be allocated. Thus, we need an
39   * efficient way of deleting things from the cache so we can write new things to
40   * it.
41   *
42   * To do this, we first divide the cache device up into buckets. A bucket is the
43   * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
44   * works efficiently.
45   *
46   * Each bucket has a 16 bit priority, and an 8 bit generation associated with
47   * it. The gens and priorities for all the buckets are stored contiguously and
48   * packed on disk (in a linked list of buckets - aside from the superblock, all
49   * of bcache's metadata is stored in buckets).
50   *
51   * The priority is used to implement an LRU. We reset a bucket's priority when
52   * we allocate it or on cache it, and every so often we decrement the priority
53   * of each bucket. It could be used to implement something more sophisticated,
54   * if anyone ever gets around to it.
55   *
56   * The generation is used for invalidating buckets. Each pointer also has an 8
57   * bit generation embedded in it; for a pointer to be considered valid, its gen
58   * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
59   * we have to do is increment its gen (and write its new gen to disk; we batch
60   * this up).
61   *
62   * Bcache is entirely COW - we never write twice to a bucket, even buckets that
63   * contain metadata (including btree nodes).
64   *
65   * THE BTREE:
66   *
67   * Bcache is in large part design around the btree.
68   *
69   * At a high level, the btree is just an index of key -> ptr tuples.
70   *
71   * Keys represent extents, and thus have a size field. Keys also have a variable
72   * number of pointers attached to them (potentially zero, which is handy for
73   * invalidating the cache).
74   *
75   * The key itself is an inode:offset pair. The inode number corresponds to a
76   * backing device or a flash only volume. The offset is the ending offset of the
77   * extent within the inode - not the starting offset; this makes lookups
78   * slightly more convenient.
79   *
80   * Pointers contain the cache device id, the offset on that device, and an 8 bit
81   * generation number. More on the gen later.
82   *
83   * Index lookups are not fully abstracted - cache lookups in particular are
84   * still somewhat mixed in with the btree code, but things are headed in that
85   * direction.
86   *
87   * Updates are fairly well abstracted, though. There are two different ways of
88   * updating the btree; insert and replace.
89   *
90   * BTREE_INSERT will just take a list of keys and insert them into the btree -
91   * overwriting (possibly only partially) any extents they overlap with. This is
92   * used to update the index after a write.
93   *
94   * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
95   * overwriting a key that matches another given key. This is used for inserting
96   * data into the cache after a cache miss, and for background writeback, and for
97   * the moving garbage collector.
98   *
99   * There is no "delete" operation; deleting things from the index is
100   * accomplished by either by invalidating pointers (by incrementing a bucket's
101   * gen) or by inserting a key with 0 pointers - which will overwrite anything
102   * previously present at that location in the index.
103   *
104   * This means that there are always stale/invalid keys in the btree. They're
105   * filtered out by the code that iterates through a btree node, and removed when
106   * a btree node is rewritten.
107   *
108   * BTREE NODES:
109   *
110   * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
111   * free smaller than a bucket - so, that's how big our btree nodes are.
112   *
113   * (If buckets are really big we'll only use part of the bucket for a btree node
114   * - no less than 1/4th - but a bucket still contains no more than a single
115   * btree node. I'd actually like to change this, but for now we rely on the
116   * bucket's gen for deleting btree nodes when we rewrite/split a node.)
117   *
118   * Anyways, btree nodes are big - big enough to be inefficient with a textbook
119   * btree implementation.
120   *
121   * The way this is solved is that btree nodes are internally log structured; we
122   * can append new keys to an existing btree node without rewriting it. This
123   * means each set of keys we write is sorted, but the node is not.
124   *
125   * We maintain this log structure in memory - keeping 1Mb of keys sorted would
126   * be expensive, and we have to distinguish between the keys we have written and
127   * the keys we haven't. So to do a lookup in a btree node, we have to search
128   * each sorted set. But we do merge written sets together lazily, so the cost of
129   * these extra searches is quite low (normally most of the keys in a btree node
130   * will be in one big set, and then there'll be one or two sets that are much
131   * smaller).
132   *
133   * This log structure makes bcache's btree more of a hybrid between a
134   * conventional btree and a compacting data structure, with some of the
135   * advantages of both.
136   *
137   * GARBAGE COLLECTION:
138   *
139   * We can't just invalidate any bucket - it might contain dirty data or
140   * metadata. If it once contained dirty data, other writes might overwrite it
141   * later, leaving no valid pointers into that bucket in the index.
142   *
143   * Thus, the primary purpose of garbage collection is to find buckets to reuse.
144   * It also counts how much valid data it each bucket currently contains, so that
145   * allocation can reuse buckets sooner when they've been mostly overwritten.
146   *
147   * It also does some things that are really internal to the btree
148   * implementation. If a btree node contains pointers that are stale by more than
149   * some threshold, it rewrites the btree node to avoid the bucket's generation
150   * wrapping around. It also merges adjacent btree nodes if they're empty enough.
151   *
152   * THE JOURNAL:
153   *
154   * Bcache's journal is not necessary for consistency; we always strictly
155   * order metadata writes so that the btree and everything else is consistent on
156   * disk in the event of an unclean shutdown, and in fact bcache had writeback
157   * caching (with recovery from unclean shutdown) before journalling was
158   * implemented.
159   *
160   * Rather, the journal is purely a performance optimization; we can't complete a
161   * write until we've updated the index on disk, otherwise the cache would be
162   * inconsistent in the event of an unclean shutdown. This means that without the
163   * journal, on random write workloads we constantly have to update all the leaf
164   * nodes in the btree, and those writes will be mostly empty (appending at most
165   * a few keys each) - highly inefficient in terms of amount of metadata writes,
166   * and it puts more strain on the various btree resorting/compacting code.
167   *
168   * The journal is just a log of keys we've inserted; on startup we just reinsert
169   * all the keys in the open journal entries. That means that when we're updating
170   * a node in the btree, we can wait until a 4k block of keys fills up before
171   * writing them out.
172   *
173   * For simplicity, we only journal updates to leaf nodes; updates to parent
174   * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
175   * the complexity to deal with journalling them (in particular, journal replay)
176   * - updates to non leaf nodes just happen synchronously (see btree_split()).
177   */
178  
179  #define pr_fmt(fmt) "bcache: %s() " fmt, __func__
180  
181  #include <linux/bio.h>
182  #include <linux/closure.h>
183  #include <linux/kobject.h>
184  #include <linux/list.h>
185  #include <linux/mutex.h>
186  #include <linux/rbtree.h>
187  #include <linux/rwsem.h>
188  #include <linux/refcount.h>
189  #include <linux/types.h>
190  #include <linux/workqueue.h>
191  #include <linux/kthread.h>
192  
193  #include "bcache_ondisk.h"
194  #include "bset.h"
195  #include "util.h"
196  
197  struct bucket {
198  	atomic_t	pin;
199  	uint16_t	prio;
200  	uint8_t		gen;
201  	uint8_t		last_gc; /* Most out of date gen in the btree */
202  	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
203  	uint16_t	reclaimable_in_gc:1;
204  };
205  
206  /*
207   * I'd use bitfields for these, but I don't trust the compiler not to screw me
208   * as multiple threads touch struct bucket without locking
209   */
210  
211  BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
212  #define GC_MARK_RECLAIMABLE	1
213  #define GC_MARK_DIRTY		2
214  #define GC_MARK_METADATA	3
215  #define GC_SECTORS_USED_SIZE	13
216  #define MAX_GC_SECTORS_USED	(~(~0ULL << GC_SECTORS_USED_SIZE))
217  BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
218  BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
219  
220  #include "journal.h"
221  #include "stats.h"
222  struct search;
223  struct btree;
224  struct keybuf;
225  
226  struct keybuf_key {
227  	struct rb_node		node;
228  	BKEY_PADDED(key);
229  	void			*private;
230  };
231  
232  struct keybuf {
233  	struct bkey		last_scanned;
234  	spinlock_t		lock;
235  
236  	/*
237  	 * Beginning and end of range in rb tree - so that we can skip taking
238  	 * lock and checking the rb tree when we need to check for overlapping
239  	 * keys.
240  	 */
241  	struct bkey		start;
242  	struct bkey		end;
243  
244  	struct rb_root		keys;
245  
246  #define KEYBUF_NR		500
247  	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
248  };
249  
250  struct bcache_device {
251  	struct closure		cl;
252  
253  	struct kobject		kobj;
254  
255  	struct cache_set	*c;
256  	unsigned int		id;
257  #define BCACHEDEVNAME_SIZE	12
258  	char			name[BCACHEDEVNAME_SIZE];
259  
260  	struct gendisk		*disk;
261  
262  	unsigned long		flags;
263  #define BCACHE_DEV_CLOSING		0
264  #define BCACHE_DEV_DETACHING		1
265  #define BCACHE_DEV_UNLINK_DONE		2
266  #define BCACHE_DEV_WB_RUNNING		3
267  #define BCACHE_DEV_RATE_DW_RUNNING	4
268  	int			nr_stripes;
269  #define BCH_MIN_STRIPE_SZ		((4 << 20) >> SECTOR_SHIFT)
270  	unsigned int		stripe_size;
271  	atomic_t		*stripe_sectors_dirty;
272  	unsigned long		*full_dirty_stripes;
273  
274  	struct bio_set		bio_split;
275  
276  	unsigned int		data_csum:1;
277  
278  	int (*cache_miss)(struct btree *b, struct search *s,
279  			  struct bio *bio, unsigned int sectors);
280  	int (*ioctl)(struct bcache_device *d, blk_mode_t mode,
281  		     unsigned int cmd, unsigned long arg);
282  };
283  
284  struct io {
285  	/* Used to track sequential IO so it can be skipped */
286  	struct hlist_node	hash;
287  	struct list_head	lru;
288  
289  	unsigned long		jiffies;
290  	unsigned int		sequential;
291  	sector_t		last;
292  };
293  
294  enum stop_on_failure {
295  	BCH_CACHED_DEV_STOP_AUTO = 0,
296  	BCH_CACHED_DEV_STOP_ALWAYS,
297  	BCH_CACHED_DEV_STOP_MODE_MAX,
298  };
299  
300  struct cached_dev {
301  	struct list_head	list;
302  	struct bcache_device	disk;
303  	struct block_device	*bdev;
304  	struct file		*bdev_file;
305  
306  	struct cache_sb		sb;
307  	struct cache_sb_disk	*sb_disk;
308  	struct bio		sb_bio;
309  	struct bio_vec		sb_bv[1];
310  	struct closure		sb_write;
311  	struct semaphore	sb_write_mutex;
312  
313  	/* Refcount on the cache set. Always nonzero when we're caching. */
314  	refcount_t		count;
315  	struct work_struct	detach;
316  
317  	/*
318  	 * Device might not be running if it's dirty and the cache set hasn't
319  	 * showed up yet.
320  	 */
321  	atomic_t		running;
322  
323  	/*
324  	 * Writes take a shared lock from start to finish; scanning for dirty
325  	 * data to refill the rb tree requires an exclusive lock.
326  	 */
327  	struct rw_semaphore	writeback_lock;
328  
329  	/*
330  	 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
331  	 * data in the cache. Protected by writeback_lock; must have an
332  	 * shared lock to set and exclusive lock to clear.
333  	 */
334  	atomic_t		has_dirty;
335  
336  #define BCH_CACHE_READA_ALL		0
337  #define BCH_CACHE_READA_META_ONLY	1
338  	unsigned int		cache_readahead_policy;
339  	struct bch_ratelimit	writeback_rate;
340  	struct delayed_work	writeback_rate_update;
341  
342  	/* Limit number of writeback bios in flight */
343  	struct semaphore	in_flight;
344  	struct task_struct	*writeback_thread;
345  	struct workqueue_struct	*writeback_write_wq;
346  
347  	struct keybuf		writeback_keys;
348  
349  	struct task_struct	*status_update_thread;
350  	/*
351  	 * Order the write-half of writeback operations strongly in dispatch
352  	 * order.  (Maintain LBA order; don't allow reads completing out of
353  	 * order to re-order the writes...)
354  	 */
355  	struct closure_waitlist writeback_ordering_wait;
356  	atomic_t		writeback_sequence_next;
357  
358  	/* For tracking sequential IO */
359  #define RECENT_IO_BITS	7
360  #define RECENT_IO	(1 << RECENT_IO_BITS)
361  	struct io		io[RECENT_IO];
362  	struct hlist_head	io_hash[RECENT_IO + 1];
363  	struct list_head	io_lru;
364  	spinlock_t		io_lock;
365  
366  	struct cache_accounting	accounting;
367  
368  	/* The rest of this all shows up in sysfs */
369  	unsigned int		sequential_cutoff;
370  
371  	unsigned int		io_disable:1;
372  	unsigned int		verify:1;
373  	unsigned int		bypass_torture_test:1;
374  
375  	unsigned int		partial_stripes_expensive:1;
376  	unsigned int		writeback_metadata:1;
377  	unsigned int		writeback_running:1;
378  	unsigned int		writeback_consider_fragment:1;
379  	unsigned char		writeback_percent;
380  	unsigned int		writeback_delay;
381  
382  	uint64_t		writeback_rate_target;
383  	int64_t			writeback_rate_proportional;
384  	int64_t			writeback_rate_integral;
385  	int64_t			writeback_rate_integral_scaled;
386  	int32_t			writeback_rate_change;
387  
388  	unsigned int		writeback_rate_update_seconds;
389  	unsigned int		writeback_rate_i_term_inverse;
390  	unsigned int		writeback_rate_p_term_inverse;
391  	unsigned int		writeback_rate_fp_term_low;
392  	unsigned int		writeback_rate_fp_term_mid;
393  	unsigned int		writeback_rate_fp_term_high;
394  	unsigned int		writeback_rate_minimum;
395  
396  	enum stop_on_failure	stop_when_cache_set_failed;
397  #define DEFAULT_CACHED_DEV_ERROR_LIMIT	64
398  	atomic_t		io_errors;
399  	unsigned int		error_limit;
400  	unsigned int		offline_seconds;
401  
402  	/*
403  	 * Retry to update writeback_rate if contention happens for
404  	 * down_read(dc->writeback_lock) in update_writeback_rate()
405  	 */
406  #define BCH_WBRATE_UPDATE_MAX_SKIPS	15
407  	unsigned int		rate_update_retry;
408  };
409  
410  enum alloc_reserve {
411  	RESERVE_BTREE,
412  	RESERVE_PRIO,
413  	RESERVE_MOVINGGC,
414  	RESERVE_NONE,
415  	RESERVE_NR,
416  };
417  
418  struct cache {
419  	struct cache_set	*set;
420  	struct cache_sb		sb;
421  	struct cache_sb_disk	*sb_disk;
422  	struct bio		sb_bio;
423  	struct bio_vec		sb_bv[1];
424  
425  	struct kobject		kobj;
426  	struct block_device	*bdev;
427  	struct file		*bdev_file;
428  
429  	struct task_struct	*alloc_thread;
430  
431  	struct closure		prio;
432  	struct prio_set		*disk_buckets;
433  
434  	/*
435  	 * When allocating new buckets, prio_write() gets first dibs - since we
436  	 * may not be allocate at all without writing priorities and gens.
437  	 * prio_last_buckets[] contains the last buckets we wrote priorities to
438  	 * (so gc can mark them as metadata), prio_buckets[] contains the
439  	 * buckets allocated for the next prio write.
440  	 */
441  	uint64_t		*prio_buckets;
442  	uint64_t		*prio_last_buckets;
443  
444  	/*
445  	 * free: Buckets that are ready to be used
446  	 *
447  	 * free_inc: Incoming buckets - these are buckets that currently have
448  	 * cached data in them, and we can't reuse them until after we write
449  	 * their new gen to disk. After prio_write() finishes writing the new
450  	 * gens/prios, they'll be moved to the free list (and possibly discarded
451  	 * in the process)
452  	 */
453  	DECLARE_FIFO(long, free)[RESERVE_NR];
454  	DECLARE_FIFO(long, free_inc);
455  
456  	size_t			fifo_last_bucket;
457  
458  	/* Allocation stuff: */
459  	struct bucket		*buckets;
460  
461  	DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap;
462  
463  	/*
464  	 * If nonzero, we know we aren't going to find any buckets to invalidate
465  	 * until a gc finishes - otherwise we could pointlessly burn a ton of
466  	 * cpu
467  	 */
468  	unsigned int		invalidate_needs_gc;
469  
470  	bool			discard; /* Get rid of? */
471  
472  	struct journal_device	journal;
473  
474  	/* The rest of this all shows up in sysfs */
475  #define IO_ERROR_SHIFT		20
476  	atomic_t		io_errors;
477  	atomic_t		io_count;
478  
479  	atomic_long_t		meta_sectors_written;
480  	atomic_long_t		btree_sectors_written;
481  	atomic_long_t		sectors_written;
482  };
483  
484  struct gc_stat {
485  	size_t			nodes;
486  	size_t			nodes_pre;
487  	size_t			key_bytes;
488  
489  	size_t			nkeys;
490  	uint64_t		data;	/* sectors */
491  	unsigned int		in_use; /* percent */
492  };
493  
494  /*
495   * Flag bits, for how the cache set is shutting down, and what phase it's at:
496   *
497   * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
498   * all the backing devices first (their cached data gets invalidated, and they
499   * won't automatically reattach).
500   *
501   * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
502   * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
503   * flushing dirty data).
504   *
505   * CACHE_SET_RUNNING means all cache devices have been registered and journal
506   * replay is complete.
507   *
508   * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all
509   * external and internal I/O should be denied when this flag is set.
510   *
511   */
512  #define CACHE_SET_UNREGISTERING		0
513  #define	CACHE_SET_STOPPING		1
514  #define	CACHE_SET_RUNNING		2
515  #define CACHE_SET_IO_DISABLE		3
516  
517  struct cache_set {
518  	struct closure		cl;
519  
520  	struct list_head	list;
521  	struct kobject		kobj;
522  	struct kobject		internal;
523  	struct dentry		*debug;
524  	struct cache_accounting accounting;
525  
526  	unsigned long		flags;
527  	atomic_t		idle_counter;
528  	atomic_t		at_max_writeback_rate;
529  
530  	struct cache		*cache;
531  
532  	struct bcache_device	**devices;
533  	unsigned int		devices_max_used;
534  	atomic_t		attached_dev_nr;
535  	struct list_head	cached_devs;
536  	uint64_t		cached_dev_sectors;
537  	atomic_long_t		flash_dev_dirty_sectors;
538  	struct closure		caching;
539  
540  	struct closure		sb_write;
541  	struct semaphore	sb_write_mutex;
542  
543  	mempool_t		search;
544  	mempool_t		bio_meta;
545  	struct bio_set		bio_split;
546  
547  	/* For the btree cache */
548  	struct shrinker		*shrink;
549  
550  	/* For the btree cache and anything allocation related */
551  	struct mutex		bucket_lock;
552  
553  	/* log2(bucket_size), in sectors */
554  	unsigned short		bucket_bits;
555  
556  	/* log2(block_size), in sectors */
557  	unsigned short		block_bits;
558  
559  	/*
560  	 * Default number of pages for a new btree node - may be less than a
561  	 * full bucket
562  	 */
563  	unsigned int		btree_pages;
564  
565  	/*
566  	 * Lists of struct btrees; lru is the list for structs that have memory
567  	 * allocated for actual btree node, freed is for structs that do not.
568  	 *
569  	 * We never free a struct btree, except on shutdown - we just put it on
570  	 * the btree_cache_freed list and reuse it later. This simplifies the
571  	 * code, and it doesn't cost us much memory as the memory usage is
572  	 * dominated by buffers that hold the actual btree node data and those
573  	 * can be freed - and the number of struct btrees allocated is
574  	 * effectively bounded.
575  	 *
576  	 * btree_cache_freeable effectively is a small cache - we use it because
577  	 * high order page allocations can be rather expensive, and it's quite
578  	 * common to delete and allocate btree nodes in quick succession. It
579  	 * should never grow past ~2-3 nodes in practice.
580  	 */
581  	struct list_head	btree_cache;
582  	struct list_head	btree_cache_freeable;
583  	struct list_head	btree_cache_freed;
584  
585  	/* Number of elements in btree_cache + btree_cache_freeable lists */
586  	unsigned int		btree_cache_used;
587  
588  	/*
589  	 * If we need to allocate memory for a new btree node and that
590  	 * allocation fails, we can cannibalize another node in the btree cache
591  	 * to satisfy the allocation - lock to guarantee only one thread does
592  	 * this at a time:
593  	 */
594  	wait_queue_head_t	btree_cache_wait;
595  	struct task_struct	*btree_cache_alloc_lock;
596  	spinlock_t		btree_cannibalize_lock;
597  
598  	/*
599  	 * When we free a btree node, we increment the gen of the bucket the
600  	 * node is in - but we can't rewrite the prios and gens until we
601  	 * finished whatever it is we were doing, otherwise after a crash the
602  	 * btree node would be freed but for say a split, we might not have the
603  	 * pointers to the new nodes inserted into the btree yet.
604  	 *
605  	 * This is a refcount that blocks prio_write() until the new keys are
606  	 * written.
607  	 */
608  	atomic_t		prio_blocked;
609  	wait_queue_head_t	bucket_wait;
610  
611  	/*
612  	 * For any bio we don't skip we subtract the number of sectors from
613  	 * rescale; when it hits 0 we rescale all the bucket priorities.
614  	 */
615  	atomic_t		rescale;
616  	/*
617  	 * used for GC, identify if any front side I/Os is inflight
618  	 */
619  	atomic_t		search_inflight;
620  	/*
621  	 * When we invalidate buckets, we use both the priority and the amount
622  	 * of good data to determine which buckets to reuse first - to weight
623  	 * those together consistently we keep track of the smallest nonzero
624  	 * priority of any bucket.
625  	 */
626  	uint16_t		min_prio;
627  
628  	/*
629  	 * max(gen - last_gc) for all buckets. When it gets too big we have to
630  	 * gc to keep gens from wrapping around.
631  	 */
632  	uint8_t			need_gc;
633  	struct gc_stat		gc_stats;
634  	size_t			nbuckets;
635  	size_t			avail_nbuckets;
636  
637  	struct task_struct	*gc_thread;
638  	/* Where in the btree gc currently is */
639  	struct bkey		gc_done;
640  
641  	/*
642  	 * For automatical garbage collection after writeback completed, this
643  	 * varialbe is used as bit fields,
644  	 * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
645  	 * - 0000 0010b (BCH_DO_AUTO_GC):     do gc after writeback
646  	 * This is an optimization for following write request after writeback
647  	 * finished, but read hit rate dropped due to clean data on cache is
648  	 * discarded. Unless user explicitly sets it via sysfs, it won't be
649  	 * enabled.
650  	 */
651  #define BCH_ENABLE_AUTO_GC	1
652  #define BCH_DO_AUTO_GC		2
653  	uint8_t			gc_after_writeback;
654  
655  	/*
656  	 * The allocation code needs gc_mark in struct bucket to be correct, but
657  	 * it's not while a gc is in progress. Protected by bucket_lock.
658  	 */
659  	int			gc_mark_valid;
660  
661  	/* Counts how many sectors bio_insert has added to the cache */
662  	atomic_t		sectors_to_gc;
663  	wait_queue_head_t	gc_wait;
664  
665  	struct keybuf		moving_gc_keys;
666  	/* Number of moving GC bios in flight */
667  	struct semaphore	moving_in_flight;
668  
669  	struct workqueue_struct	*moving_gc_wq;
670  
671  	struct btree		*root;
672  
673  #ifdef CONFIG_BCACHE_DEBUG
674  	struct btree		*verify_data;
675  	struct bset		*verify_ondisk;
676  	struct mutex		verify_lock;
677  #endif
678  
679  	uint8_t			set_uuid[16];
680  	unsigned int		nr_uuids;
681  	struct uuid_entry	*uuids;
682  	BKEY_PADDED(uuid_bucket);
683  	struct closure		uuid_write;
684  	struct semaphore	uuid_write_mutex;
685  
686  	/*
687  	 * A btree node on disk could have too many bsets for an iterator to fit
688  	 * on the stack - have to dynamically allocate them.
689  	 * bch_cache_set_alloc() will make sure the pool can allocate iterators
690  	 * equipped with enough room that can host
691  	 *     (sb.bucket_size / sb.block_size)
692  	 * btree_iter_sets, which is more than static MAX_BSETS.
693  	 */
694  	mempool_t		fill_iter;
695  
696  	struct bset_sort_state	sort;
697  
698  	/* List of buckets we're currently writing data to */
699  	struct list_head	data_buckets;
700  	spinlock_t		data_bucket_lock;
701  
702  	struct journal		journal;
703  
704  #define CONGESTED_MAX		1024
705  	unsigned int		congested_last_us;
706  	atomic_t		congested;
707  
708  	/* The rest of this all shows up in sysfs */
709  	unsigned int		congested_read_threshold_us;
710  	unsigned int		congested_write_threshold_us;
711  
712  	struct time_stats	btree_gc_time;
713  	struct time_stats	btree_split_time;
714  	struct time_stats	btree_read_time;
715  
716  	atomic_long_t		cache_read_races;
717  	atomic_long_t		writeback_keys_done;
718  	atomic_long_t		writeback_keys_failed;
719  
720  	atomic_long_t		reclaim;
721  	atomic_long_t		reclaimed_journal_buckets;
722  	atomic_long_t		flush_write;
723  
724  	enum			{
725  		ON_ERROR_UNREGISTER,
726  		ON_ERROR_PANIC,
727  	}			on_error;
728  #define DEFAULT_IO_ERROR_LIMIT 8
729  	unsigned int		error_limit;
730  	unsigned int		error_decay;
731  
732  	unsigned short		journal_delay_ms;
733  	bool			expensive_debug_checks;
734  	unsigned int		verify:1;
735  	unsigned int		key_merging_disabled:1;
736  	unsigned int		gc_always_rewrite:1;
737  	unsigned int		shrinker_disabled:1;
738  	unsigned int		copy_gc_enabled:1;
739  	unsigned int		idle_max_writeback_rate_enabled:1;
740  
741  #define BUCKET_HASH_BITS	12
742  	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
743  };
744  
745  struct bbio {
746  	unsigned int		submit_time_us;
747  	union {
748  		struct bkey	key;
749  		uint64_t	_pad[3];
750  		/*
751  		 * We only need pad = 3 here because we only ever carry around a
752  		 * single pointer - i.e. the pointer we're doing io to/from.
753  		 */
754  	};
755  	struct bio		bio;
756  };
757  
758  #define BTREE_PRIO		USHRT_MAX
759  #define INITIAL_PRIO		32768U
760  
761  #define btree_bytes(c)		((c)->btree_pages * PAGE_SIZE)
762  #define btree_blocks(b)							\
763  	((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
764  
765  #define btree_default_blocks(c)						\
766  	((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
767  
768  #define bucket_bytes(ca)	((ca)->sb.bucket_size << 9)
769  #define block_bytes(ca)		((ca)->sb.block_size << 9)
770  
meta_bucket_pages(struct cache_sb * sb)771  static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
772  {
773  	unsigned int n, max_pages;
774  
775  	max_pages = min_t(unsigned int,
776  			  __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS,
777  			  MAX_ORDER_NR_PAGES);
778  
779  	n = sb->bucket_size / PAGE_SECTORS;
780  	if (n > max_pages)
781  		n = max_pages;
782  
783  	return n;
784  }
785  
meta_bucket_bytes(struct cache_sb * sb)786  static inline unsigned int meta_bucket_bytes(struct cache_sb *sb)
787  {
788  	return meta_bucket_pages(sb) << PAGE_SHIFT;
789  }
790  
791  #define prios_per_bucket(ca)						\
792  	((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) /	\
793  	 sizeof(struct bucket_disk))
794  
795  #define prio_buckets(ca)						\
796  	DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca))
797  
sector_to_bucket(struct cache_set * c,sector_t s)798  static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
799  {
800  	return s >> c->bucket_bits;
801  }
802  
bucket_to_sector(struct cache_set * c,size_t b)803  static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
804  {
805  	return ((sector_t) b) << c->bucket_bits;
806  }
807  
bucket_remainder(struct cache_set * c,sector_t s)808  static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
809  {
810  	return s & (c->cache->sb.bucket_size - 1);
811  }
812  
PTR_BUCKET_NR(struct cache_set * c,const struct bkey * k,unsigned int ptr)813  static inline size_t PTR_BUCKET_NR(struct cache_set *c,
814  				   const struct bkey *k,
815  				   unsigned int ptr)
816  {
817  	return sector_to_bucket(c, PTR_OFFSET(k, ptr));
818  }
819  
PTR_BUCKET(struct cache_set * c,const struct bkey * k,unsigned int ptr)820  static inline struct bucket *PTR_BUCKET(struct cache_set *c,
821  					const struct bkey *k,
822  					unsigned int ptr)
823  {
824  	return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr);
825  }
826  
gen_after(uint8_t a,uint8_t b)827  static inline uint8_t gen_after(uint8_t a, uint8_t b)
828  {
829  	uint8_t r = a - b;
830  
831  	return r > 128U ? 0 : r;
832  }
833  
ptr_stale(struct cache_set * c,const struct bkey * k,unsigned int i)834  static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
835  				unsigned int i)
836  {
837  	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
838  }
839  
ptr_available(struct cache_set * c,const struct bkey * k,unsigned int i)840  static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
841  				 unsigned int i)
842  {
843  	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache;
844  }
845  
846  /* Btree key macros */
847  
848  /*
849   * This is used for various on disk data structures - cache_sb, prio_set, bset,
850   * jset: The checksum is _always_ the first 8 bytes of these structs
851   */
852  #define csum_set(i)							\
853  	bch_crc64(((void *) (i)) + sizeof(uint64_t),			\
854  		  ((void *) bset_bkey_last(i)) -			\
855  		  (((void *) (i)) + sizeof(uint64_t)))
856  
857  /* Error handling macros */
858  
859  #define btree_bug(b, ...)						\
860  do {									\
861  	if (bch_cache_set_error((b)->c, __VA_ARGS__))			\
862  		dump_stack();						\
863  } while (0)
864  
865  #define cache_bug(c, ...)						\
866  do {									\
867  	if (bch_cache_set_error(c, __VA_ARGS__))			\
868  		dump_stack();						\
869  } while (0)
870  
871  #define btree_bug_on(cond, b, ...)					\
872  do {									\
873  	if (cond)							\
874  		btree_bug(b, __VA_ARGS__);				\
875  } while (0)
876  
877  #define cache_bug_on(cond, c, ...)					\
878  do {									\
879  	if (cond)							\
880  		cache_bug(c, __VA_ARGS__);				\
881  } while (0)
882  
883  #define cache_set_err_on(cond, c, ...)					\
884  do {									\
885  	if (cond)							\
886  		bch_cache_set_error(c, __VA_ARGS__);			\
887  } while (0)
888  
889  /* Looping macros */
890  
891  #define for_each_bucket(b, ca)						\
892  	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
893  	     b < (ca)->buckets + (ca)->sb.nbuckets; b++)
894  
cached_dev_put(struct cached_dev * dc)895  static inline void cached_dev_put(struct cached_dev *dc)
896  {
897  	if (refcount_dec_and_test(&dc->count))
898  		schedule_work(&dc->detach);
899  }
900  
cached_dev_get(struct cached_dev * dc)901  static inline bool cached_dev_get(struct cached_dev *dc)
902  {
903  	if (!refcount_inc_not_zero(&dc->count))
904  		return false;
905  
906  	/* Paired with the mb in cached_dev_attach */
907  	smp_mb__after_atomic();
908  	return true;
909  }
910  
911  /*
912   * bucket_gc_gen() returns the difference between the bucket's current gen and
913   * the oldest gen of any pointer into that bucket in the btree (last_gc).
914   */
915  
bucket_gc_gen(struct bucket * b)916  static inline uint8_t bucket_gc_gen(struct bucket *b)
917  {
918  	return b->gen - b->last_gc;
919  }
920  
921  #define BUCKET_GC_GEN_MAX	96U
922  
923  #define kobj_attribute_write(n, fn)					\
924  	static struct kobj_attribute ksysfs_##n = __ATTR(n, 0200, NULL, fn)
925  
926  #define kobj_attribute_rw(n, show, store)				\
927  	static struct kobj_attribute ksysfs_##n =			\
928  		__ATTR(n, 0600, show, store)
929  
wake_up_allocators(struct cache_set * c)930  static inline void wake_up_allocators(struct cache_set *c)
931  {
932  	struct cache *ca = c->cache;
933  
934  	wake_up_process(ca->alloc_thread);
935  }
936  
closure_bio_submit(struct cache_set * c,struct bio * bio,struct closure * cl)937  static inline void closure_bio_submit(struct cache_set *c,
938  				      struct bio *bio,
939  				      struct closure *cl)
940  {
941  	closure_get(cl);
942  	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) {
943  		bio->bi_status = BLK_STS_IOERR;
944  		bio_endio(bio);
945  		return;
946  	}
947  	submit_bio_noacct(bio);
948  }
949  
950  /*
951   * Prevent the kthread exits directly, and make sure when kthread_stop()
952   * is called to stop a kthread, it is still alive. If a kthread might be
953   * stopped by CACHE_SET_IO_DISABLE bit set, wait_for_kthread_stop() is
954   * necessary before the kthread returns.
955   */
wait_for_kthread_stop(void)956  static inline void wait_for_kthread_stop(void)
957  {
958  	while (!kthread_should_stop()) {
959  		set_current_state(TASK_INTERRUPTIBLE);
960  		schedule();
961  	}
962  }
963  
964  /* Forward declarations */
965  
966  void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio);
967  void bch_count_io_errors(struct cache *ca, blk_status_t error,
968  			 int is_read, const char *m);
969  void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
970  			      blk_status_t error, const char *m);
971  void bch_bbio_endio(struct cache_set *c, struct bio *bio,
972  		    blk_status_t error, const char *m);
973  void bch_bbio_free(struct bio *bio, struct cache_set *c);
974  struct bio *bch_bbio_alloc(struct cache_set *c);
975  
976  void __bch_submit_bbio(struct bio *bio, struct cache_set *c);
977  void bch_submit_bbio(struct bio *bio, struct cache_set *c,
978  		     struct bkey *k, unsigned int ptr);
979  
980  uint8_t bch_inc_gen(struct cache *ca, struct bucket *b);
981  void bch_rescale_priorities(struct cache_set *c, int sectors);
982  
983  bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b);
984  void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b);
985  
986  void __bch_bucket_free(struct cache *ca, struct bucket *b);
987  void bch_bucket_free(struct cache_set *c, struct bkey *k);
988  
989  long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
990  int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
991  			   struct bkey *k, bool wait);
992  int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
993  			 struct bkey *k, bool wait);
994  bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
995  		       unsigned int sectors, unsigned int write_point,
996  		       unsigned int write_prio, bool wait);
997  bool bch_cached_dev_error(struct cached_dev *dc);
998  
999  __printf(2, 3)
1000  bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
1001  
1002  int bch_prio_write(struct cache *ca, bool wait);
1003  void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
1004  
1005  extern struct workqueue_struct *bcache_wq;
1006  extern struct workqueue_struct *bch_journal_wq;
1007  extern struct workqueue_struct *bch_flush_wq;
1008  extern struct mutex bch_register_lock;
1009  extern struct list_head bch_cache_sets;
1010  
1011  extern const struct kobj_type bch_cached_dev_ktype;
1012  extern const struct kobj_type bch_flash_dev_ktype;
1013  extern const struct kobj_type bch_cache_set_ktype;
1014  extern const struct kobj_type bch_cache_set_internal_ktype;
1015  extern const struct kobj_type bch_cache_ktype;
1016  
1017  void bch_cached_dev_release(struct kobject *kobj);
1018  void bch_flash_dev_release(struct kobject *kobj);
1019  void bch_cache_set_release(struct kobject *kobj);
1020  void bch_cache_release(struct kobject *kobj);
1021  
1022  int bch_uuid_write(struct cache_set *c);
1023  void bcache_write_super(struct cache_set *c);
1024  
1025  int bch_flash_dev_create(struct cache_set *c, uint64_t size);
1026  
1027  int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1028  			  uint8_t *set_uuid);
1029  void bch_cached_dev_detach(struct cached_dev *dc);
1030  int bch_cached_dev_run(struct cached_dev *dc);
1031  void bcache_device_stop(struct bcache_device *d);
1032  
1033  void bch_cache_set_unregister(struct cache_set *c);
1034  void bch_cache_set_stop(struct cache_set *c);
1035  
1036  struct cache_set *bch_cache_set_alloc(struct cache_sb *sb);
1037  void bch_btree_cache_free(struct cache_set *c);
1038  int bch_btree_cache_alloc(struct cache_set *c);
1039  void bch_moving_init_cache_set(struct cache_set *c);
1040  int bch_open_buckets_alloc(struct cache_set *c);
1041  void bch_open_buckets_free(struct cache_set *c);
1042  
1043  int bch_cache_allocator_start(struct cache *ca);
1044  
1045  void bch_debug_exit(void);
1046  void bch_debug_init(void);
1047  void bch_request_exit(void);
1048  int bch_request_init(void);
1049  void bch_btree_exit(void);
1050  int bch_btree_init(void);
1051  
1052  #endif /* _BCACHE_H */
1053