xref: /linux/drivers/md/bcache/btree.c (revision c4ee0af3fa0dc65f690fc908f02b8355f9576ea0)
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  *
4  * Uses a block device as cache for other block devices; optimized for SSDs.
5  * All allocation is done in buckets, which should match the erase block size
6  * of the device.
7  *
8  * Buckets containing cached data are kept on a heap sorted by priority;
9  * bucket priority is increased on cache hit, and periodically all the buckets
10  * on the heap have their priority scaled down. This currently is just used as
11  * an LRU but in the future should allow for more intelligent heuristics.
12  *
13  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14  * counter. Garbage collection is used to remove stale pointers.
15  *
16  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17  * as keys are inserted we only sort the pages that have not yet been written.
18  * When garbage collection is run, we resort the entire node.
19  *
20  * All configuration is done via sysfs; see Documentation/bcache.txt.
21  */
22 
23 #include "bcache.h"
24 #include "btree.h"
25 #include "debug.h"
26 #include "writeback.h"
27 
28 #include <linux/slab.h>
29 #include <linux/bitops.h>
30 #include <linux/freezer.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/prefetch.h>
34 #include <linux/random.h>
35 #include <linux/rcupdate.h>
36 #include <trace/events/bcache.h>
37 
38 /*
39  * Todo:
40  * register_bcache: Return errors out to userspace correctly
41  *
42  * Writeback: don't undirty key until after a cache flush
43  *
44  * Create an iterator for key pointers
45  *
46  * On btree write error, mark bucket such that it won't be freed from the cache
47  *
48  * Journalling:
49  *   Check for bad keys in replay
50  *   Propagate barriers
51  *   Refcount journal entries in journal_replay
52  *
53  * Garbage collection:
54  *   Finish incremental gc
55  *   Gc should free old UUIDs, data for invalid UUIDs
56  *
57  * Provide a way to list backing device UUIDs we have data cached for, and
58  * probably how long it's been since we've seen them, and a way to invalidate
59  * dirty data for devices that will never be attached again
60  *
61  * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
62  * that based on that and how much dirty data we have we can keep writeback
63  * from being starved
64  *
65  * Add a tracepoint or somesuch to watch for writeback starvation
66  *
67  * When btree depth > 1 and splitting an interior node, we have to make sure
68  * alloc_bucket() cannot fail. This should be true but is not completely
69  * obvious.
70  *
71  * Make sure all allocations get charged to the root cgroup
72  *
73  * Plugging?
74  *
75  * If data write is less than hard sector size of ssd, round up offset in open
76  * bucket to the next whole sector
77  *
78  * Also lookup by cgroup in get_open_bucket()
79  *
80  * Superblock needs to be fleshed out for multiple cache devices
81  *
82  * Add a sysfs tunable for the number of writeback IOs in flight
83  *
84  * Add a sysfs tunable for the number of open data buckets
85  *
86  * IO tracking: Can we track when one process is doing io on behalf of another?
87  * IO tracking: Don't use just an average, weigh more recent stuff higher
88  *
89  * Test module load/unload
90  */
91 
92 enum {
93 	BTREE_INSERT_STATUS_INSERT,
94 	BTREE_INSERT_STATUS_BACK_MERGE,
95 	BTREE_INSERT_STATUS_OVERWROTE,
96 	BTREE_INSERT_STATUS_FRONT_MERGE,
97 };
98 
99 #define MAX_NEED_GC		64
100 #define MAX_SAVE_PRIO		72
101 
102 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
103 
104 #define PTR_HASH(c, k)							\
105 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
106 
107 static struct workqueue_struct *btree_io_wq;
108 
109 static inline bool should_split(struct btree *b)
110 {
111 	struct bset *i = write_block(b);
112 	return b->written >= btree_blocks(b) ||
113 		(b->written + __set_blocks(i, i->keys + 15, b->c)
114 		 > btree_blocks(b));
115 }
116 
117 #define insert_lock(s, b)	((b)->level <= (s)->lock)
118 
119 /*
120  * These macros are for recursing down the btree - they handle the details of
121  * locking and looking up nodes in the cache for you. They're best treated as
122  * mere syntax when reading code that uses them.
123  *
124  * op->lock determines whether we take a read or a write lock at a given depth.
125  * If you've got a read lock and find that you need a write lock (i.e. you're
126  * going to have to split), set op->lock and return -EINTR; btree_root() will
127  * call you again and you'll have the correct lock.
128  */
129 
130 /**
131  * btree - recurse down the btree on a specified key
132  * @fn:		function to call, which will be passed the child node
133  * @key:	key to recurse on
134  * @b:		parent btree node
135  * @op:		pointer to struct btree_op
136  */
137 #define btree(fn, key, b, op, ...)					\
138 ({									\
139 	int _r, l = (b)->level - 1;					\
140 	bool _w = l <= (op)->lock;					\
141 	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
142 	if (!IS_ERR(_child)) {						\
143 		_child->parent = (b);					\
144 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
145 		rw_unlock(_w, _child);					\
146 	} else								\
147 		_r = PTR_ERR(_child);					\
148 	_r;								\
149 })
150 
151 /**
152  * btree_root - call a function on the root of the btree
153  * @fn:		function to call, which will be passed the child node
154  * @c:		cache set
155  * @op:		pointer to struct btree_op
156  */
157 #define btree_root(fn, c, op, ...)					\
158 ({									\
159 	int _r = -EINTR;						\
160 	do {								\
161 		struct btree *_b = (c)->root;				\
162 		bool _w = insert_lock(op, _b);				\
163 		rw_lock(_w, _b, _b->level);				\
164 		if (_b == (c)->root &&					\
165 		    _w == insert_lock(op, _b)) {			\
166 			_b->parent = NULL;				\
167 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
168 		}							\
169 		rw_unlock(_w, _b);					\
170 		bch_cannibalize_unlock(c);				\
171 		if (_r == -ENOSPC) {					\
172 			wait_event((c)->try_wait,			\
173 				   !(c)->try_harder);			\
174 			_r = -EINTR;					\
175 		}							\
176 	} while (_r == -EINTR);						\
177 									\
178 	_r;								\
179 })
180 
181 /* Btree key manipulation */
182 
183 void bkey_put(struct cache_set *c, struct bkey *k)
184 {
185 	unsigned i;
186 
187 	for (i = 0; i < KEY_PTRS(k); i++)
188 		if (ptr_available(c, k, i))
189 			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
190 }
191 
192 /* Btree IO */
193 
194 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
195 {
196 	uint64_t crc = b->key.ptr[0];
197 	void *data = (void *) i + 8, *end = end(i);
198 
199 	crc = bch_crc64_update(crc, data, end - data);
200 	return crc ^ 0xffffffffffffffffULL;
201 }
202 
203 static void bch_btree_node_read_done(struct btree *b)
204 {
205 	const char *err = "bad btree header";
206 	struct bset *i = b->sets[0].data;
207 	struct btree_iter *iter;
208 
209 	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
210 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
211 	iter->used = 0;
212 
213 #ifdef CONFIG_BCACHE_DEBUG
214 	iter->b = b;
215 #endif
216 
217 	if (!i->seq)
218 		goto err;
219 
220 	for (;
221 	     b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
222 	     i = write_block(b)) {
223 		err = "unsupported bset version";
224 		if (i->version > BCACHE_BSET_VERSION)
225 			goto err;
226 
227 		err = "bad btree header";
228 		if (b->written + set_blocks(i, b->c) > btree_blocks(b))
229 			goto err;
230 
231 		err = "bad magic";
232 		if (i->magic != bset_magic(&b->c->sb))
233 			goto err;
234 
235 		err = "bad checksum";
236 		switch (i->version) {
237 		case 0:
238 			if (i->csum != csum_set(i))
239 				goto err;
240 			break;
241 		case BCACHE_BSET_VERSION:
242 			if (i->csum != btree_csum_set(b, i))
243 				goto err;
244 			break;
245 		}
246 
247 		err = "empty set";
248 		if (i != b->sets[0].data && !i->keys)
249 			goto err;
250 
251 		bch_btree_iter_push(iter, i->start, end(i));
252 
253 		b->written += set_blocks(i, b->c);
254 	}
255 
256 	err = "corrupted btree";
257 	for (i = write_block(b);
258 	     index(i, b) < btree_blocks(b);
259 	     i = ((void *) i) + block_bytes(b->c))
260 		if (i->seq == b->sets[0].data->seq)
261 			goto err;
262 
263 	bch_btree_sort_and_fix_extents(b, iter);
264 
265 	i = b->sets[0].data;
266 	err = "short btree key";
267 	if (b->sets[0].size &&
268 	    bkey_cmp(&b->key, &b->sets[0].end) < 0)
269 		goto err;
270 
271 	if (b->written < btree_blocks(b))
272 		bch_bset_init_next(b);
273 out:
274 	mempool_free(iter, b->c->fill_iter);
275 	return;
276 err:
277 	set_btree_node_io_error(b);
278 	bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
279 			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
280 			    index(i, b), i->keys);
281 	goto out;
282 }
283 
284 static void btree_node_read_endio(struct bio *bio, int error)
285 {
286 	struct closure *cl = bio->bi_private;
287 	closure_put(cl);
288 }
289 
290 void bch_btree_node_read(struct btree *b)
291 {
292 	uint64_t start_time = local_clock();
293 	struct closure cl;
294 	struct bio *bio;
295 
296 	trace_bcache_btree_read(b);
297 
298 	closure_init_stack(&cl);
299 
300 	bio = bch_bbio_alloc(b->c);
301 	bio->bi_rw	= REQ_META|READ_SYNC;
302 	bio->bi_size	= KEY_SIZE(&b->key) << 9;
303 	bio->bi_end_io	= btree_node_read_endio;
304 	bio->bi_private	= &cl;
305 
306 	bch_bio_map(bio, b->sets[0].data);
307 
308 	bch_submit_bbio(bio, b->c, &b->key, 0);
309 	closure_sync(&cl);
310 
311 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
312 		set_btree_node_io_error(b);
313 
314 	bch_bbio_free(bio, b->c);
315 
316 	if (btree_node_io_error(b))
317 		goto err;
318 
319 	bch_btree_node_read_done(b);
320 	bch_time_stats_update(&b->c->btree_read_time, start_time);
321 
322 	return;
323 err:
324 	bch_cache_set_error(b->c, "io error reading bucket %zu",
325 			    PTR_BUCKET_NR(b->c, &b->key, 0));
326 }
327 
328 static void btree_complete_write(struct btree *b, struct btree_write *w)
329 {
330 	if (w->prio_blocked &&
331 	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
332 		wake_up_allocators(b->c);
333 
334 	if (w->journal) {
335 		atomic_dec_bug(w->journal);
336 		__closure_wake_up(&b->c->journal.wait);
337 	}
338 
339 	w->prio_blocked	= 0;
340 	w->journal	= NULL;
341 }
342 
343 static void __btree_node_write_done(struct closure *cl)
344 {
345 	struct btree *b = container_of(cl, struct btree, io.cl);
346 	struct btree_write *w = btree_prev_write(b);
347 
348 	bch_bbio_free(b->bio, b->c);
349 	b->bio = NULL;
350 	btree_complete_write(b, w);
351 
352 	if (btree_node_dirty(b))
353 		queue_delayed_work(btree_io_wq, &b->work,
354 				   msecs_to_jiffies(30000));
355 
356 	closure_return(cl);
357 }
358 
359 static void btree_node_write_done(struct closure *cl)
360 {
361 	struct btree *b = container_of(cl, struct btree, io.cl);
362 	struct bio_vec *bv;
363 	int n;
364 
365 	__bio_for_each_segment(bv, b->bio, n, 0)
366 		__free_page(bv->bv_page);
367 
368 	__btree_node_write_done(cl);
369 }
370 
371 static void btree_node_write_endio(struct bio *bio, int error)
372 {
373 	struct closure *cl = bio->bi_private;
374 	struct btree *b = container_of(cl, struct btree, io.cl);
375 
376 	if (error)
377 		set_btree_node_io_error(b);
378 
379 	bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
380 	closure_put(cl);
381 }
382 
383 static void do_btree_node_write(struct btree *b)
384 {
385 	struct closure *cl = &b->io.cl;
386 	struct bset *i = b->sets[b->nsets].data;
387 	BKEY_PADDED(key) k;
388 
389 	i->version	= BCACHE_BSET_VERSION;
390 	i->csum		= btree_csum_set(b, i);
391 
392 	BUG_ON(b->bio);
393 	b->bio = bch_bbio_alloc(b->c);
394 
395 	b->bio->bi_end_io	= btree_node_write_endio;
396 	b->bio->bi_private	= cl;
397 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
398 	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
399 	bch_bio_map(b->bio, i);
400 
401 	/*
402 	 * If we're appending to a leaf node, we don't technically need FUA -
403 	 * this write just needs to be persisted before the next journal write,
404 	 * which will be marked FLUSH|FUA.
405 	 *
406 	 * Similarly if we're writing a new btree root - the pointer is going to
407 	 * be in the next journal entry.
408 	 *
409 	 * But if we're writing a new btree node (that isn't a root) or
410 	 * appending to a non leaf btree node, we need either FUA or a flush
411 	 * when we write the parent with the new pointer. FUA is cheaper than a
412 	 * flush, and writes appending to leaf nodes aren't blocking anything so
413 	 * just make all btree node writes FUA to keep things sane.
414 	 */
415 
416 	bkey_copy(&k.key, &b->key);
417 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
418 
419 	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
420 		int j;
421 		struct bio_vec *bv;
422 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
423 
424 		bio_for_each_segment(bv, b->bio, j)
425 			memcpy(page_address(bv->bv_page),
426 			       base + j * PAGE_SIZE, PAGE_SIZE);
427 
428 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
429 
430 		continue_at(cl, btree_node_write_done, NULL);
431 	} else {
432 		b->bio->bi_vcnt = 0;
433 		bch_bio_map(b->bio, i);
434 
435 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
436 
437 		closure_sync(cl);
438 		__btree_node_write_done(cl);
439 	}
440 }
441 
442 void bch_btree_node_write(struct btree *b, struct closure *parent)
443 {
444 	struct bset *i = b->sets[b->nsets].data;
445 
446 	trace_bcache_btree_write(b);
447 
448 	BUG_ON(current->bio_list);
449 	BUG_ON(b->written >= btree_blocks(b));
450 	BUG_ON(b->written && !i->keys);
451 	BUG_ON(b->sets->data->seq != i->seq);
452 	bch_check_keys(b, "writing");
453 
454 	cancel_delayed_work(&b->work);
455 
456 	/* If caller isn't waiting for write, parent refcount is cache set */
457 	closure_lock(&b->io, parent ?: &b->c->cl);
458 
459 	clear_bit(BTREE_NODE_dirty,	 &b->flags);
460 	change_bit(BTREE_NODE_write_idx, &b->flags);
461 
462 	do_btree_node_write(b);
463 
464 	b->written += set_blocks(i, b->c);
465 	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
466 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
467 
468 	bch_btree_sort_lazy(b);
469 
470 	if (b->written < btree_blocks(b))
471 		bch_bset_init_next(b);
472 }
473 
474 static void bch_btree_node_write_sync(struct btree *b)
475 {
476 	struct closure cl;
477 
478 	closure_init_stack(&cl);
479 	bch_btree_node_write(b, &cl);
480 	closure_sync(&cl);
481 }
482 
483 static void btree_node_write_work(struct work_struct *w)
484 {
485 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
486 
487 	rw_lock(true, b, b->level);
488 
489 	if (btree_node_dirty(b))
490 		bch_btree_node_write(b, NULL);
491 	rw_unlock(true, b);
492 }
493 
494 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
495 {
496 	struct bset *i = b->sets[b->nsets].data;
497 	struct btree_write *w = btree_current_write(b);
498 
499 	BUG_ON(!b->written);
500 	BUG_ON(!i->keys);
501 
502 	if (!btree_node_dirty(b))
503 		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
504 
505 	set_btree_node_dirty(b);
506 
507 	if (journal_ref) {
508 		if (w->journal &&
509 		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
510 			atomic_dec_bug(w->journal);
511 			w->journal = NULL;
512 		}
513 
514 		if (!w->journal) {
515 			w->journal = journal_ref;
516 			atomic_inc(w->journal);
517 		}
518 	}
519 
520 	/* Force write if set is too big */
521 	if (set_bytes(i) > PAGE_SIZE - 48 &&
522 	    !current->bio_list)
523 		bch_btree_node_write(b, NULL);
524 }
525 
526 /*
527  * Btree in memory cache - allocation/freeing
528  * mca -> memory cache
529  */
530 
531 static void mca_reinit(struct btree *b)
532 {
533 	unsigned i;
534 
535 	b->flags	= 0;
536 	b->written	= 0;
537 	b->nsets	= 0;
538 
539 	for (i = 0; i < MAX_BSETS; i++)
540 		b->sets[i].size = 0;
541 	/*
542 	 * Second loop starts at 1 because b->sets[0]->data is the memory we
543 	 * allocated
544 	 */
545 	for (i = 1; i < MAX_BSETS; i++)
546 		b->sets[i].data = NULL;
547 }
548 
549 #define mca_reserve(c)	(((c->root && c->root->level)		\
550 			  ? c->root->level : 1) * 8 + 16)
551 #define mca_can_free(c)						\
552 	max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
553 
554 static void mca_data_free(struct btree *b)
555 {
556 	struct bset_tree *t = b->sets;
557 	BUG_ON(!closure_is_unlocked(&b->io.cl));
558 
559 	if (bset_prev_bytes(b) < PAGE_SIZE)
560 		kfree(t->prev);
561 	else
562 		free_pages((unsigned long) t->prev,
563 			   get_order(bset_prev_bytes(b)));
564 
565 	if (bset_tree_bytes(b) < PAGE_SIZE)
566 		kfree(t->tree);
567 	else
568 		free_pages((unsigned long) t->tree,
569 			   get_order(bset_tree_bytes(b)));
570 
571 	free_pages((unsigned long) t->data, b->page_order);
572 
573 	t->prev = NULL;
574 	t->tree = NULL;
575 	t->data = NULL;
576 	list_move(&b->list, &b->c->btree_cache_freed);
577 	b->c->bucket_cache_used--;
578 }
579 
580 static void mca_bucket_free(struct btree *b)
581 {
582 	BUG_ON(btree_node_dirty(b));
583 
584 	b->key.ptr[0] = 0;
585 	hlist_del_init_rcu(&b->hash);
586 	list_move(&b->list, &b->c->btree_cache_freeable);
587 }
588 
589 static unsigned btree_order(struct bkey *k)
590 {
591 	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
592 }
593 
594 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
595 {
596 	struct bset_tree *t = b->sets;
597 	BUG_ON(t->data);
598 
599 	b->page_order = max_t(unsigned,
600 			      ilog2(b->c->btree_pages),
601 			      btree_order(k));
602 
603 	t->data = (void *) __get_free_pages(gfp, b->page_order);
604 	if (!t->data)
605 		goto err;
606 
607 	t->tree = bset_tree_bytes(b) < PAGE_SIZE
608 		? kmalloc(bset_tree_bytes(b), gfp)
609 		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
610 	if (!t->tree)
611 		goto err;
612 
613 	t->prev = bset_prev_bytes(b) < PAGE_SIZE
614 		? kmalloc(bset_prev_bytes(b), gfp)
615 		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
616 	if (!t->prev)
617 		goto err;
618 
619 	list_move(&b->list, &b->c->btree_cache);
620 	b->c->bucket_cache_used++;
621 	return;
622 err:
623 	mca_data_free(b);
624 }
625 
626 static struct btree *mca_bucket_alloc(struct cache_set *c,
627 				      struct bkey *k, gfp_t gfp)
628 {
629 	struct btree *b = kzalloc(sizeof(struct btree), gfp);
630 	if (!b)
631 		return NULL;
632 
633 	init_rwsem(&b->lock);
634 	lockdep_set_novalidate_class(&b->lock);
635 	INIT_LIST_HEAD(&b->list);
636 	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
637 	b->c = c;
638 	closure_init_unlocked(&b->io);
639 
640 	mca_data_alloc(b, k, gfp);
641 	return b;
642 }
643 
644 static int mca_reap(struct btree *b, unsigned min_order, bool flush)
645 {
646 	struct closure cl;
647 
648 	closure_init_stack(&cl);
649 	lockdep_assert_held(&b->c->bucket_lock);
650 
651 	if (!down_write_trylock(&b->lock))
652 		return -ENOMEM;
653 
654 	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
655 
656 	if (b->page_order < min_order ||
657 	    (!flush &&
658 	     (btree_node_dirty(b) ||
659 	      atomic_read(&b->io.cl.remaining) != -1))) {
660 		rw_unlock(true, b);
661 		return -ENOMEM;
662 	}
663 
664 	if (btree_node_dirty(b))
665 		bch_btree_node_write_sync(b);
666 
667 	/* wait for any in flight btree write */
668 	closure_wait_event(&b->io.wait, &cl,
669 			   atomic_read(&b->io.cl.remaining) == -1);
670 
671 	return 0;
672 }
673 
674 static unsigned long bch_mca_scan(struct shrinker *shrink,
675 				  struct shrink_control *sc)
676 {
677 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
678 	struct btree *b, *t;
679 	unsigned long i, nr = sc->nr_to_scan;
680 	unsigned long freed = 0;
681 
682 	if (c->shrinker_disabled)
683 		return SHRINK_STOP;
684 
685 	if (c->try_harder)
686 		return SHRINK_STOP;
687 
688 	/* Return -1 if we can't do anything right now */
689 	if (sc->gfp_mask & __GFP_IO)
690 		mutex_lock(&c->bucket_lock);
691 	else if (!mutex_trylock(&c->bucket_lock))
692 		return -1;
693 
694 	/*
695 	 * It's _really_ critical that we don't free too many btree nodes - we
696 	 * have to always leave ourselves a reserve. The reserve is how we
697 	 * guarantee that allocating memory for a new btree node can always
698 	 * succeed, so that inserting keys into the btree can always succeed and
699 	 * IO can always make forward progress:
700 	 */
701 	nr /= c->btree_pages;
702 	nr = min_t(unsigned long, nr, mca_can_free(c));
703 
704 	i = 0;
705 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
706 		if (freed >= nr)
707 			break;
708 
709 		if (++i > 3 &&
710 		    !mca_reap(b, 0, false)) {
711 			mca_data_free(b);
712 			rw_unlock(true, b);
713 			freed++;
714 		}
715 	}
716 
717 	/*
718 	 * Can happen right when we first start up, before we've read in any
719 	 * btree nodes
720 	 */
721 	if (list_empty(&c->btree_cache))
722 		goto out;
723 
724 	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
725 		b = list_first_entry(&c->btree_cache, struct btree, list);
726 		list_rotate_left(&c->btree_cache);
727 
728 		if (!b->accessed &&
729 		    !mca_reap(b, 0, false)) {
730 			mca_bucket_free(b);
731 			mca_data_free(b);
732 			rw_unlock(true, b);
733 			freed++;
734 		} else
735 			b->accessed = 0;
736 	}
737 out:
738 	mutex_unlock(&c->bucket_lock);
739 	return freed;
740 }
741 
742 static unsigned long bch_mca_count(struct shrinker *shrink,
743 				   struct shrink_control *sc)
744 {
745 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
746 
747 	if (c->shrinker_disabled)
748 		return 0;
749 
750 	if (c->try_harder)
751 		return 0;
752 
753 	return mca_can_free(c) * c->btree_pages;
754 }
755 
756 void bch_btree_cache_free(struct cache_set *c)
757 {
758 	struct btree *b;
759 	struct closure cl;
760 	closure_init_stack(&cl);
761 
762 	if (c->shrink.list.next)
763 		unregister_shrinker(&c->shrink);
764 
765 	mutex_lock(&c->bucket_lock);
766 
767 #ifdef CONFIG_BCACHE_DEBUG
768 	if (c->verify_data)
769 		list_move(&c->verify_data->list, &c->btree_cache);
770 #endif
771 
772 	list_splice(&c->btree_cache_freeable,
773 		    &c->btree_cache);
774 
775 	while (!list_empty(&c->btree_cache)) {
776 		b = list_first_entry(&c->btree_cache, struct btree, list);
777 
778 		if (btree_node_dirty(b))
779 			btree_complete_write(b, btree_current_write(b));
780 		clear_bit(BTREE_NODE_dirty, &b->flags);
781 
782 		mca_data_free(b);
783 	}
784 
785 	while (!list_empty(&c->btree_cache_freed)) {
786 		b = list_first_entry(&c->btree_cache_freed,
787 				     struct btree, list);
788 		list_del(&b->list);
789 		cancel_delayed_work_sync(&b->work);
790 		kfree(b);
791 	}
792 
793 	mutex_unlock(&c->bucket_lock);
794 }
795 
796 int bch_btree_cache_alloc(struct cache_set *c)
797 {
798 	unsigned i;
799 
800 	for (i = 0; i < mca_reserve(c); i++)
801 		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
802 			return -ENOMEM;
803 
804 	list_splice_init(&c->btree_cache,
805 			 &c->btree_cache_freeable);
806 
807 #ifdef CONFIG_BCACHE_DEBUG
808 	mutex_init(&c->verify_lock);
809 
810 	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
811 
812 	if (c->verify_data &&
813 	    c->verify_data->sets[0].data)
814 		list_del_init(&c->verify_data->list);
815 	else
816 		c->verify_data = NULL;
817 #endif
818 
819 	c->shrink.count_objects = bch_mca_count;
820 	c->shrink.scan_objects = bch_mca_scan;
821 	c->shrink.seeks = 4;
822 	c->shrink.batch = c->btree_pages * 2;
823 	register_shrinker(&c->shrink);
824 
825 	return 0;
826 }
827 
828 /* Btree in memory cache - hash table */
829 
830 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
831 {
832 	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
833 }
834 
835 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
836 {
837 	struct btree *b;
838 
839 	rcu_read_lock();
840 	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
841 		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
842 			goto out;
843 	b = NULL;
844 out:
845 	rcu_read_unlock();
846 	return b;
847 }
848 
849 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
850 {
851 	struct btree *b;
852 
853 	trace_bcache_btree_cache_cannibalize(c);
854 
855 	if (!c->try_harder) {
856 		c->try_harder = current;
857 		c->try_harder_start = local_clock();
858 	} else if (c->try_harder != current)
859 		return ERR_PTR(-ENOSPC);
860 
861 	list_for_each_entry_reverse(b, &c->btree_cache, list)
862 		if (!mca_reap(b, btree_order(k), false))
863 			return b;
864 
865 	list_for_each_entry_reverse(b, &c->btree_cache, list)
866 		if (!mca_reap(b, btree_order(k), true))
867 			return b;
868 
869 	return ERR_PTR(-ENOMEM);
870 }
871 
872 /*
873  * We can only have one thread cannibalizing other cached btree nodes at a time,
874  * or we'll deadlock. We use an open coded mutex to ensure that, which a
875  * cannibalize_bucket() will take. This means every time we unlock the root of
876  * the btree, we need to release this lock if we have it held.
877  */
878 static void bch_cannibalize_unlock(struct cache_set *c)
879 {
880 	if (c->try_harder == current) {
881 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
882 		c->try_harder = NULL;
883 		wake_up(&c->try_wait);
884 	}
885 }
886 
887 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
888 {
889 	struct btree *b;
890 
891 	BUG_ON(current->bio_list);
892 
893 	lockdep_assert_held(&c->bucket_lock);
894 
895 	if (mca_find(c, k))
896 		return NULL;
897 
898 	/* btree_free() doesn't free memory; it sticks the node on the end of
899 	 * the list. Check if there's any freed nodes there:
900 	 */
901 	list_for_each_entry(b, &c->btree_cache_freeable, list)
902 		if (!mca_reap(b, btree_order(k), false))
903 			goto out;
904 
905 	/* We never free struct btree itself, just the memory that holds the on
906 	 * disk node. Check the freed list before allocating a new one:
907 	 */
908 	list_for_each_entry(b, &c->btree_cache_freed, list)
909 		if (!mca_reap(b, 0, false)) {
910 			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
911 			if (!b->sets[0].data)
912 				goto err;
913 			else
914 				goto out;
915 		}
916 
917 	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
918 	if (!b)
919 		goto err;
920 
921 	BUG_ON(!down_write_trylock(&b->lock));
922 	if (!b->sets->data)
923 		goto err;
924 out:
925 	BUG_ON(!closure_is_unlocked(&b->io.cl));
926 
927 	bkey_copy(&b->key, k);
928 	list_move(&b->list, &c->btree_cache);
929 	hlist_del_init_rcu(&b->hash);
930 	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
931 
932 	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
933 	b->level	= level;
934 	b->parent	= (void *) ~0UL;
935 
936 	mca_reinit(b);
937 
938 	return b;
939 err:
940 	if (b)
941 		rw_unlock(true, b);
942 
943 	b = mca_cannibalize(c, k);
944 	if (!IS_ERR(b))
945 		goto out;
946 
947 	return b;
948 }
949 
950 /**
951  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
952  * in from disk if necessary.
953  *
954  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
955  *
956  * The btree node will have either a read or a write lock held, depending on
957  * level and op->lock.
958  */
959 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
960 				 int level, bool write)
961 {
962 	int i = 0;
963 	struct btree *b;
964 
965 	BUG_ON(level < 0);
966 retry:
967 	b = mca_find(c, k);
968 
969 	if (!b) {
970 		if (current->bio_list)
971 			return ERR_PTR(-EAGAIN);
972 
973 		mutex_lock(&c->bucket_lock);
974 		b = mca_alloc(c, k, level);
975 		mutex_unlock(&c->bucket_lock);
976 
977 		if (!b)
978 			goto retry;
979 		if (IS_ERR(b))
980 			return b;
981 
982 		bch_btree_node_read(b);
983 
984 		if (!write)
985 			downgrade_write(&b->lock);
986 	} else {
987 		rw_lock(write, b, level);
988 		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
989 			rw_unlock(write, b);
990 			goto retry;
991 		}
992 		BUG_ON(b->level != level);
993 	}
994 
995 	b->accessed = 1;
996 
997 	for (; i <= b->nsets && b->sets[i].size; i++) {
998 		prefetch(b->sets[i].tree);
999 		prefetch(b->sets[i].data);
1000 	}
1001 
1002 	for (; i <= b->nsets; i++)
1003 		prefetch(b->sets[i].data);
1004 
1005 	if (btree_node_io_error(b)) {
1006 		rw_unlock(write, b);
1007 		return ERR_PTR(-EIO);
1008 	}
1009 
1010 	BUG_ON(!b->written);
1011 
1012 	return b;
1013 }
1014 
1015 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1016 {
1017 	struct btree *b;
1018 
1019 	mutex_lock(&c->bucket_lock);
1020 	b = mca_alloc(c, k, level);
1021 	mutex_unlock(&c->bucket_lock);
1022 
1023 	if (!IS_ERR_OR_NULL(b)) {
1024 		bch_btree_node_read(b);
1025 		rw_unlock(true, b);
1026 	}
1027 }
1028 
1029 /* Btree alloc */
1030 
1031 static void btree_node_free(struct btree *b)
1032 {
1033 	unsigned i;
1034 
1035 	trace_bcache_btree_node_free(b);
1036 
1037 	BUG_ON(b == b->c->root);
1038 
1039 	if (btree_node_dirty(b))
1040 		btree_complete_write(b, btree_current_write(b));
1041 	clear_bit(BTREE_NODE_dirty, &b->flags);
1042 
1043 	cancel_delayed_work(&b->work);
1044 
1045 	mutex_lock(&b->c->bucket_lock);
1046 
1047 	for (i = 0; i < KEY_PTRS(&b->key); i++) {
1048 		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
1049 
1050 		bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1051 			    PTR_BUCKET(b->c, &b->key, i));
1052 	}
1053 
1054 	bch_bucket_free(b->c, &b->key);
1055 	mca_bucket_free(b);
1056 	mutex_unlock(&b->c->bucket_lock);
1057 }
1058 
1059 struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1060 {
1061 	BKEY_PADDED(key) k;
1062 	struct btree *b = ERR_PTR(-EAGAIN);
1063 
1064 	mutex_lock(&c->bucket_lock);
1065 retry:
1066 	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
1067 		goto err;
1068 
1069 	bkey_put(c, &k.key);
1070 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1071 
1072 	b = mca_alloc(c, &k.key, level);
1073 	if (IS_ERR(b))
1074 		goto err_free;
1075 
1076 	if (!b) {
1077 		cache_bug(c,
1078 			"Tried to allocate bucket that was in btree cache");
1079 		goto retry;
1080 	}
1081 
1082 	b->accessed = 1;
1083 	bch_bset_init_next(b);
1084 
1085 	mutex_unlock(&c->bucket_lock);
1086 
1087 	trace_bcache_btree_node_alloc(b);
1088 	return b;
1089 err_free:
1090 	bch_bucket_free(c, &k.key);
1091 err:
1092 	mutex_unlock(&c->bucket_lock);
1093 
1094 	trace_bcache_btree_node_alloc_fail(b);
1095 	return b;
1096 }
1097 
1098 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1099 {
1100 	struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1101 	if (!IS_ERR_OR_NULL(n))
1102 		bch_btree_sort_into(b, n);
1103 
1104 	return n;
1105 }
1106 
1107 static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1108 {
1109 	unsigned i;
1110 
1111 	bkey_copy(k, &b->key);
1112 	bkey_copy_key(k, &ZERO_KEY);
1113 
1114 	for (i = 0; i < KEY_PTRS(k); i++) {
1115 		uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
1116 
1117 		SET_PTR_GEN(k, i, g);
1118 	}
1119 
1120 	atomic_inc(&b->c->prio_blocked);
1121 }
1122 
1123 /* Garbage collection */
1124 
1125 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1126 {
1127 	uint8_t stale = 0;
1128 	unsigned i;
1129 	struct bucket *g;
1130 
1131 	/*
1132 	 * ptr_invalid() can't return true for the keys that mark btree nodes as
1133 	 * freed, but since ptr_bad() returns true we'll never actually use them
1134 	 * for anything and thus we don't want mark their pointers here
1135 	 */
1136 	if (!bkey_cmp(k, &ZERO_KEY))
1137 		return stale;
1138 
1139 	for (i = 0; i < KEY_PTRS(k); i++) {
1140 		if (!ptr_available(c, k, i))
1141 			continue;
1142 
1143 		g = PTR_BUCKET(c, k, i);
1144 
1145 		if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1146 			g->gc_gen = PTR_GEN(k, i);
1147 
1148 		if (ptr_stale(c, k, i)) {
1149 			stale = max(stale, ptr_stale(c, k, i));
1150 			continue;
1151 		}
1152 
1153 		cache_bug_on(GC_MARK(g) &&
1154 			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1155 			     c, "inconsistent ptrs: mark = %llu, level = %i",
1156 			     GC_MARK(g), level);
1157 
1158 		if (level)
1159 			SET_GC_MARK(g, GC_MARK_METADATA);
1160 		else if (KEY_DIRTY(k))
1161 			SET_GC_MARK(g, GC_MARK_DIRTY);
1162 
1163 		/* guard against overflow */
1164 		SET_GC_SECTORS_USED(g, min_t(unsigned,
1165 					     GC_SECTORS_USED(g) + KEY_SIZE(k),
1166 					     (1 << 14) - 1));
1167 
1168 		BUG_ON(!GC_SECTORS_USED(g));
1169 	}
1170 
1171 	return stale;
1172 }
1173 
1174 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
1175 
1176 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1177 {
1178 	uint8_t stale = 0;
1179 	unsigned keys = 0, good_keys = 0;
1180 	struct bkey *k;
1181 	struct btree_iter iter;
1182 	struct bset_tree *t;
1183 
1184 	gc->nodes++;
1185 
1186 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1187 		stale = max(stale, btree_mark_key(b, k));
1188 		keys++;
1189 
1190 		if (bch_ptr_bad(b, k))
1191 			continue;
1192 
1193 		gc->key_bytes += bkey_u64s(k);
1194 		gc->nkeys++;
1195 		good_keys++;
1196 
1197 		gc->data += KEY_SIZE(k);
1198 	}
1199 
1200 	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1201 		btree_bug_on(t->size &&
1202 			     bset_written(b, t) &&
1203 			     bkey_cmp(&b->key, &t->end) < 0,
1204 			     b, "found short btree key in gc");
1205 
1206 	if (b->c->gc_always_rewrite)
1207 		return true;
1208 
1209 	if (stale > 10)
1210 		return true;
1211 
1212 	if ((keys - good_keys) * 2 > keys)
1213 		return true;
1214 
1215 	return false;
1216 }
1217 
1218 #define GC_MERGE_NODES	4U
1219 
1220 struct gc_merge_info {
1221 	struct btree	*b;
1222 	unsigned	keys;
1223 };
1224 
1225 static int bch_btree_insert_node(struct btree *, struct btree_op *,
1226 				 struct keylist *, atomic_t *, struct bkey *);
1227 
1228 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1229 			     struct keylist *keylist, struct gc_stat *gc,
1230 			     struct gc_merge_info *r)
1231 {
1232 	unsigned i, nodes = 0, keys = 0, blocks;
1233 	struct btree *new_nodes[GC_MERGE_NODES];
1234 	struct closure cl;
1235 	struct bkey *k;
1236 
1237 	memset(new_nodes, 0, sizeof(new_nodes));
1238 	closure_init_stack(&cl);
1239 
1240 	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1241 		keys += r[nodes++].keys;
1242 
1243 	blocks = btree_default_blocks(b->c) * 2 / 3;
1244 
1245 	if (nodes < 2 ||
1246 	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1247 		return 0;
1248 
1249 	for (i = 0; i < nodes; i++) {
1250 		new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
1251 		if (IS_ERR_OR_NULL(new_nodes[i]))
1252 			goto out_nocoalesce;
1253 	}
1254 
1255 	for (i = nodes - 1; i > 0; --i) {
1256 		struct bset *n1 = new_nodes[i]->sets->data;
1257 		struct bset *n2 = new_nodes[i - 1]->sets->data;
1258 		struct bkey *k, *last = NULL;
1259 
1260 		keys = 0;
1261 
1262 		if (i > 1) {
1263 			for (k = n2->start;
1264 			     k < end(n2);
1265 			     k = bkey_next(k)) {
1266 				if (__set_blocks(n1, n1->keys + keys +
1267 						 bkey_u64s(k), b->c) > blocks)
1268 					break;
1269 
1270 				last = k;
1271 				keys += bkey_u64s(k);
1272 			}
1273 		} else {
1274 			/*
1275 			 * Last node we're not getting rid of - we're getting
1276 			 * rid of the node at r[0]. Have to try and fit all of
1277 			 * the remaining keys into this node; we can't ensure
1278 			 * they will always fit due to rounding and variable
1279 			 * length keys (shouldn't be possible in practice,
1280 			 * though)
1281 			 */
1282 			if (__set_blocks(n1, n1->keys + n2->keys,
1283 					 b->c) > btree_blocks(new_nodes[i]))
1284 				goto out_nocoalesce;
1285 
1286 			keys = n2->keys;
1287 			/* Take the key of the node we're getting rid of */
1288 			last = &r->b->key;
1289 		}
1290 
1291 		BUG_ON(__set_blocks(n1, n1->keys + keys,
1292 				    b->c) > btree_blocks(new_nodes[i]));
1293 
1294 		if (last)
1295 			bkey_copy_key(&new_nodes[i]->key, last);
1296 
1297 		memcpy(end(n1),
1298 		       n2->start,
1299 		       (void *) node(n2, keys) - (void *) n2->start);
1300 
1301 		n1->keys += keys;
1302 		r[i].keys = n1->keys;
1303 
1304 		memmove(n2->start,
1305 			node(n2, keys),
1306 			(void *) end(n2) - (void *) node(n2, keys));
1307 
1308 		n2->keys -= keys;
1309 
1310 		if (bch_keylist_realloc(keylist,
1311 					KEY_PTRS(&new_nodes[i]->key), b->c))
1312 			goto out_nocoalesce;
1313 
1314 		bch_btree_node_write(new_nodes[i], &cl);
1315 		bch_keylist_add(keylist, &new_nodes[i]->key);
1316 	}
1317 
1318 	for (i = 0; i < nodes; i++) {
1319 		if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
1320 			goto out_nocoalesce;
1321 
1322 		make_btree_freeing_key(r[i].b, keylist->top);
1323 		bch_keylist_push(keylist);
1324 	}
1325 
1326 	/* We emptied out this node */
1327 	BUG_ON(new_nodes[0]->sets->data->keys);
1328 	btree_node_free(new_nodes[0]);
1329 	rw_unlock(true, new_nodes[0]);
1330 
1331 	closure_sync(&cl);
1332 
1333 	for (i = 0; i < nodes; i++) {
1334 		btree_node_free(r[i].b);
1335 		rw_unlock(true, r[i].b);
1336 
1337 		r[i].b = new_nodes[i];
1338 	}
1339 
1340 	bch_btree_insert_node(b, op, keylist, NULL, NULL);
1341 	BUG_ON(!bch_keylist_empty(keylist));
1342 
1343 	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1344 	r[nodes - 1].b = ERR_PTR(-EINTR);
1345 
1346 	trace_bcache_btree_gc_coalesce(nodes);
1347 	gc->nodes--;
1348 
1349 	/* Invalidated our iterator */
1350 	return -EINTR;
1351 
1352 out_nocoalesce:
1353 	closure_sync(&cl);
1354 
1355 	while ((k = bch_keylist_pop(keylist)))
1356 		if (!bkey_cmp(k, &ZERO_KEY))
1357 			atomic_dec(&b->c->prio_blocked);
1358 
1359 	for (i = 0; i < nodes; i++)
1360 		if (!IS_ERR_OR_NULL(new_nodes[i])) {
1361 			btree_node_free(new_nodes[i]);
1362 			rw_unlock(true, new_nodes[i]);
1363 		}
1364 	return 0;
1365 }
1366 
1367 static unsigned btree_gc_count_keys(struct btree *b)
1368 {
1369 	struct bkey *k;
1370 	struct btree_iter iter;
1371 	unsigned ret = 0;
1372 
1373 	for_each_key_filter(b, k, &iter, bch_ptr_bad)
1374 		ret += bkey_u64s(k);
1375 
1376 	return ret;
1377 }
1378 
1379 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1380 			    struct closure *writes, struct gc_stat *gc)
1381 {
1382 	unsigned i;
1383 	int ret = 0;
1384 	bool should_rewrite;
1385 	struct btree *n;
1386 	struct bkey *k;
1387 	struct keylist keys;
1388 	struct btree_iter iter;
1389 	struct gc_merge_info r[GC_MERGE_NODES];
1390 	struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1391 
1392 	bch_keylist_init(&keys);
1393 	bch_btree_iter_init(b, &iter, &b->c->gc_done);
1394 
1395 	for (i = 0; i < GC_MERGE_NODES; i++)
1396 		r[i].b = ERR_PTR(-EINTR);
1397 
1398 	while (1) {
1399 		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1400 		if (k) {
1401 			r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 			if (IS_ERR(r->b)) {
1403 				ret = PTR_ERR(r->b);
1404 				break;
1405 			}
1406 
1407 			r->keys = btree_gc_count_keys(r->b);
1408 
1409 			ret = btree_gc_coalesce(b, op, &keys, gc, r);
1410 			if (ret)
1411 				break;
1412 		}
1413 
1414 		if (!last->b)
1415 			break;
1416 
1417 		if (!IS_ERR(last->b)) {
1418 			should_rewrite = btree_gc_mark_node(last->b, gc);
1419 			if (should_rewrite) {
1420 				n = btree_node_alloc_replacement(last->b,
1421 								 false);
1422 
1423 				if (!IS_ERR_OR_NULL(n)) {
1424 					bch_btree_node_write_sync(n);
1425 					bch_keylist_add(&keys, &n->key);
1426 
1427 					make_btree_freeing_key(last->b,
1428 							       keys.top);
1429 					bch_keylist_push(&keys);
1430 
1431 					btree_node_free(last->b);
1432 
1433 					bch_btree_insert_node(b, op, &keys,
1434 							      NULL, NULL);
1435 					BUG_ON(!bch_keylist_empty(&keys));
1436 
1437 					rw_unlock(true, last->b);
1438 					last->b = n;
1439 
1440 					/* Invalidated our iterator */
1441 					ret = -EINTR;
1442 					break;
1443 				}
1444 			}
1445 
1446 			if (last->b->level) {
1447 				ret = btree_gc_recurse(last->b, op, writes, gc);
1448 				if (ret)
1449 					break;
1450 			}
1451 
1452 			bkey_copy_key(&b->c->gc_done, &last->b->key);
1453 
1454 			/*
1455 			 * Must flush leaf nodes before gc ends, since replace
1456 			 * operations aren't journalled
1457 			 */
1458 			if (btree_node_dirty(last->b))
1459 				bch_btree_node_write(last->b, writes);
1460 			rw_unlock(true, last->b);
1461 		}
1462 
1463 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1464 		r->b = NULL;
1465 
1466 		if (need_resched()) {
1467 			ret = -EAGAIN;
1468 			break;
1469 		}
1470 	}
1471 
1472 	for (i = 0; i < GC_MERGE_NODES; i++)
1473 		if (!IS_ERR_OR_NULL(r[i].b)) {
1474 			if (btree_node_dirty(r[i].b))
1475 				bch_btree_node_write(r[i].b, writes);
1476 			rw_unlock(true, r[i].b);
1477 		}
1478 
1479 	bch_keylist_free(&keys);
1480 
1481 	return ret;
1482 }
1483 
1484 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1485 			     struct closure *writes, struct gc_stat *gc)
1486 {
1487 	struct btree *n = NULL;
1488 	int ret = 0;
1489 	bool should_rewrite;
1490 
1491 	should_rewrite = btree_gc_mark_node(b, gc);
1492 	if (should_rewrite) {
1493 		n = btree_node_alloc_replacement(b, false);
1494 
1495 		if (!IS_ERR_OR_NULL(n)) {
1496 			bch_btree_node_write_sync(n);
1497 			bch_btree_set_root(n);
1498 			btree_node_free(b);
1499 			rw_unlock(true, n);
1500 
1501 			return -EINTR;
1502 		}
1503 	}
1504 
1505 	if (b->level) {
1506 		ret = btree_gc_recurse(b, op, writes, gc);
1507 		if (ret)
1508 			return ret;
1509 	}
1510 
1511 	bkey_copy_key(&b->c->gc_done, &b->key);
1512 
1513 	return ret;
1514 }
1515 
1516 static void btree_gc_start(struct cache_set *c)
1517 {
1518 	struct cache *ca;
1519 	struct bucket *b;
1520 	unsigned i;
1521 
1522 	if (!c->gc_mark_valid)
1523 		return;
1524 
1525 	mutex_lock(&c->bucket_lock);
1526 
1527 	c->gc_mark_valid = 0;
1528 	c->gc_done = ZERO_KEY;
1529 
1530 	for_each_cache(ca, c, i)
1531 		for_each_bucket(b, ca) {
1532 			b->gc_gen = b->gen;
1533 			if (!atomic_read(&b->pin)) {
1534 				SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1535 				SET_GC_SECTORS_USED(b, 0);
1536 			}
1537 		}
1538 
1539 	mutex_unlock(&c->bucket_lock);
1540 }
1541 
1542 size_t bch_btree_gc_finish(struct cache_set *c)
1543 {
1544 	size_t available = 0;
1545 	struct bucket *b;
1546 	struct cache *ca;
1547 	unsigned i;
1548 
1549 	mutex_lock(&c->bucket_lock);
1550 
1551 	set_gc_sectors(c);
1552 	c->gc_mark_valid = 1;
1553 	c->need_gc	= 0;
1554 
1555 	if (c->root)
1556 		for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1557 			SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1558 				    GC_MARK_METADATA);
1559 
1560 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1561 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1562 			    GC_MARK_METADATA);
1563 
1564 	/* don't reclaim buckets to which writeback keys point */
1565 	rcu_read_lock();
1566 	for (i = 0; i < c->nr_uuids; i++) {
1567 		struct bcache_device *d = c->devices[i];
1568 		struct cached_dev *dc;
1569 		struct keybuf_key *w, *n;
1570 		unsigned j;
1571 
1572 		if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1573 			continue;
1574 		dc = container_of(d, struct cached_dev, disk);
1575 
1576 		spin_lock(&dc->writeback_keys.lock);
1577 		rbtree_postorder_for_each_entry_safe(w, n,
1578 					&dc->writeback_keys.keys, node)
1579 			for (j = 0; j < KEY_PTRS(&w->key); j++)
1580 				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1581 					    GC_MARK_DIRTY);
1582 		spin_unlock(&dc->writeback_keys.lock);
1583 	}
1584 	rcu_read_unlock();
1585 
1586 	for_each_cache(ca, c, i) {
1587 		uint64_t *i;
1588 
1589 		ca->invalidate_needs_gc = 0;
1590 
1591 		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1592 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1593 
1594 		for (i = ca->prio_buckets;
1595 		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1596 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1597 
1598 		for_each_bucket(b, ca) {
1599 			b->last_gc	= b->gc_gen;
1600 			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
1601 
1602 			if (!atomic_read(&b->pin) &&
1603 			    GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1604 				available++;
1605 				if (!GC_SECTORS_USED(b))
1606 					bch_bucket_add_unused(ca, b);
1607 			}
1608 		}
1609 	}
1610 
1611 	mutex_unlock(&c->bucket_lock);
1612 	return available;
1613 }
1614 
1615 static void bch_btree_gc(struct cache_set *c)
1616 {
1617 	int ret;
1618 	unsigned long available;
1619 	struct gc_stat stats;
1620 	struct closure writes;
1621 	struct btree_op op;
1622 	uint64_t start_time = local_clock();
1623 
1624 	trace_bcache_gc_start(c);
1625 
1626 	memset(&stats, 0, sizeof(struct gc_stat));
1627 	closure_init_stack(&writes);
1628 	bch_btree_op_init(&op, SHRT_MAX);
1629 
1630 	btree_gc_start(c);
1631 
1632 	do {
1633 		ret = btree_root(gc_root, c, &op, &writes, &stats);
1634 		closure_sync(&writes);
1635 
1636 		if (ret && ret != -EAGAIN)
1637 			pr_warn("gc failed!");
1638 	} while (ret);
1639 
1640 	available = bch_btree_gc_finish(c);
1641 	wake_up_allocators(c);
1642 
1643 	bch_time_stats_update(&c->btree_gc_time, start_time);
1644 
1645 	stats.key_bytes *= sizeof(uint64_t);
1646 	stats.data	<<= 9;
1647 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
1648 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1649 
1650 	trace_bcache_gc_end(c);
1651 
1652 	bch_moving_gc(c);
1653 }
1654 
1655 static int bch_gc_thread(void *arg)
1656 {
1657 	struct cache_set *c = arg;
1658 	struct cache *ca;
1659 	unsigned i;
1660 
1661 	while (1) {
1662 again:
1663 		bch_btree_gc(c);
1664 
1665 		set_current_state(TASK_INTERRUPTIBLE);
1666 		if (kthread_should_stop())
1667 			break;
1668 
1669 		mutex_lock(&c->bucket_lock);
1670 
1671 		for_each_cache(ca, c, i)
1672 			if (ca->invalidate_needs_gc) {
1673 				mutex_unlock(&c->bucket_lock);
1674 				set_current_state(TASK_RUNNING);
1675 				goto again;
1676 			}
1677 
1678 		mutex_unlock(&c->bucket_lock);
1679 
1680 		try_to_freeze();
1681 		schedule();
1682 	}
1683 
1684 	return 0;
1685 }
1686 
1687 int bch_gc_thread_start(struct cache_set *c)
1688 {
1689 	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1690 	if (IS_ERR(c->gc_thread))
1691 		return PTR_ERR(c->gc_thread);
1692 
1693 	set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1694 	return 0;
1695 }
1696 
1697 /* Initial partial gc */
1698 
1699 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1700 				   unsigned long **seen)
1701 {
1702 	int ret = 0;
1703 	unsigned i;
1704 	struct bkey *k, *p = NULL;
1705 	struct bucket *g;
1706 	struct btree_iter iter;
1707 
1708 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1709 		for (i = 0; i < KEY_PTRS(k); i++) {
1710 			if (!ptr_available(b->c, k, i))
1711 				continue;
1712 
1713 			g = PTR_BUCKET(b->c, k, i);
1714 
1715 			if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1716 						seen[PTR_DEV(k, i)]) ||
1717 			    !ptr_stale(b->c, k, i)) {
1718 				g->gen = PTR_GEN(k, i);
1719 
1720 				if (b->level)
1721 					g->prio = BTREE_PRIO;
1722 				else if (g->prio == BTREE_PRIO)
1723 					g->prio = INITIAL_PRIO;
1724 			}
1725 		}
1726 
1727 		btree_mark_key(b, k);
1728 	}
1729 
1730 	if (b->level) {
1731 		bch_btree_iter_init(b, &iter, NULL);
1732 
1733 		do {
1734 			k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1735 			if (k)
1736 				btree_node_prefetch(b->c, k, b->level - 1);
1737 
1738 			if (p)
1739 				ret = btree(check_recurse, p, b, op, seen);
1740 
1741 			p = k;
1742 		} while (p && !ret);
1743 	}
1744 
1745 	return 0;
1746 }
1747 
1748 int bch_btree_check(struct cache_set *c)
1749 {
1750 	int ret = -ENOMEM;
1751 	unsigned i;
1752 	unsigned long *seen[MAX_CACHES_PER_SET];
1753 	struct btree_op op;
1754 
1755 	memset(seen, 0, sizeof(seen));
1756 	bch_btree_op_init(&op, SHRT_MAX);
1757 
1758 	for (i = 0; c->cache[i]; i++) {
1759 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1760 		seen[i] = kmalloc(n, GFP_KERNEL);
1761 		if (!seen[i])
1762 			goto err;
1763 
1764 		/* Disables the seen array until prio_read() uses it too */
1765 		memset(seen[i], 0xFF, n);
1766 	}
1767 
1768 	ret = btree_root(check_recurse, c, &op, seen);
1769 err:
1770 	for (i = 0; i < MAX_CACHES_PER_SET; i++)
1771 		kfree(seen[i]);
1772 	return ret;
1773 }
1774 
1775 /* Btree insertion */
1776 
1777 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1778 {
1779 	struct bset *i = b->sets[b->nsets].data;
1780 
1781 	memmove((uint64_t *) where + bkey_u64s(insert),
1782 		where,
1783 		(void *) end(i) - (void *) where);
1784 
1785 	i->keys += bkey_u64s(insert);
1786 	bkey_copy(where, insert);
1787 	bch_bset_fix_lookup_table(b, where);
1788 }
1789 
1790 static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1791 				    struct btree_iter *iter,
1792 				    struct bkey *replace_key)
1793 {
1794 	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1795 	{
1796 		if (KEY_DIRTY(k))
1797 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1798 						     offset, -sectors);
1799 	}
1800 
1801 	uint64_t old_offset;
1802 	unsigned old_size, sectors_found = 0;
1803 
1804 	while (1) {
1805 		struct bkey *k = bch_btree_iter_next(iter);
1806 		if (!k ||
1807 		    bkey_cmp(&START_KEY(k), insert) >= 0)
1808 			break;
1809 
1810 		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1811 			continue;
1812 
1813 		old_offset = KEY_START(k);
1814 		old_size = KEY_SIZE(k);
1815 
1816 		/*
1817 		 * We might overlap with 0 size extents; we can't skip these
1818 		 * because if they're in the set we're inserting to we have to
1819 		 * adjust them so they don't overlap with the key we're
1820 		 * inserting. But we don't want to check them for replace
1821 		 * operations.
1822 		 */
1823 
1824 		if (replace_key && KEY_SIZE(k)) {
1825 			/*
1826 			 * k might have been split since we inserted/found the
1827 			 * key we're replacing
1828 			 */
1829 			unsigned i;
1830 			uint64_t offset = KEY_START(k) -
1831 				KEY_START(replace_key);
1832 
1833 			/* But it must be a subset of the replace key */
1834 			if (KEY_START(k) < KEY_START(replace_key) ||
1835 			    KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1836 				goto check_failed;
1837 
1838 			/* We didn't find a key that we were supposed to */
1839 			if (KEY_START(k) > KEY_START(insert) + sectors_found)
1840 				goto check_failed;
1841 
1842 			if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1843 			    KEY_DIRTY(k) != KEY_DIRTY(replace_key))
1844 				goto check_failed;
1845 
1846 			/* skip past gen */
1847 			offset <<= 8;
1848 
1849 			BUG_ON(!KEY_PTRS(replace_key));
1850 
1851 			for (i = 0; i < KEY_PTRS(replace_key); i++)
1852 				if (k->ptr[i] != replace_key->ptr[i] + offset)
1853 					goto check_failed;
1854 
1855 			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1856 		}
1857 
1858 		if (bkey_cmp(insert, k) < 0 &&
1859 		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1860 			/*
1861 			 * We overlapped in the middle of an existing key: that
1862 			 * means we have to split the old key. But we have to do
1863 			 * slightly different things depending on whether the
1864 			 * old key has been written out yet.
1865 			 */
1866 
1867 			struct bkey *top;
1868 
1869 			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1870 
1871 			if (bkey_written(b, k)) {
1872 				/*
1873 				 * We insert a new key to cover the top of the
1874 				 * old key, and the old key is modified in place
1875 				 * to represent the bottom split.
1876 				 *
1877 				 * It's completely arbitrary whether the new key
1878 				 * is the top or the bottom, but it has to match
1879 				 * up with what btree_sort_fixup() does - it
1880 				 * doesn't check for this kind of overlap, it
1881 				 * depends on us inserting a new key for the top
1882 				 * here.
1883 				 */
1884 				top = bch_bset_search(b, &b->sets[b->nsets],
1885 						      insert);
1886 				shift_keys(b, top, k);
1887 			} else {
1888 				BKEY_PADDED(key) temp;
1889 				bkey_copy(&temp.key, k);
1890 				shift_keys(b, k, &temp.key);
1891 				top = bkey_next(k);
1892 			}
1893 
1894 			bch_cut_front(insert, top);
1895 			bch_cut_back(&START_KEY(insert), k);
1896 			bch_bset_fix_invalidated_key(b, k);
1897 			return false;
1898 		}
1899 
1900 		if (bkey_cmp(insert, k) < 0) {
1901 			bch_cut_front(insert, k);
1902 		} else {
1903 			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1904 				old_offset = KEY_START(insert);
1905 
1906 			if (bkey_written(b, k) &&
1907 			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1908 				/*
1909 				 * Completely overwrote, so we don't have to
1910 				 * invalidate the binary search tree
1911 				 */
1912 				bch_cut_front(k, k);
1913 			} else {
1914 				__bch_cut_back(&START_KEY(insert), k);
1915 				bch_bset_fix_invalidated_key(b, k);
1916 			}
1917 		}
1918 
1919 		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1920 	}
1921 
1922 check_failed:
1923 	if (replace_key) {
1924 		if (!sectors_found) {
1925 			return true;
1926 		} else if (sectors_found < KEY_SIZE(insert)) {
1927 			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1928 				       (KEY_SIZE(insert) - sectors_found));
1929 			SET_KEY_SIZE(insert, sectors_found);
1930 		}
1931 	}
1932 
1933 	return false;
1934 }
1935 
1936 static bool btree_insert_key(struct btree *b, struct btree_op *op,
1937 			     struct bkey *k, struct bkey *replace_key)
1938 {
1939 	struct bset *i = b->sets[b->nsets].data;
1940 	struct bkey *m, *prev;
1941 	unsigned status = BTREE_INSERT_STATUS_INSERT;
1942 
1943 	BUG_ON(bkey_cmp(k, &b->key) > 0);
1944 	BUG_ON(b->level && !KEY_PTRS(k));
1945 	BUG_ON(!b->level && !KEY_OFFSET(k));
1946 
1947 	if (!b->level) {
1948 		struct btree_iter iter;
1949 
1950 		/*
1951 		 * bset_search() returns the first key that is strictly greater
1952 		 * than the search key - but for back merging, we want to find
1953 		 * the previous key.
1954 		 */
1955 		prev = NULL;
1956 		m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1957 
1958 		if (fix_overlapping_extents(b, k, &iter, replace_key)) {
1959 			op->insert_collision = true;
1960 			return false;
1961 		}
1962 
1963 		if (KEY_DIRTY(k))
1964 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1965 						     KEY_START(k), KEY_SIZE(k));
1966 
1967 		while (m != end(i) &&
1968 		       bkey_cmp(k, &START_KEY(m)) > 0)
1969 			prev = m, m = bkey_next(m);
1970 
1971 		if (key_merging_disabled(b->c))
1972 			goto insert;
1973 
1974 		/* prev is in the tree, if we merge we're done */
1975 		status = BTREE_INSERT_STATUS_BACK_MERGE;
1976 		if (prev &&
1977 		    bch_bkey_try_merge(b, prev, k))
1978 			goto merged;
1979 
1980 		status = BTREE_INSERT_STATUS_OVERWROTE;
1981 		if (m != end(i) &&
1982 		    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1983 			goto copy;
1984 
1985 		status = BTREE_INSERT_STATUS_FRONT_MERGE;
1986 		if (m != end(i) &&
1987 		    bch_bkey_try_merge(b, k, m))
1988 			goto copy;
1989 	} else {
1990 		BUG_ON(replace_key);
1991 		m = bch_bset_search(b, &b->sets[b->nsets], k);
1992 	}
1993 
1994 insert:	shift_keys(b, m, k);
1995 copy:	bkey_copy(m, k);
1996 merged:
1997 	bch_check_keys(b, "%u for %s", status,
1998 		       replace_key ? "replace" : "insert");
1999 
2000 	if (b->level && !KEY_OFFSET(k))
2001 		btree_current_write(b)->prio_blocked++;
2002 
2003 	trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
2004 
2005 	return true;
2006 }
2007 
2008 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2009 				  struct keylist *insert_keys,
2010 				  struct bkey *replace_key)
2011 {
2012 	bool ret = false;
2013 	int oldsize = bch_count_data(b);
2014 
2015 	while (!bch_keylist_empty(insert_keys)) {
2016 		struct bset *i = write_block(b);
2017 		struct bkey *k = insert_keys->keys;
2018 
2019 		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
2020 		    > btree_blocks(b))
2021 			break;
2022 
2023 		if (bkey_cmp(k, &b->key) <= 0) {
2024 			if (!b->level)
2025 				bkey_put(b->c, k);
2026 
2027 			ret |= btree_insert_key(b, op, k, replace_key);
2028 			bch_keylist_pop_front(insert_keys);
2029 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2030 			BKEY_PADDED(key) temp;
2031 			bkey_copy(&temp.key, insert_keys->keys);
2032 
2033 			bch_cut_back(&b->key, &temp.key);
2034 			bch_cut_front(&b->key, insert_keys->keys);
2035 
2036 			ret |= btree_insert_key(b, op, &temp.key, replace_key);
2037 			break;
2038 		} else {
2039 			break;
2040 		}
2041 	}
2042 
2043 	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2044 
2045 	BUG_ON(bch_count_data(b) < oldsize);
2046 	return ret;
2047 }
2048 
2049 static int btree_split(struct btree *b, struct btree_op *op,
2050 		       struct keylist *insert_keys,
2051 		       struct bkey *replace_key)
2052 {
2053 	bool split;
2054 	struct btree *n1, *n2 = NULL, *n3 = NULL;
2055 	uint64_t start_time = local_clock();
2056 	struct closure cl;
2057 	struct keylist parent_keys;
2058 
2059 	closure_init_stack(&cl);
2060 	bch_keylist_init(&parent_keys);
2061 
2062 	n1 = btree_node_alloc_replacement(b, true);
2063 	if (IS_ERR(n1))
2064 		goto err;
2065 
2066 	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
2067 
2068 	if (split) {
2069 		unsigned keys = 0;
2070 
2071 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
2072 
2073 		n2 = bch_btree_node_alloc(b->c, b->level, true);
2074 		if (IS_ERR(n2))
2075 			goto err_free1;
2076 
2077 		if (!b->parent) {
2078 			n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
2079 			if (IS_ERR(n3))
2080 				goto err_free2;
2081 		}
2082 
2083 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2084 
2085 		/*
2086 		 * Has to be a linear search because we don't have an auxiliary
2087 		 * search tree yet
2088 		 */
2089 
2090 		while (keys < (n1->sets[0].data->keys * 3) / 5)
2091 			keys += bkey_u64s(node(n1->sets[0].data, keys));
2092 
2093 		bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
2094 		keys += bkey_u64s(node(n1->sets[0].data, keys));
2095 
2096 		n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
2097 		n1->sets[0].data->keys = keys;
2098 
2099 		memcpy(n2->sets[0].data->start,
2100 		       end(n1->sets[0].data),
2101 		       n2->sets[0].data->keys * sizeof(uint64_t));
2102 
2103 		bkey_copy_key(&n2->key, &b->key);
2104 
2105 		bch_keylist_add(&parent_keys, &n2->key);
2106 		bch_btree_node_write(n2, &cl);
2107 		rw_unlock(true, n2);
2108 	} else {
2109 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
2110 
2111 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2112 	}
2113 
2114 	bch_keylist_add(&parent_keys, &n1->key);
2115 	bch_btree_node_write(n1, &cl);
2116 
2117 	if (n3) {
2118 		/* Depth increases, make a new root */
2119 		bkey_copy_key(&n3->key, &MAX_KEY);
2120 		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2121 		bch_btree_node_write(n3, &cl);
2122 
2123 		closure_sync(&cl);
2124 		bch_btree_set_root(n3);
2125 		rw_unlock(true, n3);
2126 
2127 		btree_node_free(b);
2128 	} else if (!b->parent) {
2129 		/* Root filled up but didn't need to be split */
2130 		closure_sync(&cl);
2131 		bch_btree_set_root(n1);
2132 
2133 		btree_node_free(b);
2134 	} else {
2135 		/* Split a non root node */
2136 		closure_sync(&cl);
2137 		make_btree_freeing_key(b, parent_keys.top);
2138 		bch_keylist_push(&parent_keys);
2139 
2140 		btree_node_free(b);
2141 
2142 		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2143 		BUG_ON(!bch_keylist_empty(&parent_keys));
2144 	}
2145 
2146 	rw_unlock(true, n1);
2147 
2148 	bch_time_stats_update(&b->c->btree_split_time, start_time);
2149 
2150 	return 0;
2151 err_free2:
2152 	btree_node_free(n2);
2153 	rw_unlock(true, n2);
2154 err_free1:
2155 	btree_node_free(n1);
2156 	rw_unlock(true, n1);
2157 err:
2158 	if (n3 == ERR_PTR(-EAGAIN) ||
2159 	    n2 == ERR_PTR(-EAGAIN) ||
2160 	    n1 == ERR_PTR(-EAGAIN))
2161 		return -EAGAIN;
2162 
2163 	pr_warn("couldn't split");
2164 	return -ENOMEM;
2165 }
2166 
2167 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2168 				 struct keylist *insert_keys,
2169 				 atomic_t *journal_ref,
2170 				 struct bkey *replace_key)
2171 {
2172 	BUG_ON(b->level && replace_key);
2173 
2174 	if (should_split(b)) {
2175 		if (current->bio_list) {
2176 			op->lock = b->c->root->level + 1;
2177 			return -EAGAIN;
2178 		} else if (op->lock <= b->c->root->level) {
2179 			op->lock = b->c->root->level + 1;
2180 			return -EINTR;
2181 		} else {
2182 			/* Invalidated all iterators */
2183 			return btree_split(b, op, insert_keys, replace_key) ?:
2184 				-EINTR;
2185 		}
2186 	} else {
2187 		BUG_ON(write_block(b) != b->sets[b->nsets].data);
2188 
2189 		if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2190 			if (!b->level)
2191 				bch_btree_leaf_dirty(b, journal_ref);
2192 			else
2193 				bch_btree_node_write_sync(b);
2194 		}
2195 
2196 		return 0;
2197 	}
2198 }
2199 
2200 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2201 			       struct bkey *check_key)
2202 {
2203 	int ret = -EINTR;
2204 	uint64_t btree_ptr = b->key.ptr[0];
2205 	unsigned long seq = b->seq;
2206 	struct keylist insert;
2207 	bool upgrade = op->lock == -1;
2208 
2209 	bch_keylist_init(&insert);
2210 
2211 	if (upgrade) {
2212 		rw_unlock(false, b);
2213 		rw_lock(true, b, b->level);
2214 
2215 		if (b->key.ptr[0] != btree_ptr ||
2216 		    b->seq != seq + 1)
2217 			goto out;
2218 	}
2219 
2220 	SET_KEY_PTRS(check_key, 1);
2221 	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2222 
2223 	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2224 
2225 	bch_keylist_add(&insert, check_key);
2226 
2227 	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2228 
2229 	BUG_ON(!ret && !bch_keylist_empty(&insert));
2230 out:
2231 	if (upgrade)
2232 		downgrade_write(&b->lock);
2233 	return ret;
2234 }
2235 
2236 struct btree_insert_op {
2237 	struct btree_op	op;
2238 	struct keylist	*keys;
2239 	atomic_t	*journal_ref;
2240 	struct bkey	*replace_key;
2241 };
2242 
2243 static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2244 {
2245 	struct btree_insert_op *op = container_of(b_op,
2246 					struct btree_insert_op, op);
2247 
2248 	int ret = bch_btree_insert_node(b, &op->op, op->keys,
2249 					op->journal_ref, op->replace_key);
2250 	if (ret && !bch_keylist_empty(op->keys))
2251 		return ret;
2252 	else
2253 		return MAP_DONE;
2254 }
2255 
2256 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2257 		     atomic_t *journal_ref, struct bkey *replace_key)
2258 {
2259 	struct btree_insert_op op;
2260 	int ret = 0;
2261 
2262 	BUG_ON(current->bio_list);
2263 	BUG_ON(bch_keylist_empty(keys));
2264 
2265 	bch_btree_op_init(&op.op, 0);
2266 	op.keys		= keys;
2267 	op.journal_ref	= journal_ref;
2268 	op.replace_key	= replace_key;
2269 
2270 	while (!ret && !bch_keylist_empty(keys)) {
2271 		op.op.lock = 0;
2272 		ret = bch_btree_map_leaf_nodes(&op.op, c,
2273 					       &START_KEY(keys->keys),
2274 					       btree_insert_fn);
2275 	}
2276 
2277 	if (ret) {
2278 		struct bkey *k;
2279 
2280 		pr_err("error %i", ret);
2281 
2282 		while ((k = bch_keylist_pop(keys)))
2283 			bkey_put(c, k);
2284 	} else if (op.op.insert_collision)
2285 		ret = -ESRCH;
2286 
2287 	return ret;
2288 }
2289 
2290 void bch_btree_set_root(struct btree *b)
2291 {
2292 	unsigned i;
2293 	struct closure cl;
2294 
2295 	closure_init_stack(&cl);
2296 
2297 	trace_bcache_btree_set_root(b);
2298 
2299 	BUG_ON(!b->written);
2300 
2301 	for (i = 0; i < KEY_PTRS(&b->key); i++)
2302 		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2303 
2304 	mutex_lock(&b->c->bucket_lock);
2305 	list_del_init(&b->list);
2306 	mutex_unlock(&b->c->bucket_lock);
2307 
2308 	b->c->root = b;
2309 
2310 	bch_journal_meta(b->c, &cl);
2311 	closure_sync(&cl);
2312 }
2313 
2314 /* Map across nodes or keys */
2315 
2316 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2317 				       struct bkey *from,
2318 				       btree_map_nodes_fn *fn, int flags)
2319 {
2320 	int ret = MAP_CONTINUE;
2321 
2322 	if (b->level) {
2323 		struct bkey *k;
2324 		struct btree_iter iter;
2325 
2326 		bch_btree_iter_init(b, &iter, from);
2327 
2328 		while ((k = bch_btree_iter_next_filter(&iter, b,
2329 						       bch_ptr_bad))) {
2330 			ret = btree(map_nodes_recurse, k, b,
2331 				    op, from, fn, flags);
2332 			from = NULL;
2333 
2334 			if (ret != MAP_CONTINUE)
2335 				return ret;
2336 		}
2337 	}
2338 
2339 	if (!b->level || flags == MAP_ALL_NODES)
2340 		ret = fn(op, b);
2341 
2342 	return ret;
2343 }
2344 
2345 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2346 			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
2347 {
2348 	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2349 }
2350 
2351 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2352 				      struct bkey *from, btree_map_keys_fn *fn,
2353 				      int flags)
2354 {
2355 	int ret = MAP_CONTINUE;
2356 	struct bkey *k;
2357 	struct btree_iter iter;
2358 
2359 	bch_btree_iter_init(b, &iter, from);
2360 
2361 	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
2362 		ret = !b->level
2363 			? fn(op, b, k)
2364 			: btree(map_keys_recurse, k, b, op, from, fn, flags);
2365 		from = NULL;
2366 
2367 		if (ret != MAP_CONTINUE)
2368 			return ret;
2369 	}
2370 
2371 	if (!b->level && (flags & MAP_END_KEY))
2372 		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2373 				     KEY_OFFSET(&b->key), 0));
2374 
2375 	return ret;
2376 }
2377 
2378 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2379 		       struct bkey *from, btree_map_keys_fn *fn, int flags)
2380 {
2381 	return btree_root(map_keys_recurse, c, op, from, fn, flags);
2382 }
2383 
2384 /* Keybuf code */
2385 
2386 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2387 {
2388 	/* Overlapping keys compare equal */
2389 	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2390 		return -1;
2391 	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2392 		return 1;
2393 	return 0;
2394 }
2395 
2396 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2397 					    struct keybuf_key *r)
2398 {
2399 	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2400 }
2401 
2402 struct refill {
2403 	struct btree_op	op;
2404 	unsigned	nr_found;
2405 	struct keybuf	*buf;
2406 	struct bkey	*end;
2407 	keybuf_pred_fn	*pred;
2408 };
2409 
2410 static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2411 			    struct bkey *k)
2412 {
2413 	struct refill *refill = container_of(op, struct refill, op);
2414 	struct keybuf *buf = refill->buf;
2415 	int ret = MAP_CONTINUE;
2416 
2417 	if (bkey_cmp(k, refill->end) >= 0) {
2418 		ret = MAP_DONE;
2419 		goto out;
2420 	}
2421 
2422 	if (!KEY_SIZE(k)) /* end key */
2423 		goto out;
2424 
2425 	if (refill->pred(buf, k)) {
2426 		struct keybuf_key *w;
2427 
2428 		spin_lock(&buf->lock);
2429 
2430 		w = array_alloc(&buf->freelist);
2431 		if (!w) {
2432 			spin_unlock(&buf->lock);
2433 			return MAP_DONE;
2434 		}
2435 
2436 		w->private = NULL;
2437 		bkey_copy(&w->key, k);
2438 
2439 		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2440 			array_free(&buf->freelist, w);
2441 		else
2442 			refill->nr_found++;
2443 
2444 		if (array_freelist_empty(&buf->freelist))
2445 			ret = MAP_DONE;
2446 
2447 		spin_unlock(&buf->lock);
2448 	}
2449 out:
2450 	buf->last_scanned = *k;
2451 	return ret;
2452 }
2453 
2454 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2455 		       struct bkey *end, keybuf_pred_fn *pred)
2456 {
2457 	struct bkey start = buf->last_scanned;
2458 	struct refill refill;
2459 
2460 	cond_resched();
2461 
2462 	bch_btree_op_init(&refill.op, -1);
2463 	refill.nr_found	= 0;
2464 	refill.buf	= buf;
2465 	refill.end	= end;
2466 	refill.pred	= pred;
2467 
2468 	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2469 			   refill_keybuf_fn, MAP_END_KEY);
2470 
2471 	trace_bcache_keyscan(refill.nr_found,
2472 			     KEY_INODE(&start), KEY_OFFSET(&start),
2473 			     KEY_INODE(&buf->last_scanned),
2474 			     KEY_OFFSET(&buf->last_scanned));
2475 
2476 	spin_lock(&buf->lock);
2477 
2478 	if (!RB_EMPTY_ROOT(&buf->keys)) {
2479 		struct keybuf_key *w;
2480 		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2481 		buf->start	= START_KEY(&w->key);
2482 
2483 		w = RB_LAST(&buf->keys, struct keybuf_key, node);
2484 		buf->end	= w->key;
2485 	} else {
2486 		buf->start	= MAX_KEY;
2487 		buf->end	= MAX_KEY;
2488 	}
2489 
2490 	spin_unlock(&buf->lock);
2491 }
2492 
2493 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2494 {
2495 	rb_erase(&w->node, &buf->keys);
2496 	array_free(&buf->freelist, w);
2497 }
2498 
2499 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2500 {
2501 	spin_lock(&buf->lock);
2502 	__bch_keybuf_del(buf, w);
2503 	spin_unlock(&buf->lock);
2504 }
2505 
2506 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2507 				  struct bkey *end)
2508 {
2509 	bool ret = false;
2510 	struct keybuf_key *p, *w, s;
2511 	s.key = *start;
2512 
2513 	if (bkey_cmp(end, &buf->start) <= 0 ||
2514 	    bkey_cmp(start, &buf->end) >= 0)
2515 		return false;
2516 
2517 	spin_lock(&buf->lock);
2518 	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2519 
2520 	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2521 		p = w;
2522 		w = RB_NEXT(w, node);
2523 
2524 		if (p->private)
2525 			ret = true;
2526 		else
2527 			__bch_keybuf_del(buf, p);
2528 	}
2529 
2530 	spin_unlock(&buf->lock);
2531 	return ret;
2532 }
2533 
2534 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2535 {
2536 	struct keybuf_key *w;
2537 	spin_lock(&buf->lock);
2538 
2539 	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2540 
2541 	while (w && w->private)
2542 		w = RB_NEXT(w, node);
2543 
2544 	if (w)
2545 		w->private = ERR_PTR(-EINTR);
2546 
2547 	spin_unlock(&buf->lock);
2548 	return w;
2549 }
2550 
2551 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2552 					  struct keybuf *buf,
2553 					  struct bkey *end,
2554 					  keybuf_pred_fn *pred)
2555 {
2556 	struct keybuf_key *ret;
2557 
2558 	while (1) {
2559 		ret = bch_keybuf_next(buf);
2560 		if (ret)
2561 			break;
2562 
2563 		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2564 			pr_debug("scan finished");
2565 			break;
2566 		}
2567 
2568 		bch_refill_keybuf(c, buf, end, pred);
2569 	}
2570 
2571 	return ret;
2572 }
2573 
2574 void bch_keybuf_init(struct keybuf *buf)
2575 {
2576 	buf->last_scanned	= MAX_KEY;
2577 	buf->keys		= RB_ROOT;
2578 
2579 	spin_lock_init(&buf->lock);
2580 	array_allocator_init(&buf->freelist);
2581 }
2582 
2583 void bch_btree_exit(void)
2584 {
2585 	if (btree_io_wq)
2586 		destroy_workqueue(btree_io_wq);
2587 }
2588 
2589 int __init bch_btree_init(void)
2590 {
2591 	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
2592 	if (!btree_io_wq)
2593 		return -ENOMEM;
2594 
2595 	return 0;
2596 }
2597