1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2cafe5635SKent Overstreet /* 3cafe5635SKent Overstreet * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 4cafe5635SKent Overstreet * 5cafe5635SKent Overstreet * Uses a block device as cache for other block devices; optimized for SSDs. 6cafe5635SKent Overstreet * All allocation is done in buckets, which should match the erase block size 7cafe5635SKent Overstreet * of the device. 8cafe5635SKent Overstreet * 9cafe5635SKent Overstreet * Buckets containing cached data are kept on a heap sorted by priority; 10cafe5635SKent Overstreet * bucket priority is increased on cache hit, and periodically all the buckets 11cafe5635SKent Overstreet * on the heap have their priority scaled down. This currently is just used as 12cafe5635SKent Overstreet * an LRU but in the future should allow for more intelligent heuristics. 13cafe5635SKent Overstreet * 14cafe5635SKent Overstreet * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 15cafe5635SKent Overstreet * counter. Garbage collection is used to remove stale pointers. 16cafe5635SKent Overstreet * 17cafe5635SKent Overstreet * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 18cafe5635SKent Overstreet * as keys are inserted we only sort the pages that have not yet been written. 19cafe5635SKent Overstreet * When garbage collection is run, we resort the entire node. 20cafe5635SKent Overstreet * 215fb94e9cSMauro Carvalho Chehab * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. 22cafe5635SKent Overstreet */ 23cafe5635SKent Overstreet 24cafe5635SKent Overstreet #include "bcache.h" 25cafe5635SKent Overstreet #include "btree.h" 26cafe5635SKent Overstreet #include "debug.h" 2765d45231SKent Overstreet #include "extents.h" 28cafe5635SKent Overstreet 29cafe5635SKent Overstreet #include <linux/slab.h> 30cafe5635SKent Overstreet #include <linux/bitops.h> 31cafe5635SKent Overstreet #include <linux/hash.h> 3272a44517SKent Overstreet #include <linux/kthread.h> 33cd953ed0SGeert Uytterhoeven #include <linux/prefetch.h> 34cafe5635SKent Overstreet #include <linux/random.h> 35cafe5635SKent Overstreet #include <linux/rcupdate.h> 36e6017571SIngo Molnar #include <linux/sched/clock.h> 37b2d09103SIngo Molnar #include <linux/rculist.h> 3850a260e8SColy Li #include <linux/delay.h> 39cafe5635SKent Overstreet #include <trace/events/bcache.h> 40cafe5635SKent Overstreet 41cafe5635SKent Overstreet /* 42cafe5635SKent Overstreet * Todo: 43cafe5635SKent Overstreet * register_bcache: Return errors out to userspace correctly 44cafe5635SKent Overstreet * 45cafe5635SKent Overstreet * Writeback: don't undirty key until after a cache flush 46cafe5635SKent Overstreet * 47cafe5635SKent Overstreet * Create an iterator for key pointers 48cafe5635SKent Overstreet * 49cafe5635SKent Overstreet * On btree write error, mark bucket such that it won't be freed from the cache 50cafe5635SKent Overstreet * 51cafe5635SKent Overstreet * Journalling: 52cafe5635SKent Overstreet * Check for bad keys in replay 53cafe5635SKent Overstreet * Propagate barriers 54cafe5635SKent Overstreet * Refcount journal entries in journal_replay 55cafe5635SKent Overstreet * 56cafe5635SKent Overstreet * Garbage collection: 57cafe5635SKent Overstreet * Finish incremental gc 58cafe5635SKent Overstreet * Gc should free old UUIDs, data for invalid UUIDs 59cafe5635SKent Overstreet * 60cafe5635SKent Overstreet * Provide a way to list backing device UUIDs we have data cached for, and 61cafe5635SKent Overstreet * probably how long it's been since we've seen them, and a way to invalidate 62cafe5635SKent Overstreet * dirty data for devices that will never be attached again 63cafe5635SKent Overstreet * 64cafe5635SKent Overstreet * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 65cafe5635SKent Overstreet * that based on that and how much dirty data we have we can keep writeback 66cafe5635SKent Overstreet * from being starved 67cafe5635SKent Overstreet * 68cafe5635SKent Overstreet * Add a tracepoint or somesuch to watch for writeback starvation 69cafe5635SKent Overstreet * 70cafe5635SKent Overstreet * When btree depth > 1 and splitting an interior node, we have to make sure 71cafe5635SKent Overstreet * alloc_bucket() cannot fail. This should be true but is not completely 72cafe5635SKent Overstreet * obvious. 73cafe5635SKent Overstreet * 74cafe5635SKent Overstreet * Plugging? 75cafe5635SKent Overstreet * 76cafe5635SKent Overstreet * If data write is less than hard sector size of ssd, round up offset in open 77cafe5635SKent Overstreet * bucket to the next whole sector 78cafe5635SKent Overstreet * 79cafe5635SKent Overstreet * Superblock needs to be fleshed out for multiple cache devices 80cafe5635SKent Overstreet * 81cafe5635SKent Overstreet * Add a sysfs tunable for the number of writeback IOs in flight 82cafe5635SKent Overstreet * 83cafe5635SKent Overstreet * Add a sysfs tunable for the number of open data buckets 84cafe5635SKent Overstreet * 85cafe5635SKent Overstreet * IO tracking: Can we track when one process is doing io on behalf of another? 86cafe5635SKent Overstreet * IO tracking: Don't use just an average, weigh more recent stuff higher 87cafe5635SKent Overstreet * 88cafe5635SKent Overstreet * Test module load/unload 89cafe5635SKent Overstreet */ 90cafe5635SKent Overstreet 91cafe5635SKent Overstreet #define MAX_NEED_GC 64 92cafe5635SKent Overstreet #define MAX_SAVE_PRIO 72 937f4a59deSTang Junhui #define MAX_GC_TIMES 100 945c25c4fcSTang Junhui #define MIN_GC_NODES 100 955c25c4fcSTang Junhui #define GC_SLEEP_MS 100 96cafe5635SKent Overstreet 97cafe5635SKent Overstreet #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 98cafe5635SKent Overstreet 99cafe5635SKent Overstreet #define PTR_HASH(c, k) \ 100cafe5635SKent Overstreet (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 101cafe5635SKent Overstreet 102df8e8970SKent Overstreet #define insert_lock(s, b) ((b)->level <= (s)->lock) 103df8e8970SKent Overstreet 104df8e8970SKent Overstreet /* 105df8e8970SKent Overstreet * These macros are for recursing down the btree - they handle the details of 106df8e8970SKent Overstreet * locking and looking up nodes in the cache for you. They're best treated as 107df8e8970SKent Overstreet * mere syntax when reading code that uses them. 108df8e8970SKent Overstreet * 109df8e8970SKent Overstreet * op->lock determines whether we take a read or a write lock at a given depth. 110df8e8970SKent Overstreet * If you've got a read lock and find that you need a write lock (i.e. you're 111df8e8970SKent Overstreet * going to have to split), set op->lock and return -EINTR; btree_root() will 112df8e8970SKent Overstreet * call you again and you'll have the correct lock. 113df8e8970SKent Overstreet */ 114df8e8970SKent Overstreet 115df8e8970SKent Overstreet /** 116df8e8970SKent Overstreet * btree - recurse down the btree on a specified key 117df8e8970SKent Overstreet * @fn: function to call, which will be passed the child node 118df8e8970SKent Overstreet * @key: key to recurse on 119df8e8970SKent Overstreet * @b: parent btree node 120df8e8970SKent Overstreet * @op: pointer to struct btree_op 121df8e8970SKent Overstreet */ 122df8e8970SKent Overstreet #define btree(fn, key, b, op, ...) \ 123df8e8970SKent Overstreet ({ \ 124df8e8970SKent Overstreet int _r, l = (b)->level - 1; \ 125df8e8970SKent Overstreet bool _w = l <= (op)->lock; \ 1262452cc89SSlava Pestov struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ 1272452cc89SSlava Pestov _w, b); \ 128df8e8970SKent Overstreet if (!IS_ERR(_child)) { \ 129df8e8970SKent Overstreet _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 130df8e8970SKent Overstreet rw_unlock(_w, _child); \ 131df8e8970SKent Overstreet } else \ 132df8e8970SKent Overstreet _r = PTR_ERR(_child); \ 133df8e8970SKent Overstreet _r; \ 134df8e8970SKent Overstreet }) 135df8e8970SKent Overstreet 136df8e8970SKent Overstreet /** 137df8e8970SKent Overstreet * btree_root - call a function on the root of the btree 138df8e8970SKent Overstreet * @fn: function to call, which will be passed the child node 139df8e8970SKent Overstreet * @c: cache set 140df8e8970SKent Overstreet * @op: pointer to struct btree_op 141df8e8970SKent Overstreet */ 142df8e8970SKent Overstreet #define btree_root(fn, c, op, ...) \ 143df8e8970SKent Overstreet ({ \ 144df8e8970SKent Overstreet int _r = -EINTR; \ 145df8e8970SKent Overstreet do { \ 146df8e8970SKent Overstreet struct btree *_b = (c)->root; \ 147df8e8970SKent Overstreet bool _w = insert_lock(op, _b); \ 148df8e8970SKent Overstreet rw_lock(_w, _b, _b->level); \ 149df8e8970SKent Overstreet if (_b == (c)->root && \ 150df8e8970SKent Overstreet _w == insert_lock(op, _b)) { \ 151df8e8970SKent Overstreet _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 152df8e8970SKent Overstreet } \ 153df8e8970SKent Overstreet rw_unlock(_w, _b); \ 1540a63b66dSKent Overstreet bch_cannibalize_unlock(c); \ 15578365411SKent Overstreet if (_r == -EINTR) \ 15678365411SKent Overstreet schedule(); \ 157df8e8970SKent Overstreet } while (_r == -EINTR); \ 158df8e8970SKent Overstreet \ 1590a63b66dSKent Overstreet finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ 160df8e8970SKent Overstreet _r; \ 161df8e8970SKent Overstreet }) 162df8e8970SKent Overstreet 163a85e968eSKent Overstreet static inline struct bset *write_block(struct btree *b) 164a85e968eSKent Overstreet { 165a85e968eSKent Overstreet return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); 166a85e968eSKent Overstreet } 167a85e968eSKent Overstreet 1682a285686SKent Overstreet static void bch_btree_init_next(struct btree *b) 1692a285686SKent Overstreet { 1702a285686SKent Overstreet /* If not a leaf node, always sort */ 1712a285686SKent Overstreet if (b->level && b->keys.nsets) 1722a285686SKent Overstreet bch_btree_sort(&b->keys, &b->c->sort); 1732a285686SKent Overstreet else 1742a285686SKent Overstreet bch_btree_sort_lazy(&b->keys, &b->c->sort); 1752a285686SKent Overstreet 1762a285686SKent Overstreet if (b->written < btree_blocks(b)) 1772a285686SKent Overstreet bch_bset_init_next(&b->keys, write_block(b), 1782a285686SKent Overstreet bset_magic(&b->c->sb)); 1792a285686SKent Overstreet 1802a285686SKent Overstreet } 1812a285686SKent Overstreet 182cafe5635SKent Overstreet /* Btree key manipulation */ 183cafe5635SKent Overstreet 1843a3b6a4eSKent Overstreet void bkey_put(struct cache_set *c, struct bkey *k) 185e7c590ebSKent Overstreet { 1866f10f7d1SColy Li unsigned int i; 187e7c590ebSKent Overstreet 188e7c590ebSKent Overstreet for (i = 0; i < KEY_PTRS(k); i++) 189e7c590ebSKent Overstreet if (ptr_available(c, k, i)) 190e7c590ebSKent Overstreet atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 191e7c590ebSKent Overstreet } 192e7c590ebSKent Overstreet 193cafe5635SKent Overstreet /* Btree IO */ 194cafe5635SKent Overstreet 195cafe5635SKent Overstreet static uint64_t btree_csum_set(struct btree *b, struct bset *i) 196cafe5635SKent Overstreet { 197cafe5635SKent Overstreet uint64_t crc = b->key.ptr[0]; 198fafff81cSKent Overstreet void *data = (void *) i + 8, *end = bset_bkey_last(i); 199cafe5635SKent Overstreet 200169ef1cfSKent Overstreet crc = bch_crc64_update(crc, data, end - data); 201c19ed23aSKent Overstreet return crc ^ 0xffffffffffffffffULL; 202cafe5635SKent Overstreet } 203cafe5635SKent Overstreet 20478b77bf8SKent Overstreet void bch_btree_node_read_done(struct btree *b) 205cafe5635SKent Overstreet { 206cafe5635SKent Overstreet const char *err = "bad btree header"; 207ee811287SKent Overstreet struct bset *i = btree_bset_first(b); 20857943511SKent Overstreet struct btree_iter *iter; 209cafe5635SKent Overstreet 210d2f96f48SShenghui Wang /* 211d2f96f48SShenghui Wang * c->fill_iter can allocate an iterator with more memory space 212d2f96f48SShenghui Wang * than static MAX_BSETS. 213d2f96f48SShenghui Wang * See the comment arount cache_set->fill_iter. 214d2f96f48SShenghui Wang */ 215d19936a2SKent Overstreet iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); 21657943511SKent Overstreet iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 217cafe5635SKent Overstreet iter->used = 0; 218cafe5635SKent Overstreet 219280481d0SKent Overstreet #ifdef CONFIG_BCACHE_DEBUG 220c052dd9aSKent Overstreet iter->b = &b->keys; 221280481d0SKent Overstreet #endif 222280481d0SKent Overstreet 22357943511SKent Overstreet if (!i->seq) 224cafe5635SKent Overstreet goto err; 225cafe5635SKent Overstreet 226cafe5635SKent Overstreet for (; 227a85e968eSKent Overstreet b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; 228cafe5635SKent Overstreet i = write_block(b)) { 229cafe5635SKent Overstreet err = "unsupported bset version"; 230cafe5635SKent Overstreet if (i->version > BCACHE_BSET_VERSION) 231cafe5635SKent Overstreet goto err; 232cafe5635SKent Overstreet 233cafe5635SKent Overstreet err = "bad btree header"; 234ee811287SKent Overstreet if (b->written + set_blocks(i, block_bytes(b->c)) > 235ee811287SKent Overstreet btree_blocks(b)) 236cafe5635SKent Overstreet goto err; 237cafe5635SKent Overstreet 238cafe5635SKent Overstreet err = "bad magic"; 23981ab4190SKent Overstreet if (i->magic != bset_magic(&b->c->sb)) 240cafe5635SKent Overstreet goto err; 241cafe5635SKent Overstreet 242cafe5635SKent Overstreet err = "bad checksum"; 243cafe5635SKent Overstreet switch (i->version) { 244cafe5635SKent Overstreet case 0: 245cafe5635SKent Overstreet if (i->csum != csum_set(i)) 246cafe5635SKent Overstreet goto err; 247cafe5635SKent Overstreet break; 248cafe5635SKent Overstreet case BCACHE_BSET_VERSION: 249cafe5635SKent Overstreet if (i->csum != btree_csum_set(b, i)) 250cafe5635SKent Overstreet goto err; 251cafe5635SKent Overstreet break; 252cafe5635SKent Overstreet } 253cafe5635SKent Overstreet 254cafe5635SKent Overstreet err = "empty set"; 255a85e968eSKent Overstreet if (i != b->keys.set[0].data && !i->keys) 256cafe5635SKent Overstreet goto err; 257cafe5635SKent Overstreet 258fafff81cSKent Overstreet bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); 259cafe5635SKent Overstreet 260ee811287SKent Overstreet b->written += set_blocks(i, block_bytes(b->c)); 261cafe5635SKent Overstreet } 262cafe5635SKent Overstreet 263cafe5635SKent Overstreet err = "corrupted btree"; 264cafe5635SKent Overstreet for (i = write_block(b); 265a85e968eSKent Overstreet bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); 266cafe5635SKent Overstreet i = ((void *) i) + block_bytes(b->c)) 267a85e968eSKent Overstreet if (i->seq == b->keys.set[0].data->seq) 268cafe5635SKent Overstreet goto err; 269cafe5635SKent Overstreet 270a85e968eSKent Overstreet bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); 271cafe5635SKent Overstreet 272a85e968eSKent Overstreet i = b->keys.set[0].data; 273cafe5635SKent Overstreet err = "short btree key"; 274a85e968eSKent Overstreet if (b->keys.set[0].size && 275a85e968eSKent Overstreet bkey_cmp(&b->key, &b->keys.set[0].end) < 0) 276cafe5635SKent Overstreet goto err; 277cafe5635SKent Overstreet 278cafe5635SKent Overstreet if (b->written < btree_blocks(b)) 279a85e968eSKent Overstreet bch_bset_init_next(&b->keys, write_block(b), 280a85e968eSKent Overstreet bset_magic(&b->c->sb)); 281cafe5635SKent Overstreet out: 282d19936a2SKent Overstreet mempool_free(iter, &b->c->fill_iter); 28357943511SKent Overstreet return; 284cafe5635SKent Overstreet err: 285cafe5635SKent Overstreet set_btree_node_io_error(b); 28688b9f8c4SKent Overstreet bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", 287cafe5635SKent Overstreet err, PTR_BUCKET_NR(b->c, &b->key, 0), 28888b9f8c4SKent Overstreet bset_block_offset(b, i), i->keys); 289cafe5635SKent Overstreet goto out; 290cafe5635SKent Overstreet } 291cafe5635SKent Overstreet 2924246a0b6SChristoph Hellwig static void btree_node_read_endio(struct bio *bio) 293cafe5635SKent Overstreet { 29457943511SKent Overstreet struct closure *cl = bio->bi_private; 2951fae7cf0SColy Li 29657943511SKent Overstreet closure_put(cl); 29757943511SKent Overstreet } 298cafe5635SKent Overstreet 29978b77bf8SKent Overstreet static void bch_btree_node_read(struct btree *b) 30057943511SKent Overstreet { 30157943511SKent Overstreet uint64_t start_time = local_clock(); 30257943511SKent Overstreet struct closure cl; 30357943511SKent Overstreet struct bio *bio; 304cafe5635SKent Overstreet 305c37511b8SKent Overstreet trace_bcache_btree_read(b); 306c37511b8SKent Overstreet 30757943511SKent Overstreet closure_init_stack(&cl); 308cafe5635SKent Overstreet 30957943511SKent Overstreet bio = bch_bbio_alloc(b->c); 3104f024f37SKent Overstreet bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; 31157943511SKent Overstreet bio->bi_end_io = btree_node_read_endio; 31257943511SKent Overstreet bio->bi_private = &cl; 31370fd7614SChristoph Hellwig bio->bi_opf = REQ_OP_READ | REQ_META; 31457943511SKent Overstreet 315a85e968eSKent Overstreet bch_bio_map(bio, b->keys.set[0].data); 31657943511SKent Overstreet 31757943511SKent Overstreet bch_submit_bbio(bio, b->c, &b->key, 0); 31857943511SKent Overstreet closure_sync(&cl); 31957943511SKent Overstreet 3204e4cbee9SChristoph Hellwig if (bio->bi_status) 32157943511SKent Overstreet set_btree_node_io_error(b); 32257943511SKent Overstreet 32357943511SKent Overstreet bch_bbio_free(bio, b->c); 32457943511SKent Overstreet 32557943511SKent Overstreet if (btree_node_io_error(b)) 32657943511SKent Overstreet goto err; 32757943511SKent Overstreet 32857943511SKent Overstreet bch_btree_node_read_done(b); 32957943511SKent Overstreet bch_time_stats_update(&b->c->btree_read_time, start_time); 33057943511SKent Overstreet 33157943511SKent Overstreet return; 33257943511SKent Overstreet err: 33361cbd250SGeert Uytterhoeven bch_cache_set_error(b->c, "io error reading bucket %zu", 33457943511SKent Overstreet PTR_BUCKET_NR(b->c, &b->key, 0)); 335cafe5635SKent Overstreet } 336cafe5635SKent Overstreet 337cafe5635SKent Overstreet static void btree_complete_write(struct btree *b, struct btree_write *w) 338cafe5635SKent Overstreet { 339cafe5635SKent Overstreet if (w->prio_blocked && 340cafe5635SKent Overstreet !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 341119ba0f8SKent Overstreet wake_up_allocators(b->c); 342cafe5635SKent Overstreet 343cafe5635SKent Overstreet if (w->journal) { 344cafe5635SKent Overstreet atomic_dec_bug(w->journal); 345cafe5635SKent Overstreet __closure_wake_up(&b->c->journal.wait); 346cafe5635SKent Overstreet } 347cafe5635SKent Overstreet 348cafe5635SKent Overstreet w->prio_blocked = 0; 349cafe5635SKent Overstreet w->journal = NULL; 350cafe5635SKent Overstreet } 351cafe5635SKent Overstreet 352cb7a583eSKent Overstreet static void btree_node_write_unlock(struct closure *cl) 353cb7a583eSKent Overstreet { 354cb7a583eSKent Overstreet struct btree *b = container_of(cl, struct btree, io); 355cb7a583eSKent Overstreet 356cb7a583eSKent Overstreet up(&b->io_mutex); 357cb7a583eSKent Overstreet } 358cb7a583eSKent Overstreet 35957943511SKent Overstreet static void __btree_node_write_done(struct closure *cl) 360cafe5635SKent Overstreet { 361cb7a583eSKent Overstreet struct btree *b = container_of(cl, struct btree, io); 362cafe5635SKent Overstreet struct btree_write *w = btree_prev_write(b); 363cafe5635SKent Overstreet 364cafe5635SKent Overstreet bch_bbio_free(b->bio, b->c); 365cafe5635SKent Overstreet b->bio = NULL; 366cafe5635SKent Overstreet btree_complete_write(b, w); 367cafe5635SKent Overstreet 368cafe5635SKent Overstreet if (btree_node_dirty(b)) 36956b30770SKent Overstreet schedule_delayed_work(&b->work, 30 * HZ); 370cafe5635SKent Overstreet 371cb7a583eSKent Overstreet closure_return_with_destructor(cl, btree_node_write_unlock); 372cafe5635SKent Overstreet } 373cafe5635SKent Overstreet 37457943511SKent Overstreet static void btree_node_write_done(struct closure *cl) 375cafe5635SKent Overstreet { 376cb7a583eSKent Overstreet struct btree *b = container_of(cl, struct btree, io); 377cafe5635SKent Overstreet 378491221f8SGuoqing Jiang bio_free_pages(b->bio); 37957943511SKent Overstreet __btree_node_write_done(cl); 380cafe5635SKent Overstreet } 381cafe5635SKent Overstreet 3824246a0b6SChristoph Hellwig static void btree_node_write_endio(struct bio *bio) 38357943511SKent Overstreet { 38457943511SKent Overstreet struct closure *cl = bio->bi_private; 385cb7a583eSKent Overstreet struct btree *b = container_of(cl, struct btree, io); 38657943511SKent Overstreet 3874e4cbee9SChristoph Hellwig if (bio->bi_status) 38857943511SKent Overstreet set_btree_node_io_error(b); 38957943511SKent Overstreet 3904e4cbee9SChristoph Hellwig bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); 39157943511SKent Overstreet closure_put(cl); 39257943511SKent Overstreet } 39357943511SKent Overstreet 39457943511SKent Overstreet static void do_btree_node_write(struct btree *b) 395cafe5635SKent Overstreet { 396cb7a583eSKent Overstreet struct closure *cl = &b->io; 397ee811287SKent Overstreet struct bset *i = btree_bset_last(b); 398cafe5635SKent Overstreet BKEY_PADDED(key) k; 399cafe5635SKent Overstreet 400cafe5635SKent Overstreet i->version = BCACHE_BSET_VERSION; 401cafe5635SKent Overstreet i->csum = btree_csum_set(b, i); 402cafe5635SKent Overstreet 40357943511SKent Overstreet BUG_ON(b->bio); 40457943511SKent Overstreet b->bio = bch_bbio_alloc(b->c); 40557943511SKent Overstreet 40657943511SKent Overstreet b->bio->bi_end_io = btree_node_write_endio; 407faadf0c9SKent Overstreet b->bio->bi_private = cl; 408ee811287SKent Overstreet b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); 40970fd7614SChristoph Hellwig b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; 410169ef1cfSKent Overstreet bch_bio_map(b->bio, i); 411cafe5635SKent Overstreet 412e49c7c37SKent Overstreet /* 413e49c7c37SKent Overstreet * If we're appending to a leaf node, we don't technically need FUA - 414e49c7c37SKent Overstreet * this write just needs to be persisted before the next journal write, 415e49c7c37SKent Overstreet * which will be marked FLUSH|FUA. 416e49c7c37SKent Overstreet * 417e49c7c37SKent Overstreet * Similarly if we're writing a new btree root - the pointer is going to 418e49c7c37SKent Overstreet * be in the next journal entry. 419e49c7c37SKent Overstreet * 420e49c7c37SKent Overstreet * But if we're writing a new btree node (that isn't a root) or 421e49c7c37SKent Overstreet * appending to a non leaf btree node, we need either FUA or a flush 422e49c7c37SKent Overstreet * when we write the parent with the new pointer. FUA is cheaper than a 423e49c7c37SKent Overstreet * flush, and writes appending to leaf nodes aren't blocking anything so 424e49c7c37SKent Overstreet * just make all btree node writes FUA to keep things sane. 425e49c7c37SKent Overstreet */ 426e49c7c37SKent Overstreet 427cafe5635SKent Overstreet bkey_copy(&k.key, &b->key); 428ee811287SKent Overstreet SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 429a85e968eSKent Overstreet bset_sector_offset(&b->keys, i)); 430cafe5635SKent Overstreet 43125d8be77SMing Lei if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { 432cafe5635SKent Overstreet struct bio_vec *bv; 433f936b06aSChristoph Hellwig void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 4346dc4f100SMing Lei struct bvec_iter_all iter_all; 435cafe5635SKent Overstreet 4362b070cfeSChristoph Hellwig bio_for_each_segment_all(bv, b->bio, iter_all) { 437f936b06aSChristoph Hellwig memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); 438f936b06aSChristoph Hellwig addr += PAGE_SIZE; 439f936b06aSChristoph Hellwig } 440cafe5635SKent Overstreet 441cafe5635SKent Overstreet bch_submit_bbio(b->bio, b->c, &k.key, 0); 442cafe5635SKent Overstreet 44357943511SKent Overstreet continue_at(cl, btree_node_write_done, NULL); 444cafe5635SKent Overstreet } else { 445b0d30981SColy Li /* 446b0d30981SColy Li * No problem for multipage bvec since the bio is 447b0d30981SColy Li * just allocated 448b0d30981SColy Li */ 449cafe5635SKent Overstreet b->bio->bi_vcnt = 0; 450169ef1cfSKent Overstreet bch_bio_map(b->bio, i); 451cafe5635SKent Overstreet 452cafe5635SKent Overstreet bch_submit_bbio(b->bio, b->c, &k.key, 0); 453cafe5635SKent Overstreet 454cafe5635SKent Overstreet closure_sync(cl); 455cb7a583eSKent Overstreet continue_at_nobarrier(cl, __btree_node_write_done, NULL); 456cafe5635SKent Overstreet } 457cafe5635SKent Overstreet } 458cafe5635SKent Overstreet 4592a285686SKent Overstreet void __bch_btree_node_write(struct btree *b, struct closure *parent) 460cafe5635SKent Overstreet { 461ee811287SKent Overstreet struct bset *i = btree_bset_last(b); 462cafe5635SKent Overstreet 4632a285686SKent Overstreet lockdep_assert_held(&b->write_lock); 4642a285686SKent Overstreet 465c37511b8SKent Overstreet trace_bcache_btree_write(b); 466c37511b8SKent Overstreet 467cafe5635SKent Overstreet BUG_ON(current->bio_list); 46857943511SKent Overstreet BUG_ON(b->written >= btree_blocks(b)); 46957943511SKent Overstreet BUG_ON(b->written && !i->keys); 470ee811287SKent Overstreet BUG_ON(btree_bset_first(b)->seq != i->seq); 471dc9d98d6SKent Overstreet bch_check_keys(&b->keys, "writing"); 472cafe5635SKent Overstreet 473cafe5635SKent Overstreet cancel_delayed_work(&b->work); 474cafe5635SKent Overstreet 47557943511SKent Overstreet /* If caller isn't waiting for write, parent refcount is cache set */ 476cb7a583eSKent Overstreet down(&b->io_mutex); 477cb7a583eSKent Overstreet closure_init(&b->io, parent ?: &b->c->cl); 47857943511SKent Overstreet 479cafe5635SKent Overstreet clear_bit(BTREE_NODE_dirty, &b->flags); 480cafe5635SKent Overstreet change_bit(BTREE_NODE_write_idx, &b->flags); 481cafe5635SKent Overstreet 48257943511SKent Overstreet do_btree_node_write(b); 483cafe5635SKent Overstreet 484ee811287SKent Overstreet atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, 485cafe5635SKent Overstreet &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 486cafe5635SKent Overstreet 487a85e968eSKent Overstreet b->written += set_blocks(i, block_bytes(b->c)); 4882a285686SKent Overstreet } 489a85e968eSKent Overstreet 4902a285686SKent Overstreet void bch_btree_node_write(struct btree *b, struct closure *parent) 4912a285686SKent Overstreet { 4926f10f7d1SColy Li unsigned int nsets = b->keys.nsets; 4932a285686SKent Overstreet 4942a285686SKent Overstreet lockdep_assert_held(&b->lock); 4952a285686SKent Overstreet 4962a285686SKent Overstreet __bch_btree_node_write(b, parent); 497cafe5635SKent Overstreet 49878b77bf8SKent Overstreet /* 49978b77bf8SKent Overstreet * do verify if there was more than one set initially (i.e. we did a 50078b77bf8SKent Overstreet * sort) and we sorted down to a single set: 50178b77bf8SKent Overstreet */ 5022a285686SKent Overstreet if (nsets && !b->keys.nsets) 50378b77bf8SKent Overstreet bch_btree_verify(b); 50478b77bf8SKent Overstreet 5052a285686SKent Overstreet bch_btree_init_next(b); 506cafe5635SKent Overstreet } 507cafe5635SKent Overstreet 508f269af5aSKent Overstreet static void bch_btree_node_write_sync(struct btree *b) 509f269af5aSKent Overstreet { 510f269af5aSKent Overstreet struct closure cl; 511f269af5aSKent Overstreet 512f269af5aSKent Overstreet closure_init_stack(&cl); 5132a285686SKent Overstreet 5142a285686SKent Overstreet mutex_lock(&b->write_lock); 515f269af5aSKent Overstreet bch_btree_node_write(b, &cl); 5162a285686SKent Overstreet mutex_unlock(&b->write_lock); 5172a285686SKent Overstreet 518f269af5aSKent Overstreet closure_sync(&cl); 519f269af5aSKent Overstreet } 520f269af5aSKent Overstreet 52157943511SKent Overstreet static void btree_node_write_work(struct work_struct *w) 522cafe5635SKent Overstreet { 523cafe5635SKent Overstreet struct btree *b = container_of(to_delayed_work(w), struct btree, work); 524cafe5635SKent Overstreet 5252a285686SKent Overstreet mutex_lock(&b->write_lock); 526cafe5635SKent Overstreet if (btree_node_dirty(b)) 5272a285686SKent Overstreet __bch_btree_node_write(b, NULL); 5282a285686SKent Overstreet mutex_unlock(&b->write_lock); 529cafe5635SKent Overstreet } 530cafe5635SKent Overstreet 531c0e0954eSColy Li /* return true if journal pin 'l' is newer than 'r' */ 532c0e0954eSColy Li static bool journal_pin_cmp(struct cache_set *c, 533c0e0954eSColy Li atomic_t *l, 534c0e0954eSColy Li atomic_t *r) 535c0e0954eSColy Li { 536c0e0954eSColy Li int l_idx, r_idx, f_idx, b_idx; 537c0e0954eSColy Li bool ret = false; 538c0e0954eSColy Li 539c0e0954eSColy Li l_idx = fifo_idx(&(c)->journal.pin, (l)); 540c0e0954eSColy Li r_idx = fifo_idx(&(c)->journal.pin, (r)); 541c0e0954eSColy Li f_idx = (c)->journal.pin.front; 542c0e0954eSColy Li b_idx = (c)->journal.pin.back; 543c0e0954eSColy Li 544c0e0954eSColy Li if (l_idx > r_idx) 545c0e0954eSColy Li ret = true; 546c0e0954eSColy Li /* in case fifo back pointer is swapped */ 547c0e0954eSColy Li if (b_idx < f_idx) { 548c0e0954eSColy Li if (l_idx <= b_idx && r_idx >= f_idx) 549c0e0954eSColy Li ret = true; 550c0e0954eSColy Li else if (l_idx >= f_idx && r_idx <= b_idx) 551c0e0954eSColy Li ret = false; 552c0e0954eSColy Li } 553c0e0954eSColy Li 554c0e0954eSColy Li return ret; 555c0e0954eSColy Li } 556c0e0954eSColy Li 557c18536a7SKent Overstreet static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 558cafe5635SKent Overstreet { 559ee811287SKent Overstreet struct bset *i = btree_bset_last(b); 560cafe5635SKent Overstreet struct btree_write *w = btree_current_write(b); 561cafe5635SKent Overstreet 5622a285686SKent Overstreet lockdep_assert_held(&b->write_lock); 5632a285686SKent Overstreet 56457943511SKent Overstreet BUG_ON(!b->written); 56557943511SKent Overstreet BUG_ON(!i->keys); 566cafe5635SKent Overstreet 56757943511SKent Overstreet if (!btree_node_dirty(b)) 56856b30770SKent Overstreet schedule_delayed_work(&b->work, 30 * HZ); 56957943511SKent Overstreet 570cafe5635SKent Overstreet set_btree_node_dirty(b); 571cafe5635SKent Overstreet 572*5dccefd3SColy Li /* 573*5dccefd3SColy Li * w->journal is always the oldest journal pin of all bkeys 574*5dccefd3SColy Li * in the leaf node, to make sure the oldest jset seq won't 575*5dccefd3SColy Li * be increased before this btree node is flushed. 576*5dccefd3SColy Li */ 577c18536a7SKent Overstreet if (journal_ref) { 578cafe5635SKent Overstreet if (w->journal && 579c18536a7SKent Overstreet journal_pin_cmp(b->c, w->journal, journal_ref)) { 580cafe5635SKent Overstreet atomic_dec_bug(w->journal); 581cafe5635SKent Overstreet w->journal = NULL; 582cafe5635SKent Overstreet } 583cafe5635SKent Overstreet 584cafe5635SKent Overstreet if (!w->journal) { 585c18536a7SKent Overstreet w->journal = journal_ref; 586cafe5635SKent Overstreet atomic_inc(w->journal); 587cafe5635SKent Overstreet } 588cafe5635SKent Overstreet } 589cafe5635SKent Overstreet 590cafe5635SKent Overstreet /* Force write if set is too big */ 59157943511SKent Overstreet if (set_bytes(i) > PAGE_SIZE - 48 && 59257943511SKent Overstreet !current->bio_list) 59357943511SKent Overstreet bch_btree_node_write(b, NULL); 594cafe5635SKent Overstreet } 595cafe5635SKent Overstreet 596cafe5635SKent Overstreet /* 597cafe5635SKent Overstreet * Btree in memory cache - allocation/freeing 598cafe5635SKent Overstreet * mca -> memory cache 599cafe5635SKent Overstreet */ 600cafe5635SKent Overstreet 601cafe5635SKent Overstreet #define mca_reserve(c) (((c->root && c->root->level) \ 602cafe5635SKent Overstreet ? c->root->level : 1) * 8 + 16) 603cafe5635SKent Overstreet #define mca_can_free(c) \ 6040a63b66dSKent Overstreet max_t(int, 0, c->btree_cache_used - mca_reserve(c)) 605cafe5635SKent Overstreet 606cafe5635SKent Overstreet static void mca_data_free(struct btree *b) 607cafe5635SKent Overstreet { 608cb7a583eSKent Overstreet BUG_ON(b->io_mutex.count != 1); 609cafe5635SKent Overstreet 610a85e968eSKent Overstreet bch_btree_keys_free(&b->keys); 611cafe5635SKent Overstreet 6120a63b66dSKent Overstreet b->c->btree_cache_used--; 613ee811287SKent Overstreet list_move(&b->list, &b->c->btree_cache_freed); 614cafe5635SKent Overstreet } 615cafe5635SKent Overstreet 616cafe5635SKent Overstreet static void mca_bucket_free(struct btree *b) 617cafe5635SKent Overstreet { 618cafe5635SKent Overstreet BUG_ON(btree_node_dirty(b)); 619cafe5635SKent Overstreet 620cafe5635SKent Overstreet b->key.ptr[0] = 0; 621cafe5635SKent Overstreet hlist_del_init_rcu(&b->hash); 622cafe5635SKent Overstreet list_move(&b->list, &b->c->btree_cache_freeable); 623cafe5635SKent Overstreet } 624cafe5635SKent Overstreet 6256f10f7d1SColy Li static unsigned int btree_order(struct bkey *k) 626cafe5635SKent Overstreet { 627cafe5635SKent Overstreet return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 628cafe5635SKent Overstreet } 629cafe5635SKent Overstreet 630cafe5635SKent Overstreet static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 631cafe5635SKent Overstreet { 632a85e968eSKent Overstreet if (!bch_btree_keys_alloc(&b->keys, 6336f10f7d1SColy Li max_t(unsigned int, 634cafe5635SKent Overstreet ilog2(b->c->btree_pages), 635ee811287SKent Overstreet btree_order(k)), 636ee811287SKent Overstreet gfp)) { 6370a63b66dSKent Overstreet b->c->btree_cache_used++; 638ee811287SKent Overstreet list_move(&b->list, &b->c->btree_cache); 639ee811287SKent Overstreet } else { 640ee811287SKent Overstreet list_move(&b->list, &b->c->btree_cache_freed); 641ee811287SKent Overstreet } 642cafe5635SKent Overstreet } 643cafe5635SKent Overstreet 644cafe5635SKent Overstreet static struct btree *mca_bucket_alloc(struct cache_set *c, 645cafe5635SKent Overstreet struct bkey *k, gfp_t gfp) 646cafe5635SKent Overstreet { 647bd9026c8SColy Li /* 648bd9026c8SColy Li * kzalloc() is necessary here for initialization, 649bd9026c8SColy Li * see code comments in bch_btree_keys_init(). 650bd9026c8SColy Li */ 651cafe5635SKent Overstreet struct btree *b = kzalloc(sizeof(struct btree), gfp); 6521fae7cf0SColy Li 653cafe5635SKent Overstreet if (!b) 654cafe5635SKent Overstreet return NULL; 655cafe5635SKent Overstreet 656cafe5635SKent Overstreet init_rwsem(&b->lock); 657cafe5635SKent Overstreet lockdep_set_novalidate_class(&b->lock); 6582a285686SKent Overstreet mutex_init(&b->write_lock); 6592a285686SKent Overstreet lockdep_set_novalidate_class(&b->write_lock); 660cafe5635SKent Overstreet INIT_LIST_HEAD(&b->list); 66157943511SKent Overstreet INIT_DELAYED_WORK(&b->work, btree_node_write_work); 662cafe5635SKent Overstreet b->c = c; 663cb7a583eSKent Overstreet sema_init(&b->io_mutex, 1); 664cafe5635SKent Overstreet 665cafe5635SKent Overstreet mca_data_alloc(b, k, gfp); 666cafe5635SKent Overstreet return b; 667cafe5635SKent Overstreet } 668cafe5635SKent Overstreet 6696f10f7d1SColy Li static int mca_reap(struct btree *b, unsigned int min_order, bool flush) 670cafe5635SKent Overstreet { 671e8e1d468SKent Overstreet struct closure cl; 672e8e1d468SKent Overstreet 673e8e1d468SKent Overstreet closure_init_stack(&cl); 674cafe5635SKent Overstreet lockdep_assert_held(&b->c->bucket_lock); 675cafe5635SKent Overstreet 676cafe5635SKent Overstreet if (!down_write_trylock(&b->lock)) 677cafe5635SKent Overstreet return -ENOMEM; 678cafe5635SKent Overstreet 679a85e968eSKent Overstreet BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); 680e8e1d468SKent Overstreet 681a85e968eSKent Overstreet if (b->keys.page_order < min_order) 682cb7a583eSKent Overstreet goto out_unlock; 683cb7a583eSKent Overstreet 684cb7a583eSKent Overstreet if (!flush) { 685cb7a583eSKent Overstreet if (btree_node_dirty(b)) 686cb7a583eSKent Overstreet goto out_unlock; 687cb7a583eSKent Overstreet 688cb7a583eSKent Overstreet if (down_trylock(&b->io_mutex)) 689cb7a583eSKent Overstreet goto out_unlock; 690cb7a583eSKent Overstreet up(&b->io_mutex); 691cafe5635SKent Overstreet } 692cafe5635SKent Overstreet 69350a260e8SColy Li retry: 69441508bb7SColy Li /* 69541508bb7SColy Li * BTREE_NODE_dirty might be cleared in btree_flush_btree() by 69641508bb7SColy Li * __bch_btree_node_write(). To avoid an extra flush, acquire 69741508bb7SColy Li * b->write_lock before checking BTREE_NODE_dirty bit. 69841508bb7SColy Li */ 6992a285686SKent Overstreet mutex_lock(&b->write_lock); 70050a260e8SColy Li /* 70150a260e8SColy Li * If this btree node is selected in btree_flush_write() by journal 70250a260e8SColy Li * code, delay and retry until the node is flushed by journal code 70350a260e8SColy Li * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). 70450a260e8SColy Li */ 70550a260e8SColy Li if (btree_node_journal_flush(b)) { 70650a260e8SColy Li pr_debug("bnode %p is flushing by journal, retry", b); 70750a260e8SColy Li mutex_unlock(&b->write_lock); 70850a260e8SColy Li udelay(1); 70950a260e8SColy Li goto retry; 71050a260e8SColy Li } 71150a260e8SColy Li 712f269af5aSKent Overstreet if (btree_node_dirty(b)) 7132a285686SKent Overstreet __bch_btree_node_write(b, &cl); 7142a285686SKent Overstreet mutex_unlock(&b->write_lock); 7152a285686SKent Overstreet 7162a285686SKent Overstreet closure_sync(&cl); 717cafe5635SKent Overstreet 718e8e1d468SKent Overstreet /* wait for any in flight btree write */ 719cb7a583eSKent Overstreet down(&b->io_mutex); 720cb7a583eSKent Overstreet up(&b->io_mutex); 721e8e1d468SKent Overstreet 722cafe5635SKent Overstreet return 0; 723cb7a583eSKent Overstreet out_unlock: 724cb7a583eSKent Overstreet rw_unlock(true, b); 725cb7a583eSKent Overstreet return -ENOMEM; 726cafe5635SKent Overstreet } 727cafe5635SKent Overstreet 7287dc19d5aSDave Chinner static unsigned long bch_mca_scan(struct shrinker *shrink, 7297dc19d5aSDave Chinner struct shrink_control *sc) 730cafe5635SKent Overstreet { 731cafe5635SKent Overstreet struct cache_set *c = container_of(shrink, struct cache_set, shrink); 732cafe5635SKent Overstreet struct btree *b, *t; 733cafe5635SKent Overstreet unsigned long i, nr = sc->nr_to_scan; 7347dc19d5aSDave Chinner unsigned long freed = 0; 735ca71df31STang Junhui unsigned int btree_cache_used; 736cafe5635SKent Overstreet 737cafe5635SKent Overstreet if (c->shrinker_disabled) 7387dc19d5aSDave Chinner return SHRINK_STOP; 739cafe5635SKent Overstreet 7400a63b66dSKent Overstreet if (c->btree_cache_alloc_lock) 7417dc19d5aSDave Chinner return SHRINK_STOP; 742cafe5635SKent Overstreet 743cafe5635SKent Overstreet /* Return -1 if we can't do anything right now */ 744a698e08cSKent Overstreet if (sc->gfp_mask & __GFP_IO) 745cafe5635SKent Overstreet mutex_lock(&c->bucket_lock); 746cafe5635SKent Overstreet else if (!mutex_trylock(&c->bucket_lock)) 747cafe5635SKent Overstreet return -1; 748cafe5635SKent Overstreet 74936c9ea98SKent Overstreet /* 75036c9ea98SKent Overstreet * It's _really_ critical that we don't free too many btree nodes - we 75136c9ea98SKent Overstreet * have to always leave ourselves a reserve. The reserve is how we 75236c9ea98SKent Overstreet * guarantee that allocating memory for a new btree node can always 75336c9ea98SKent Overstreet * succeed, so that inserting keys into the btree can always succeed and 75436c9ea98SKent Overstreet * IO can always make forward progress: 75536c9ea98SKent Overstreet */ 756cafe5635SKent Overstreet nr /= c->btree_pages; 757cafe5635SKent Overstreet nr = min_t(unsigned long, nr, mca_can_free(c)); 758cafe5635SKent Overstreet 759cafe5635SKent Overstreet i = 0; 760ca71df31STang Junhui btree_cache_used = c->btree_cache_used; 761cafe5635SKent Overstreet list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 762ca71df31STang Junhui if (nr <= 0) 763ca71df31STang Junhui goto out; 764cafe5635SKent Overstreet 765cafe5635SKent Overstreet if (++i > 3 && 766e8e1d468SKent Overstreet !mca_reap(b, 0, false)) { 767cafe5635SKent Overstreet mca_data_free(b); 768cafe5635SKent Overstreet rw_unlock(true, b); 7697dc19d5aSDave Chinner freed++; 770cafe5635SKent Overstreet } 771ca71df31STang Junhui nr--; 772cafe5635SKent Overstreet } 773cafe5635SKent Overstreet 774ca71df31STang Junhui for (; (nr--) && i < btree_cache_used; i++) { 775cafe5635SKent Overstreet if (list_empty(&c->btree_cache)) 776cafe5635SKent Overstreet goto out; 777cafe5635SKent Overstreet 778cafe5635SKent Overstreet b = list_first_entry(&c->btree_cache, struct btree, list); 779cafe5635SKent Overstreet list_rotate_left(&c->btree_cache); 780cafe5635SKent Overstreet 781cafe5635SKent Overstreet if (!b->accessed && 782e8e1d468SKent Overstreet !mca_reap(b, 0, false)) { 783cafe5635SKent Overstreet mca_bucket_free(b); 784cafe5635SKent Overstreet mca_data_free(b); 785cafe5635SKent Overstreet rw_unlock(true, b); 7867dc19d5aSDave Chinner freed++; 787cafe5635SKent Overstreet } else 788cafe5635SKent Overstreet b->accessed = 0; 789cafe5635SKent Overstreet } 790cafe5635SKent Overstreet out: 791cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 792f3641c3aSTang Junhui return freed * c->btree_pages; 7937dc19d5aSDave Chinner } 7947dc19d5aSDave Chinner 7957dc19d5aSDave Chinner static unsigned long bch_mca_count(struct shrinker *shrink, 7967dc19d5aSDave Chinner struct shrink_control *sc) 7977dc19d5aSDave Chinner { 7987dc19d5aSDave Chinner struct cache_set *c = container_of(shrink, struct cache_set, shrink); 7997dc19d5aSDave Chinner 8007dc19d5aSDave Chinner if (c->shrinker_disabled) 8017dc19d5aSDave Chinner return 0; 8027dc19d5aSDave Chinner 8030a63b66dSKent Overstreet if (c->btree_cache_alloc_lock) 8047dc19d5aSDave Chinner return 0; 8057dc19d5aSDave Chinner 8067dc19d5aSDave Chinner return mca_can_free(c) * c->btree_pages; 807cafe5635SKent Overstreet } 808cafe5635SKent Overstreet 809cafe5635SKent Overstreet void bch_btree_cache_free(struct cache_set *c) 810cafe5635SKent Overstreet { 811cafe5635SKent Overstreet struct btree *b; 812cafe5635SKent Overstreet struct closure cl; 8131fae7cf0SColy Li 814cafe5635SKent Overstreet closure_init_stack(&cl); 815cafe5635SKent Overstreet 816cafe5635SKent Overstreet if (c->shrink.list.next) 817cafe5635SKent Overstreet unregister_shrinker(&c->shrink); 818cafe5635SKent Overstreet 819cafe5635SKent Overstreet mutex_lock(&c->bucket_lock); 820cafe5635SKent Overstreet 821cafe5635SKent Overstreet #ifdef CONFIG_BCACHE_DEBUG 822cafe5635SKent Overstreet if (c->verify_data) 823cafe5635SKent Overstreet list_move(&c->verify_data->list, &c->btree_cache); 82478b77bf8SKent Overstreet 82578b77bf8SKent Overstreet free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); 826cafe5635SKent Overstreet #endif 827cafe5635SKent Overstreet 828cafe5635SKent Overstreet list_splice(&c->btree_cache_freeable, 829cafe5635SKent Overstreet &c->btree_cache); 830cafe5635SKent Overstreet 831cafe5635SKent Overstreet while (!list_empty(&c->btree_cache)) { 832cafe5635SKent Overstreet b = list_first_entry(&c->btree_cache, struct btree, list); 833cafe5635SKent Overstreet 83441508bb7SColy Li /* 83541508bb7SColy Li * This function is called by cache_set_free(), no I/O 83641508bb7SColy Li * request on cache now, it is unnecessary to acquire 83741508bb7SColy Li * b->write_lock before clearing BTREE_NODE_dirty anymore. 83841508bb7SColy Li */ 839e5ec5f47SColy Li if (btree_node_dirty(b)) { 840cafe5635SKent Overstreet btree_complete_write(b, btree_current_write(b)); 841cafe5635SKent Overstreet clear_bit(BTREE_NODE_dirty, &b->flags); 842e5ec5f47SColy Li } 843cafe5635SKent Overstreet mca_data_free(b); 844cafe5635SKent Overstreet } 845cafe5635SKent Overstreet 846cafe5635SKent Overstreet while (!list_empty(&c->btree_cache_freed)) { 847cafe5635SKent Overstreet b = list_first_entry(&c->btree_cache_freed, 848cafe5635SKent Overstreet struct btree, list); 849cafe5635SKent Overstreet list_del(&b->list); 850cafe5635SKent Overstreet cancel_delayed_work_sync(&b->work); 851cafe5635SKent Overstreet kfree(b); 852cafe5635SKent Overstreet } 853cafe5635SKent Overstreet 854cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 855cafe5635SKent Overstreet } 856cafe5635SKent Overstreet 857cafe5635SKent Overstreet int bch_btree_cache_alloc(struct cache_set *c) 858cafe5635SKent Overstreet { 8596f10f7d1SColy Li unsigned int i; 860cafe5635SKent Overstreet 861cafe5635SKent Overstreet for (i = 0; i < mca_reserve(c); i++) 86272a44517SKent Overstreet if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) 86372a44517SKent Overstreet return -ENOMEM; 864cafe5635SKent Overstreet 865cafe5635SKent Overstreet list_splice_init(&c->btree_cache, 866cafe5635SKent Overstreet &c->btree_cache_freeable); 867cafe5635SKent Overstreet 868cafe5635SKent Overstreet #ifdef CONFIG_BCACHE_DEBUG 869cafe5635SKent Overstreet mutex_init(&c->verify_lock); 870cafe5635SKent Overstreet 87178b77bf8SKent Overstreet c->verify_ondisk = (void *) 87278b77bf8SKent Overstreet __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); 87378b77bf8SKent Overstreet 874cafe5635SKent Overstreet c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 875cafe5635SKent Overstreet 876cafe5635SKent Overstreet if (c->verify_data && 877a85e968eSKent Overstreet c->verify_data->keys.set->data) 878cafe5635SKent Overstreet list_del_init(&c->verify_data->list); 879cafe5635SKent Overstreet else 880cafe5635SKent Overstreet c->verify_data = NULL; 881cafe5635SKent Overstreet #endif 882cafe5635SKent Overstreet 8837dc19d5aSDave Chinner c->shrink.count_objects = bch_mca_count; 8847dc19d5aSDave Chinner c->shrink.scan_objects = bch_mca_scan; 885cafe5635SKent Overstreet c->shrink.seeks = 4; 886cafe5635SKent Overstreet c->shrink.batch = c->btree_pages * 2; 8876c4ca1e3SMichael Lyle 8886c4ca1e3SMichael Lyle if (register_shrinker(&c->shrink)) 8896c4ca1e3SMichael Lyle pr_warn("bcache: %s: could not register shrinker", 8906c4ca1e3SMichael Lyle __func__); 891cafe5635SKent Overstreet 892cafe5635SKent Overstreet return 0; 893cafe5635SKent Overstreet } 894cafe5635SKent Overstreet 895cafe5635SKent Overstreet /* Btree in memory cache - hash table */ 896cafe5635SKent Overstreet 897cafe5635SKent Overstreet static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 898cafe5635SKent Overstreet { 899cafe5635SKent Overstreet return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 900cafe5635SKent Overstreet } 901cafe5635SKent Overstreet 902cafe5635SKent Overstreet static struct btree *mca_find(struct cache_set *c, struct bkey *k) 903cafe5635SKent Overstreet { 904cafe5635SKent Overstreet struct btree *b; 905cafe5635SKent Overstreet 906cafe5635SKent Overstreet rcu_read_lock(); 907cafe5635SKent Overstreet hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 908cafe5635SKent Overstreet if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 909cafe5635SKent Overstreet goto out; 910cafe5635SKent Overstreet b = NULL; 911cafe5635SKent Overstreet out: 912cafe5635SKent Overstreet rcu_read_unlock(); 913cafe5635SKent Overstreet return b; 914cafe5635SKent Overstreet } 915cafe5635SKent Overstreet 9160a63b66dSKent Overstreet static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) 9170a63b66dSKent Overstreet { 91834cf78bfSGuoju Fang spin_lock(&c->btree_cannibalize_lock); 91934cf78bfSGuoju Fang if (likely(c->btree_cache_alloc_lock == NULL)) { 92034cf78bfSGuoju Fang c->btree_cache_alloc_lock = current; 92134cf78bfSGuoju Fang } else if (c->btree_cache_alloc_lock != current) { 9220a63b66dSKent Overstreet if (op) 9230a63b66dSKent Overstreet prepare_to_wait(&c->btree_cache_wait, &op->wait, 9240a63b66dSKent Overstreet TASK_UNINTERRUPTIBLE); 92534cf78bfSGuoju Fang spin_unlock(&c->btree_cannibalize_lock); 9260a63b66dSKent Overstreet return -EINTR; 9270a63b66dSKent Overstreet } 92834cf78bfSGuoju Fang spin_unlock(&c->btree_cannibalize_lock); 9290a63b66dSKent Overstreet 9300a63b66dSKent Overstreet return 0; 9310a63b66dSKent Overstreet } 9320a63b66dSKent Overstreet 9330a63b66dSKent Overstreet static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, 9340a63b66dSKent Overstreet struct bkey *k) 935cafe5635SKent Overstreet { 936e8e1d468SKent Overstreet struct btree *b; 937cafe5635SKent Overstreet 938c37511b8SKent Overstreet trace_bcache_btree_cache_cannibalize(c); 939c37511b8SKent Overstreet 9400a63b66dSKent Overstreet if (mca_cannibalize_lock(c, op)) 9410a63b66dSKent Overstreet return ERR_PTR(-EINTR); 942cafe5635SKent Overstreet 943e8e1d468SKent Overstreet list_for_each_entry_reverse(b, &c->btree_cache, list) 944e8e1d468SKent Overstreet if (!mca_reap(b, btree_order(k), false)) 945e8e1d468SKent Overstreet return b; 946cafe5635SKent Overstreet 947e8e1d468SKent Overstreet list_for_each_entry_reverse(b, &c->btree_cache, list) 948e8e1d468SKent Overstreet if (!mca_reap(b, btree_order(k), true)) 949e8e1d468SKent Overstreet return b; 950e8e1d468SKent Overstreet 9510a63b66dSKent Overstreet WARN(1, "btree cache cannibalize failed\n"); 952e8e1d468SKent Overstreet return ERR_PTR(-ENOMEM); 953cafe5635SKent Overstreet } 954cafe5635SKent Overstreet 955cafe5635SKent Overstreet /* 956cafe5635SKent Overstreet * We can only have one thread cannibalizing other cached btree nodes at a time, 957cafe5635SKent Overstreet * or we'll deadlock. We use an open coded mutex to ensure that, which a 958cafe5635SKent Overstreet * cannibalize_bucket() will take. This means every time we unlock the root of 959cafe5635SKent Overstreet * the btree, we need to release this lock if we have it held. 960cafe5635SKent Overstreet */ 961df8e8970SKent Overstreet static void bch_cannibalize_unlock(struct cache_set *c) 962cafe5635SKent Overstreet { 96334cf78bfSGuoju Fang spin_lock(&c->btree_cannibalize_lock); 9640a63b66dSKent Overstreet if (c->btree_cache_alloc_lock == current) { 9650a63b66dSKent Overstreet c->btree_cache_alloc_lock = NULL; 9660a63b66dSKent Overstreet wake_up(&c->btree_cache_wait); 967cafe5635SKent Overstreet } 96834cf78bfSGuoju Fang spin_unlock(&c->btree_cannibalize_lock); 969cafe5635SKent Overstreet } 970cafe5635SKent Overstreet 9710a63b66dSKent Overstreet static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, 9720a63b66dSKent Overstreet struct bkey *k, int level) 973cafe5635SKent Overstreet { 974cafe5635SKent Overstreet struct btree *b; 975cafe5635SKent Overstreet 976e8e1d468SKent Overstreet BUG_ON(current->bio_list); 977e8e1d468SKent Overstreet 978cafe5635SKent Overstreet lockdep_assert_held(&c->bucket_lock); 979cafe5635SKent Overstreet 980cafe5635SKent Overstreet if (mca_find(c, k)) 981cafe5635SKent Overstreet return NULL; 982cafe5635SKent Overstreet 983cafe5635SKent Overstreet /* btree_free() doesn't free memory; it sticks the node on the end of 984cafe5635SKent Overstreet * the list. Check if there's any freed nodes there: 985cafe5635SKent Overstreet */ 986cafe5635SKent Overstreet list_for_each_entry(b, &c->btree_cache_freeable, list) 987e8e1d468SKent Overstreet if (!mca_reap(b, btree_order(k), false)) 988cafe5635SKent Overstreet goto out; 989cafe5635SKent Overstreet 990cafe5635SKent Overstreet /* We never free struct btree itself, just the memory that holds the on 991cafe5635SKent Overstreet * disk node. Check the freed list before allocating a new one: 992cafe5635SKent Overstreet */ 993cafe5635SKent Overstreet list_for_each_entry(b, &c->btree_cache_freed, list) 994e8e1d468SKent Overstreet if (!mca_reap(b, 0, false)) { 995cafe5635SKent Overstreet mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 996a85e968eSKent Overstreet if (!b->keys.set[0].data) 997cafe5635SKent Overstreet goto err; 998cafe5635SKent Overstreet else 999cafe5635SKent Overstreet goto out; 1000cafe5635SKent Overstreet } 1001cafe5635SKent Overstreet 1002cafe5635SKent Overstreet b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 1003cafe5635SKent Overstreet if (!b) 1004cafe5635SKent Overstreet goto err; 1005cafe5635SKent Overstreet 1006cafe5635SKent Overstreet BUG_ON(!down_write_trylock(&b->lock)); 1007a85e968eSKent Overstreet if (!b->keys.set->data) 1008cafe5635SKent Overstreet goto err; 1009cafe5635SKent Overstreet out: 1010cb7a583eSKent Overstreet BUG_ON(b->io_mutex.count != 1); 1011cafe5635SKent Overstreet 1012cafe5635SKent Overstreet bkey_copy(&b->key, k); 1013cafe5635SKent Overstreet list_move(&b->list, &c->btree_cache); 1014cafe5635SKent Overstreet hlist_del_init_rcu(&b->hash); 1015cafe5635SKent Overstreet hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 1016cafe5635SKent Overstreet 1017cafe5635SKent Overstreet lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 1018d6fd3b11SKent Overstreet b->parent = (void *) ~0UL; 1019a85e968eSKent Overstreet b->flags = 0; 1020a85e968eSKent Overstreet b->written = 0; 1021a85e968eSKent Overstreet b->level = level; 1022cafe5635SKent Overstreet 102365d45231SKent Overstreet if (!b->level) 1024a85e968eSKent Overstreet bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, 1025a85e968eSKent Overstreet &b->c->expensive_debug_checks); 102665d45231SKent Overstreet else 1027a85e968eSKent Overstreet bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, 1028a85e968eSKent Overstreet &b->c->expensive_debug_checks); 1029cafe5635SKent Overstreet 1030cafe5635SKent Overstreet return b; 1031cafe5635SKent Overstreet err: 1032cafe5635SKent Overstreet if (b) 1033cafe5635SKent Overstreet rw_unlock(true, b); 1034cafe5635SKent Overstreet 10350a63b66dSKent Overstreet b = mca_cannibalize(c, op, k); 1036cafe5635SKent Overstreet if (!IS_ERR(b)) 1037cafe5635SKent Overstreet goto out; 1038cafe5635SKent Overstreet 1039cafe5635SKent Overstreet return b; 1040cafe5635SKent Overstreet } 1041cafe5635SKent Overstreet 104247344e33SBart Van Assche /* 1043cafe5635SKent Overstreet * bch_btree_node_get - find a btree node in the cache and lock it, reading it 1044cafe5635SKent Overstreet * in from disk if necessary. 1045cafe5635SKent Overstreet * 1046b54d6934SKent Overstreet * If IO is necessary and running under generic_make_request, returns -EAGAIN. 1047cafe5635SKent Overstreet * 1048cafe5635SKent Overstreet * The btree node will have either a read or a write lock held, depending on 1049cafe5635SKent Overstreet * level and op->lock. 1050cafe5635SKent Overstreet */ 10510a63b66dSKent Overstreet struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, 10522452cc89SSlava Pestov struct bkey *k, int level, bool write, 10532452cc89SSlava Pestov struct btree *parent) 1054cafe5635SKent Overstreet { 1055cafe5635SKent Overstreet int i = 0; 1056cafe5635SKent Overstreet struct btree *b; 1057cafe5635SKent Overstreet 1058cafe5635SKent Overstreet BUG_ON(level < 0); 1059cafe5635SKent Overstreet retry: 1060cafe5635SKent Overstreet b = mca_find(c, k); 1061cafe5635SKent Overstreet 1062cafe5635SKent Overstreet if (!b) { 106357943511SKent Overstreet if (current->bio_list) 106457943511SKent Overstreet return ERR_PTR(-EAGAIN); 106557943511SKent Overstreet 1066cafe5635SKent Overstreet mutex_lock(&c->bucket_lock); 10670a63b66dSKent Overstreet b = mca_alloc(c, op, k, level); 1068cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 1069cafe5635SKent Overstreet 1070cafe5635SKent Overstreet if (!b) 1071cafe5635SKent Overstreet goto retry; 1072cafe5635SKent Overstreet if (IS_ERR(b)) 1073cafe5635SKent Overstreet return b; 1074cafe5635SKent Overstreet 107557943511SKent Overstreet bch_btree_node_read(b); 1076cafe5635SKent Overstreet 1077cafe5635SKent Overstreet if (!write) 1078cafe5635SKent Overstreet downgrade_write(&b->lock); 1079cafe5635SKent Overstreet } else { 1080cafe5635SKent Overstreet rw_lock(write, b, level); 1081cafe5635SKent Overstreet if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 1082cafe5635SKent Overstreet rw_unlock(write, b); 1083cafe5635SKent Overstreet goto retry; 1084cafe5635SKent Overstreet } 1085cafe5635SKent Overstreet BUG_ON(b->level != level); 1086cafe5635SKent Overstreet } 1087cafe5635SKent Overstreet 1088c2e8dcf7SColy Li if (btree_node_io_error(b)) { 1089c2e8dcf7SColy Li rw_unlock(write, b); 1090c2e8dcf7SColy Li return ERR_PTR(-EIO); 1091c2e8dcf7SColy Li } 1092c2e8dcf7SColy Li 1093c2e8dcf7SColy Li BUG_ON(!b->written); 1094c2e8dcf7SColy Li 10952452cc89SSlava Pestov b->parent = parent; 1096cafe5635SKent Overstreet b->accessed = 1; 1097cafe5635SKent Overstreet 1098a85e968eSKent Overstreet for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { 1099a85e968eSKent Overstreet prefetch(b->keys.set[i].tree); 1100a85e968eSKent Overstreet prefetch(b->keys.set[i].data); 1101cafe5635SKent Overstreet } 1102cafe5635SKent Overstreet 1103a85e968eSKent Overstreet for (; i <= b->keys.nsets; i++) 1104a85e968eSKent Overstreet prefetch(b->keys.set[i].data); 1105cafe5635SKent Overstreet 1106cafe5635SKent Overstreet return b; 1107cafe5635SKent Overstreet } 1108cafe5635SKent Overstreet 11092452cc89SSlava Pestov static void btree_node_prefetch(struct btree *parent, struct bkey *k) 1110cafe5635SKent Overstreet { 1111cafe5635SKent Overstreet struct btree *b; 1112cafe5635SKent Overstreet 11132452cc89SSlava Pestov mutex_lock(&parent->c->bucket_lock); 11142452cc89SSlava Pestov b = mca_alloc(parent->c, NULL, k, parent->level - 1); 11152452cc89SSlava Pestov mutex_unlock(&parent->c->bucket_lock); 1116cafe5635SKent Overstreet 1117cafe5635SKent Overstreet if (!IS_ERR_OR_NULL(b)) { 11182452cc89SSlava Pestov b->parent = parent; 111957943511SKent Overstreet bch_btree_node_read(b); 1120cafe5635SKent Overstreet rw_unlock(true, b); 1121cafe5635SKent Overstreet } 1122cafe5635SKent Overstreet } 1123cafe5635SKent Overstreet 1124cafe5635SKent Overstreet /* Btree alloc */ 1125cafe5635SKent Overstreet 1126e8e1d468SKent Overstreet static void btree_node_free(struct btree *b) 1127cafe5635SKent Overstreet { 1128c37511b8SKent Overstreet trace_bcache_btree_node_free(b); 1129c37511b8SKent Overstreet 1130cafe5635SKent Overstreet BUG_ON(b == b->c->root); 1131cafe5635SKent Overstreet 113250a260e8SColy Li retry: 11332a285686SKent Overstreet mutex_lock(&b->write_lock); 113450a260e8SColy Li /* 113550a260e8SColy Li * If the btree node is selected and flushing in btree_flush_write(), 113650a260e8SColy Li * delay and retry until the BTREE_NODE_journal_flush bit cleared, 113750a260e8SColy Li * then it is safe to free the btree node here. Otherwise this btree 113850a260e8SColy Li * node will be in race condition. 113950a260e8SColy Li */ 114050a260e8SColy Li if (btree_node_journal_flush(b)) { 114150a260e8SColy Li mutex_unlock(&b->write_lock); 114250a260e8SColy Li pr_debug("bnode %p journal_flush set, retry", b); 114350a260e8SColy Li udelay(1); 114450a260e8SColy Li goto retry; 114550a260e8SColy Li } 11462a285686SKent Overstreet 1147e5ec5f47SColy Li if (btree_node_dirty(b)) { 1148cafe5635SKent Overstreet btree_complete_write(b, btree_current_write(b)); 1149cafe5635SKent Overstreet clear_bit(BTREE_NODE_dirty, &b->flags); 1150e5ec5f47SColy Li } 1151cafe5635SKent Overstreet 11522a285686SKent Overstreet mutex_unlock(&b->write_lock); 11532a285686SKent Overstreet 1154cafe5635SKent Overstreet cancel_delayed_work(&b->work); 1155cafe5635SKent Overstreet 1156cafe5635SKent Overstreet mutex_lock(&b->c->bucket_lock); 1157cafe5635SKent Overstreet bch_bucket_free(b->c, &b->key); 1158cafe5635SKent Overstreet mca_bucket_free(b); 1159cafe5635SKent Overstreet mutex_unlock(&b->c->bucket_lock); 1160cafe5635SKent Overstreet } 1161cafe5635SKent Overstreet 1162c5aa4a31SSlava Pestov struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, 11632452cc89SSlava Pestov int level, bool wait, 11642452cc89SSlava Pestov struct btree *parent) 1165cafe5635SKent Overstreet { 1166cafe5635SKent Overstreet BKEY_PADDED(key) k; 1167cafe5635SKent Overstreet struct btree *b = ERR_PTR(-EAGAIN); 1168cafe5635SKent Overstreet 1169cafe5635SKent Overstreet mutex_lock(&c->bucket_lock); 1170cafe5635SKent Overstreet retry: 1171c5aa4a31SSlava Pestov if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1172cafe5635SKent Overstreet goto err; 1173cafe5635SKent Overstreet 11743a3b6a4eSKent Overstreet bkey_put(c, &k.key); 1175cafe5635SKent Overstreet SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1176cafe5635SKent Overstreet 11770a63b66dSKent Overstreet b = mca_alloc(c, op, &k.key, level); 1178cafe5635SKent Overstreet if (IS_ERR(b)) 1179cafe5635SKent Overstreet goto err_free; 1180cafe5635SKent Overstreet 1181cafe5635SKent Overstreet if (!b) { 1182b1a67b0fSKent Overstreet cache_bug(c, 1183b1a67b0fSKent Overstreet "Tried to allocate bucket that was in btree cache"); 1184cafe5635SKent Overstreet goto retry; 1185cafe5635SKent Overstreet } 1186cafe5635SKent Overstreet 1187cafe5635SKent Overstreet b->accessed = 1; 11882452cc89SSlava Pestov b->parent = parent; 1189a85e968eSKent Overstreet bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); 1190cafe5635SKent Overstreet 1191cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 1192c37511b8SKent Overstreet 1193c37511b8SKent Overstreet trace_bcache_btree_node_alloc(b); 1194cafe5635SKent Overstreet return b; 1195cafe5635SKent Overstreet err_free: 1196cafe5635SKent Overstreet bch_bucket_free(c, &k.key); 1197cafe5635SKent Overstreet err: 1198cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 1199c37511b8SKent Overstreet 1200913dc33fSSlava Pestov trace_bcache_btree_node_alloc_fail(c); 1201cafe5635SKent Overstreet return b; 1202cafe5635SKent Overstreet } 1203cafe5635SKent Overstreet 1204c5aa4a31SSlava Pestov static struct btree *bch_btree_node_alloc(struct cache_set *c, 12052452cc89SSlava Pestov struct btree_op *op, int level, 12062452cc89SSlava Pestov struct btree *parent) 1207c5aa4a31SSlava Pestov { 12082452cc89SSlava Pestov return __bch_btree_node_alloc(c, op, level, op != NULL, parent); 1209c5aa4a31SSlava Pestov } 1210c5aa4a31SSlava Pestov 12110a63b66dSKent Overstreet static struct btree *btree_node_alloc_replacement(struct btree *b, 12120a63b66dSKent Overstreet struct btree_op *op) 1213cafe5635SKent Overstreet { 12142452cc89SSlava Pestov struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); 12151fae7cf0SColy Li 121667539e85SKent Overstreet if (!IS_ERR_OR_NULL(n)) { 12172a285686SKent Overstreet mutex_lock(&n->write_lock); 121889ebb4a2SKent Overstreet bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 121967539e85SKent Overstreet bkey_copy_key(&n->key, &b->key); 12202a285686SKent Overstreet mutex_unlock(&n->write_lock); 122167539e85SKent Overstreet } 1222cafe5635SKent Overstreet 1223cafe5635SKent Overstreet return n; 1224cafe5635SKent Overstreet } 1225cafe5635SKent Overstreet 12268835c123SKent Overstreet static void make_btree_freeing_key(struct btree *b, struct bkey *k) 12278835c123SKent Overstreet { 12286f10f7d1SColy Li unsigned int i; 12298835c123SKent Overstreet 123005335cffSKent Overstreet mutex_lock(&b->c->bucket_lock); 123105335cffSKent Overstreet 123205335cffSKent Overstreet atomic_inc(&b->c->prio_blocked); 123305335cffSKent Overstreet 12348835c123SKent Overstreet bkey_copy(k, &b->key); 12358835c123SKent Overstreet bkey_copy_key(k, &ZERO_KEY); 12368835c123SKent Overstreet 123705335cffSKent Overstreet for (i = 0; i < KEY_PTRS(k); i++) 123805335cffSKent Overstreet SET_PTR_GEN(k, i, 123905335cffSKent Overstreet bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 124005335cffSKent Overstreet PTR_BUCKET(b->c, &b->key, i))); 12418835c123SKent Overstreet 124205335cffSKent Overstreet mutex_unlock(&b->c->bucket_lock); 12438835c123SKent Overstreet } 12448835c123SKent Overstreet 124578365411SKent Overstreet static int btree_check_reserve(struct btree *b, struct btree_op *op) 124678365411SKent Overstreet { 124778365411SKent Overstreet struct cache_set *c = b->c; 124878365411SKent Overstreet struct cache *ca; 12496f10f7d1SColy Li unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; 125078365411SKent Overstreet 125178365411SKent Overstreet mutex_lock(&c->bucket_lock); 125278365411SKent Overstreet 125378365411SKent Overstreet for_each_cache(ca, c, i) 125478365411SKent Overstreet if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 125578365411SKent Overstreet if (op) 12560a63b66dSKent Overstreet prepare_to_wait(&c->btree_cache_wait, &op->wait, 125778365411SKent Overstreet TASK_UNINTERRUPTIBLE); 12580a63b66dSKent Overstreet mutex_unlock(&c->bucket_lock); 12590a63b66dSKent Overstreet return -EINTR; 126078365411SKent Overstreet } 126178365411SKent Overstreet 126278365411SKent Overstreet mutex_unlock(&c->bucket_lock); 12630a63b66dSKent Overstreet 12640a63b66dSKent Overstreet return mca_cannibalize_lock(b->c, op); 126578365411SKent Overstreet } 126678365411SKent Overstreet 1267cafe5635SKent Overstreet /* Garbage collection */ 1268cafe5635SKent Overstreet 1269487dded8SKent Overstreet static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, 1270487dded8SKent Overstreet struct bkey *k) 1271cafe5635SKent Overstreet { 1272cafe5635SKent Overstreet uint8_t stale = 0; 12736f10f7d1SColy Li unsigned int i; 1274cafe5635SKent Overstreet struct bucket *g; 1275cafe5635SKent Overstreet 1276cafe5635SKent Overstreet /* 1277cafe5635SKent Overstreet * ptr_invalid() can't return true for the keys that mark btree nodes as 1278cafe5635SKent Overstreet * freed, but since ptr_bad() returns true we'll never actually use them 1279cafe5635SKent Overstreet * for anything and thus we don't want mark their pointers here 1280cafe5635SKent Overstreet */ 1281cafe5635SKent Overstreet if (!bkey_cmp(k, &ZERO_KEY)) 1282cafe5635SKent Overstreet return stale; 1283cafe5635SKent Overstreet 1284cafe5635SKent Overstreet for (i = 0; i < KEY_PTRS(k); i++) { 1285cafe5635SKent Overstreet if (!ptr_available(c, k, i)) 1286cafe5635SKent Overstreet continue; 1287cafe5635SKent Overstreet 1288cafe5635SKent Overstreet g = PTR_BUCKET(c, k, i); 1289cafe5635SKent Overstreet 12903a2fd9d5SKent Overstreet if (gen_after(g->last_gc, PTR_GEN(k, i))) 12913a2fd9d5SKent Overstreet g->last_gc = PTR_GEN(k, i); 1292cafe5635SKent Overstreet 1293cafe5635SKent Overstreet if (ptr_stale(c, k, i)) { 1294cafe5635SKent Overstreet stale = max(stale, ptr_stale(c, k, i)); 1295cafe5635SKent Overstreet continue; 1296cafe5635SKent Overstreet } 1297cafe5635SKent Overstreet 1298cafe5635SKent Overstreet cache_bug_on(GC_MARK(g) && 1299cafe5635SKent Overstreet (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 1300cafe5635SKent Overstreet c, "inconsistent ptrs: mark = %llu, level = %i", 1301cafe5635SKent Overstreet GC_MARK(g), level); 1302cafe5635SKent Overstreet 1303cafe5635SKent Overstreet if (level) 1304cafe5635SKent Overstreet SET_GC_MARK(g, GC_MARK_METADATA); 1305cafe5635SKent Overstreet else if (KEY_DIRTY(k)) 1306cafe5635SKent Overstreet SET_GC_MARK(g, GC_MARK_DIRTY); 13074fe6a816SKent Overstreet else if (!GC_MARK(g)) 13084fe6a816SKent Overstreet SET_GC_MARK(g, GC_MARK_RECLAIMABLE); 1309cafe5635SKent Overstreet 1310cafe5635SKent Overstreet /* guard against overflow */ 13116f10f7d1SColy Li SET_GC_SECTORS_USED(g, min_t(unsigned int, 1312cafe5635SKent Overstreet GC_SECTORS_USED(g) + KEY_SIZE(k), 131394717447SDarrick J. Wong MAX_GC_SECTORS_USED)); 1314cafe5635SKent Overstreet 1315cafe5635SKent Overstreet BUG_ON(!GC_SECTORS_USED(g)); 1316cafe5635SKent Overstreet } 1317cafe5635SKent Overstreet 1318cafe5635SKent Overstreet return stale; 1319cafe5635SKent Overstreet } 1320cafe5635SKent Overstreet 1321cafe5635SKent Overstreet #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1322cafe5635SKent Overstreet 1323487dded8SKent Overstreet void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) 1324487dded8SKent Overstreet { 13256f10f7d1SColy Li unsigned int i; 1326487dded8SKent Overstreet 1327487dded8SKent Overstreet for (i = 0; i < KEY_PTRS(k); i++) 1328487dded8SKent Overstreet if (ptr_available(c, k, i) && 1329487dded8SKent Overstreet !ptr_stale(c, k, i)) { 1330487dded8SKent Overstreet struct bucket *b = PTR_BUCKET(c, k, i); 1331487dded8SKent Overstreet 1332487dded8SKent Overstreet b->gen = PTR_GEN(k, i); 1333487dded8SKent Overstreet 1334487dded8SKent Overstreet if (level && bkey_cmp(k, &ZERO_KEY)) 1335487dded8SKent Overstreet b->prio = BTREE_PRIO; 1336487dded8SKent Overstreet else if (!level && b->prio == BTREE_PRIO) 1337487dded8SKent Overstreet b->prio = INITIAL_PRIO; 1338487dded8SKent Overstreet } 1339487dded8SKent Overstreet 1340487dded8SKent Overstreet __bch_btree_mark_key(c, level, k); 1341487dded8SKent Overstreet } 1342487dded8SKent Overstreet 1343d44c2f9eSTang Junhui void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) 1344d44c2f9eSTang Junhui { 1345d44c2f9eSTang Junhui stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; 1346d44c2f9eSTang Junhui } 1347d44c2f9eSTang Junhui 1348a1f0358bSKent Overstreet static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 1349cafe5635SKent Overstreet { 1350cafe5635SKent Overstreet uint8_t stale = 0; 13516f10f7d1SColy Li unsigned int keys = 0, good_keys = 0; 1352cafe5635SKent Overstreet struct bkey *k; 1353cafe5635SKent Overstreet struct btree_iter iter; 1354cafe5635SKent Overstreet struct bset_tree *t; 1355cafe5635SKent Overstreet 1356cafe5635SKent Overstreet gc->nodes++; 1357cafe5635SKent Overstreet 1358c052dd9aSKent Overstreet for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 1359cafe5635SKent Overstreet stale = max(stale, btree_mark_key(b, k)); 1360a1f0358bSKent Overstreet keys++; 1361cafe5635SKent Overstreet 1362a85e968eSKent Overstreet if (bch_ptr_bad(&b->keys, k)) 1363cafe5635SKent Overstreet continue; 1364cafe5635SKent Overstreet 1365cafe5635SKent Overstreet gc->key_bytes += bkey_u64s(k); 1366cafe5635SKent Overstreet gc->nkeys++; 1367a1f0358bSKent Overstreet good_keys++; 1368cafe5635SKent Overstreet 1369cafe5635SKent Overstreet gc->data += KEY_SIZE(k); 1370cafe5635SKent Overstreet } 1371cafe5635SKent Overstreet 1372a85e968eSKent Overstreet for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) 1373cafe5635SKent Overstreet btree_bug_on(t->size && 1374a85e968eSKent Overstreet bset_written(&b->keys, t) && 1375cafe5635SKent Overstreet bkey_cmp(&b->key, &t->end) < 0, 1376cafe5635SKent Overstreet b, "found short btree key in gc"); 1377cafe5635SKent Overstreet 1378a1f0358bSKent Overstreet if (b->c->gc_always_rewrite) 1379a1f0358bSKent Overstreet return true; 1380a1f0358bSKent Overstreet 1381a1f0358bSKent Overstreet if (stale > 10) 1382a1f0358bSKent Overstreet return true; 1383a1f0358bSKent Overstreet 1384a1f0358bSKent Overstreet if ((keys - good_keys) * 2 > keys) 1385a1f0358bSKent Overstreet return true; 1386a1f0358bSKent Overstreet 1387a1f0358bSKent Overstreet return false; 1388cafe5635SKent Overstreet } 1389cafe5635SKent Overstreet 1390a1f0358bSKent Overstreet #define GC_MERGE_NODES 4U 1391cafe5635SKent Overstreet 1392cafe5635SKent Overstreet struct gc_merge_info { 1393cafe5635SKent Overstreet struct btree *b; 13946f10f7d1SColy Li unsigned int keys; 1395cafe5635SKent Overstreet }; 1396cafe5635SKent Overstreet 1397fc2d5988SColy Li static int bch_btree_insert_node(struct btree *b, struct btree_op *op, 1398fc2d5988SColy Li struct keylist *insert_keys, 1399fc2d5988SColy Li atomic_t *journal_ref, 1400fc2d5988SColy Li struct bkey *replace_key); 1401a1f0358bSKent Overstreet 1402a1f0358bSKent Overstreet static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 14030a63b66dSKent Overstreet struct gc_stat *gc, struct gc_merge_info *r) 1404cafe5635SKent Overstreet { 14056f10f7d1SColy Li unsigned int i, nodes = 0, keys = 0, blocks; 1406a1f0358bSKent Overstreet struct btree *new_nodes[GC_MERGE_NODES]; 14070a63b66dSKent Overstreet struct keylist keylist; 1408b54d6934SKent Overstreet struct closure cl; 1409a1f0358bSKent Overstreet struct bkey *k; 1410b54d6934SKent Overstreet 14110a63b66dSKent Overstreet bch_keylist_init(&keylist); 14120a63b66dSKent Overstreet 14130a63b66dSKent Overstreet if (btree_check_reserve(b, NULL)) 14140a63b66dSKent Overstreet return 0; 14150a63b66dSKent Overstreet 1416a1f0358bSKent Overstreet memset(new_nodes, 0, sizeof(new_nodes)); 1417b54d6934SKent Overstreet closure_init_stack(&cl); 1418cafe5635SKent Overstreet 1419a1f0358bSKent Overstreet while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 1420cafe5635SKent Overstreet keys += r[nodes++].keys; 1421cafe5635SKent Overstreet 1422cafe5635SKent Overstreet blocks = btree_default_blocks(b->c) * 2 / 3; 1423cafe5635SKent Overstreet 1424cafe5635SKent Overstreet if (nodes < 2 || 1425a85e968eSKent Overstreet __set_blocks(b->keys.set[0].data, keys, 1426ee811287SKent Overstreet block_bytes(b->c)) > blocks * (nodes - 1)) 1427a1f0358bSKent Overstreet return 0; 1428cafe5635SKent Overstreet 1429a1f0358bSKent Overstreet for (i = 0; i < nodes; i++) { 14300a63b66dSKent Overstreet new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); 1431a1f0358bSKent Overstreet if (IS_ERR_OR_NULL(new_nodes[i])) 1432a1f0358bSKent Overstreet goto out_nocoalesce; 1433cafe5635SKent Overstreet } 1434cafe5635SKent Overstreet 14350a63b66dSKent Overstreet /* 14360a63b66dSKent Overstreet * We have to check the reserve here, after we've allocated our new 14370a63b66dSKent Overstreet * nodes, to make sure the insert below will succeed - we also check 14380a63b66dSKent Overstreet * before as an optimization to potentially avoid a bunch of expensive 14390a63b66dSKent Overstreet * allocs/sorts 14400a63b66dSKent Overstreet */ 14410a63b66dSKent Overstreet if (btree_check_reserve(b, NULL)) 14420a63b66dSKent Overstreet goto out_nocoalesce; 14430a63b66dSKent Overstreet 14442a285686SKent Overstreet for (i = 0; i < nodes; i++) 14452a285686SKent Overstreet mutex_lock(&new_nodes[i]->write_lock); 14462a285686SKent Overstreet 1447cafe5635SKent Overstreet for (i = nodes - 1; i > 0; --i) { 1448ee811287SKent Overstreet struct bset *n1 = btree_bset_first(new_nodes[i]); 1449ee811287SKent Overstreet struct bset *n2 = btree_bset_first(new_nodes[i - 1]); 1450cafe5635SKent Overstreet struct bkey *k, *last = NULL; 1451cafe5635SKent Overstreet 1452cafe5635SKent Overstreet keys = 0; 1453cafe5635SKent Overstreet 1454a1f0358bSKent Overstreet if (i > 1) { 1455cafe5635SKent Overstreet for (k = n2->start; 1456fafff81cSKent Overstreet k < bset_bkey_last(n2); 1457cafe5635SKent Overstreet k = bkey_next(k)) { 1458cafe5635SKent Overstreet if (__set_blocks(n1, n1->keys + keys + 1459ee811287SKent Overstreet bkey_u64s(k), 1460ee811287SKent Overstreet block_bytes(b->c)) > blocks) 1461cafe5635SKent Overstreet break; 1462cafe5635SKent Overstreet 1463cafe5635SKent Overstreet last = k; 1464cafe5635SKent Overstreet keys += bkey_u64s(k); 1465cafe5635SKent Overstreet } 1466a1f0358bSKent Overstreet } else { 1467a1f0358bSKent Overstreet /* 1468a1f0358bSKent Overstreet * Last node we're not getting rid of - we're getting 1469a1f0358bSKent Overstreet * rid of the node at r[0]. Have to try and fit all of 1470a1f0358bSKent Overstreet * the remaining keys into this node; we can't ensure 1471a1f0358bSKent Overstreet * they will always fit due to rounding and variable 1472a1f0358bSKent Overstreet * length keys (shouldn't be possible in practice, 1473a1f0358bSKent Overstreet * though) 1474a1f0358bSKent Overstreet */ 1475a1f0358bSKent Overstreet if (__set_blocks(n1, n1->keys + n2->keys, 1476ee811287SKent Overstreet block_bytes(b->c)) > 1477ee811287SKent Overstreet btree_blocks(new_nodes[i])) 1478a1f0358bSKent Overstreet goto out_nocoalesce; 1479a1f0358bSKent Overstreet 1480a1f0358bSKent Overstreet keys = n2->keys; 1481a1f0358bSKent Overstreet /* Take the key of the node we're getting rid of */ 1482a1f0358bSKent Overstreet last = &r->b->key; 1483a1f0358bSKent Overstreet } 1484cafe5635SKent Overstreet 1485ee811287SKent Overstreet BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > 1486ee811287SKent Overstreet btree_blocks(new_nodes[i])); 1487cafe5635SKent Overstreet 1488a1f0358bSKent Overstreet if (last) 1489a1f0358bSKent Overstreet bkey_copy_key(&new_nodes[i]->key, last); 1490cafe5635SKent Overstreet 1491fafff81cSKent Overstreet memcpy(bset_bkey_last(n1), 1492cafe5635SKent Overstreet n2->start, 1493fafff81cSKent Overstreet (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); 1494cafe5635SKent Overstreet 1495cafe5635SKent Overstreet n1->keys += keys; 1496a1f0358bSKent Overstreet r[i].keys = n1->keys; 1497cafe5635SKent Overstreet 1498cafe5635SKent Overstreet memmove(n2->start, 1499fafff81cSKent Overstreet bset_bkey_idx(n2, keys), 1500fafff81cSKent Overstreet (void *) bset_bkey_last(n2) - 1501fafff81cSKent Overstreet (void *) bset_bkey_idx(n2, keys)); 1502cafe5635SKent Overstreet 1503cafe5635SKent Overstreet n2->keys -= keys; 1504cafe5635SKent Overstreet 15050a63b66dSKent Overstreet if (__bch_keylist_realloc(&keylist, 1506085d2a3dSKent Overstreet bkey_u64s(&new_nodes[i]->key))) 1507a1f0358bSKent Overstreet goto out_nocoalesce; 1508a1f0358bSKent Overstreet 1509a1f0358bSKent Overstreet bch_btree_node_write(new_nodes[i], &cl); 15100a63b66dSKent Overstreet bch_keylist_add(&keylist, &new_nodes[i]->key); 1511cafe5635SKent Overstreet } 1512cafe5635SKent Overstreet 15132a285686SKent Overstreet for (i = 0; i < nodes; i++) 15142a285686SKent Overstreet mutex_unlock(&new_nodes[i]->write_lock); 15152a285686SKent Overstreet 151605335cffSKent Overstreet closure_sync(&cl); 151705335cffSKent Overstreet 151805335cffSKent Overstreet /* We emptied out this node */ 151905335cffSKent Overstreet BUG_ON(btree_bset_first(new_nodes[0])->keys); 152005335cffSKent Overstreet btree_node_free(new_nodes[0]); 152105335cffSKent Overstreet rw_unlock(true, new_nodes[0]); 1522400ffaa2SSlava Pestov new_nodes[0] = NULL; 152305335cffSKent Overstreet 1524a1f0358bSKent Overstreet for (i = 0; i < nodes; i++) { 15250a63b66dSKent Overstreet if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) 1526a1f0358bSKent Overstreet goto out_nocoalesce; 1527a1f0358bSKent Overstreet 15280a63b66dSKent Overstreet make_btree_freeing_key(r[i].b, keylist.top); 15290a63b66dSKent Overstreet bch_keylist_push(&keylist); 1530a1f0358bSKent Overstreet } 1531a1f0358bSKent Overstreet 15320a63b66dSKent Overstreet bch_btree_insert_node(b, op, &keylist, NULL, NULL); 15330a63b66dSKent Overstreet BUG_ON(!bch_keylist_empty(&keylist)); 1534a1f0358bSKent Overstreet 1535a1f0358bSKent Overstreet for (i = 0; i < nodes; i++) { 1536a1f0358bSKent Overstreet btree_node_free(r[i].b); 1537a1f0358bSKent Overstreet rw_unlock(true, r[i].b); 1538a1f0358bSKent Overstreet 1539a1f0358bSKent Overstreet r[i].b = new_nodes[i]; 1540a1f0358bSKent Overstreet } 1541a1f0358bSKent Overstreet 1542a1f0358bSKent Overstreet memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); 1543a1f0358bSKent Overstreet r[nodes - 1].b = ERR_PTR(-EINTR); 1544cafe5635SKent Overstreet 1545c37511b8SKent Overstreet trace_bcache_btree_gc_coalesce(nodes); 1546cafe5635SKent Overstreet gc->nodes--; 1547cafe5635SKent Overstreet 15480a63b66dSKent Overstreet bch_keylist_free(&keylist); 15490a63b66dSKent Overstreet 1550a1f0358bSKent Overstreet /* Invalidated our iterator */ 1551a1f0358bSKent Overstreet return -EINTR; 1552a1f0358bSKent Overstreet 1553a1f0358bSKent Overstreet out_nocoalesce: 1554a1f0358bSKent Overstreet closure_sync(&cl); 1555a1f0358bSKent Overstreet 15560a63b66dSKent Overstreet while ((k = bch_keylist_pop(&keylist))) 1557a1f0358bSKent Overstreet if (!bkey_cmp(k, &ZERO_KEY)) 1558a1f0358bSKent Overstreet atomic_dec(&b->c->prio_blocked); 1559f16277caSShenghui Wang bch_keylist_free(&keylist); 1560a1f0358bSKent Overstreet 1561a1f0358bSKent Overstreet for (i = 0; i < nodes; i++) 1562a1f0358bSKent Overstreet if (!IS_ERR_OR_NULL(new_nodes[i])) { 1563a1f0358bSKent Overstreet btree_node_free(new_nodes[i]); 1564a1f0358bSKent Overstreet rw_unlock(true, new_nodes[i]); 1565a1f0358bSKent Overstreet } 1566a1f0358bSKent Overstreet return 0; 1567a1f0358bSKent Overstreet } 1568a1f0358bSKent Overstreet 15690a63b66dSKent Overstreet static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, 15700a63b66dSKent Overstreet struct btree *replace) 15710a63b66dSKent Overstreet { 15720a63b66dSKent Overstreet struct keylist keys; 15730a63b66dSKent Overstreet struct btree *n; 15740a63b66dSKent Overstreet 15750a63b66dSKent Overstreet if (btree_check_reserve(b, NULL)) 15760a63b66dSKent Overstreet return 0; 15770a63b66dSKent Overstreet 15780a63b66dSKent Overstreet n = btree_node_alloc_replacement(replace, NULL); 15790a63b66dSKent Overstreet 15800a63b66dSKent Overstreet /* recheck reserve after allocating replacement node */ 15810a63b66dSKent Overstreet if (btree_check_reserve(b, NULL)) { 15820a63b66dSKent Overstreet btree_node_free(n); 15830a63b66dSKent Overstreet rw_unlock(true, n); 15840a63b66dSKent Overstreet return 0; 15850a63b66dSKent Overstreet } 15860a63b66dSKent Overstreet 15870a63b66dSKent Overstreet bch_btree_node_write_sync(n); 15880a63b66dSKent Overstreet 15890a63b66dSKent Overstreet bch_keylist_init(&keys); 15900a63b66dSKent Overstreet bch_keylist_add(&keys, &n->key); 15910a63b66dSKent Overstreet 15920a63b66dSKent Overstreet make_btree_freeing_key(replace, keys.top); 15930a63b66dSKent Overstreet bch_keylist_push(&keys); 15940a63b66dSKent Overstreet 15950a63b66dSKent Overstreet bch_btree_insert_node(b, op, &keys, NULL, NULL); 15960a63b66dSKent Overstreet BUG_ON(!bch_keylist_empty(&keys)); 15970a63b66dSKent Overstreet 15980a63b66dSKent Overstreet btree_node_free(replace); 15990a63b66dSKent Overstreet rw_unlock(true, n); 16000a63b66dSKent Overstreet 16010a63b66dSKent Overstreet /* Invalidated our iterator */ 16020a63b66dSKent Overstreet return -EINTR; 16030a63b66dSKent Overstreet } 16040a63b66dSKent Overstreet 16056f10f7d1SColy Li static unsigned int btree_gc_count_keys(struct btree *b) 1606a1f0358bSKent Overstreet { 1607a1f0358bSKent Overstreet struct bkey *k; 1608a1f0358bSKent Overstreet struct btree_iter iter; 16096f10f7d1SColy Li unsigned int ret = 0; 1610a1f0358bSKent Overstreet 1611c052dd9aSKent Overstreet for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 1612a1f0358bSKent Overstreet ret += bkey_u64s(k); 1613a1f0358bSKent Overstreet 1614a1f0358bSKent Overstreet return ret; 1615cafe5635SKent Overstreet } 1616cafe5635SKent Overstreet 16177f4a59deSTang Junhui static size_t btree_gc_min_nodes(struct cache_set *c) 16187f4a59deSTang Junhui { 16197f4a59deSTang Junhui size_t min_nodes; 16207f4a59deSTang Junhui 16217f4a59deSTang Junhui /* 16227f4a59deSTang Junhui * Since incremental GC would stop 100ms when front 16237f4a59deSTang Junhui * side I/O comes, so when there are many btree nodes, 16247f4a59deSTang Junhui * if GC only processes constant (100) nodes each time, 16257f4a59deSTang Junhui * GC would last a long time, and the front side I/Os 16267f4a59deSTang Junhui * would run out of the buckets (since no new bucket 16277f4a59deSTang Junhui * can be allocated during GC), and be blocked again. 16287f4a59deSTang Junhui * So GC should not process constant nodes, but varied 16297f4a59deSTang Junhui * nodes according to the number of btree nodes, which 16307f4a59deSTang Junhui * realized by dividing GC into constant(100) times, 16317f4a59deSTang Junhui * so when there are many btree nodes, GC can process 16327f4a59deSTang Junhui * more nodes each time, otherwise, GC will process less 16337f4a59deSTang Junhui * nodes each time (but no less than MIN_GC_NODES) 16347f4a59deSTang Junhui */ 16357f4a59deSTang Junhui min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; 16367f4a59deSTang Junhui if (min_nodes < MIN_GC_NODES) 16377f4a59deSTang Junhui min_nodes = MIN_GC_NODES; 16387f4a59deSTang Junhui 16397f4a59deSTang Junhui return min_nodes; 16407f4a59deSTang Junhui } 16417f4a59deSTang Junhui 16427f4a59deSTang Junhui 1643cafe5635SKent Overstreet static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1644cafe5635SKent Overstreet struct closure *writes, struct gc_stat *gc) 1645cafe5635SKent Overstreet { 1646a1f0358bSKent Overstreet int ret = 0; 1647a1f0358bSKent Overstreet bool should_rewrite; 1648a1f0358bSKent Overstreet struct bkey *k; 1649a1f0358bSKent Overstreet struct btree_iter iter; 1650cafe5635SKent Overstreet struct gc_merge_info r[GC_MERGE_NODES]; 16512a285686SKent Overstreet struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1652cafe5635SKent Overstreet 1653c052dd9aSKent Overstreet bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1654cafe5635SKent Overstreet 16552a285686SKent Overstreet for (i = r; i < r + ARRAY_SIZE(r); i++) 16562a285686SKent Overstreet i->b = ERR_PTR(-EINTR); 1657cafe5635SKent Overstreet 1658a1f0358bSKent Overstreet while (1) { 1659a85e968eSKent Overstreet k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1660a1f0358bSKent Overstreet if (k) { 16610a63b66dSKent Overstreet r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 16622452cc89SSlava Pestov true, b); 1663cafe5635SKent Overstreet if (IS_ERR(r->b)) { 1664cafe5635SKent Overstreet ret = PTR_ERR(r->b); 1665cafe5635SKent Overstreet break; 1666cafe5635SKent Overstreet } 1667cafe5635SKent Overstreet 1668a1f0358bSKent Overstreet r->keys = btree_gc_count_keys(r->b); 1669cafe5635SKent Overstreet 16700a63b66dSKent Overstreet ret = btree_gc_coalesce(b, op, gc, r); 1671a1f0358bSKent Overstreet if (ret) 1672cafe5635SKent Overstreet break; 1673cafe5635SKent Overstreet } 1674cafe5635SKent Overstreet 1675a1f0358bSKent Overstreet if (!last->b) 1676a1f0358bSKent Overstreet break; 1677cafe5635SKent Overstreet 1678a1f0358bSKent Overstreet if (!IS_ERR(last->b)) { 1679a1f0358bSKent Overstreet should_rewrite = btree_gc_mark_node(last->b, gc); 16800a63b66dSKent Overstreet if (should_rewrite) { 16810a63b66dSKent Overstreet ret = btree_gc_rewrite_node(b, op, last->b); 16820a63b66dSKent Overstreet if (ret) 1683a1f0358bSKent Overstreet break; 1684a1f0358bSKent Overstreet } 1685a1f0358bSKent Overstreet 1686a1f0358bSKent Overstreet if (last->b->level) { 1687a1f0358bSKent Overstreet ret = btree_gc_recurse(last->b, op, writes, gc); 1688a1f0358bSKent Overstreet if (ret) 1689a1f0358bSKent Overstreet break; 1690a1f0358bSKent Overstreet } 1691a1f0358bSKent Overstreet 1692a1f0358bSKent Overstreet bkey_copy_key(&b->c->gc_done, &last->b->key); 1693a1f0358bSKent Overstreet 1694a1f0358bSKent Overstreet /* 1695a1f0358bSKent Overstreet * Must flush leaf nodes before gc ends, since replace 1696a1f0358bSKent Overstreet * operations aren't journalled 1697cafe5635SKent Overstreet */ 16982a285686SKent Overstreet mutex_lock(&last->b->write_lock); 1699a1f0358bSKent Overstreet if (btree_node_dirty(last->b)) 1700a1f0358bSKent Overstreet bch_btree_node_write(last->b, writes); 17012a285686SKent Overstreet mutex_unlock(&last->b->write_lock); 1702a1f0358bSKent Overstreet rw_unlock(true, last->b); 1703a1f0358bSKent Overstreet } 1704a1f0358bSKent Overstreet 1705a1f0358bSKent Overstreet memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); 1706a1f0358bSKent Overstreet r->b = NULL; 1707a1f0358bSKent Overstreet 17085c25c4fcSTang Junhui if (atomic_read(&b->c->search_inflight) && 17097f4a59deSTang Junhui gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { 17105c25c4fcSTang Junhui gc->nodes_pre = gc->nodes; 17115c25c4fcSTang Junhui ret = -EAGAIN; 17125c25c4fcSTang Junhui break; 17135c25c4fcSTang Junhui } 17145c25c4fcSTang Junhui 1715cafe5635SKent Overstreet if (need_resched()) { 1716cafe5635SKent Overstreet ret = -EAGAIN; 1717cafe5635SKent Overstreet break; 1718cafe5635SKent Overstreet } 1719cafe5635SKent Overstreet } 1720cafe5635SKent Overstreet 17212a285686SKent Overstreet for (i = r; i < r + ARRAY_SIZE(r); i++) 17222a285686SKent Overstreet if (!IS_ERR_OR_NULL(i->b)) { 17232a285686SKent Overstreet mutex_lock(&i->b->write_lock); 17242a285686SKent Overstreet if (btree_node_dirty(i->b)) 17252a285686SKent Overstreet bch_btree_node_write(i->b, writes); 17262a285686SKent Overstreet mutex_unlock(&i->b->write_lock); 17272a285686SKent Overstreet rw_unlock(true, i->b); 1728a1f0358bSKent Overstreet } 1729cafe5635SKent Overstreet 1730cafe5635SKent Overstreet return ret; 1731cafe5635SKent Overstreet } 1732cafe5635SKent Overstreet 1733cafe5635SKent Overstreet static int bch_btree_gc_root(struct btree *b, struct btree_op *op, 1734cafe5635SKent Overstreet struct closure *writes, struct gc_stat *gc) 1735cafe5635SKent Overstreet { 1736cafe5635SKent Overstreet struct btree *n = NULL; 1737a1f0358bSKent Overstreet int ret = 0; 1738a1f0358bSKent Overstreet bool should_rewrite; 1739cafe5635SKent Overstreet 1740a1f0358bSKent Overstreet should_rewrite = btree_gc_mark_node(b, gc); 1741a1f0358bSKent Overstreet if (should_rewrite) { 17420a63b66dSKent Overstreet n = btree_node_alloc_replacement(b, NULL); 1743cafe5635SKent Overstreet 1744cafe5635SKent Overstreet if (!IS_ERR_OR_NULL(n)) { 1745a1f0358bSKent Overstreet bch_btree_node_write_sync(n); 17462a285686SKent Overstreet 1747a1f0358bSKent Overstreet bch_btree_set_root(n); 1748a1f0358bSKent Overstreet btree_node_free(b); 1749a1f0358bSKent Overstreet rw_unlock(true, n); 1750a1f0358bSKent Overstreet 1751a1f0358bSKent Overstreet return -EINTR; 1752cafe5635SKent Overstreet } 1753a1f0358bSKent Overstreet } 1754a1f0358bSKent Overstreet 1755487dded8SKent Overstreet __bch_btree_mark_key(b->c, b->level + 1, &b->key); 1756487dded8SKent Overstreet 1757a1f0358bSKent Overstreet if (b->level) { 1758a1f0358bSKent Overstreet ret = btree_gc_recurse(b, op, writes, gc); 1759a1f0358bSKent Overstreet if (ret) 1760a1f0358bSKent Overstreet return ret; 1761a1f0358bSKent Overstreet } 1762a1f0358bSKent Overstreet 1763a1f0358bSKent Overstreet bkey_copy_key(&b->c->gc_done, &b->key); 1764cafe5635SKent Overstreet 1765cafe5635SKent Overstreet return ret; 1766cafe5635SKent Overstreet } 1767cafe5635SKent Overstreet 1768cafe5635SKent Overstreet static void btree_gc_start(struct cache_set *c) 1769cafe5635SKent Overstreet { 1770cafe5635SKent Overstreet struct cache *ca; 1771cafe5635SKent Overstreet struct bucket *b; 17726f10f7d1SColy Li unsigned int i; 1773cafe5635SKent Overstreet 1774cafe5635SKent Overstreet if (!c->gc_mark_valid) 1775cafe5635SKent Overstreet return; 1776cafe5635SKent Overstreet 1777cafe5635SKent Overstreet mutex_lock(&c->bucket_lock); 1778cafe5635SKent Overstreet 1779cafe5635SKent Overstreet c->gc_mark_valid = 0; 1780cafe5635SKent Overstreet c->gc_done = ZERO_KEY; 1781cafe5635SKent Overstreet 1782cafe5635SKent Overstreet for_each_cache(ca, c, i) 1783cafe5635SKent Overstreet for_each_bucket(b, ca) { 17843a2fd9d5SKent Overstreet b->last_gc = b->gen; 178529ebf465SKent Overstreet if (!atomic_read(&b->pin)) { 17864fe6a816SKent Overstreet SET_GC_MARK(b, 0); 178729ebf465SKent Overstreet SET_GC_SECTORS_USED(b, 0); 178829ebf465SKent Overstreet } 1789cafe5635SKent Overstreet } 1790cafe5635SKent Overstreet 1791cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 1792cafe5635SKent Overstreet } 1793cafe5635SKent Overstreet 1794d44c2f9eSTang Junhui static void bch_btree_gc_finish(struct cache_set *c) 1795cafe5635SKent Overstreet { 1796cafe5635SKent Overstreet struct bucket *b; 1797cafe5635SKent Overstreet struct cache *ca; 17986f10f7d1SColy Li unsigned int i; 1799cafe5635SKent Overstreet 1800cafe5635SKent Overstreet mutex_lock(&c->bucket_lock); 1801cafe5635SKent Overstreet 1802cafe5635SKent Overstreet set_gc_sectors(c); 1803cafe5635SKent Overstreet c->gc_mark_valid = 1; 1804cafe5635SKent Overstreet c->need_gc = 0; 1805cafe5635SKent Overstreet 1806cafe5635SKent Overstreet for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1807cafe5635SKent Overstreet SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1808cafe5635SKent Overstreet GC_MARK_METADATA); 1809cafe5635SKent Overstreet 1810bf0a628aSNicholas Swenson /* don't reclaim buckets to which writeback keys point */ 1811bf0a628aSNicholas Swenson rcu_read_lock(); 18122831231dSColy Li for (i = 0; i < c->devices_max_used; i++) { 1813bf0a628aSNicholas Swenson struct bcache_device *d = c->devices[i]; 1814bf0a628aSNicholas Swenson struct cached_dev *dc; 1815bf0a628aSNicholas Swenson struct keybuf_key *w, *n; 18166f10f7d1SColy Li unsigned int j; 1817bf0a628aSNicholas Swenson 1818bf0a628aSNicholas Swenson if (!d || UUID_FLASH_ONLY(&c->uuids[i])) 1819bf0a628aSNicholas Swenson continue; 1820bf0a628aSNicholas Swenson dc = container_of(d, struct cached_dev, disk); 1821bf0a628aSNicholas Swenson 1822bf0a628aSNicholas Swenson spin_lock(&dc->writeback_keys.lock); 1823bf0a628aSNicholas Swenson rbtree_postorder_for_each_entry_safe(w, n, 1824bf0a628aSNicholas Swenson &dc->writeback_keys.keys, node) 1825bf0a628aSNicholas Swenson for (j = 0; j < KEY_PTRS(&w->key); j++) 1826bf0a628aSNicholas Swenson SET_GC_MARK(PTR_BUCKET(c, &w->key, j), 1827bf0a628aSNicholas Swenson GC_MARK_DIRTY); 1828bf0a628aSNicholas Swenson spin_unlock(&dc->writeback_keys.lock); 1829bf0a628aSNicholas Swenson } 1830bf0a628aSNicholas Swenson rcu_read_unlock(); 1831bf0a628aSNicholas Swenson 1832d44c2f9eSTang Junhui c->avail_nbuckets = 0; 1833cafe5635SKent Overstreet for_each_cache(ca, c, i) { 1834cafe5635SKent Overstreet uint64_t *i; 1835cafe5635SKent Overstreet 1836cafe5635SKent Overstreet ca->invalidate_needs_gc = 0; 1837cafe5635SKent Overstreet 1838cafe5635SKent Overstreet for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) 1839cafe5635SKent Overstreet SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1840cafe5635SKent Overstreet 1841cafe5635SKent Overstreet for (i = ca->prio_buckets; 1842cafe5635SKent Overstreet i < ca->prio_buckets + prio_buckets(ca) * 2; i++) 1843cafe5635SKent Overstreet SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1844cafe5635SKent Overstreet 1845cafe5635SKent Overstreet for_each_bucket(b, ca) { 1846cafe5635SKent Overstreet c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1847cafe5635SKent Overstreet 18484fe6a816SKent Overstreet if (atomic_read(&b->pin)) 18494fe6a816SKent Overstreet continue; 18504fe6a816SKent Overstreet 18514fe6a816SKent Overstreet BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); 18524fe6a816SKent Overstreet 18534fe6a816SKent Overstreet if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) 1854d44c2f9eSTang Junhui c->avail_nbuckets++; 1855cafe5635SKent Overstreet } 1856cafe5635SKent Overstreet } 1857cafe5635SKent Overstreet 1858cafe5635SKent Overstreet mutex_unlock(&c->bucket_lock); 1859cafe5635SKent Overstreet } 1860cafe5635SKent Overstreet 186172a44517SKent Overstreet static void bch_btree_gc(struct cache_set *c) 1862cafe5635SKent Overstreet { 1863cafe5635SKent Overstreet int ret; 1864cafe5635SKent Overstreet struct gc_stat stats; 1865cafe5635SKent Overstreet struct closure writes; 1866cafe5635SKent Overstreet struct btree_op op; 1867cafe5635SKent Overstreet uint64_t start_time = local_clock(); 186857943511SKent Overstreet 1869c37511b8SKent Overstreet trace_bcache_gc_start(c); 1870cafe5635SKent Overstreet 1871cafe5635SKent Overstreet memset(&stats, 0, sizeof(struct gc_stat)); 1872cafe5635SKent Overstreet closure_init_stack(&writes); 1873b54d6934SKent Overstreet bch_btree_op_init(&op, SHRT_MAX); 1874cafe5635SKent Overstreet 1875cafe5635SKent Overstreet btree_gc_start(c); 1876cafe5635SKent Overstreet 1877771f393eSColy Li /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ 1878a1f0358bSKent Overstreet do { 1879cafe5635SKent Overstreet ret = btree_root(gc_root, c, &op, &writes, &stats); 1880cafe5635SKent Overstreet closure_sync(&writes); 1881c5f1e5adSKent Overstreet cond_resched(); 1882cafe5635SKent Overstreet 18835c25c4fcSTang Junhui if (ret == -EAGAIN) 18845c25c4fcSTang Junhui schedule_timeout_interruptible(msecs_to_jiffies 18855c25c4fcSTang Junhui (GC_SLEEP_MS)); 18865c25c4fcSTang Junhui else if (ret) 1887cafe5635SKent Overstreet pr_warn("gc failed!"); 1888771f393eSColy Li } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); 1889cafe5635SKent Overstreet 1890d44c2f9eSTang Junhui bch_btree_gc_finish(c); 189157943511SKent Overstreet wake_up_allocators(c); 189257943511SKent Overstreet 1893169ef1cfSKent Overstreet bch_time_stats_update(&c->btree_gc_time, start_time); 1894cafe5635SKent Overstreet 1895cafe5635SKent Overstreet stats.key_bytes *= sizeof(uint64_t); 1896cafe5635SKent Overstreet stats.data <<= 9; 1897d44c2f9eSTang Junhui bch_update_bucket_in_use(c, &stats); 1898cafe5635SKent Overstreet memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1899cafe5635SKent Overstreet 1900c37511b8SKent Overstreet trace_bcache_gc_end(c); 1901cafe5635SKent Overstreet 190272a44517SKent Overstreet bch_moving_gc(c); 1903cafe5635SKent Overstreet } 1904cafe5635SKent Overstreet 1905be628be0SKent Overstreet static bool gc_should_run(struct cache_set *c) 1906cafe5635SKent Overstreet { 1907a1f0358bSKent Overstreet struct cache *ca; 19086f10f7d1SColy Li unsigned int i; 190972a44517SKent Overstreet 1910be628be0SKent Overstreet for_each_cache(ca, c, i) 1911be628be0SKent Overstreet if (ca->invalidate_needs_gc) 1912be628be0SKent Overstreet return true; 191372a44517SKent Overstreet 1914be628be0SKent Overstreet if (atomic_read(&c->sectors_to_gc) < 0) 1915be628be0SKent Overstreet return true; 1916be628be0SKent Overstreet 1917be628be0SKent Overstreet return false; 1918be628be0SKent Overstreet } 1919be628be0SKent Overstreet 1920be628be0SKent Overstreet static int bch_gc_thread(void *arg) 1921be628be0SKent Overstreet { 1922be628be0SKent Overstreet struct cache_set *c = arg; 1923be628be0SKent Overstreet 1924be628be0SKent Overstreet while (1) { 1925be628be0SKent Overstreet wait_event_interruptible(c->gc_wait, 1926771f393eSColy Li kthread_should_stop() || 1927771f393eSColy Li test_bit(CACHE_SET_IO_DISABLE, &c->flags) || 1928771f393eSColy Li gc_should_run(c)); 1929be628be0SKent Overstreet 1930771f393eSColy Li if (kthread_should_stop() || 1931771f393eSColy Li test_bit(CACHE_SET_IO_DISABLE, &c->flags)) 193272a44517SKent Overstreet break; 193372a44517SKent Overstreet 1934be628be0SKent Overstreet set_gc_sectors(c); 1935be628be0SKent Overstreet bch_btree_gc(c); 193672a44517SKent Overstreet } 193772a44517SKent Overstreet 1938771f393eSColy Li wait_for_kthread_stop(); 193972a44517SKent Overstreet return 0; 194072a44517SKent Overstreet } 194172a44517SKent Overstreet 194272a44517SKent Overstreet int bch_gc_thread_start(struct cache_set *c) 194372a44517SKent Overstreet { 1944be628be0SKent Overstreet c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); 19459d134117SVasyl Gomonovych return PTR_ERR_OR_ZERO(c->gc_thread); 1946cafe5635SKent Overstreet } 1947cafe5635SKent Overstreet 1948cafe5635SKent Overstreet /* Initial partial gc */ 1949cafe5635SKent Overstreet 1950487dded8SKent Overstreet static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) 1951cafe5635SKent Overstreet { 195250310164SKent Overstreet int ret = 0; 195350310164SKent Overstreet struct bkey *k, *p = NULL; 1954cafe5635SKent Overstreet struct btree_iter iter; 1955cafe5635SKent Overstreet 1956487dded8SKent Overstreet for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) 1957487dded8SKent Overstreet bch_initial_mark_key(b->c, b->level, k); 1958cafe5635SKent Overstreet 1959487dded8SKent Overstreet bch_initial_mark_key(b->c, b->level + 1, &b->key); 1960cafe5635SKent Overstreet 1961cafe5635SKent Overstreet if (b->level) { 1962c052dd9aSKent Overstreet bch_btree_iter_init(&b->keys, &iter, NULL); 1963cafe5635SKent Overstreet 196450310164SKent Overstreet do { 1965a85e968eSKent Overstreet k = bch_btree_iter_next_filter(&iter, &b->keys, 1966a85e968eSKent Overstreet bch_ptr_bad); 19677f4a59deSTang Junhui if (k) { 19682452cc89SSlava Pestov btree_node_prefetch(b, k); 19697f4a59deSTang Junhui /* 19707f4a59deSTang Junhui * initiallize c->gc_stats.nodes 19717f4a59deSTang Junhui * for incremental GC 19727f4a59deSTang Junhui */ 19737f4a59deSTang Junhui b->c->gc_stats.nodes++; 19747f4a59deSTang Junhui } 197550310164SKent Overstreet 1976cafe5635SKent Overstreet if (p) 1977487dded8SKent Overstreet ret = btree(check_recurse, p, b, op); 1978cafe5635SKent Overstreet 197950310164SKent Overstreet p = k; 198050310164SKent Overstreet } while (p && !ret); 1981cafe5635SKent Overstreet } 1982cafe5635SKent Overstreet 1983487dded8SKent Overstreet return ret; 1984cafe5635SKent Overstreet } 1985cafe5635SKent Overstreet 1986c18536a7SKent Overstreet int bch_btree_check(struct cache_set *c) 1987cafe5635SKent Overstreet { 1988c18536a7SKent Overstreet struct btree_op op; 1989cafe5635SKent Overstreet 1990b54d6934SKent Overstreet bch_btree_op_init(&op, SHRT_MAX); 1991cafe5635SKent Overstreet 1992487dded8SKent Overstreet return btree_root(check_recurse, c, &op); 1993cafe5635SKent Overstreet } 1994cafe5635SKent Overstreet 19952531d9eeSKent Overstreet void bch_initial_gc_finish(struct cache_set *c) 19962531d9eeSKent Overstreet { 19972531d9eeSKent Overstreet struct cache *ca; 19982531d9eeSKent Overstreet struct bucket *b; 19996f10f7d1SColy Li unsigned int i; 20002531d9eeSKent Overstreet 20012531d9eeSKent Overstreet bch_btree_gc_finish(c); 20022531d9eeSKent Overstreet 20032531d9eeSKent Overstreet mutex_lock(&c->bucket_lock); 20042531d9eeSKent Overstreet 20052531d9eeSKent Overstreet /* 20062531d9eeSKent Overstreet * We need to put some unused buckets directly on the prio freelist in 20072531d9eeSKent Overstreet * order to get the allocator thread started - it needs freed buckets in 20082531d9eeSKent Overstreet * order to rewrite the prios and gens, and it needs to rewrite prios 20092531d9eeSKent Overstreet * and gens in order to free buckets. 20102531d9eeSKent Overstreet * 20112531d9eeSKent Overstreet * This is only safe for buckets that have no live data in them, which 20122531d9eeSKent Overstreet * there should always be some of. 20132531d9eeSKent Overstreet */ 20142531d9eeSKent Overstreet for_each_cache(ca, c, i) { 20152531d9eeSKent Overstreet for_each_bucket(b, ca) { 2016682811b3STang Junhui if (fifo_full(&ca->free[RESERVE_PRIO]) && 2017682811b3STang Junhui fifo_full(&ca->free[RESERVE_BTREE])) 20182531d9eeSKent Overstreet break; 20192531d9eeSKent Overstreet 20202531d9eeSKent Overstreet if (bch_can_invalidate_bucket(ca, b) && 20212531d9eeSKent Overstreet !GC_MARK(b)) { 20222531d9eeSKent Overstreet __bch_invalidate_one_bucket(ca, b); 2023682811b3STang Junhui if (!fifo_push(&ca->free[RESERVE_PRIO], 2024682811b3STang Junhui b - ca->buckets)) 2025682811b3STang Junhui fifo_push(&ca->free[RESERVE_BTREE], 20262531d9eeSKent Overstreet b - ca->buckets); 20272531d9eeSKent Overstreet } 20282531d9eeSKent Overstreet } 20292531d9eeSKent Overstreet } 20302531d9eeSKent Overstreet 20312531d9eeSKent Overstreet mutex_unlock(&c->bucket_lock); 20322531d9eeSKent Overstreet } 20332531d9eeSKent Overstreet 2034cafe5635SKent Overstreet /* Btree insertion */ 2035cafe5635SKent Overstreet 2036829a60b9SKent Overstreet static bool btree_insert_key(struct btree *b, struct bkey *k, 20371b207d80SKent Overstreet struct bkey *replace_key) 2038cafe5635SKent Overstreet { 20396f10f7d1SColy Li unsigned int status; 2040cafe5635SKent Overstreet 2041cafe5635SKent Overstreet BUG_ON(bkey_cmp(k, &b->key) > 0); 2042cafe5635SKent Overstreet 2043829a60b9SKent Overstreet status = bch_btree_insert_key(&b->keys, k, replace_key); 2044829a60b9SKent Overstreet if (status != BTREE_INSERT_STATUS_NO_INSERT) { 2045dc9d98d6SKent Overstreet bch_check_keys(&b->keys, "%u for %s", status, 20461b207d80SKent Overstreet replace_key ? "replace" : "insert"); 2047cafe5635SKent Overstreet 2048829a60b9SKent Overstreet trace_bcache_btree_insert_key(b, k, replace_key != NULL, 2049829a60b9SKent Overstreet status); 2050cafe5635SKent Overstreet return true; 2051829a60b9SKent Overstreet } else 2052829a60b9SKent Overstreet return false; 2053cafe5635SKent Overstreet } 2054cafe5635SKent Overstreet 205559158fdeSKent Overstreet static size_t insert_u64s_remaining(struct btree *b) 205659158fdeSKent Overstreet { 20573572324aSKent Overstreet long ret = bch_btree_keys_u64s_remaining(&b->keys); 205859158fdeSKent Overstreet 205959158fdeSKent Overstreet /* 206059158fdeSKent Overstreet * Might land in the middle of an existing extent and have to split it 206159158fdeSKent Overstreet */ 206259158fdeSKent Overstreet if (b->keys.ops->is_extents) 206359158fdeSKent Overstreet ret -= KEY_MAX_U64S; 206459158fdeSKent Overstreet 206559158fdeSKent Overstreet return max(ret, 0L); 206659158fdeSKent Overstreet } 206759158fdeSKent Overstreet 206826c949f8SKent Overstreet static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 20691b207d80SKent Overstreet struct keylist *insert_keys, 20701b207d80SKent Overstreet struct bkey *replace_key) 2071cafe5635SKent Overstreet { 2072cafe5635SKent Overstreet bool ret = false; 2073dc9d98d6SKent Overstreet int oldsize = bch_count_data(&b->keys); 2074cafe5635SKent Overstreet 207526c949f8SKent Overstreet while (!bch_keylist_empty(insert_keys)) { 2076c2f95ae2SKent Overstreet struct bkey *k = insert_keys->keys; 207726c949f8SKent Overstreet 207859158fdeSKent Overstreet if (bkey_u64s(k) > insert_u64s_remaining(b)) 2079403b6cdeSKent Overstreet break; 2080403b6cdeSKent Overstreet 2081403b6cdeSKent Overstreet if (bkey_cmp(k, &b->key) <= 0) { 20823a3b6a4eSKent Overstreet if (!b->level) 20833a3b6a4eSKent Overstreet bkey_put(b->c, k); 208426c949f8SKent Overstreet 2085829a60b9SKent Overstreet ret |= btree_insert_key(b, k, replace_key); 208626c949f8SKent Overstreet bch_keylist_pop_front(insert_keys); 208726c949f8SKent Overstreet } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 208826c949f8SKent Overstreet BKEY_PADDED(key) temp; 2089c2f95ae2SKent Overstreet bkey_copy(&temp.key, insert_keys->keys); 209026c949f8SKent Overstreet 209126c949f8SKent Overstreet bch_cut_back(&b->key, &temp.key); 2092c2f95ae2SKent Overstreet bch_cut_front(&b->key, insert_keys->keys); 209326c949f8SKent Overstreet 2094829a60b9SKent Overstreet ret |= btree_insert_key(b, &temp.key, replace_key); 209526c949f8SKent Overstreet break; 209626c949f8SKent Overstreet } else { 209726c949f8SKent Overstreet break; 209826c949f8SKent Overstreet } 2099cafe5635SKent Overstreet } 2100cafe5635SKent Overstreet 2101829a60b9SKent Overstreet if (!ret) 2102829a60b9SKent Overstreet op->insert_collision = true; 2103829a60b9SKent Overstreet 2104403b6cdeSKent Overstreet BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 2105403b6cdeSKent Overstreet 2106dc9d98d6SKent Overstreet BUG_ON(bch_count_data(&b->keys) < oldsize); 2107cafe5635SKent Overstreet return ret; 2108cafe5635SKent Overstreet } 2109cafe5635SKent Overstreet 211026c949f8SKent Overstreet static int btree_split(struct btree *b, struct btree_op *op, 211126c949f8SKent Overstreet struct keylist *insert_keys, 21121b207d80SKent Overstreet struct bkey *replace_key) 2113cafe5635SKent Overstreet { 2114d6fd3b11SKent Overstreet bool split; 2115cafe5635SKent Overstreet struct btree *n1, *n2 = NULL, *n3 = NULL; 2116cafe5635SKent Overstreet uint64_t start_time = local_clock(); 2117b54d6934SKent Overstreet struct closure cl; 211817e21a9fSKent Overstreet struct keylist parent_keys; 2119b54d6934SKent Overstreet 2120b54d6934SKent Overstreet closure_init_stack(&cl); 212117e21a9fSKent Overstreet bch_keylist_init(&parent_keys); 2122cafe5635SKent Overstreet 21230a63b66dSKent Overstreet if (btree_check_reserve(b, op)) { 21240a63b66dSKent Overstreet if (!b->level) 212578365411SKent Overstreet return -EINTR; 21260a63b66dSKent Overstreet else 21270a63b66dSKent Overstreet WARN(1, "insufficient reserve for split\n"); 21280a63b66dSKent Overstreet } 212978365411SKent Overstreet 21300a63b66dSKent Overstreet n1 = btree_node_alloc_replacement(b, op); 2131cafe5635SKent Overstreet if (IS_ERR(n1)) 2132cafe5635SKent Overstreet goto err; 2133cafe5635SKent Overstreet 2134ee811287SKent Overstreet split = set_blocks(btree_bset_first(n1), 2135ee811287SKent Overstreet block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; 2136cafe5635SKent Overstreet 2137cafe5635SKent Overstreet if (split) { 21386f10f7d1SColy Li unsigned int keys = 0; 2139cafe5635SKent Overstreet 2140ee811287SKent Overstreet trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 2141c37511b8SKent Overstreet 21422452cc89SSlava Pestov n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); 2143cafe5635SKent Overstreet if (IS_ERR(n2)) 2144cafe5635SKent Overstreet goto err_free1; 2145cafe5635SKent Overstreet 2146d6fd3b11SKent Overstreet if (!b->parent) { 21472452cc89SSlava Pestov n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); 2148cafe5635SKent Overstreet if (IS_ERR(n3)) 2149cafe5635SKent Overstreet goto err_free2; 2150cafe5635SKent Overstreet } 2151cafe5635SKent Overstreet 21522a285686SKent Overstreet mutex_lock(&n1->write_lock); 21532a285686SKent Overstreet mutex_lock(&n2->write_lock); 21542a285686SKent Overstreet 21551b207d80SKent Overstreet bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2156cafe5635SKent Overstreet 2157d6fd3b11SKent Overstreet /* 2158d6fd3b11SKent Overstreet * Has to be a linear search because we don't have an auxiliary 2159cafe5635SKent Overstreet * search tree yet 2160cafe5635SKent Overstreet */ 2161cafe5635SKent Overstreet 2162ee811287SKent Overstreet while (keys < (btree_bset_first(n1)->keys * 3) / 5) 2163ee811287SKent Overstreet keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), 2164fafff81cSKent Overstreet keys)); 2165cafe5635SKent Overstreet 2166fafff81cSKent Overstreet bkey_copy_key(&n1->key, 2167ee811287SKent Overstreet bset_bkey_idx(btree_bset_first(n1), keys)); 2168ee811287SKent Overstreet keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); 2169cafe5635SKent Overstreet 2170ee811287SKent Overstreet btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; 2171ee811287SKent Overstreet btree_bset_first(n1)->keys = keys; 2172cafe5635SKent Overstreet 2173ee811287SKent Overstreet memcpy(btree_bset_first(n2)->start, 2174ee811287SKent Overstreet bset_bkey_last(btree_bset_first(n1)), 2175ee811287SKent Overstreet btree_bset_first(n2)->keys * sizeof(uint64_t)); 2176cafe5635SKent Overstreet 2177cafe5635SKent Overstreet bkey_copy_key(&n2->key, &b->key); 2178cafe5635SKent Overstreet 217917e21a9fSKent Overstreet bch_keylist_add(&parent_keys, &n2->key); 2180b54d6934SKent Overstreet bch_btree_node_write(n2, &cl); 21812a285686SKent Overstreet mutex_unlock(&n2->write_lock); 2182cafe5635SKent Overstreet rw_unlock(true, n2); 2183c37511b8SKent Overstreet } else { 2184ee811287SKent Overstreet trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); 2185c37511b8SKent Overstreet 21862a285686SKent Overstreet mutex_lock(&n1->write_lock); 21871b207d80SKent Overstreet bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2188c37511b8SKent Overstreet } 2189cafe5635SKent Overstreet 219017e21a9fSKent Overstreet bch_keylist_add(&parent_keys, &n1->key); 2191b54d6934SKent Overstreet bch_btree_node_write(n1, &cl); 21922a285686SKent Overstreet mutex_unlock(&n1->write_lock); 2193cafe5635SKent Overstreet 2194cafe5635SKent Overstreet if (n3) { 2195d6fd3b11SKent Overstreet /* Depth increases, make a new root */ 21962a285686SKent Overstreet mutex_lock(&n3->write_lock); 2197cafe5635SKent Overstreet bkey_copy_key(&n3->key, &MAX_KEY); 219817e21a9fSKent Overstreet bch_btree_insert_keys(n3, op, &parent_keys, NULL); 2199b54d6934SKent Overstreet bch_btree_node_write(n3, &cl); 22002a285686SKent Overstreet mutex_unlock(&n3->write_lock); 2201cafe5635SKent Overstreet 2202b54d6934SKent Overstreet closure_sync(&cl); 2203cafe5635SKent Overstreet bch_btree_set_root(n3); 2204cafe5635SKent Overstreet rw_unlock(true, n3); 2205d6fd3b11SKent Overstreet } else if (!b->parent) { 2206d6fd3b11SKent Overstreet /* Root filled up but didn't need to be split */ 2207b54d6934SKent Overstreet closure_sync(&cl); 2208cafe5635SKent Overstreet bch_btree_set_root(n1); 2209cafe5635SKent Overstreet } else { 221017e21a9fSKent Overstreet /* Split a non root node */ 2211b54d6934SKent Overstreet closure_sync(&cl); 221217e21a9fSKent Overstreet make_btree_freeing_key(b, parent_keys.top); 221317e21a9fSKent Overstreet bch_keylist_push(&parent_keys); 221417e21a9fSKent Overstreet 221517e21a9fSKent Overstreet bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); 221617e21a9fSKent Overstreet BUG_ON(!bch_keylist_empty(&parent_keys)); 2217cafe5635SKent Overstreet } 2218cafe5635SKent Overstreet 221905335cffSKent Overstreet btree_node_free(b); 2220cafe5635SKent Overstreet rw_unlock(true, n1); 2221cafe5635SKent Overstreet 2222169ef1cfSKent Overstreet bch_time_stats_update(&b->c->btree_split_time, start_time); 2223cafe5635SKent Overstreet 2224cafe5635SKent Overstreet return 0; 2225cafe5635SKent Overstreet err_free2: 22265f5837d2SKent Overstreet bkey_put(b->c, &n2->key); 2227e8e1d468SKent Overstreet btree_node_free(n2); 2228cafe5635SKent Overstreet rw_unlock(true, n2); 2229cafe5635SKent Overstreet err_free1: 22305f5837d2SKent Overstreet bkey_put(b->c, &n1->key); 2231e8e1d468SKent Overstreet btree_node_free(n1); 2232cafe5635SKent Overstreet rw_unlock(true, n1); 2233cafe5635SKent Overstreet err: 22340a63b66dSKent Overstreet WARN(1, "bcache: btree split failed (level %u)", b->level); 22355f5837d2SKent Overstreet 2236cafe5635SKent Overstreet if (n3 == ERR_PTR(-EAGAIN) || 2237cafe5635SKent Overstreet n2 == ERR_PTR(-EAGAIN) || 2238cafe5635SKent Overstreet n1 == ERR_PTR(-EAGAIN)) 2239cafe5635SKent Overstreet return -EAGAIN; 2240cafe5635SKent Overstreet 2241cafe5635SKent Overstreet return -ENOMEM; 2242cafe5635SKent Overstreet } 2243cafe5635SKent Overstreet 224426c949f8SKent Overstreet static int bch_btree_insert_node(struct btree *b, struct btree_op *op, 2245c18536a7SKent Overstreet struct keylist *insert_keys, 22461b207d80SKent Overstreet atomic_t *journal_ref, 22471b207d80SKent Overstreet struct bkey *replace_key) 224826c949f8SKent Overstreet { 22492a285686SKent Overstreet struct closure cl; 22502a285686SKent Overstreet 22511b207d80SKent Overstreet BUG_ON(b->level && replace_key); 22521b207d80SKent Overstreet 22532a285686SKent Overstreet closure_init_stack(&cl); 22542a285686SKent Overstreet 22552a285686SKent Overstreet mutex_lock(&b->write_lock); 22562a285686SKent Overstreet 22572a285686SKent Overstreet if (write_block(b) != btree_bset_last(b) && 22582a285686SKent Overstreet b->keys.last_set_unwritten) 22592a285686SKent Overstreet bch_btree_init_next(b); /* just wrote a set */ 22602a285686SKent Overstreet 226159158fdeSKent Overstreet if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { 22622a285686SKent Overstreet mutex_unlock(&b->write_lock); 22632a285686SKent Overstreet goto split; 22642a285686SKent Overstreet } 22652a285686SKent Overstreet 22662a285686SKent Overstreet BUG_ON(write_block(b) != btree_bset_last(b)); 22672a285686SKent Overstreet 22682a285686SKent Overstreet if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 22692a285686SKent Overstreet if (!b->level) 22702a285686SKent Overstreet bch_btree_leaf_dirty(b, journal_ref); 22712a285686SKent Overstreet else 22722a285686SKent Overstreet bch_btree_node_write(b, &cl); 22732a285686SKent Overstreet } 22742a285686SKent Overstreet 22752a285686SKent Overstreet mutex_unlock(&b->write_lock); 22762a285686SKent Overstreet 22772a285686SKent Overstreet /* wait for btree node write if necessary, after unlock */ 22782a285686SKent Overstreet closure_sync(&cl); 22792a285686SKent Overstreet 22802a285686SKent Overstreet return 0; 22812a285686SKent Overstreet split: 228226c949f8SKent Overstreet if (current->bio_list) { 228326c949f8SKent Overstreet op->lock = b->c->root->level + 1; 228417e21a9fSKent Overstreet return -EAGAIN; 228526c949f8SKent Overstreet } else if (op->lock <= b->c->root->level) { 228626c949f8SKent Overstreet op->lock = b->c->root->level + 1; 228717e21a9fSKent Overstreet return -EINTR; 228826c949f8SKent Overstreet } else { 228917e21a9fSKent Overstreet /* Invalidated all iterators */ 22903b3e9e50SKent Overstreet int ret = btree_split(b, op, insert_keys, replace_key); 22913b3e9e50SKent Overstreet 22922a285686SKent Overstreet if (bch_keylist_empty(insert_keys)) 229317e21a9fSKent Overstreet return 0; 22942a285686SKent Overstreet else if (!ret) 22952a285686SKent Overstreet return -EINTR; 22962a285686SKent Overstreet return ret; 229717e21a9fSKent Overstreet } 229826c949f8SKent Overstreet } 229926c949f8SKent Overstreet 2300e7c590ebSKent Overstreet int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 2301e7c590ebSKent Overstreet struct bkey *check_key) 2302e7c590ebSKent Overstreet { 2303e7c590ebSKent Overstreet int ret = -EINTR; 2304e7c590ebSKent Overstreet uint64_t btree_ptr = b->key.ptr[0]; 2305e7c590ebSKent Overstreet unsigned long seq = b->seq; 2306e7c590ebSKent Overstreet struct keylist insert; 2307e7c590ebSKent Overstreet bool upgrade = op->lock == -1; 2308e7c590ebSKent Overstreet 2309e7c590ebSKent Overstreet bch_keylist_init(&insert); 2310e7c590ebSKent Overstreet 2311e7c590ebSKent Overstreet if (upgrade) { 2312e7c590ebSKent Overstreet rw_unlock(false, b); 2313e7c590ebSKent Overstreet rw_lock(true, b, b->level); 2314e7c590ebSKent Overstreet 2315e7c590ebSKent Overstreet if (b->key.ptr[0] != btree_ptr || 23162ef9ccbfSZheng Liu b->seq != seq + 1) { 23172ef9ccbfSZheng Liu op->lock = b->level; 2318e7c590ebSKent Overstreet goto out; 2319e7c590ebSKent Overstreet } 23202ef9ccbfSZheng Liu } 2321e7c590ebSKent Overstreet 2322e7c590ebSKent Overstreet SET_KEY_PTRS(check_key, 1); 2323e7c590ebSKent Overstreet get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); 2324e7c590ebSKent Overstreet 2325e7c590ebSKent Overstreet SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); 2326e7c590ebSKent Overstreet 2327e7c590ebSKent Overstreet bch_keylist_add(&insert, check_key); 2328e7c590ebSKent Overstreet 23291b207d80SKent Overstreet ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); 2330e7c590ebSKent Overstreet 2331e7c590ebSKent Overstreet BUG_ON(!ret && !bch_keylist_empty(&insert)); 2332e7c590ebSKent Overstreet out: 2333e7c590ebSKent Overstreet if (upgrade) 2334e7c590ebSKent Overstreet downgrade_write(&b->lock); 2335e7c590ebSKent Overstreet return ret; 2336e7c590ebSKent Overstreet } 2337e7c590ebSKent Overstreet 2338cc7b8819SKent Overstreet struct btree_insert_op { 2339cc7b8819SKent Overstreet struct btree_op op; 2340cc7b8819SKent Overstreet struct keylist *keys; 2341cc7b8819SKent Overstreet atomic_t *journal_ref; 2342cc7b8819SKent Overstreet struct bkey *replace_key; 2343cc7b8819SKent Overstreet }; 2344cc7b8819SKent Overstreet 234508239ca2SWei Yongjun static int btree_insert_fn(struct btree_op *b_op, struct btree *b) 2346cafe5635SKent Overstreet { 2347cc7b8819SKent Overstreet struct btree_insert_op *op = container_of(b_op, 2348cc7b8819SKent Overstreet struct btree_insert_op, op); 2349403b6cdeSKent Overstreet 2350cc7b8819SKent Overstreet int ret = bch_btree_insert_node(b, &op->op, op->keys, 2351cc7b8819SKent Overstreet op->journal_ref, op->replace_key); 2352cc7b8819SKent Overstreet if (ret && !bch_keylist_empty(op->keys)) 2353cc7b8819SKent Overstreet return ret; 2354cc7b8819SKent Overstreet else 2355cc7b8819SKent Overstreet return MAP_DONE; 2356cafe5635SKent Overstreet } 2357cafe5635SKent Overstreet 2358cc7b8819SKent Overstreet int bch_btree_insert(struct cache_set *c, struct keylist *keys, 2359cc7b8819SKent Overstreet atomic_t *journal_ref, struct bkey *replace_key) 2360cafe5635SKent Overstreet { 2361cc7b8819SKent Overstreet struct btree_insert_op op; 2362cafe5635SKent Overstreet int ret = 0; 2363cafe5635SKent Overstreet 2364cc7b8819SKent Overstreet BUG_ON(current->bio_list); 23654f3d4014SKent Overstreet BUG_ON(bch_keylist_empty(keys)); 2366cafe5635SKent Overstreet 2367cc7b8819SKent Overstreet bch_btree_op_init(&op.op, 0); 2368cc7b8819SKent Overstreet op.keys = keys; 2369cc7b8819SKent Overstreet op.journal_ref = journal_ref; 2370cc7b8819SKent Overstreet op.replace_key = replace_key; 2371cafe5635SKent Overstreet 2372cc7b8819SKent Overstreet while (!ret && !bch_keylist_empty(keys)) { 2373cc7b8819SKent Overstreet op.op.lock = 0; 2374cc7b8819SKent Overstreet ret = bch_btree_map_leaf_nodes(&op.op, c, 2375cc7b8819SKent Overstreet &START_KEY(keys->keys), 2376cc7b8819SKent Overstreet btree_insert_fn); 2377cc7b8819SKent Overstreet } 2378cc7b8819SKent Overstreet 2379cc7b8819SKent Overstreet if (ret) { 2380cafe5635SKent Overstreet struct bkey *k; 2381cafe5635SKent Overstreet 23821b207d80SKent Overstreet pr_err("error %i", ret); 2383cafe5635SKent Overstreet 23844f3d4014SKent Overstreet while ((k = bch_keylist_pop(keys))) 23853a3b6a4eSKent Overstreet bkey_put(c, k); 2386cc7b8819SKent Overstreet } else if (op.op.insert_collision) 2387cc7b8819SKent Overstreet ret = -ESRCH; 23886054c6d4SKent Overstreet 2389cafe5635SKent Overstreet return ret; 2390cafe5635SKent Overstreet } 2391cafe5635SKent Overstreet 2392cafe5635SKent Overstreet void bch_btree_set_root(struct btree *b) 2393cafe5635SKent Overstreet { 23946f10f7d1SColy Li unsigned int i; 2395e49c7c37SKent Overstreet struct closure cl; 2396e49c7c37SKent Overstreet 2397e49c7c37SKent Overstreet closure_init_stack(&cl); 2398cafe5635SKent Overstreet 2399c37511b8SKent Overstreet trace_bcache_btree_set_root(b); 2400c37511b8SKent Overstreet 2401cafe5635SKent Overstreet BUG_ON(!b->written); 2402cafe5635SKent Overstreet 2403cafe5635SKent Overstreet for (i = 0; i < KEY_PTRS(&b->key); i++) 2404cafe5635SKent Overstreet BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 2405cafe5635SKent Overstreet 2406cafe5635SKent Overstreet mutex_lock(&b->c->bucket_lock); 2407cafe5635SKent Overstreet list_del_init(&b->list); 2408cafe5635SKent Overstreet mutex_unlock(&b->c->bucket_lock); 2409cafe5635SKent Overstreet 2410cafe5635SKent Overstreet b->c->root = b; 2411cafe5635SKent Overstreet 2412e49c7c37SKent Overstreet bch_journal_meta(b->c, &cl); 2413e49c7c37SKent Overstreet closure_sync(&cl); 2414cafe5635SKent Overstreet } 2415cafe5635SKent Overstreet 241648dad8baSKent Overstreet /* Map across nodes or keys */ 241748dad8baSKent Overstreet 241848dad8baSKent Overstreet static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, 241948dad8baSKent Overstreet struct bkey *from, 242048dad8baSKent Overstreet btree_map_nodes_fn *fn, int flags) 242148dad8baSKent Overstreet { 242248dad8baSKent Overstreet int ret = MAP_CONTINUE; 242348dad8baSKent Overstreet 242448dad8baSKent Overstreet if (b->level) { 242548dad8baSKent Overstreet struct bkey *k; 242648dad8baSKent Overstreet struct btree_iter iter; 242748dad8baSKent Overstreet 2428c052dd9aSKent Overstreet bch_btree_iter_init(&b->keys, &iter, from); 242948dad8baSKent Overstreet 2430a85e968eSKent Overstreet while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 243148dad8baSKent Overstreet bch_ptr_bad))) { 243248dad8baSKent Overstreet ret = btree(map_nodes_recurse, k, b, 243348dad8baSKent Overstreet op, from, fn, flags); 243448dad8baSKent Overstreet from = NULL; 243548dad8baSKent Overstreet 243648dad8baSKent Overstreet if (ret != MAP_CONTINUE) 243748dad8baSKent Overstreet return ret; 243848dad8baSKent Overstreet } 243948dad8baSKent Overstreet } 244048dad8baSKent Overstreet 244148dad8baSKent Overstreet if (!b->level || flags == MAP_ALL_NODES) 244248dad8baSKent Overstreet ret = fn(op, b); 244348dad8baSKent Overstreet 244448dad8baSKent Overstreet return ret; 244548dad8baSKent Overstreet } 244648dad8baSKent Overstreet 244748dad8baSKent Overstreet int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 244848dad8baSKent Overstreet struct bkey *from, btree_map_nodes_fn *fn, int flags) 244948dad8baSKent Overstreet { 2450b54d6934SKent Overstreet return btree_root(map_nodes_recurse, c, op, from, fn, flags); 245148dad8baSKent Overstreet } 245248dad8baSKent Overstreet 245348dad8baSKent Overstreet static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, 245448dad8baSKent Overstreet struct bkey *from, btree_map_keys_fn *fn, 245548dad8baSKent Overstreet int flags) 245648dad8baSKent Overstreet { 245748dad8baSKent Overstreet int ret = MAP_CONTINUE; 245848dad8baSKent Overstreet struct bkey *k; 245948dad8baSKent Overstreet struct btree_iter iter; 246048dad8baSKent Overstreet 2461c052dd9aSKent Overstreet bch_btree_iter_init(&b->keys, &iter, from); 246248dad8baSKent Overstreet 2463a85e968eSKent Overstreet while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 246448dad8baSKent Overstreet ret = !b->level 246548dad8baSKent Overstreet ? fn(op, b, k) 246648dad8baSKent Overstreet : btree(map_keys_recurse, k, b, op, from, fn, flags); 246748dad8baSKent Overstreet from = NULL; 246848dad8baSKent Overstreet 246948dad8baSKent Overstreet if (ret != MAP_CONTINUE) 247048dad8baSKent Overstreet return ret; 247148dad8baSKent Overstreet } 247248dad8baSKent Overstreet 247348dad8baSKent Overstreet if (!b->level && (flags & MAP_END_KEY)) 247448dad8baSKent Overstreet ret = fn(op, b, &KEY(KEY_INODE(&b->key), 247548dad8baSKent Overstreet KEY_OFFSET(&b->key), 0)); 247648dad8baSKent Overstreet 247748dad8baSKent Overstreet return ret; 247848dad8baSKent Overstreet } 247948dad8baSKent Overstreet 248048dad8baSKent Overstreet int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, 248148dad8baSKent Overstreet struct bkey *from, btree_map_keys_fn *fn, int flags) 248248dad8baSKent Overstreet { 2483b54d6934SKent Overstreet return btree_root(map_keys_recurse, c, op, from, fn, flags); 248448dad8baSKent Overstreet } 248548dad8baSKent Overstreet 2486cafe5635SKent Overstreet /* Keybuf code */ 2487cafe5635SKent Overstreet 2488cafe5635SKent Overstreet static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2489cafe5635SKent Overstreet { 2490cafe5635SKent Overstreet /* Overlapping keys compare equal */ 2491cafe5635SKent Overstreet if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 2492cafe5635SKent Overstreet return -1; 2493cafe5635SKent Overstreet if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 2494cafe5635SKent Overstreet return 1; 2495cafe5635SKent Overstreet return 0; 2496cafe5635SKent Overstreet } 2497cafe5635SKent Overstreet 2498cafe5635SKent Overstreet static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 2499cafe5635SKent Overstreet struct keybuf_key *r) 2500cafe5635SKent Overstreet { 2501cafe5635SKent Overstreet return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2502cafe5635SKent Overstreet } 2503cafe5635SKent Overstreet 250448dad8baSKent Overstreet struct refill { 250548dad8baSKent Overstreet struct btree_op op; 25066f10f7d1SColy Li unsigned int nr_found; 250748dad8baSKent Overstreet struct keybuf *buf; 250848dad8baSKent Overstreet struct bkey *end; 250948dad8baSKent Overstreet keybuf_pred_fn *pred; 251048dad8baSKent Overstreet }; 251148dad8baSKent Overstreet 251248dad8baSKent Overstreet static int refill_keybuf_fn(struct btree_op *op, struct btree *b, 251348dad8baSKent Overstreet struct bkey *k) 2514cafe5635SKent Overstreet { 251548dad8baSKent Overstreet struct refill *refill = container_of(op, struct refill, op); 251648dad8baSKent Overstreet struct keybuf *buf = refill->buf; 251748dad8baSKent Overstreet int ret = MAP_CONTINUE; 2518cafe5635SKent Overstreet 25192d6cb6edSTang Junhui if (bkey_cmp(k, refill->end) > 0) { 252048dad8baSKent Overstreet ret = MAP_DONE; 252148dad8baSKent Overstreet goto out; 2522cafe5635SKent Overstreet } 2523cafe5635SKent Overstreet 252448dad8baSKent Overstreet if (!KEY_SIZE(k)) /* end key */ 252548dad8baSKent Overstreet goto out; 2526cafe5635SKent Overstreet 252748dad8baSKent Overstreet if (refill->pred(buf, k)) { 2528cafe5635SKent Overstreet struct keybuf_key *w; 2529cafe5635SKent Overstreet 2530cafe5635SKent Overstreet spin_lock(&buf->lock); 2531cafe5635SKent Overstreet 2532cafe5635SKent Overstreet w = array_alloc(&buf->freelist); 253348dad8baSKent Overstreet if (!w) { 253448dad8baSKent Overstreet spin_unlock(&buf->lock); 253548dad8baSKent Overstreet return MAP_DONE; 253648dad8baSKent Overstreet } 2537cafe5635SKent Overstreet 2538cafe5635SKent Overstreet w->private = NULL; 2539cafe5635SKent Overstreet bkey_copy(&w->key, k); 2540cafe5635SKent Overstreet 2541cafe5635SKent Overstreet if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2542cafe5635SKent Overstreet array_free(&buf->freelist, w); 254348a915a8SKent Overstreet else 254448a915a8SKent Overstreet refill->nr_found++; 2545cafe5635SKent Overstreet 254648dad8baSKent Overstreet if (array_freelist_empty(&buf->freelist)) 254748dad8baSKent Overstreet ret = MAP_DONE; 254848dad8baSKent Overstreet 2549cafe5635SKent Overstreet spin_unlock(&buf->lock); 2550cafe5635SKent Overstreet } 255148dad8baSKent Overstreet out: 255248dad8baSKent Overstreet buf->last_scanned = *k; 255348dad8baSKent Overstreet return ret; 2554cafe5635SKent Overstreet } 2555cafe5635SKent Overstreet 2556cafe5635SKent Overstreet void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 255772c27061SKent Overstreet struct bkey *end, keybuf_pred_fn *pred) 2558cafe5635SKent Overstreet { 2559cafe5635SKent Overstreet struct bkey start = buf->last_scanned; 256048dad8baSKent Overstreet struct refill refill; 2561cafe5635SKent Overstreet 2562cafe5635SKent Overstreet cond_resched(); 2563cafe5635SKent Overstreet 2564b54d6934SKent Overstreet bch_btree_op_init(&refill.op, -1); 256548a915a8SKent Overstreet refill.nr_found = 0; 256648dad8baSKent Overstreet refill.buf = buf; 256748dad8baSKent Overstreet refill.end = end; 256848dad8baSKent Overstreet refill.pred = pred; 256948dad8baSKent Overstreet 257048dad8baSKent Overstreet bch_btree_map_keys(&refill.op, c, &buf->last_scanned, 257148dad8baSKent Overstreet refill_keybuf_fn, MAP_END_KEY); 2572cafe5635SKent Overstreet 257348a915a8SKent Overstreet trace_bcache_keyscan(refill.nr_found, 2574cafe5635SKent Overstreet KEY_INODE(&start), KEY_OFFSET(&start), 257548a915a8SKent Overstreet KEY_INODE(&buf->last_scanned), 257648a915a8SKent Overstreet KEY_OFFSET(&buf->last_scanned)); 2577cafe5635SKent Overstreet 2578cafe5635SKent Overstreet spin_lock(&buf->lock); 2579cafe5635SKent Overstreet 2580cafe5635SKent Overstreet if (!RB_EMPTY_ROOT(&buf->keys)) { 2581cafe5635SKent Overstreet struct keybuf_key *w; 25821fae7cf0SColy Li 2583cafe5635SKent Overstreet w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2584cafe5635SKent Overstreet buf->start = START_KEY(&w->key); 2585cafe5635SKent Overstreet 2586cafe5635SKent Overstreet w = RB_LAST(&buf->keys, struct keybuf_key, node); 2587cafe5635SKent Overstreet buf->end = w->key; 2588cafe5635SKent Overstreet } else { 2589cafe5635SKent Overstreet buf->start = MAX_KEY; 2590cafe5635SKent Overstreet buf->end = MAX_KEY; 2591cafe5635SKent Overstreet } 2592cafe5635SKent Overstreet 2593cafe5635SKent Overstreet spin_unlock(&buf->lock); 2594cafe5635SKent Overstreet } 2595cafe5635SKent Overstreet 2596cafe5635SKent Overstreet static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2597cafe5635SKent Overstreet { 2598cafe5635SKent Overstreet rb_erase(&w->node, &buf->keys); 2599cafe5635SKent Overstreet array_free(&buf->freelist, w); 2600cafe5635SKent Overstreet } 2601cafe5635SKent Overstreet 2602cafe5635SKent Overstreet void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2603cafe5635SKent Overstreet { 2604cafe5635SKent Overstreet spin_lock(&buf->lock); 2605cafe5635SKent Overstreet __bch_keybuf_del(buf, w); 2606cafe5635SKent Overstreet spin_unlock(&buf->lock); 2607cafe5635SKent Overstreet } 2608cafe5635SKent Overstreet 2609cafe5635SKent Overstreet bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 2610cafe5635SKent Overstreet struct bkey *end) 2611cafe5635SKent Overstreet { 2612cafe5635SKent Overstreet bool ret = false; 2613cafe5635SKent Overstreet struct keybuf_key *p, *w, s; 26141fae7cf0SColy Li 2615cafe5635SKent Overstreet s.key = *start; 2616cafe5635SKent Overstreet 2617cafe5635SKent Overstreet if (bkey_cmp(end, &buf->start) <= 0 || 2618cafe5635SKent Overstreet bkey_cmp(start, &buf->end) >= 0) 2619cafe5635SKent Overstreet return false; 2620cafe5635SKent Overstreet 2621cafe5635SKent Overstreet spin_lock(&buf->lock); 2622cafe5635SKent Overstreet w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 2623cafe5635SKent Overstreet 2624cafe5635SKent Overstreet while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 2625cafe5635SKent Overstreet p = w; 2626cafe5635SKent Overstreet w = RB_NEXT(w, node); 2627cafe5635SKent Overstreet 2628cafe5635SKent Overstreet if (p->private) 2629cafe5635SKent Overstreet ret = true; 2630cafe5635SKent Overstreet else 2631cafe5635SKent Overstreet __bch_keybuf_del(buf, p); 2632cafe5635SKent Overstreet } 2633cafe5635SKent Overstreet 2634cafe5635SKent Overstreet spin_unlock(&buf->lock); 2635cafe5635SKent Overstreet return ret; 2636cafe5635SKent Overstreet } 2637cafe5635SKent Overstreet 2638cafe5635SKent Overstreet struct keybuf_key *bch_keybuf_next(struct keybuf *buf) 2639cafe5635SKent Overstreet { 2640cafe5635SKent Overstreet struct keybuf_key *w; 26411fae7cf0SColy Li 2642cafe5635SKent Overstreet spin_lock(&buf->lock); 2643cafe5635SKent Overstreet 2644cafe5635SKent Overstreet w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2645cafe5635SKent Overstreet 2646cafe5635SKent Overstreet while (w && w->private) 2647cafe5635SKent Overstreet w = RB_NEXT(w, node); 2648cafe5635SKent Overstreet 2649cafe5635SKent Overstreet if (w) 2650cafe5635SKent Overstreet w->private = ERR_PTR(-EINTR); 2651cafe5635SKent Overstreet 2652cafe5635SKent Overstreet spin_unlock(&buf->lock); 2653cafe5635SKent Overstreet return w; 2654cafe5635SKent Overstreet } 2655cafe5635SKent Overstreet 2656cafe5635SKent Overstreet struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2657cafe5635SKent Overstreet struct keybuf *buf, 265872c27061SKent Overstreet struct bkey *end, 265972c27061SKent Overstreet keybuf_pred_fn *pred) 2660cafe5635SKent Overstreet { 2661cafe5635SKent Overstreet struct keybuf_key *ret; 2662cafe5635SKent Overstreet 2663cafe5635SKent Overstreet while (1) { 2664cafe5635SKent Overstreet ret = bch_keybuf_next(buf); 2665cafe5635SKent Overstreet if (ret) 2666cafe5635SKent Overstreet break; 2667cafe5635SKent Overstreet 2668cafe5635SKent Overstreet if (bkey_cmp(&buf->last_scanned, end) >= 0) { 2669cafe5635SKent Overstreet pr_debug("scan finished"); 2670cafe5635SKent Overstreet break; 2671cafe5635SKent Overstreet } 2672cafe5635SKent Overstreet 267372c27061SKent Overstreet bch_refill_keybuf(c, buf, end, pred); 2674cafe5635SKent Overstreet } 2675cafe5635SKent Overstreet 2676cafe5635SKent Overstreet return ret; 2677cafe5635SKent Overstreet } 2678cafe5635SKent Overstreet 267972c27061SKent Overstreet void bch_keybuf_init(struct keybuf *buf) 2680cafe5635SKent Overstreet { 2681cafe5635SKent Overstreet buf->last_scanned = MAX_KEY; 2682cafe5635SKent Overstreet buf->keys = RB_ROOT; 2683cafe5635SKent Overstreet 2684cafe5635SKent Overstreet spin_lock_init(&buf->lock); 2685cafe5635SKent Overstreet array_allocator_init(&buf->freelist); 2686cafe5635SKent Overstreet } 2687