11c6fdbd8SKent Overstreet // SPDX-License-Identifier: GPL-2.0 21c6fdbd8SKent Overstreet 31c6fdbd8SKent Overstreet #include "bcachefs.h" 4*1dd7f9d9SKent Overstreet #include "btree_iter.h" 5*1dd7f9d9SKent Overstreet #include "eytzinger.h" 61c6fdbd8SKent Overstreet #include "journal_seq_blacklist.h" 7*1dd7f9d9SKent Overstreet #include "super-io.h" 81c6fdbd8SKent Overstreet 91c6fdbd8SKent Overstreet /* 101c6fdbd8SKent Overstreet * journal_seq_blacklist machinery: 111c6fdbd8SKent Overstreet * 121c6fdbd8SKent Overstreet * To guarantee order of btree updates after a crash, we need to detect when a 131c6fdbd8SKent Overstreet * btree node entry (bset) is newer than the newest journal entry that was 141c6fdbd8SKent Overstreet * successfully written, and ignore it - effectively ignoring any btree updates 151c6fdbd8SKent Overstreet * that didn't make it into the journal. 161c6fdbd8SKent Overstreet * 171c6fdbd8SKent Overstreet * If we didn't do this, we might have two btree nodes, a and b, both with 181c6fdbd8SKent Overstreet * updates that weren't written to the journal yet: if b was updated after a, 191c6fdbd8SKent Overstreet * but b was flushed and not a - oops; on recovery we'll find that the updates 201c6fdbd8SKent Overstreet * to b happened, but not the updates to a that happened before it. 211c6fdbd8SKent Overstreet * 221c6fdbd8SKent Overstreet * Ignoring bsets that are newer than the newest journal entry is always safe, 231c6fdbd8SKent Overstreet * because everything they contain will also have been journalled - and must 241c6fdbd8SKent Overstreet * still be present in the journal on disk until a journal entry has been 251c6fdbd8SKent Overstreet * written _after_ that bset was written. 261c6fdbd8SKent Overstreet * 271c6fdbd8SKent Overstreet * To accomplish this, bsets record the newest journal sequence number they 281c6fdbd8SKent Overstreet * contain updates for; then, on startup, the btree code queries the journal 291c6fdbd8SKent Overstreet * code to ask "Is this sequence number newer than the newest journal entry? If 301c6fdbd8SKent Overstreet * so, ignore it." 311c6fdbd8SKent Overstreet * 321c6fdbd8SKent Overstreet * When this happens, we must blacklist that journal sequence number: the 331c6fdbd8SKent Overstreet * journal must not write any entries with that sequence number, and it must 341c6fdbd8SKent Overstreet * record that it was blacklisted so that a) on recovery we don't think we have 351c6fdbd8SKent Overstreet * missing journal entries and b) so that the btree code continues to ignore 361c6fdbd8SKent Overstreet * that bset, until that btree node is rewritten. 371c6fdbd8SKent Overstreet */ 381c6fdbd8SKent Overstreet 39*1dd7f9d9SKent Overstreet static unsigned 40*1dd7f9d9SKent Overstreet blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) 411c6fdbd8SKent Overstreet { 42*1dd7f9d9SKent Overstreet return bl 43*1dd7f9d9SKent Overstreet ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / 44*1dd7f9d9SKent Overstreet sizeof(struct journal_seq_blacklist_entry)) 45*1dd7f9d9SKent Overstreet : 0; 46*1dd7f9d9SKent Overstreet } 471c6fdbd8SKent Overstreet 48*1dd7f9d9SKent Overstreet static unsigned sb_blacklist_u64s(unsigned nr) 49*1dd7f9d9SKent Overstreet { 50*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl; 511c6fdbd8SKent Overstreet 52*1dd7f9d9SKent Overstreet return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); 53*1dd7f9d9SKent Overstreet } 54*1dd7f9d9SKent Overstreet 55*1dd7f9d9SKent Overstreet static struct bch_sb_field_journal_seq_blacklist * 56*1dd7f9d9SKent Overstreet blacklist_entry_try_merge(struct bch_fs *c, 57*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl, 58*1dd7f9d9SKent Overstreet unsigned i) 59*1dd7f9d9SKent Overstreet { 60*1dd7f9d9SKent Overstreet unsigned nr = blacklist_nr_entries(bl); 61*1dd7f9d9SKent Overstreet 62*1dd7f9d9SKent Overstreet if (le64_to_cpu(bl->start[i].end) >= 63*1dd7f9d9SKent Overstreet le64_to_cpu(bl->start[i + 1].start)) { 64*1dd7f9d9SKent Overstreet bl->start[i].end = bl->start[i + 1].end; 65*1dd7f9d9SKent Overstreet --nr; 66*1dd7f9d9SKent Overstreet memmove(&bl->start[i], 67*1dd7f9d9SKent Overstreet &bl->start[i + 1], 68*1dd7f9d9SKent Overstreet sizeof(bl->start[0]) * (nr - i)); 69*1dd7f9d9SKent Overstreet 70*1dd7f9d9SKent Overstreet bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 71*1dd7f9d9SKent Overstreet sb_blacklist_u64s(nr)); 72*1dd7f9d9SKent Overstreet BUG_ON(!bl); 73*1dd7f9d9SKent Overstreet } 74*1dd7f9d9SKent Overstreet 75*1dd7f9d9SKent Overstreet return bl; 76*1dd7f9d9SKent Overstreet } 77*1dd7f9d9SKent Overstreet 78*1dd7f9d9SKent Overstreet int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) 79*1dd7f9d9SKent Overstreet { 80*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl; 81*1dd7f9d9SKent Overstreet unsigned i, nr; 82*1dd7f9d9SKent Overstreet int ret = 0; 83*1dd7f9d9SKent Overstreet 84*1dd7f9d9SKent Overstreet mutex_lock(&c->sb_lock); 85*1dd7f9d9SKent Overstreet bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); 86*1dd7f9d9SKent Overstreet nr = blacklist_nr_entries(bl); 87*1dd7f9d9SKent Overstreet 88*1dd7f9d9SKent Overstreet if (bl) { 89*1dd7f9d9SKent Overstreet for (i = 0; i < nr; i++) { 90*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_entry *e = 91*1dd7f9d9SKent Overstreet bl->start + i; 92*1dd7f9d9SKent Overstreet 93*1dd7f9d9SKent Overstreet if (start == le64_to_cpu(e->start) && 94*1dd7f9d9SKent Overstreet end == le64_to_cpu(e->end)) 95*1dd7f9d9SKent Overstreet goto out; 96*1dd7f9d9SKent Overstreet 97*1dd7f9d9SKent Overstreet if (start <= le64_to_cpu(e->start) && 98*1dd7f9d9SKent Overstreet end >= le64_to_cpu(e->end)) { 99*1dd7f9d9SKent Overstreet e->start = cpu_to_le64(start); 100*1dd7f9d9SKent Overstreet e->end = cpu_to_le64(end); 101*1dd7f9d9SKent Overstreet 102*1dd7f9d9SKent Overstreet if (i + 1 < nr) 103*1dd7f9d9SKent Overstreet bl = blacklist_entry_try_merge(c, 104*1dd7f9d9SKent Overstreet bl, i); 105*1dd7f9d9SKent Overstreet if (i) 106*1dd7f9d9SKent Overstreet bl = blacklist_entry_try_merge(c, 107*1dd7f9d9SKent Overstreet bl, i - 1); 108*1dd7f9d9SKent Overstreet goto out_write_sb; 109*1dd7f9d9SKent Overstreet } 110*1dd7f9d9SKent Overstreet } 111*1dd7f9d9SKent Overstreet } 112*1dd7f9d9SKent Overstreet 113*1dd7f9d9SKent Overstreet bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 114*1dd7f9d9SKent Overstreet sb_blacklist_u64s(nr + 1)); 115*1dd7f9d9SKent Overstreet if (!bl) { 116*1dd7f9d9SKent Overstreet ret = -ENOMEM; 117*1dd7f9d9SKent Overstreet goto out; 118*1dd7f9d9SKent Overstreet } 119*1dd7f9d9SKent Overstreet 120*1dd7f9d9SKent Overstreet bl->start[nr].start = cpu_to_le64(start); 121*1dd7f9d9SKent Overstreet bl->start[nr].end = cpu_to_le64(end); 122*1dd7f9d9SKent Overstreet out_write_sb: 123*1dd7f9d9SKent Overstreet c->disk_sb.sb->features[0] |= 124*1dd7f9d9SKent Overstreet 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; 125*1dd7f9d9SKent Overstreet 126*1dd7f9d9SKent Overstreet ret = bch2_write_super(c); 127*1dd7f9d9SKent Overstreet out: 128*1dd7f9d9SKent Overstreet mutex_unlock(&c->sb_lock); 129*1dd7f9d9SKent Overstreet 130*1dd7f9d9SKent Overstreet return ret; 131*1dd7f9d9SKent Overstreet } 132*1dd7f9d9SKent Overstreet 133*1dd7f9d9SKent Overstreet static int journal_seq_blacklist_table_cmp(const void *_l, 134*1dd7f9d9SKent Overstreet const void *_r, size_t size) 135*1dd7f9d9SKent Overstreet { 136*1dd7f9d9SKent Overstreet const struct journal_seq_blacklist_table_entry *l = _l; 137*1dd7f9d9SKent Overstreet const struct journal_seq_blacklist_table_entry *r = _r; 138*1dd7f9d9SKent Overstreet 139*1dd7f9d9SKent Overstreet return (l->start > r->start) - (l->start < r->start); 140*1dd7f9d9SKent Overstreet } 141*1dd7f9d9SKent Overstreet 142*1dd7f9d9SKent Overstreet bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, 143*1dd7f9d9SKent Overstreet bool dirty) 144*1dd7f9d9SKent Overstreet { 145*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; 146*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_table_entry search = { .start = seq }; 147*1dd7f9d9SKent Overstreet int idx; 148*1dd7f9d9SKent Overstreet 149*1dd7f9d9SKent Overstreet if (!t) 150*1dd7f9d9SKent Overstreet return false; 151*1dd7f9d9SKent Overstreet 152*1dd7f9d9SKent Overstreet idx = eytzinger0_find_le(t->entries, t->nr, 153*1dd7f9d9SKent Overstreet sizeof(t->entries[0]), 154*1dd7f9d9SKent Overstreet journal_seq_blacklist_table_cmp, 155*1dd7f9d9SKent Overstreet &search); 156*1dd7f9d9SKent Overstreet if (idx < 0) 157*1dd7f9d9SKent Overstreet return false; 158*1dd7f9d9SKent Overstreet 159*1dd7f9d9SKent Overstreet BUG_ON(t->entries[idx].start > seq); 160*1dd7f9d9SKent Overstreet 161*1dd7f9d9SKent Overstreet if (seq >= t->entries[idx].end) 162*1dd7f9d9SKent Overstreet return false; 163*1dd7f9d9SKent Overstreet 164*1dd7f9d9SKent Overstreet if (dirty) 165*1dd7f9d9SKent Overstreet t->entries[idx].dirty = true; 166*1dd7f9d9SKent Overstreet return true; 167*1dd7f9d9SKent Overstreet } 168*1dd7f9d9SKent Overstreet 169*1dd7f9d9SKent Overstreet int bch2_blacklist_table_initialize(struct bch_fs *c) 170*1dd7f9d9SKent Overstreet { 171*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl = 172*1dd7f9d9SKent Overstreet bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); 173*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_table *t; 174*1dd7f9d9SKent Overstreet unsigned i, nr = blacklist_nr_entries(bl); 175*1dd7f9d9SKent Overstreet 176*1dd7f9d9SKent Overstreet BUG_ON(c->journal_seq_blacklist_table); 177*1dd7f9d9SKent Overstreet 178*1dd7f9d9SKent Overstreet if (!bl) 179*1dd7f9d9SKent Overstreet return 0; 180*1dd7f9d9SKent Overstreet 181*1dd7f9d9SKent Overstreet t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, 182*1dd7f9d9SKent Overstreet GFP_KERNEL); 183*1dd7f9d9SKent Overstreet if (!t) 184*1dd7f9d9SKent Overstreet return -ENOMEM; 185*1dd7f9d9SKent Overstreet 186*1dd7f9d9SKent Overstreet t->nr = nr; 187*1dd7f9d9SKent Overstreet 188*1dd7f9d9SKent Overstreet for (i = 0; i < nr; i++) { 189*1dd7f9d9SKent Overstreet t->entries[i].start = le64_to_cpu(bl->start[i].start); 190*1dd7f9d9SKent Overstreet t->entries[i].end = le64_to_cpu(bl->start[i].end); 191*1dd7f9d9SKent Overstreet } 192*1dd7f9d9SKent Overstreet 193*1dd7f9d9SKent Overstreet eytzinger0_sort(t->entries, 194*1dd7f9d9SKent Overstreet t->nr, 195*1dd7f9d9SKent Overstreet sizeof(t->entries[0]), 196*1dd7f9d9SKent Overstreet journal_seq_blacklist_table_cmp, 197*1dd7f9d9SKent Overstreet NULL); 198*1dd7f9d9SKent Overstreet 199*1dd7f9d9SKent Overstreet c->journal_seq_blacklist_table = t; 200*1dd7f9d9SKent Overstreet return 0; 201*1dd7f9d9SKent Overstreet } 202*1dd7f9d9SKent Overstreet 203*1dd7f9d9SKent Overstreet static const char * 204*1dd7f9d9SKent Overstreet bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, 205*1dd7f9d9SKent Overstreet struct bch_sb_field *f) 206*1dd7f9d9SKent Overstreet { 207*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl = 208*1dd7f9d9SKent Overstreet field_to_type(f, journal_seq_blacklist); 209*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_entry *i; 210*1dd7f9d9SKent Overstreet unsigned nr = blacklist_nr_entries(bl); 211*1dd7f9d9SKent Overstreet 212*1dd7f9d9SKent Overstreet for (i = bl->start; i < bl->start + nr; i++) { 213*1dd7f9d9SKent Overstreet if (le64_to_cpu(i->start) >= 214*1dd7f9d9SKent Overstreet le64_to_cpu(i->end)) 215*1dd7f9d9SKent Overstreet return "entry start >= end"; 216*1dd7f9d9SKent Overstreet 217*1dd7f9d9SKent Overstreet if (i + 1 < bl->start + nr && 218*1dd7f9d9SKent Overstreet le64_to_cpu(i[0].end) > 219*1dd7f9d9SKent Overstreet le64_to_cpu(i[1].start)) 220*1dd7f9d9SKent Overstreet return "entries out of order"; 221*1dd7f9d9SKent Overstreet } 222*1dd7f9d9SKent Overstreet 223*1dd7f9d9SKent Overstreet return NULL; 224*1dd7f9d9SKent Overstreet } 225*1dd7f9d9SKent Overstreet 226*1dd7f9d9SKent Overstreet static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, 227*1dd7f9d9SKent Overstreet struct bch_sb *sb, 228*1dd7f9d9SKent Overstreet struct bch_sb_field *f) 229*1dd7f9d9SKent Overstreet { 230*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl = 231*1dd7f9d9SKent Overstreet field_to_type(f, journal_seq_blacklist); 232*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_entry *i; 233*1dd7f9d9SKent Overstreet unsigned nr = blacklist_nr_entries(bl); 234*1dd7f9d9SKent Overstreet 235*1dd7f9d9SKent Overstreet for (i = bl->start; i < bl->start + nr; i++) { 236*1dd7f9d9SKent Overstreet if (i != bl->start) 237*1dd7f9d9SKent Overstreet pr_buf(out, " "); 238*1dd7f9d9SKent Overstreet 239*1dd7f9d9SKent Overstreet pr_buf(out, "%llu-%llu", 240*1dd7f9d9SKent Overstreet le64_to_cpu(i->start), 241*1dd7f9d9SKent Overstreet le64_to_cpu(i->end)); 242*1dd7f9d9SKent Overstreet } 243*1dd7f9d9SKent Overstreet } 244*1dd7f9d9SKent Overstreet 245*1dd7f9d9SKent Overstreet const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { 246*1dd7f9d9SKent Overstreet .validate = bch2_sb_journal_seq_blacklist_validate, 247*1dd7f9d9SKent Overstreet .to_text = bch2_sb_journal_seq_blacklist_to_text 248*1dd7f9d9SKent Overstreet }; 249*1dd7f9d9SKent Overstreet 250*1dd7f9d9SKent Overstreet void bch2_blacklist_entries_gc(struct work_struct *work) 251*1dd7f9d9SKent Overstreet { 252*1dd7f9d9SKent Overstreet struct bch_fs *c = container_of(work, struct bch_fs, 253*1dd7f9d9SKent Overstreet journal_seq_blacklist_gc_work); 254*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_table *t; 255*1dd7f9d9SKent Overstreet struct bch_sb_field_journal_seq_blacklist *bl; 256*1dd7f9d9SKent Overstreet struct journal_seq_blacklist_entry *src, *dst; 257424eb881SKent Overstreet struct btree_trans trans; 258*1dd7f9d9SKent Overstreet unsigned i, nr, new_nr; 259*1dd7f9d9SKent Overstreet int ret; 2601c6fdbd8SKent Overstreet 261424eb881SKent Overstreet bch2_trans_init(&trans, c); 262424eb881SKent Overstreet 263*1dd7f9d9SKent Overstreet for (i = 0; i < BTREE_ID_NR; i++) { 264*1dd7f9d9SKent Overstreet struct btree_iter *iter; 265*1dd7f9d9SKent Overstreet struct btree *b; 2661c6fdbd8SKent Overstreet 267*1dd7f9d9SKent Overstreet for_each_btree_node(&trans, iter, i, POS_MIN, 268*1dd7f9d9SKent Overstreet BTREE_ITER_PREFETCH, b) 269*1dd7f9d9SKent Overstreet if (test_bit(BCH_FS_STOPPING, &c->flags)) { 270424eb881SKent Overstreet bch2_trans_exit(&trans); 2711c6fdbd8SKent Overstreet return; 2721c6fdbd8SKent Overstreet } 273*1dd7f9d9SKent Overstreet bch2_trans_iter_free(&trans, iter); 2741c6fdbd8SKent Overstreet } 2751c6fdbd8SKent Overstreet 276*1dd7f9d9SKent Overstreet ret = bch2_trans_exit(&trans); 2771c6fdbd8SKent Overstreet if (ret) 2781c6fdbd8SKent Overstreet return; 2791c6fdbd8SKent Overstreet 280*1dd7f9d9SKent Overstreet mutex_lock(&c->sb_lock); 281*1dd7f9d9SKent Overstreet bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); 282*1dd7f9d9SKent Overstreet if (!bl) 283*1dd7f9d9SKent Overstreet goto out; 2841c6fdbd8SKent Overstreet 285*1dd7f9d9SKent Overstreet nr = blacklist_nr_entries(bl); 286*1dd7f9d9SKent Overstreet dst = bl->start; 2871c6fdbd8SKent Overstreet 288*1dd7f9d9SKent Overstreet t = c->journal_seq_blacklist_table; 289*1dd7f9d9SKent Overstreet BUG_ON(nr != t->nr); 2901c6fdbd8SKent Overstreet 291*1dd7f9d9SKent Overstreet for (src = bl->start, i = eytzinger0_first(t->nr); 292*1dd7f9d9SKent Overstreet src < bl->start + nr; 293*1dd7f9d9SKent Overstreet src++, i = eytzinger0_next(i, nr)) { 294*1dd7f9d9SKent Overstreet BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); 295*1dd7f9d9SKent Overstreet BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); 296*1dd7f9d9SKent Overstreet 297*1dd7f9d9SKent Overstreet if (t->entries[i].dirty) 298*1dd7f9d9SKent Overstreet *dst++ = *src; 299*1dd7f9d9SKent Overstreet } 300*1dd7f9d9SKent Overstreet 301*1dd7f9d9SKent Overstreet new_nr = dst - bl->start; 302*1dd7f9d9SKent Overstreet 303*1dd7f9d9SKent Overstreet bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); 304*1dd7f9d9SKent Overstreet 305*1dd7f9d9SKent Overstreet if (new_nr != nr) { 306*1dd7f9d9SKent Overstreet bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, 307*1dd7f9d9SKent Overstreet new_nr ? sb_blacklist_u64s(new_nr) : 0); 308*1dd7f9d9SKent Overstreet BUG_ON(new_nr && !bl); 309*1dd7f9d9SKent Overstreet 310*1dd7f9d9SKent Overstreet if (!new_nr) 311*1dd7f9d9SKent Overstreet c->disk_sb.sb->features[0] &= 312*1dd7f9d9SKent Overstreet ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); 313*1dd7f9d9SKent Overstreet 314*1dd7f9d9SKent Overstreet bch2_write_super(c); 315*1dd7f9d9SKent Overstreet } 316*1dd7f9d9SKent Overstreet out: 317*1dd7f9d9SKent Overstreet mutex_unlock(&c->sb_lock); 3181c6fdbd8SKent Overstreet } 319