1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code for manipulating bucket marks for garbage collection. 4 * 5 * Copyright 2014 Datera, Inc. 6 */ 7 8 #include "bcachefs.h" 9 #include "alloc_background.h" 10 #include "backpointers.h" 11 #include "bset.h" 12 #include "btree_gc.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "buckets_waiting_for_journal.h" 16 #include "ec.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "movinggc.h" 20 #include "recovery.h" 21 #include "reflink.h" 22 #include "replicas.h" 23 #include "subvolume.h" 24 #include "trace.h" 25 26 #include <linux/preempt.h> 27 28 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, 29 enum bch_data_type data_type, 30 s64 sectors) 31 { 32 switch (data_type) { 33 case BCH_DATA_btree: 34 fs_usage->btree += sectors; 35 break; 36 case BCH_DATA_user: 37 case BCH_DATA_parity: 38 fs_usage->data += sectors; 39 break; 40 case BCH_DATA_cached: 41 fs_usage->cached += sectors; 42 break; 43 default: 44 break; 45 } 46 } 47 48 void bch2_fs_usage_initialize(struct bch_fs *c) 49 { 50 struct bch_fs_usage *usage; 51 struct bch_dev *ca; 52 unsigned i; 53 54 percpu_down_write(&c->mark_lock); 55 usage = c->usage_base; 56 57 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 58 bch2_fs_usage_acc_to_base(c, i); 59 60 for (i = 0; i < BCH_REPLICAS_MAX; i++) 61 usage->reserved += usage->persistent_reserved[i]; 62 63 for (i = 0; i < c->replicas.nr; i++) { 64 struct bch_replicas_entry *e = 65 cpu_replicas_entry(&c->replicas, i); 66 67 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); 68 } 69 70 for_each_member_device(ca, c, i) { 71 struct bch_dev_usage dev = bch2_dev_usage_read(ca); 72 73 usage->hidden += (dev.d[BCH_DATA_sb].buckets + 74 dev.d[BCH_DATA_journal].buckets) * 75 ca->mi.bucket_size; 76 } 77 78 percpu_up_write(&c->mark_lock); 79 } 80 81 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, 82 unsigned journal_seq, 83 bool gc) 84 { 85 BUG_ON(!gc && !journal_seq); 86 87 return this_cpu_ptr(gc 88 ? ca->usage_gc 89 : ca->usage[journal_seq & JOURNAL_BUF_MASK]); 90 } 91 92 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) 93 { 94 struct bch_fs *c = ca->fs; 95 unsigned seq, i, u64s = dev_usage_u64s(); 96 97 do { 98 seq = read_seqcount_begin(&c->usage_lock); 99 memcpy(usage, ca->usage_base, u64s * sizeof(u64)); 100 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 101 acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); 102 } while (read_seqcount_retry(&c->usage_lock, seq)); 103 } 104 105 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) 106 { 107 ssize_t offset = v - (u64 *) c->usage_base; 108 unsigned i, seq; 109 u64 ret; 110 111 BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); 112 percpu_rwsem_assert_held(&c->mark_lock); 113 114 do { 115 seq = read_seqcount_begin(&c->usage_lock); 116 ret = *v; 117 118 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 119 ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); 120 } while (read_seqcount_retry(&c->usage_lock, seq)); 121 122 return ret; 123 } 124 125 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) 126 { 127 struct bch_fs_usage_online *ret; 128 unsigned nr_replicas = READ_ONCE(c->replicas.nr); 129 unsigned seq, i; 130 retry: 131 ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); 132 if (unlikely(!ret)) 133 return NULL; 134 135 percpu_down_read(&c->mark_lock); 136 137 if (nr_replicas != c->replicas.nr) { 138 nr_replicas = c->replicas.nr; 139 percpu_up_read(&c->mark_lock); 140 kfree(ret); 141 goto retry; 142 } 143 144 ret->online_reserved = percpu_u64_get(c->online_reserved); 145 146 do { 147 seq = read_seqcount_begin(&c->usage_lock); 148 unsafe_memcpy(&ret->u, c->usage_base, 149 __fs_usage_u64s(nr_replicas) * sizeof(u64), 150 "embedded variable length struct"); 151 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 152 acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], 153 __fs_usage_u64s(nr_replicas)); 154 } while (read_seqcount_retry(&c->usage_lock, seq)); 155 156 return ret; 157 } 158 159 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) 160 { 161 struct bch_dev *ca; 162 unsigned i, u64s = fs_usage_u64s(c); 163 164 BUG_ON(idx >= ARRAY_SIZE(c->usage)); 165 166 preempt_disable(); 167 write_seqcount_begin(&c->usage_lock); 168 169 acc_u64s_percpu((u64 *) c->usage_base, 170 (u64 __percpu *) c->usage[idx], u64s); 171 percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); 172 173 rcu_read_lock(); 174 for_each_member_device_rcu(ca, c, i, NULL) { 175 u64s = dev_usage_u64s(); 176 177 acc_u64s_percpu((u64 *) ca->usage_base, 178 (u64 __percpu *) ca->usage[idx], u64s); 179 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); 180 } 181 rcu_read_unlock(); 182 183 write_seqcount_end(&c->usage_lock); 184 preempt_enable(); 185 } 186 187 void bch2_fs_usage_to_text(struct printbuf *out, 188 struct bch_fs *c, 189 struct bch_fs_usage_online *fs_usage) 190 { 191 unsigned i; 192 193 prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); 194 195 prt_printf(out, "hidden:\t\t\t\t%llu\n", 196 fs_usage->u.hidden); 197 prt_printf(out, "data:\t\t\t\t%llu\n", 198 fs_usage->u.data); 199 prt_printf(out, "cached:\t\t\t\t%llu\n", 200 fs_usage->u.cached); 201 prt_printf(out, "reserved:\t\t\t%llu\n", 202 fs_usage->u.reserved); 203 prt_printf(out, "nr_inodes:\t\t\t%llu\n", 204 fs_usage->u.nr_inodes); 205 prt_printf(out, "online reserved:\t\t%llu\n", 206 fs_usage->online_reserved); 207 208 for (i = 0; 209 i < ARRAY_SIZE(fs_usage->u.persistent_reserved); 210 i++) { 211 prt_printf(out, "%u replicas:\n", i + 1); 212 prt_printf(out, "\treserved:\t\t%llu\n", 213 fs_usage->u.persistent_reserved[i]); 214 } 215 216 for (i = 0; i < c->replicas.nr; i++) { 217 struct bch_replicas_entry *e = 218 cpu_replicas_entry(&c->replicas, i); 219 220 prt_printf(out, "\t"); 221 bch2_replicas_entry_to_text(out, e); 222 prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); 223 } 224 } 225 226 static u64 reserve_factor(u64 r) 227 { 228 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); 229 } 230 231 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) 232 { 233 return min(fs_usage->u.hidden + 234 fs_usage->u.btree + 235 fs_usage->u.data + 236 reserve_factor(fs_usage->u.reserved + 237 fs_usage->online_reserved), 238 c->capacity); 239 } 240 241 static struct bch_fs_usage_short 242 __bch2_fs_usage_read_short(struct bch_fs *c) 243 { 244 struct bch_fs_usage_short ret; 245 u64 data, reserved; 246 247 ret.capacity = c->capacity - 248 bch2_fs_usage_read_one(c, &c->usage_base->hidden); 249 250 data = bch2_fs_usage_read_one(c, &c->usage_base->data) + 251 bch2_fs_usage_read_one(c, &c->usage_base->btree); 252 reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + 253 percpu_u64_get(c->online_reserved); 254 255 ret.used = min(ret.capacity, data + reserve_factor(reserved)); 256 ret.free = ret.capacity - ret.used; 257 258 ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); 259 260 return ret; 261 } 262 263 struct bch_fs_usage_short 264 bch2_fs_usage_read_short(struct bch_fs *c) 265 { 266 struct bch_fs_usage_short ret; 267 268 percpu_down_read(&c->mark_lock); 269 ret = __bch2_fs_usage_read_short(c); 270 percpu_up_read(&c->mark_lock); 271 272 return ret; 273 } 274 275 void bch2_dev_usage_init(struct bch_dev *ca) 276 { 277 ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; 278 } 279 280 static inline int bucket_sectors_fragmented(struct bch_dev *ca, 281 struct bch_alloc_v4 a) 282 { 283 return a.dirty_sectors 284 ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) 285 : 0; 286 } 287 288 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, 289 struct bch_alloc_v4 old, 290 struct bch_alloc_v4 new, 291 u64 journal_seq, bool gc) 292 { 293 struct bch_fs_usage *fs_usage; 294 struct bch_dev_usage *u; 295 296 preempt_disable(); 297 fs_usage = fs_usage_ptr(c, journal_seq, gc); 298 299 if (data_type_is_hidden(old.data_type)) 300 fs_usage->hidden -= ca->mi.bucket_size; 301 if (data_type_is_hidden(new.data_type)) 302 fs_usage->hidden += ca->mi.bucket_size; 303 304 u = dev_usage_ptr(ca, journal_seq, gc); 305 306 u->d[old.data_type].buckets--; 307 u->d[new.data_type].buckets++; 308 309 u->buckets_ec -= (int) !!old.stripe; 310 u->buckets_ec += (int) !!new.stripe; 311 312 u->d[old.data_type].sectors -= old.dirty_sectors; 313 u->d[new.data_type].sectors += new.dirty_sectors; 314 315 u->d[BCH_DATA_cached].sectors += new.cached_sectors; 316 u->d[BCH_DATA_cached].sectors -= old.cached_sectors; 317 318 u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); 319 u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); 320 321 preempt_enable(); 322 } 323 324 static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, 325 struct bucket old, struct bucket new, 326 u64 journal_seq, bool gc) 327 { 328 struct bch_alloc_v4 old_a = { 329 .gen = old.gen, 330 .data_type = old.data_type, 331 .dirty_sectors = old.dirty_sectors, 332 .cached_sectors = old.cached_sectors, 333 .stripe = old.stripe, 334 }; 335 struct bch_alloc_v4 new_a = { 336 .gen = new.gen, 337 .data_type = new.data_type, 338 .dirty_sectors = new.dirty_sectors, 339 .cached_sectors = new.cached_sectors, 340 .stripe = new.stripe, 341 }; 342 343 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); 344 } 345 346 static inline int __update_replicas(struct bch_fs *c, 347 struct bch_fs_usage *fs_usage, 348 struct bch_replicas_entry *r, 349 s64 sectors) 350 { 351 int idx = bch2_replicas_entry_idx(c, r); 352 353 if (idx < 0) 354 return -1; 355 356 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); 357 fs_usage->replicas[idx] += sectors; 358 return 0; 359 } 360 361 static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, 362 struct bch_replicas_entry *r, s64 sectors, 363 unsigned journal_seq, bool gc) 364 { 365 struct bch_fs_usage *fs_usage; 366 int idx, ret = 0; 367 struct printbuf buf = PRINTBUF; 368 369 percpu_down_read(&c->mark_lock); 370 371 idx = bch2_replicas_entry_idx(c, r); 372 if (idx < 0 && 373 fsck_err(c, ptr_to_missing_replicas_entry, 374 "no replicas entry\n while marking %s", 375 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 376 percpu_up_read(&c->mark_lock); 377 ret = bch2_mark_replicas(c, r); 378 percpu_down_read(&c->mark_lock); 379 380 if (ret) 381 goto err; 382 idx = bch2_replicas_entry_idx(c, r); 383 } 384 if (idx < 0) { 385 ret = -1; 386 goto err; 387 } 388 389 preempt_disable(); 390 fs_usage = fs_usage_ptr(c, journal_seq, gc); 391 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); 392 fs_usage->replicas[idx] += sectors; 393 preempt_enable(); 394 err: 395 fsck_err: 396 percpu_up_read(&c->mark_lock); 397 printbuf_exit(&buf); 398 return ret; 399 } 400 401 static inline int update_cached_sectors(struct bch_fs *c, 402 struct bkey_s_c k, 403 unsigned dev, s64 sectors, 404 unsigned journal_seq, bool gc) 405 { 406 struct bch_replicas_padded r; 407 408 bch2_replicas_entry_cached(&r.e, dev); 409 410 return update_replicas(c, k, &r.e, sectors, journal_seq, gc); 411 } 412 413 static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, 414 gfp_t gfp) 415 { 416 struct replicas_delta_list *d = trans->fs_usage_deltas; 417 unsigned new_size = d ? (d->size + more) * 2 : 128; 418 unsigned alloc_size = sizeof(*d) + new_size; 419 420 WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); 421 422 if (!d || d->used + more > d->size) { 423 d = krealloc(d, alloc_size, gfp|__GFP_ZERO); 424 425 if (unlikely(!d)) { 426 if (alloc_size > REPLICAS_DELTA_LIST_MAX) 427 return -ENOMEM; 428 429 d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); 430 if (!d) 431 return -ENOMEM; 432 433 memset(d, 0, REPLICAS_DELTA_LIST_MAX); 434 435 if (trans->fs_usage_deltas) 436 memcpy(d, trans->fs_usage_deltas, 437 trans->fs_usage_deltas->size + sizeof(*d)); 438 439 new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); 440 kfree(trans->fs_usage_deltas); 441 } 442 443 d->size = new_size; 444 trans->fs_usage_deltas = d; 445 } 446 447 return 0; 448 } 449 450 int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) 451 { 452 return allocate_dropping_locks_errcode(trans, 453 __replicas_deltas_realloc(trans, more, _gfp)); 454 } 455 456 static inline int update_replicas_list(struct btree_trans *trans, 457 struct bch_replicas_entry *r, 458 s64 sectors) 459 { 460 struct replicas_delta_list *d; 461 struct replicas_delta *n; 462 unsigned b; 463 int ret; 464 465 if (!sectors) 466 return 0; 467 468 b = replicas_entry_bytes(r) + 8; 469 ret = bch2_replicas_deltas_realloc(trans, b); 470 if (ret) 471 return ret; 472 473 d = trans->fs_usage_deltas; 474 n = (void *) d->d + d->used; 475 n->delta = sectors; 476 unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), 477 r, replicas_entry_bytes(r), 478 "flexible array member embedded in strcuct with padding"); 479 bch2_replicas_entry_sort(&n->r); 480 d->used += b; 481 return 0; 482 } 483 484 static inline int update_cached_sectors_list(struct btree_trans *trans, 485 unsigned dev, s64 sectors) 486 { 487 struct bch_replicas_padded r; 488 489 bch2_replicas_entry_cached(&r.e, dev); 490 491 return update_replicas_list(trans, &r.e, sectors); 492 } 493 494 int bch2_mark_alloc(struct btree_trans *trans, 495 enum btree_id btree, unsigned level, 496 struct bkey_s_c old, struct bkey_s_c new, 497 unsigned flags) 498 { 499 bool gc = flags & BTREE_TRIGGER_GC; 500 u64 journal_seq = trans->journal_res.seq; 501 u64 bucket_journal_seq; 502 struct bch_fs *c = trans->c; 503 struct bch_alloc_v4 old_a_convert, new_a_convert; 504 const struct bch_alloc_v4 *old_a, *new_a; 505 struct bch_dev *ca; 506 int ret = 0; 507 508 /* 509 * alloc btree is read in by bch2_alloc_read, not gc: 510 */ 511 if ((flags & BTREE_TRIGGER_GC) && 512 !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) 513 return 0; 514 515 if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, 516 "alloc key for invalid device or bucket")) 517 return -EIO; 518 519 ca = bch_dev_bkey_exists(c, new.k->p.inode); 520 521 old_a = bch2_alloc_to_v4(old, &old_a_convert); 522 new_a = bch2_alloc_to_v4(new, &new_a_convert); 523 524 bucket_journal_seq = new_a->journal_seq; 525 526 if ((flags & BTREE_TRIGGER_INSERT) && 527 data_type_is_empty(old_a->data_type) != 528 data_type_is_empty(new_a->data_type) && 529 new.k->type == KEY_TYPE_alloc_v4) { 530 struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; 531 532 EBUG_ON(!journal_seq); 533 534 /* 535 * If the btree updates referring to a bucket weren't flushed 536 * before the bucket became empty again, then the we don't have 537 * to wait on a journal flush before we can reuse the bucket: 538 */ 539 v->journal_seq = bucket_journal_seq = 540 data_type_is_empty(new_a->data_type) && 541 (journal_seq == v->journal_seq || 542 bch2_journal_noflush_seq(&c->journal, v->journal_seq)) 543 ? 0 : journal_seq; 544 } 545 546 if (!data_type_is_empty(old_a->data_type) && 547 data_type_is_empty(new_a->data_type) && 548 bucket_journal_seq) { 549 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 550 c->journal.flushed_seq_ondisk, 551 new.k->p.inode, new.k->p.offset, 552 bucket_journal_seq); 553 if (ret) { 554 bch2_fs_fatal_error(c, 555 "error setting bucket_needs_journal_commit: %i", ret); 556 return ret; 557 } 558 } 559 560 percpu_down_read(&c->mark_lock); 561 if (!gc && new_a->gen != old_a->gen) 562 *bucket_gen(ca, new.k->p.offset) = new_a->gen; 563 564 bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); 565 566 if (gc) { 567 struct bucket *g = gc_bucket(ca, new.k->p.offset); 568 569 bucket_lock(g); 570 571 g->gen_valid = 1; 572 g->gen = new_a->gen; 573 g->data_type = new_a->data_type; 574 g->stripe = new_a->stripe; 575 g->stripe_redundancy = new_a->stripe_redundancy; 576 g->dirty_sectors = new_a->dirty_sectors; 577 g->cached_sectors = new_a->cached_sectors; 578 579 bucket_unlock(g); 580 } 581 percpu_up_read(&c->mark_lock); 582 583 /* 584 * need to know if we're getting called from the invalidate path or 585 * not: 586 */ 587 588 if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && 589 old_a->cached_sectors) { 590 ret = update_cached_sectors(c, new, ca->dev_idx, 591 -((s64) old_a->cached_sectors), 592 journal_seq, gc); 593 if (ret) { 594 bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", 595 __func__); 596 return ret; 597 } 598 } 599 600 if (new_a->data_type == BCH_DATA_free && 601 (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) 602 closure_wake_up(&c->freelist_wait); 603 604 if (new_a->data_type == BCH_DATA_need_discard && 605 (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) 606 bch2_do_discards(c); 607 608 if (old_a->data_type != BCH_DATA_cached && 609 new_a->data_type == BCH_DATA_cached && 610 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 611 bch2_do_invalidates(c); 612 613 if (new_a->data_type == BCH_DATA_need_gc_gens) 614 bch2_do_gc_gens(c); 615 616 return 0; 617 } 618 619 int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, 620 size_t b, enum bch_data_type data_type, 621 unsigned sectors, struct gc_pos pos, 622 unsigned flags) 623 { 624 struct bucket old, new, *g; 625 int ret = 0; 626 627 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 628 BUG_ON(data_type != BCH_DATA_sb && 629 data_type != BCH_DATA_journal); 630 631 /* 632 * Backup superblock might be past the end of our normal usable space: 633 */ 634 if (b >= ca->mi.nbuckets) 635 return 0; 636 637 percpu_down_read(&c->mark_lock); 638 g = gc_bucket(ca, b); 639 640 bucket_lock(g); 641 old = *g; 642 643 if (bch2_fs_inconsistent_on(g->data_type && 644 g->data_type != data_type, c, 645 "different types of data in same bucket: %s, %s", 646 bch2_data_types[g->data_type], 647 bch2_data_types[data_type])) { 648 ret = -EIO; 649 goto err; 650 } 651 652 if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, 653 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", 654 ca->dev_idx, b, g->gen, 655 bch2_data_types[g->data_type ?: data_type], 656 g->dirty_sectors, sectors)) { 657 ret = -EIO; 658 goto err; 659 } 660 661 662 g->data_type = data_type; 663 g->dirty_sectors += sectors; 664 new = *g; 665 err: 666 bucket_unlock(g); 667 if (!ret) 668 bch2_dev_usage_update_m(c, ca, old, new, 0, true); 669 percpu_up_read(&c->mark_lock); 670 return ret; 671 } 672 673 static int check_bucket_ref(struct btree_trans *trans, 674 struct bkey_s_c k, 675 const struct bch_extent_ptr *ptr, 676 s64 sectors, enum bch_data_type ptr_data_type, 677 u8 b_gen, u8 bucket_data_type, 678 u32 dirty_sectors, u32 cached_sectors) 679 { 680 struct bch_fs *c = trans->c; 681 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 682 size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); 683 u32 bucket_sectors = !ptr->cached 684 ? dirty_sectors 685 : cached_sectors; 686 struct printbuf buf = PRINTBUF; 687 int ret = 0; 688 689 if (bucket_data_type == BCH_DATA_cached) 690 bucket_data_type = BCH_DATA_user; 691 692 if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || 693 (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) 694 bucket_data_type = ptr_data_type = BCH_DATA_stripe; 695 696 if (gen_after(ptr->gen, b_gen)) { 697 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 698 BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, 699 "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" 700 "while marking %s", 701 ptr->dev, bucket_nr, b_gen, 702 bch2_data_types[bucket_data_type ?: ptr_data_type], 703 ptr->gen, 704 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 705 ret = -EIO; 706 goto err; 707 } 708 709 if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { 710 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 711 BCH_FSCK_ERR_ptr_too_stale, 712 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 713 "while marking %s", 714 ptr->dev, bucket_nr, b_gen, 715 bch2_data_types[bucket_data_type ?: ptr_data_type], 716 ptr->gen, 717 (printbuf_reset(&buf), 718 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 719 ret = -EIO; 720 goto err; 721 } 722 723 if (b_gen != ptr->gen && !ptr->cached) { 724 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 725 BCH_FSCK_ERR_stale_dirty_ptr, 726 "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" 727 "while marking %s", 728 ptr->dev, bucket_nr, b_gen, 729 *bucket_gen(ca, bucket_nr), 730 bch2_data_types[bucket_data_type ?: ptr_data_type], 731 ptr->gen, 732 (printbuf_reset(&buf), 733 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 734 ret = -EIO; 735 goto err; 736 } 737 738 if (b_gen != ptr->gen) { 739 ret = 1; 740 goto out; 741 } 742 743 if (!data_type_is_empty(bucket_data_type) && 744 ptr_data_type && 745 bucket_data_type != ptr_data_type) { 746 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 747 BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, 748 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" 749 "while marking %s", 750 ptr->dev, bucket_nr, b_gen, 751 bch2_data_types[bucket_data_type], 752 bch2_data_types[ptr_data_type], 753 (printbuf_reset(&buf), 754 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 755 ret = -EIO; 756 goto err; 757 } 758 759 if ((u64) bucket_sectors + sectors > U32_MAX) { 760 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 761 BCH_FSCK_ERR_bucket_sector_count_overflow, 762 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" 763 "while marking %s", 764 ptr->dev, bucket_nr, b_gen, 765 bch2_data_types[bucket_data_type ?: ptr_data_type], 766 bucket_sectors, sectors, 767 (printbuf_reset(&buf), 768 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 769 ret = -EIO; 770 goto err; 771 } 772 out: 773 printbuf_exit(&buf); 774 return ret; 775 err: 776 bch2_dump_trans_updates(trans); 777 goto out; 778 } 779 780 static int mark_stripe_bucket(struct btree_trans *trans, 781 struct bkey_s_c k, 782 unsigned ptr_idx, 783 unsigned flags) 784 { 785 struct bch_fs *c = trans->c; 786 u64 journal_seq = trans->journal_res.seq; 787 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 788 unsigned nr_data = s->nr_blocks - s->nr_redundant; 789 bool parity = ptr_idx >= nr_data; 790 enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; 791 s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; 792 const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; 793 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 794 struct bucket old, new, *g; 795 struct printbuf buf = PRINTBUF; 796 int ret = 0; 797 798 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 799 800 /* * XXX doesn't handle deletion */ 801 802 percpu_down_read(&c->mark_lock); 803 g = PTR_GC_BUCKET(ca, ptr); 804 805 if (g->dirty_sectors || 806 (g->stripe && g->stripe != k.k->p.offset)) { 807 bch2_fs_inconsistent(c, 808 "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", 809 ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, 810 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 811 ret = -EINVAL; 812 goto err; 813 } 814 815 bucket_lock(g); 816 old = *g; 817 818 ret = check_bucket_ref(trans, k, ptr, sectors, data_type, 819 g->gen, g->data_type, 820 g->dirty_sectors, g->cached_sectors); 821 if (ret) 822 goto err; 823 824 g->data_type = data_type; 825 g->dirty_sectors += sectors; 826 827 g->stripe = k.k->p.offset; 828 g->stripe_redundancy = s->nr_redundant; 829 new = *g; 830 err: 831 bucket_unlock(g); 832 if (!ret) 833 bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); 834 percpu_up_read(&c->mark_lock); 835 printbuf_exit(&buf); 836 return ret; 837 } 838 839 static int __mark_pointer(struct btree_trans *trans, 840 struct bkey_s_c k, 841 const struct bch_extent_ptr *ptr, 842 s64 sectors, enum bch_data_type ptr_data_type, 843 u8 bucket_gen, u8 *bucket_data_type, 844 u32 *dirty_sectors, u32 *cached_sectors) 845 { 846 u32 *dst_sectors = !ptr->cached 847 ? dirty_sectors 848 : cached_sectors; 849 int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, 850 bucket_gen, *bucket_data_type, 851 *dirty_sectors, *cached_sectors); 852 853 if (ret) 854 return ret; 855 856 *dst_sectors += sectors; 857 *bucket_data_type = *dirty_sectors || *cached_sectors 858 ? ptr_data_type : 0; 859 return 0; 860 } 861 862 static int bch2_mark_pointer(struct btree_trans *trans, 863 enum btree_id btree_id, unsigned level, 864 struct bkey_s_c k, 865 struct extent_ptr_decoded p, 866 s64 sectors, 867 unsigned flags) 868 { 869 u64 journal_seq = trans->journal_res.seq; 870 struct bch_fs *c = trans->c; 871 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); 872 struct bucket old, new, *g; 873 enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); 874 u8 bucket_data_type; 875 int ret = 0; 876 877 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 878 879 percpu_down_read(&c->mark_lock); 880 g = PTR_GC_BUCKET(ca, &p.ptr); 881 bucket_lock(g); 882 old = *g; 883 884 bucket_data_type = g->data_type; 885 ret = __mark_pointer(trans, k, &p.ptr, sectors, 886 data_type, g->gen, 887 &bucket_data_type, 888 &g->dirty_sectors, 889 &g->cached_sectors); 890 if (!ret) 891 g->data_type = bucket_data_type; 892 893 new = *g; 894 bucket_unlock(g); 895 if (!ret) 896 bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); 897 percpu_up_read(&c->mark_lock); 898 899 return ret; 900 } 901 902 static int bch2_mark_stripe_ptr(struct btree_trans *trans, 903 struct bkey_s_c k, 904 struct bch_extent_stripe_ptr p, 905 enum bch_data_type data_type, 906 s64 sectors, 907 unsigned flags) 908 { 909 struct bch_fs *c = trans->c; 910 struct bch_replicas_padded r; 911 struct gc_stripe *m; 912 913 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 914 915 m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); 916 if (!m) { 917 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 918 (u64) p.idx); 919 return -BCH_ERR_ENOMEM_mark_stripe_ptr; 920 } 921 922 mutex_lock(&c->ec_stripes_heap_lock); 923 924 if (!m || !m->alive) { 925 mutex_unlock(&c->ec_stripes_heap_lock); 926 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", 927 (u64) p.idx); 928 bch2_inconsistent_error(c); 929 return -EIO; 930 } 931 932 m->block_sectors[p.block] += sectors; 933 934 r = m->r; 935 mutex_unlock(&c->ec_stripes_heap_lock); 936 937 r.e.data_type = data_type; 938 update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); 939 940 return 0; 941 } 942 943 static int __mark_extent(struct btree_trans *trans, 944 enum btree_id btree_id, unsigned level, 945 struct bkey_s_c k, unsigned flags) 946 { 947 u64 journal_seq = trans->journal_res.seq; 948 struct bch_fs *c = trans->c; 949 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 950 const union bch_extent_entry *entry; 951 struct extent_ptr_decoded p; 952 struct bch_replicas_padded r; 953 enum bch_data_type data_type = bkey_is_btree_ptr(k.k) 954 ? BCH_DATA_btree 955 : BCH_DATA_user; 956 s64 sectors = bkey_is_btree_ptr(k.k) 957 ? btree_sectors(c) 958 : k.k->size; 959 s64 dirty_sectors = 0; 960 bool stale; 961 int ret; 962 963 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 964 965 r.e.data_type = data_type; 966 r.e.nr_devs = 0; 967 r.e.nr_required = 1; 968 969 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 970 s64 disk_sectors = ptr_disk_sectors(sectors, p); 971 972 if (flags & BTREE_TRIGGER_OVERWRITE) 973 disk_sectors = -disk_sectors; 974 975 ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); 976 if (ret < 0) 977 return ret; 978 979 stale = ret > 0; 980 981 if (p.ptr.cached) { 982 if (!stale) { 983 ret = update_cached_sectors(c, k, p.ptr.dev, 984 disk_sectors, journal_seq, true); 985 if (ret) { 986 bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", 987 __func__); 988 return ret; 989 } 990 } 991 } else if (!p.has_ec) { 992 dirty_sectors += disk_sectors; 993 r.e.devs[r.e.nr_devs++] = p.ptr.dev; 994 } else { 995 ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, 996 disk_sectors, flags); 997 if (ret) 998 return ret; 999 1000 /* 1001 * There may be other dirty pointers in this extent, but 1002 * if so they're not required for mounting if we have an 1003 * erasure coded pointer in this extent: 1004 */ 1005 r.e.nr_required = 0; 1006 } 1007 } 1008 1009 if (r.e.nr_devs) { 1010 ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); 1011 if (ret) { 1012 struct printbuf buf = PRINTBUF; 1013 1014 bch2_bkey_val_to_text(&buf, c, k); 1015 bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); 1016 printbuf_exit(&buf); 1017 return ret; 1018 } 1019 } 1020 1021 return 0; 1022 } 1023 1024 int bch2_mark_extent(struct btree_trans *trans, 1025 enum btree_id btree_id, unsigned level, 1026 struct bkey_s_c old, struct bkey_s_c new, 1027 unsigned flags) 1028 { 1029 return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); 1030 } 1031 1032 int bch2_mark_stripe(struct btree_trans *trans, 1033 enum btree_id btree_id, unsigned level, 1034 struct bkey_s_c old, struct bkey_s_c new, 1035 unsigned flags) 1036 { 1037 bool gc = flags & BTREE_TRIGGER_GC; 1038 u64 journal_seq = trans->journal_res.seq; 1039 struct bch_fs *c = trans->c; 1040 u64 idx = new.k->p.offset; 1041 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 1042 ? bkey_s_c_to_stripe(old).v : NULL; 1043 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 1044 ? bkey_s_c_to_stripe(new).v : NULL; 1045 unsigned i; 1046 int ret; 1047 1048 BUG_ON(gc && old_s); 1049 1050 if (!gc) { 1051 struct stripe *m = genradix_ptr(&c->stripes, idx); 1052 1053 if (!m) { 1054 struct printbuf buf1 = PRINTBUF; 1055 struct printbuf buf2 = PRINTBUF; 1056 1057 bch2_bkey_val_to_text(&buf1, c, old); 1058 bch2_bkey_val_to_text(&buf2, c, new); 1059 bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" 1060 "old %s\n" 1061 "new %s", idx, buf1.buf, buf2.buf); 1062 printbuf_exit(&buf2); 1063 printbuf_exit(&buf1); 1064 bch2_inconsistent_error(c); 1065 return -1; 1066 } 1067 1068 if (!new_s) { 1069 bch2_stripes_heap_del(c, m, idx); 1070 1071 memset(m, 0, sizeof(*m)); 1072 } else { 1073 m->sectors = le16_to_cpu(new_s->sectors); 1074 m->algorithm = new_s->algorithm; 1075 m->nr_blocks = new_s->nr_blocks; 1076 m->nr_redundant = new_s->nr_redundant; 1077 m->blocks_nonempty = 0; 1078 1079 for (i = 0; i < new_s->nr_blocks; i++) 1080 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); 1081 1082 if (!old_s) 1083 bch2_stripes_heap_insert(c, m, idx); 1084 else 1085 bch2_stripes_heap_update(c, m, idx); 1086 } 1087 } else { 1088 struct gc_stripe *m = 1089 genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); 1090 1091 if (!m) { 1092 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 1093 idx); 1094 return -BCH_ERR_ENOMEM_mark_stripe; 1095 } 1096 /* 1097 * This will be wrong when we bring back runtime gc: we should 1098 * be unmarking the old key and then marking the new key 1099 */ 1100 m->alive = true; 1101 m->sectors = le16_to_cpu(new_s->sectors); 1102 m->nr_blocks = new_s->nr_blocks; 1103 m->nr_redundant = new_s->nr_redundant; 1104 1105 for (i = 0; i < new_s->nr_blocks; i++) 1106 m->ptrs[i] = new_s->ptrs[i]; 1107 1108 bch2_bkey_to_replicas(&m->r.e, new); 1109 1110 /* 1111 * gc recalculates this field from stripe ptr 1112 * references: 1113 */ 1114 memset(m->block_sectors, 0, sizeof(m->block_sectors)); 1115 1116 for (i = 0; i < new_s->nr_blocks; i++) { 1117 ret = mark_stripe_bucket(trans, new, i, flags); 1118 if (ret) 1119 return ret; 1120 } 1121 1122 ret = update_replicas(c, new, &m->r.e, 1123 ((s64) m->sectors * m->nr_redundant), 1124 journal_seq, gc); 1125 if (ret) { 1126 struct printbuf buf = PRINTBUF; 1127 1128 bch2_bkey_val_to_text(&buf, c, new); 1129 bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); 1130 printbuf_exit(&buf); 1131 return ret; 1132 } 1133 } 1134 1135 return 0; 1136 } 1137 1138 static int __mark_reservation(struct btree_trans *trans, 1139 enum btree_id btree_id, unsigned level, 1140 struct bkey_s_c k, unsigned flags) 1141 { 1142 struct bch_fs *c = trans->c; 1143 struct bch_fs_usage *fs_usage; 1144 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; 1145 s64 sectors = (s64) k.k->size; 1146 1147 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 1148 1149 if (flags & BTREE_TRIGGER_OVERWRITE) 1150 sectors = -sectors; 1151 sectors *= replicas; 1152 1153 percpu_down_read(&c->mark_lock); 1154 preempt_disable(); 1155 1156 fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); 1157 replicas = clamp_t(unsigned, replicas, 1, 1158 ARRAY_SIZE(fs_usage->persistent_reserved)); 1159 1160 fs_usage->reserved += sectors; 1161 fs_usage->persistent_reserved[replicas - 1] += sectors; 1162 1163 preempt_enable(); 1164 percpu_up_read(&c->mark_lock); 1165 1166 return 0; 1167 } 1168 1169 int bch2_mark_reservation(struct btree_trans *trans, 1170 enum btree_id btree_id, unsigned level, 1171 struct bkey_s_c old, struct bkey_s_c new, 1172 unsigned flags) 1173 { 1174 return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); 1175 } 1176 1177 static s64 __bch2_mark_reflink_p(struct btree_trans *trans, 1178 struct bkey_s_c_reflink_p p, 1179 u64 start, u64 end, 1180 u64 *idx, unsigned flags, size_t r_idx) 1181 { 1182 struct bch_fs *c = trans->c; 1183 struct reflink_gc *r; 1184 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; 1185 u64 next_idx = end; 1186 s64 ret = 0; 1187 struct printbuf buf = PRINTBUF; 1188 1189 if (r_idx >= c->reflink_gc_nr) 1190 goto not_found; 1191 1192 r = genradix_ptr(&c->reflink_gc_table, r_idx); 1193 next_idx = min(next_idx, r->offset - r->size); 1194 if (*idx < next_idx) 1195 goto not_found; 1196 1197 BUG_ON((s64) r->refcount + add < 0); 1198 1199 r->refcount += add; 1200 *idx = r->offset; 1201 return 0; 1202 not_found: 1203 if (fsck_err(c, reflink_p_to_missing_reflink_v, 1204 "pointer to missing indirect extent\n" 1205 " %s\n" 1206 " missing range %llu-%llu", 1207 (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), 1208 *idx, next_idx)) { 1209 struct bkey_i_error *new; 1210 1211 new = bch2_trans_kmalloc(trans, sizeof(*new)); 1212 ret = PTR_ERR_OR_ZERO(new); 1213 if (ret) 1214 goto err; 1215 1216 bkey_init(&new->k); 1217 new->k.type = KEY_TYPE_error; 1218 new->k.p = bkey_start_pos(p.k); 1219 new->k.p.offset += *idx - start; 1220 bch2_key_resize(&new->k, next_idx - *idx); 1221 ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, 1222 BTREE_TRIGGER_NORUN); 1223 } 1224 1225 *idx = next_idx; 1226 err: 1227 fsck_err: 1228 printbuf_exit(&buf); 1229 return ret; 1230 } 1231 1232 static int __mark_reflink_p(struct btree_trans *trans, 1233 enum btree_id btree_id, unsigned level, 1234 struct bkey_s_c k, unsigned flags) 1235 { 1236 struct bch_fs *c = trans->c; 1237 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 1238 struct reflink_gc *ref; 1239 size_t l, r, m; 1240 u64 idx = le64_to_cpu(p.v->idx), start = idx; 1241 u64 end = le64_to_cpu(p.v->idx) + p.k->size; 1242 int ret = 0; 1243 1244 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 1245 1246 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { 1247 idx -= le32_to_cpu(p.v->front_pad); 1248 end += le32_to_cpu(p.v->back_pad); 1249 } 1250 1251 l = 0; 1252 r = c->reflink_gc_nr; 1253 while (l < r) { 1254 m = l + (r - l) / 2; 1255 1256 ref = genradix_ptr(&c->reflink_gc_table, m); 1257 if (ref->offset <= idx) 1258 l = m + 1; 1259 else 1260 r = m; 1261 } 1262 1263 while (idx < end && !ret) 1264 ret = __bch2_mark_reflink_p(trans, p, start, end, 1265 &idx, flags, l++); 1266 1267 return ret; 1268 } 1269 1270 int bch2_mark_reflink_p(struct btree_trans *trans, 1271 enum btree_id btree_id, unsigned level, 1272 struct bkey_s_c old, struct bkey_s_c new, 1273 unsigned flags) 1274 { 1275 return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); 1276 } 1277 1278 void bch2_trans_fs_usage_revert(struct btree_trans *trans, 1279 struct replicas_delta_list *deltas) 1280 { 1281 struct bch_fs *c = trans->c; 1282 struct bch_fs_usage *dst; 1283 struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; 1284 s64 added = 0; 1285 unsigned i; 1286 1287 percpu_down_read(&c->mark_lock); 1288 preempt_disable(); 1289 dst = fs_usage_ptr(c, trans->journal_res.seq, false); 1290 1291 /* revert changes: */ 1292 for (d = deltas->d; d != top; d = replicas_delta_next(d)) { 1293 switch (d->r.data_type) { 1294 case BCH_DATA_btree: 1295 case BCH_DATA_user: 1296 case BCH_DATA_parity: 1297 added += d->delta; 1298 } 1299 BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); 1300 } 1301 1302 dst->nr_inodes -= deltas->nr_inodes; 1303 1304 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1305 added -= deltas->persistent_reserved[i]; 1306 dst->reserved -= deltas->persistent_reserved[i]; 1307 dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; 1308 } 1309 1310 if (added > 0) { 1311 trans->disk_res->sectors += added; 1312 this_cpu_add(*c->online_reserved, added); 1313 } 1314 1315 preempt_enable(); 1316 percpu_up_read(&c->mark_lock); 1317 } 1318 1319 int bch2_trans_fs_usage_apply(struct btree_trans *trans, 1320 struct replicas_delta_list *deltas) 1321 { 1322 struct bch_fs *c = trans->c; 1323 static int warned_disk_usage = 0; 1324 bool warn = false; 1325 u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; 1326 struct replicas_delta *d, *d2; 1327 struct replicas_delta *top = (void *) deltas->d + deltas->used; 1328 struct bch_fs_usage *dst; 1329 s64 added = 0, should_not_have_added; 1330 unsigned i; 1331 1332 percpu_down_read(&c->mark_lock); 1333 preempt_disable(); 1334 dst = fs_usage_ptr(c, trans->journal_res.seq, false); 1335 1336 for (d = deltas->d; d != top; d = replicas_delta_next(d)) { 1337 switch (d->r.data_type) { 1338 case BCH_DATA_btree: 1339 case BCH_DATA_user: 1340 case BCH_DATA_parity: 1341 added += d->delta; 1342 } 1343 1344 if (__update_replicas(c, dst, &d->r, d->delta)) 1345 goto need_mark; 1346 } 1347 1348 dst->nr_inodes += deltas->nr_inodes; 1349 1350 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1351 added += deltas->persistent_reserved[i]; 1352 dst->reserved += deltas->persistent_reserved[i]; 1353 dst->persistent_reserved[i] += deltas->persistent_reserved[i]; 1354 } 1355 1356 /* 1357 * Not allowed to reduce sectors_available except by getting a 1358 * reservation: 1359 */ 1360 should_not_have_added = added - (s64) disk_res_sectors; 1361 if (unlikely(should_not_have_added > 0)) { 1362 u64 old, new, v = atomic64_read(&c->sectors_available); 1363 1364 do { 1365 old = v; 1366 new = max_t(s64, 0, old - should_not_have_added); 1367 } while ((v = atomic64_cmpxchg(&c->sectors_available, 1368 old, new)) != old); 1369 1370 added -= should_not_have_added; 1371 warn = true; 1372 } 1373 1374 if (added > 0) { 1375 trans->disk_res->sectors -= added; 1376 this_cpu_sub(*c->online_reserved, added); 1377 } 1378 1379 preempt_enable(); 1380 percpu_up_read(&c->mark_lock); 1381 1382 if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) 1383 bch2_trans_inconsistent(trans, 1384 "disk usage increased %lli more than %llu sectors reserved)", 1385 should_not_have_added, disk_res_sectors); 1386 return 0; 1387 need_mark: 1388 /* revert changes: */ 1389 for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) 1390 BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); 1391 1392 preempt_enable(); 1393 percpu_up_read(&c->mark_lock); 1394 return -1; 1395 } 1396 1397 /* trans_mark: */ 1398 1399 static inline int bch2_trans_mark_pointer(struct btree_trans *trans, 1400 enum btree_id btree_id, unsigned level, 1401 struct bkey_s_c k, struct extent_ptr_decoded p, 1402 unsigned flags) 1403 { 1404 bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); 1405 struct btree_iter iter; 1406 struct bkey_i_alloc_v4 *a; 1407 struct bpos bucket; 1408 struct bch_backpointer bp; 1409 s64 sectors; 1410 int ret; 1411 1412 bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); 1413 sectors = bp.bucket_len; 1414 if (!insert) 1415 sectors = -sectors; 1416 1417 a = bch2_trans_start_alloc_update(trans, &iter, bucket); 1418 if (IS_ERR(a)) 1419 return PTR_ERR(a); 1420 1421 ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, 1422 a->v.gen, &a->v.data_type, 1423 &a->v.dirty_sectors, &a->v.cached_sectors) ?: 1424 bch2_trans_update(trans, &iter, &a->k_i, 0); 1425 bch2_trans_iter_exit(trans, &iter); 1426 1427 if (ret) 1428 return ret; 1429 1430 if (!p.ptr.cached) { 1431 ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); 1432 if (ret) 1433 return ret; 1434 } 1435 1436 return 0; 1437 } 1438 1439 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, 1440 struct extent_ptr_decoded p, 1441 s64 sectors, enum bch_data_type data_type) 1442 { 1443 struct btree_iter iter; 1444 struct bkey_i_stripe *s; 1445 struct bch_replicas_padded r; 1446 int ret = 0; 1447 1448 s = bch2_bkey_get_mut_typed(trans, &iter, 1449 BTREE_ID_stripes, POS(0, p.ec.idx), 1450 BTREE_ITER_WITH_UPDATES, stripe); 1451 ret = PTR_ERR_OR_ZERO(s); 1452 if (unlikely(ret)) { 1453 bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, 1454 "pointer to nonexistent stripe %llu", 1455 (u64) p.ec.idx); 1456 goto err; 1457 } 1458 1459 if (!bch2_ptr_matches_stripe(&s->v, p)) { 1460 bch2_trans_inconsistent(trans, 1461 "stripe pointer doesn't match stripe %llu", 1462 (u64) p.ec.idx); 1463 ret = -EIO; 1464 goto err; 1465 } 1466 1467 stripe_blockcount_set(&s->v, p.ec.block, 1468 stripe_blockcount_get(&s->v, p.ec.block) + 1469 sectors); 1470 1471 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); 1472 r.e.data_type = data_type; 1473 ret = update_replicas_list(trans, &r.e, sectors); 1474 err: 1475 bch2_trans_iter_exit(trans, &iter); 1476 return ret; 1477 } 1478 1479 static int __trans_mark_extent(struct btree_trans *trans, 1480 enum btree_id btree_id, unsigned level, 1481 struct bkey_s_c k, unsigned flags) 1482 { 1483 struct bch_fs *c = trans->c; 1484 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1485 const union bch_extent_entry *entry; 1486 struct extent_ptr_decoded p; 1487 struct bch_replicas_padded r; 1488 enum bch_data_type data_type = bkey_is_btree_ptr(k.k) 1489 ? BCH_DATA_btree 1490 : BCH_DATA_user; 1491 s64 sectors = bkey_is_btree_ptr(k.k) 1492 ? btree_sectors(c) 1493 : k.k->size; 1494 s64 dirty_sectors = 0; 1495 bool stale; 1496 int ret = 0; 1497 1498 r.e.data_type = data_type; 1499 r.e.nr_devs = 0; 1500 r.e.nr_required = 1; 1501 1502 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1503 s64 disk_sectors = ptr_disk_sectors(sectors, p); 1504 1505 if (flags & BTREE_TRIGGER_OVERWRITE) 1506 disk_sectors = -disk_sectors; 1507 1508 ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); 1509 if (ret < 0) 1510 return ret; 1511 1512 stale = ret > 0; 1513 1514 if (p.ptr.cached) { 1515 if (!stale) { 1516 ret = update_cached_sectors_list(trans, p.ptr.dev, 1517 disk_sectors); 1518 if (ret) 1519 return ret; 1520 } 1521 } else if (!p.has_ec) { 1522 dirty_sectors += disk_sectors; 1523 r.e.devs[r.e.nr_devs++] = p.ptr.dev; 1524 } else { 1525 ret = bch2_trans_mark_stripe_ptr(trans, p, 1526 disk_sectors, data_type); 1527 if (ret) 1528 return ret; 1529 1530 r.e.nr_required = 0; 1531 } 1532 } 1533 1534 if (r.e.nr_devs) 1535 ret = update_replicas_list(trans, &r.e, dirty_sectors); 1536 1537 return ret; 1538 } 1539 1540 int bch2_trans_mark_extent(struct btree_trans *trans, 1541 enum btree_id btree_id, unsigned level, 1542 struct bkey_s_c old, struct bkey_i *new, 1543 unsigned flags) 1544 { 1545 struct bch_fs *c = trans->c; 1546 int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - 1547 (int) bch2_bkey_needs_rebalance(c, old); 1548 1549 if (mod) { 1550 int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); 1551 if (ret) 1552 return ret; 1553 } 1554 1555 return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); 1556 } 1557 1558 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, 1559 struct bkey_s_c_stripe s, 1560 unsigned idx, bool deleting) 1561 { 1562 struct bch_fs *c = trans->c; 1563 const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; 1564 struct btree_iter iter; 1565 struct bkey_i_alloc_v4 *a; 1566 enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant 1567 ? BCH_DATA_parity : 0; 1568 s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; 1569 int ret = 0; 1570 1571 if (deleting) 1572 sectors = -sectors; 1573 1574 a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); 1575 if (IS_ERR(a)) 1576 return PTR_ERR(a); 1577 1578 ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, 1579 a->v.gen, a->v.data_type, 1580 a->v.dirty_sectors, a->v.cached_sectors); 1581 if (ret) 1582 goto err; 1583 1584 if (!deleting) { 1585 if (bch2_trans_inconsistent_on(a->v.stripe || 1586 a->v.stripe_redundancy, trans, 1587 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", 1588 iter.pos.inode, iter.pos.offset, a->v.gen, 1589 bch2_data_types[a->v.data_type], 1590 a->v.dirty_sectors, 1591 a->v.stripe, s.k->p.offset)) { 1592 ret = -EIO; 1593 goto err; 1594 } 1595 1596 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, 1597 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", 1598 iter.pos.inode, iter.pos.offset, a->v.gen, 1599 bch2_data_types[a->v.data_type], 1600 a->v.dirty_sectors, 1601 s.k->p.offset)) { 1602 ret = -EIO; 1603 goto err; 1604 } 1605 1606 a->v.stripe = s.k->p.offset; 1607 a->v.stripe_redundancy = s.v->nr_redundant; 1608 a->v.data_type = BCH_DATA_stripe; 1609 } else { 1610 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || 1611 a->v.stripe_redundancy != s.v->nr_redundant, trans, 1612 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", 1613 iter.pos.inode, iter.pos.offset, a->v.gen, 1614 s.k->p.offset, a->v.stripe)) { 1615 ret = -EIO; 1616 goto err; 1617 } 1618 1619 a->v.stripe = 0; 1620 a->v.stripe_redundancy = 0; 1621 a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); 1622 } 1623 1624 a->v.dirty_sectors += sectors; 1625 if (data_type) 1626 a->v.data_type = !deleting ? data_type : 0; 1627 1628 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1629 if (ret) 1630 goto err; 1631 err: 1632 bch2_trans_iter_exit(trans, &iter); 1633 return ret; 1634 } 1635 1636 int bch2_trans_mark_stripe(struct btree_trans *trans, 1637 enum btree_id btree_id, unsigned level, 1638 struct bkey_s_c old, struct bkey_i *new, 1639 unsigned flags) 1640 { 1641 const struct bch_stripe *old_s = NULL; 1642 struct bch_stripe *new_s = NULL; 1643 struct bch_replicas_padded r; 1644 unsigned i, nr_blocks; 1645 int ret = 0; 1646 1647 if (old.k->type == KEY_TYPE_stripe) 1648 old_s = bkey_s_c_to_stripe(old).v; 1649 if (new->k.type == KEY_TYPE_stripe) 1650 new_s = &bkey_i_to_stripe(new)->v; 1651 1652 /* 1653 * If the pointers aren't changing, we don't need to do anything: 1654 */ 1655 if (new_s && old_s && 1656 new_s->nr_blocks == old_s->nr_blocks && 1657 new_s->nr_redundant == old_s->nr_redundant && 1658 !memcmp(old_s->ptrs, new_s->ptrs, 1659 new_s->nr_blocks * sizeof(struct bch_extent_ptr))) 1660 return 0; 1661 1662 BUG_ON(new_s && old_s && 1663 (new_s->nr_blocks != old_s->nr_blocks || 1664 new_s->nr_redundant != old_s->nr_redundant)); 1665 1666 nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; 1667 1668 if (new_s) { 1669 s64 sectors = le16_to_cpu(new_s->sectors); 1670 1671 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); 1672 ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); 1673 if (ret) 1674 return ret; 1675 } 1676 1677 if (old_s) { 1678 s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); 1679 1680 bch2_bkey_to_replicas(&r.e, old); 1681 ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); 1682 if (ret) 1683 return ret; 1684 } 1685 1686 for (i = 0; i < nr_blocks; i++) { 1687 if (new_s && old_s && 1688 !memcmp(&new_s->ptrs[i], 1689 &old_s->ptrs[i], 1690 sizeof(new_s->ptrs[i]))) 1691 continue; 1692 1693 if (new_s) { 1694 ret = bch2_trans_mark_stripe_bucket(trans, 1695 bkey_i_to_s_c_stripe(new), i, false); 1696 if (ret) 1697 break; 1698 } 1699 1700 if (old_s) { 1701 ret = bch2_trans_mark_stripe_bucket(trans, 1702 bkey_s_c_to_stripe(old), i, true); 1703 if (ret) 1704 break; 1705 } 1706 } 1707 1708 return ret; 1709 } 1710 1711 static int __trans_mark_reservation(struct btree_trans *trans, 1712 enum btree_id btree_id, unsigned level, 1713 struct bkey_s_c k, unsigned flags) 1714 { 1715 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; 1716 s64 sectors = (s64) k.k->size; 1717 struct replicas_delta_list *d; 1718 int ret; 1719 1720 if (flags & BTREE_TRIGGER_OVERWRITE) 1721 sectors = -sectors; 1722 sectors *= replicas; 1723 1724 ret = bch2_replicas_deltas_realloc(trans, 0); 1725 if (ret) 1726 return ret; 1727 1728 d = trans->fs_usage_deltas; 1729 replicas = clamp_t(unsigned, replicas, 1, 1730 ARRAY_SIZE(d->persistent_reserved)); 1731 1732 d->persistent_reserved[replicas - 1] += sectors; 1733 return 0; 1734 } 1735 1736 int bch2_trans_mark_reservation(struct btree_trans *trans, 1737 enum btree_id btree_id, unsigned level, 1738 struct bkey_s_c old, 1739 struct bkey_i *new, 1740 unsigned flags) 1741 { 1742 return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); 1743 } 1744 1745 static int trans_mark_reflink_p_segment(struct btree_trans *trans, 1746 struct bkey_s_c_reflink_p p, 1747 u64 *idx, unsigned flags) 1748 { 1749 struct bch_fs *c = trans->c; 1750 struct btree_iter iter; 1751 struct bkey_i *k; 1752 __le64 *refcount; 1753 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; 1754 struct printbuf buf = PRINTBUF; 1755 int ret; 1756 1757 k = bch2_bkey_get_mut_noupdate(trans, &iter, 1758 BTREE_ID_reflink, POS(0, *idx), 1759 BTREE_ITER_WITH_UPDATES); 1760 ret = PTR_ERR_OR_ZERO(k); 1761 if (ret) 1762 goto err; 1763 1764 refcount = bkey_refcount(k); 1765 if (!refcount) { 1766 bch2_bkey_val_to_text(&buf, c, p.s_c); 1767 bch2_trans_inconsistent(trans, 1768 "nonexistent indirect extent at %llu while marking\n %s", 1769 *idx, buf.buf); 1770 ret = -EIO; 1771 goto err; 1772 } 1773 1774 if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { 1775 bch2_bkey_val_to_text(&buf, c, p.s_c); 1776 bch2_trans_inconsistent(trans, 1777 "indirect extent refcount underflow at %llu while marking\n %s", 1778 *idx, buf.buf); 1779 ret = -EIO; 1780 goto err; 1781 } 1782 1783 if (flags & BTREE_TRIGGER_INSERT) { 1784 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; 1785 u64 pad; 1786 1787 pad = max_t(s64, le32_to_cpu(v->front_pad), 1788 le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); 1789 BUG_ON(pad > U32_MAX); 1790 v->front_pad = cpu_to_le32(pad); 1791 1792 pad = max_t(s64, le32_to_cpu(v->back_pad), 1793 k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); 1794 BUG_ON(pad > U32_MAX); 1795 v->back_pad = cpu_to_le32(pad); 1796 } 1797 1798 le64_add_cpu(refcount, add); 1799 1800 bch2_btree_iter_set_pos_to_extent_start(&iter); 1801 ret = bch2_trans_update(trans, &iter, k, 0); 1802 if (ret) 1803 goto err; 1804 1805 *idx = k->k.p.offset; 1806 err: 1807 bch2_trans_iter_exit(trans, &iter); 1808 printbuf_exit(&buf); 1809 return ret; 1810 } 1811 1812 static int __trans_mark_reflink_p(struct btree_trans *trans, 1813 enum btree_id btree_id, unsigned level, 1814 struct bkey_s_c k, unsigned flags) 1815 { 1816 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 1817 u64 idx, end_idx; 1818 int ret = 0; 1819 1820 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); 1821 end_idx = le64_to_cpu(p.v->idx) + p.k->size + 1822 le32_to_cpu(p.v->back_pad); 1823 1824 while (idx < end_idx && !ret) 1825 ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); 1826 return ret; 1827 } 1828 1829 int bch2_trans_mark_reflink_p(struct btree_trans *trans, 1830 enum btree_id btree_id, unsigned level, 1831 struct bkey_s_c old, 1832 struct bkey_i *new, 1833 unsigned flags) 1834 { 1835 if (flags & BTREE_TRIGGER_INSERT) { 1836 struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; 1837 1838 v->front_pad = v->back_pad = 0; 1839 } 1840 1841 return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); 1842 } 1843 1844 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, 1845 struct bch_dev *ca, size_t b, 1846 enum bch_data_type type, 1847 unsigned sectors) 1848 { 1849 struct bch_fs *c = trans->c; 1850 struct btree_iter iter; 1851 struct bkey_i_alloc_v4 *a; 1852 int ret = 0; 1853 1854 /* 1855 * Backup superblock might be past the end of our normal usable space: 1856 */ 1857 if (b >= ca->mi.nbuckets) 1858 return 0; 1859 1860 a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); 1861 if (IS_ERR(a)) 1862 return PTR_ERR(a); 1863 1864 if (a->v.data_type && type && a->v.data_type != type) { 1865 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 1866 BCH_FSCK_ERR_bucket_metadata_type_mismatch, 1867 "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" 1868 "while marking %s", 1869 iter.pos.inode, iter.pos.offset, a->v.gen, 1870 bch2_data_types[a->v.data_type], 1871 bch2_data_types[type], 1872 bch2_data_types[type]); 1873 ret = -EIO; 1874 goto err; 1875 } 1876 1877 if (a->v.data_type != type || 1878 a->v.dirty_sectors != sectors) { 1879 a->v.data_type = type; 1880 a->v.dirty_sectors = sectors; 1881 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1882 } 1883 err: 1884 bch2_trans_iter_exit(trans, &iter); 1885 return ret; 1886 } 1887 1888 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, 1889 struct bch_dev *ca, size_t b, 1890 enum bch_data_type type, 1891 unsigned sectors) 1892 { 1893 return commit_do(trans, NULL, NULL, 0, 1894 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); 1895 } 1896 1897 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, 1898 struct bch_dev *ca, 1899 u64 start, u64 end, 1900 enum bch_data_type type, 1901 u64 *bucket, unsigned *bucket_sectors) 1902 { 1903 do { 1904 u64 b = sector_to_bucket(ca, start); 1905 unsigned sectors = 1906 min_t(u64, bucket_to_sector(ca, b + 1), end) - start; 1907 1908 if (b != *bucket && *bucket_sectors) { 1909 int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, 1910 type, *bucket_sectors); 1911 if (ret) 1912 return ret; 1913 1914 *bucket_sectors = 0; 1915 } 1916 1917 *bucket = b; 1918 *bucket_sectors += sectors; 1919 start += sectors; 1920 } while (start < end); 1921 1922 return 0; 1923 } 1924 1925 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, 1926 struct bch_dev *ca) 1927 { 1928 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; 1929 u64 bucket = 0; 1930 unsigned i, bucket_sectors = 0; 1931 int ret; 1932 1933 for (i = 0; i < layout->nr_superblocks; i++) { 1934 u64 offset = le64_to_cpu(layout->sb_offset[i]); 1935 1936 if (offset == BCH_SB_SECTOR) { 1937 ret = bch2_trans_mark_metadata_sectors(trans, ca, 1938 0, BCH_SB_SECTOR, 1939 BCH_DATA_sb, &bucket, &bucket_sectors); 1940 if (ret) 1941 return ret; 1942 } 1943 1944 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, 1945 offset + (1 << layout->sb_max_size_bits), 1946 BCH_DATA_sb, &bucket, &bucket_sectors); 1947 if (ret) 1948 return ret; 1949 } 1950 1951 if (bucket_sectors) { 1952 ret = bch2_trans_mark_metadata_bucket(trans, ca, 1953 bucket, BCH_DATA_sb, bucket_sectors); 1954 if (ret) 1955 return ret; 1956 } 1957 1958 for (i = 0; i < ca->journal.nr; i++) { 1959 ret = bch2_trans_mark_metadata_bucket(trans, ca, 1960 ca->journal.buckets[i], 1961 BCH_DATA_journal, ca->mi.bucket_size); 1962 if (ret) 1963 return ret; 1964 } 1965 1966 return 0; 1967 } 1968 1969 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) 1970 { 1971 int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); 1972 1973 if (ret) 1974 bch_err_fn(c, ret); 1975 return ret; 1976 } 1977 1978 int bch2_trans_mark_dev_sbs(struct bch_fs *c) 1979 { 1980 struct bch_dev *ca; 1981 unsigned i; 1982 1983 for_each_online_member(ca, c, i) { 1984 int ret = bch2_trans_mark_dev_sb(c, ca); 1985 if (ret) { 1986 percpu_ref_put(&ca->ref); 1987 return ret; 1988 } 1989 } 1990 1991 return 0; 1992 } 1993 1994 /* Disk reservations: */ 1995 1996 #define SECTORS_CACHE 1024 1997 1998 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, 1999 u64 sectors, int flags) 2000 { 2001 struct bch_fs_pcpu *pcpu; 2002 u64 old, v, get; 2003 s64 sectors_available; 2004 int ret; 2005 2006 percpu_down_read(&c->mark_lock); 2007 preempt_disable(); 2008 pcpu = this_cpu_ptr(c->pcpu); 2009 2010 if (sectors <= pcpu->sectors_available) 2011 goto out; 2012 2013 v = atomic64_read(&c->sectors_available); 2014 do { 2015 old = v; 2016 get = min((u64) sectors + SECTORS_CACHE, old); 2017 2018 if (get < sectors) { 2019 preempt_enable(); 2020 goto recalculate; 2021 } 2022 } while ((v = atomic64_cmpxchg(&c->sectors_available, 2023 old, old - get)) != old); 2024 2025 pcpu->sectors_available += get; 2026 2027 out: 2028 pcpu->sectors_available -= sectors; 2029 this_cpu_add(*c->online_reserved, sectors); 2030 res->sectors += sectors; 2031 2032 preempt_enable(); 2033 percpu_up_read(&c->mark_lock); 2034 return 0; 2035 2036 recalculate: 2037 mutex_lock(&c->sectors_available_lock); 2038 2039 percpu_u64_set(&c->pcpu->sectors_available, 0); 2040 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); 2041 2042 if (sectors <= sectors_available || 2043 (flags & BCH_DISK_RESERVATION_NOFAIL)) { 2044 atomic64_set(&c->sectors_available, 2045 max_t(s64, 0, sectors_available - sectors)); 2046 this_cpu_add(*c->online_reserved, sectors); 2047 res->sectors += sectors; 2048 ret = 0; 2049 } else { 2050 atomic64_set(&c->sectors_available, sectors_available); 2051 ret = -BCH_ERR_ENOSPC_disk_reservation; 2052 } 2053 2054 mutex_unlock(&c->sectors_available_lock); 2055 percpu_up_read(&c->mark_lock); 2056 2057 return ret; 2058 } 2059 2060 /* Startup/shutdown: */ 2061 2062 static void bucket_gens_free_rcu(struct rcu_head *rcu) 2063 { 2064 struct bucket_gens *buckets = 2065 container_of(rcu, struct bucket_gens, rcu); 2066 2067 kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); 2068 } 2069 2070 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 2071 { 2072 struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; 2073 unsigned long *buckets_nouse = NULL; 2074 bool resize = ca->bucket_gens != NULL; 2075 int ret; 2076 2077 if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, 2078 GFP_KERNEL|__GFP_ZERO))) { 2079 ret = -BCH_ERR_ENOMEM_bucket_gens; 2080 goto err; 2081 } 2082 2083 if ((c->opts.buckets_nouse && 2084 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * 2085 sizeof(unsigned long), 2086 GFP_KERNEL|__GFP_ZERO)))) { 2087 ret = -BCH_ERR_ENOMEM_buckets_nouse; 2088 goto err; 2089 } 2090 2091 bucket_gens->first_bucket = ca->mi.first_bucket; 2092 bucket_gens->nbuckets = nbuckets; 2093 2094 bch2_copygc_stop(c); 2095 2096 if (resize) { 2097 down_write(&c->gc_lock); 2098 down_write(&ca->bucket_lock); 2099 percpu_down_write(&c->mark_lock); 2100 } 2101 2102 old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); 2103 2104 if (resize) { 2105 size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); 2106 2107 memcpy(bucket_gens->b, 2108 old_bucket_gens->b, 2109 n); 2110 if (buckets_nouse) 2111 memcpy(buckets_nouse, 2112 ca->buckets_nouse, 2113 BITS_TO_LONGS(n) * sizeof(unsigned long)); 2114 } 2115 2116 rcu_assign_pointer(ca->bucket_gens, bucket_gens); 2117 bucket_gens = old_bucket_gens; 2118 2119 swap(ca->buckets_nouse, buckets_nouse); 2120 2121 nbuckets = ca->mi.nbuckets; 2122 2123 if (resize) { 2124 percpu_up_write(&c->mark_lock); 2125 up_write(&ca->bucket_lock); 2126 up_write(&c->gc_lock); 2127 } 2128 2129 ret = 0; 2130 err: 2131 kvpfree(buckets_nouse, 2132 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); 2133 if (bucket_gens) 2134 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); 2135 2136 return ret; 2137 } 2138 2139 void bch2_dev_buckets_free(struct bch_dev *ca) 2140 { 2141 unsigned i; 2142 2143 kvpfree(ca->buckets_nouse, 2144 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); 2145 kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), 2146 sizeof(struct bucket_gens) + ca->mi.nbuckets); 2147 2148 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 2149 free_percpu(ca->usage[i]); 2150 kfree(ca->usage_base); 2151 } 2152 2153 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) 2154 { 2155 unsigned i; 2156 2157 ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); 2158 if (!ca->usage_base) 2159 return -BCH_ERR_ENOMEM_usage_init; 2160 2161 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { 2162 ca->usage[i] = alloc_percpu(struct bch_dev_usage); 2163 if (!ca->usage[i]) 2164 return -BCH_ERR_ENOMEM_usage_init; 2165 } 2166 2167 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); 2168 } 2169