1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code for manipulating bucket marks for garbage collection. 4 * 5 * Copyright 2014 Datera, Inc. 6 */ 7 8 #include "bcachefs.h" 9 #include "alloc_background.h" 10 #include "backpointers.h" 11 #include "bset.h" 12 #include "btree_gc.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "buckets_waiting_for_journal.h" 16 #include "ec.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "movinggc.h" 20 #include "recovery.h" 21 #include "reflink.h" 22 #include "replicas.h" 23 #include "subvolume.h" 24 #include "trace.h" 25 26 #include <linux/preempt.h> 27 28 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, 29 enum bch_data_type data_type, 30 s64 sectors) 31 { 32 switch (data_type) { 33 case BCH_DATA_btree: 34 fs_usage->btree += sectors; 35 break; 36 case BCH_DATA_user: 37 case BCH_DATA_parity: 38 fs_usage->data += sectors; 39 break; 40 case BCH_DATA_cached: 41 fs_usage->cached += sectors; 42 break; 43 default: 44 break; 45 } 46 } 47 48 void bch2_fs_usage_initialize(struct bch_fs *c) 49 { 50 struct bch_fs_usage *usage; 51 struct bch_dev *ca; 52 unsigned i; 53 54 percpu_down_write(&c->mark_lock); 55 usage = c->usage_base; 56 57 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 58 bch2_fs_usage_acc_to_base(c, i); 59 60 for (i = 0; i < BCH_REPLICAS_MAX; i++) 61 usage->reserved += usage->persistent_reserved[i]; 62 63 for (i = 0; i < c->replicas.nr; i++) { 64 struct bch_replicas_entry *e = 65 cpu_replicas_entry(&c->replicas, i); 66 67 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); 68 } 69 70 for_each_member_device(ca, c, i) { 71 struct bch_dev_usage dev = bch2_dev_usage_read(ca); 72 73 usage->hidden += (dev.d[BCH_DATA_sb].buckets + 74 dev.d[BCH_DATA_journal].buckets) * 75 ca->mi.bucket_size; 76 } 77 78 percpu_up_write(&c->mark_lock); 79 } 80 81 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, 82 unsigned journal_seq, 83 bool gc) 84 { 85 BUG_ON(!gc && !journal_seq); 86 87 return this_cpu_ptr(gc 88 ? ca->usage_gc 89 : ca->usage[journal_seq & JOURNAL_BUF_MASK]); 90 } 91 92 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) 93 { 94 struct bch_fs *c = ca->fs; 95 unsigned seq, i, u64s = dev_usage_u64s(); 96 97 do { 98 seq = read_seqcount_begin(&c->usage_lock); 99 memcpy(usage, ca->usage_base, u64s * sizeof(u64)); 100 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 101 acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); 102 } while (read_seqcount_retry(&c->usage_lock, seq)); 103 } 104 105 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) 106 { 107 ssize_t offset = v - (u64 *) c->usage_base; 108 unsigned i, seq; 109 u64 ret; 110 111 BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); 112 percpu_rwsem_assert_held(&c->mark_lock); 113 114 do { 115 seq = read_seqcount_begin(&c->usage_lock); 116 ret = *v; 117 118 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 119 ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); 120 } while (read_seqcount_retry(&c->usage_lock, seq)); 121 122 return ret; 123 } 124 125 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) 126 { 127 struct bch_fs_usage_online *ret; 128 unsigned nr_replicas = READ_ONCE(c->replicas.nr); 129 unsigned seq, i; 130 retry: 131 ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); 132 if (unlikely(!ret)) 133 return NULL; 134 135 percpu_down_read(&c->mark_lock); 136 137 if (nr_replicas != c->replicas.nr) { 138 nr_replicas = c->replicas.nr; 139 percpu_up_read(&c->mark_lock); 140 kfree(ret); 141 goto retry; 142 } 143 144 ret->online_reserved = percpu_u64_get(c->online_reserved); 145 146 do { 147 seq = read_seqcount_begin(&c->usage_lock); 148 unsafe_memcpy(&ret->u, c->usage_base, 149 __fs_usage_u64s(nr_replicas) * sizeof(u64), 150 "embedded variable length struct"); 151 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 152 acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], 153 __fs_usage_u64s(nr_replicas)); 154 } while (read_seqcount_retry(&c->usage_lock, seq)); 155 156 return ret; 157 } 158 159 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) 160 { 161 struct bch_dev *ca; 162 unsigned i, u64s = fs_usage_u64s(c); 163 164 BUG_ON(idx >= ARRAY_SIZE(c->usage)); 165 166 preempt_disable(); 167 write_seqcount_begin(&c->usage_lock); 168 169 acc_u64s_percpu((u64 *) c->usage_base, 170 (u64 __percpu *) c->usage[idx], u64s); 171 percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); 172 173 rcu_read_lock(); 174 for_each_member_device_rcu(ca, c, i, NULL) { 175 u64s = dev_usage_u64s(); 176 177 acc_u64s_percpu((u64 *) ca->usage_base, 178 (u64 __percpu *) ca->usage[idx], u64s); 179 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); 180 } 181 rcu_read_unlock(); 182 183 write_seqcount_end(&c->usage_lock); 184 preempt_enable(); 185 } 186 187 void bch2_fs_usage_to_text(struct printbuf *out, 188 struct bch_fs *c, 189 struct bch_fs_usage_online *fs_usage) 190 { 191 unsigned i; 192 193 prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); 194 195 prt_printf(out, "hidden:\t\t\t\t%llu\n", 196 fs_usage->u.hidden); 197 prt_printf(out, "data:\t\t\t\t%llu\n", 198 fs_usage->u.data); 199 prt_printf(out, "cached:\t\t\t\t%llu\n", 200 fs_usage->u.cached); 201 prt_printf(out, "reserved:\t\t\t%llu\n", 202 fs_usage->u.reserved); 203 prt_printf(out, "nr_inodes:\t\t\t%llu\n", 204 fs_usage->u.nr_inodes); 205 prt_printf(out, "online reserved:\t\t%llu\n", 206 fs_usage->online_reserved); 207 208 for (i = 0; 209 i < ARRAY_SIZE(fs_usage->u.persistent_reserved); 210 i++) { 211 prt_printf(out, "%u replicas:\n", i + 1); 212 prt_printf(out, "\treserved:\t\t%llu\n", 213 fs_usage->u.persistent_reserved[i]); 214 } 215 216 for (i = 0; i < c->replicas.nr; i++) { 217 struct bch_replicas_entry *e = 218 cpu_replicas_entry(&c->replicas, i); 219 220 prt_printf(out, "\t"); 221 bch2_replicas_entry_to_text(out, e); 222 prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); 223 } 224 } 225 226 static u64 reserve_factor(u64 r) 227 { 228 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); 229 } 230 231 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) 232 { 233 return min(fs_usage->u.hidden + 234 fs_usage->u.btree + 235 fs_usage->u.data + 236 reserve_factor(fs_usage->u.reserved + 237 fs_usage->online_reserved), 238 c->capacity); 239 } 240 241 static struct bch_fs_usage_short 242 __bch2_fs_usage_read_short(struct bch_fs *c) 243 { 244 struct bch_fs_usage_short ret; 245 u64 data, reserved; 246 247 ret.capacity = c->capacity - 248 bch2_fs_usage_read_one(c, &c->usage_base->hidden); 249 250 data = bch2_fs_usage_read_one(c, &c->usage_base->data) + 251 bch2_fs_usage_read_one(c, &c->usage_base->btree); 252 reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + 253 percpu_u64_get(c->online_reserved); 254 255 ret.used = min(ret.capacity, data + reserve_factor(reserved)); 256 ret.free = ret.capacity - ret.used; 257 258 ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); 259 260 return ret; 261 } 262 263 struct bch_fs_usage_short 264 bch2_fs_usage_read_short(struct bch_fs *c) 265 { 266 struct bch_fs_usage_short ret; 267 268 percpu_down_read(&c->mark_lock); 269 ret = __bch2_fs_usage_read_short(c); 270 percpu_up_read(&c->mark_lock); 271 272 return ret; 273 } 274 275 void bch2_dev_usage_init(struct bch_dev *ca) 276 { 277 ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; 278 } 279 280 static inline int bucket_sectors_fragmented(struct bch_dev *ca, 281 struct bch_alloc_v4 a) 282 { 283 return a.dirty_sectors 284 ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) 285 : 0; 286 } 287 288 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, 289 struct bch_alloc_v4 old, 290 struct bch_alloc_v4 new, 291 u64 journal_seq, bool gc) 292 { 293 struct bch_fs_usage *fs_usage; 294 struct bch_dev_usage *u; 295 296 preempt_disable(); 297 fs_usage = fs_usage_ptr(c, journal_seq, gc); 298 299 if (data_type_is_hidden(old.data_type)) 300 fs_usage->hidden -= ca->mi.bucket_size; 301 if (data_type_is_hidden(new.data_type)) 302 fs_usage->hidden += ca->mi.bucket_size; 303 304 u = dev_usage_ptr(ca, journal_seq, gc); 305 306 u->d[old.data_type].buckets--; 307 u->d[new.data_type].buckets++; 308 309 u->buckets_ec -= (int) !!old.stripe; 310 u->buckets_ec += (int) !!new.stripe; 311 312 u->d[old.data_type].sectors -= old.dirty_sectors; 313 u->d[new.data_type].sectors += new.dirty_sectors; 314 315 u->d[BCH_DATA_cached].sectors += new.cached_sectors; 316 u->d[BCH_DATA_cached].sectors -= old.cached_sectors; 317 318 u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); 319 u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); 320 321 preempt_enable(); 322 } 323 324 static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, 325 struct bucket old, struct bucket new, 326 u64 journal_seq, bool gc) 327 { 328 struct bch_alloc_v4 old_a = { 329 .gen = old.gen, 330 .data_type = old.data_type, 331 .dirty_sectors = old.dirty_sectors, 332 .cached_sectors = old.cached_sectors, 333 .stripe = old.stripe, 334 }; 335 struct bch_alloc_v4 new_a = { 336 .gen = new.gen, 337 .data_type = new.data_type, 338 .dirty_sectors = new.dirty_sectors, 339 .cached_sectors = new.cached_sectors, 340 .stripe = new.stripe, 341 }; 342 343 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); 344 } 345 346 static inline int __update_replicas(struct bch_fs *c, 347 struct bch_fs_usage *fs_usage, 348 struct bch_replicas_entry *r, 349 s64 sectors) 350 { 351 int idx = bch2_replicas_entry_idx(c, r); 352 353 if (idx < 0) 354 return -1; 355 356 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); 357 fs_usage->replicas[idx] += sectors; 358 return 0; 359 } 360 361 static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, 362 struct bch_replicas_entry *r, s64 sectors, 363 unsigned journal_seq, bool gc) 364 { 365 struct bch_fs_usage *fs_usage; 366 int idx, ret = 0; 367 struct printbuf buf = PRINTBUF; 368 369 percpu_down_read(&c->mark_lock); 370 371 idx = bch2_replicas_entry_idx(c, r); 372 if (idx < 0 && 373 fsck_err(c, ptr_to_missing_replicas_entry, 374 "no replicas entry\n while marking %s", 375 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 376 percpu_up_read(&c->mark_lock); 377 ret = bch2_mark_replicas(c, r); 378 percpu_down_read(&c->mark_lock); 379 380 if (ret) 381 goto err; 382 idx = bch2_replicas_entry_idx(c, r); 383 } 384 if (idx < 0) { 385 ret = -1; 386 goto err; 387 } 388 389 preempt_disable(); 390 fs_usage = fs_usage_ptr(c, journal_seq, gc); 391 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); 392 fs_usage->replicas[idx] += sectors; 393 preempt_enable(); 394 err: 395 fsck_err: 396 percpu_up_read(&c->mark_lock); 397 printbuf_exit(&buf); 398 return ret; 399 } 400 401 static inline int update_cached_sectors(struct bch_fs *c, 402 struct bkey_s_c k, 403 unsigned dev, s64 sectors, 404 unsigned journal_seq, bool gc) 405 { 406 struct bch_replicas_padded r; 407 408 bch2_replicas_entry_cached(&r.e, dev); 409 410 return update_replicas(c, k, &r.e, sectors, journal_seq, gc); 411 } 412 413 static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, 414 gfp_t gfp) 415 { 416 struct replicas_delta_list *d = trans->fs_usage_deltas; 417 unsigned new_size = d ? (d->size + more) * 2 : 128; 418 unsigned alloc_size = sizeof(*d) + new_size; 419 420 WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); 421 422 if (!d || d->used + more > d->size) { 423 d = krealloc(d, alloc_size, gfp|__GFP_ZERO); 424 425 if (unlikely(!d)) { 426 if (alloc_size > REPLICAS_DELTA_LIST_MAX) 427 return -ENOMEM; 428 429 d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); 430 if (!d) 431 return -ENOMEM; 432 433 memset(d, 0, REPLICAS_DELTA_LIST_MAX); 434 435 if (trans->fs_usage_deltas) 436 memcpy(d, trans->fs_usage_deltas, 437 trans->fs_usage_deltas->size + sizeof(*d)); 438 439 new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); 440 kfree(trans->fs_usage_deltas); 441 } 442 443 d->size = new_size; 444 trans->fs_usage_deltas = d; 445 } 446 447 return 0; 448 } 449 450 int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) 451 { 452 return allocate_dropping_locks_errcode(trans, 453 __replicas_deltas_realloc(trans, more, _gfp)); 454 } 455 456 static inline int update_replicas_list(struct btree_trans *trans, 457 struct bch_replicas_entry *r, 458 s64 sectors) 459 { 460 struct replicas_delta_list *d; 461 struct replicas_delta *n; 462 unsigned b; 463 int ret; 464 465 if (!sectors) 466 return 0; 467 468 b = replicas_entry_bytes(r) + 8; 469 ret = bch2_replicas_deltas_realloc(trans, b); 470 if (ret) 471 return ret; 472 473 d = trans->fs_usage_deltas; 474 n = (void *) d->d + d->used; 475 n->delta = sectors; 476 unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), 477 r, replicas_entry_bytes(r), 478 "flexible array member embedded in strcuct with padding"); 479 bch2_replicas_entry_sort(&n->r); 480 d->used += b; 481 return 0; 482 } 483 484 static inline int update_cached_sectors_list(struct btree_trans *trans, 485 unsigned dev, s64 sectors) 486 { 487 struct bch_replicas_padded r; 488 489 bch2_replicas_entry_cached(&r.e, dev); 490 491 return update_replicas_list(trans, &r.e, sectors); 492 } 493 494 int bch2_mark_alloc(struct btree_trans *trans, 495 enum btree_id btree, unsigned level, 496 struct bkey_s_c old, struct bkey_s_c new, 497 unsigned flags) 498 { 499 bool gc = flags & BTREE_TRIGGER_GC; 500 u64 journal_seq = trans->journal_res.seq; 501 u64 bucket_journal_seq; 502 struct bch_fs *c = trans->c; 503 struct bch_alloc_v4 old_a_convert, new_a_convert; 504 const struct bch_alloc_v4 *old_a, *new_a; 505 struct bch_dev *ca; 506 int ret = 0; 507 508 /* 509 * alloc btree is read in by bch2_alloc_read, not gc: 510 */ 511 if ((flags & BTREE_TRIGGER_GC) && 512 !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) 513 return 0; 514 515 if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, 516 "alloc key for invalid device or bucket")) 517 return -EIO; 518 519 ca = bch_dev_bkey_exists(c, new.k->p.inode); 520 521 old_a = bch2_alloc_to_v4(old, &old_a_convert); 522 new_a = bch2_alloc_to_v4(new, &new_a_convert); 523 524 bucket_journal_seq = new_a->journal_seq; 525 526 if ((flags & BTREE_TRIGGER_INSERT) && 527 data_type_is_empty(old_a->data_type) != 528 data_type_is_empty(new_a->data_type) && 529 new.k->type == KEY_TYPE_alloc_v4) { 530 struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; 531 532 EBUG_ON(!journal_seq); 533 534 /* 535 * If the btree updates referring to a bucket weren't flushed 536 * before the bucket became empty again, then the we don't have 537 * to wait on a journal flush before we can reuse the bucket: 538 */ 539 v->journal_seq = bucket_journal_seq = 540 data_type_is_empty(new_a->data_type) && 541 (journal_seq == v->journal_seq || 542 bch2_journal_noflush_seq(&c->journal, v->journal_seq)) 543 ? 0 : journal_seq; 544 } 545 546 if (!data_type_is_empty(old_a->data_type) && 547 data_type_is_empty(new_a->data_type) && 548 bucket_journal_seq) { 549 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 550 c->journal.flushed_seq_ondisk, 551 new.k->p.inode, new.k->p.offset, 552 bucket_journal_seq); 553 if (ret) { 554 bch2_fs_fatal_error(c, 555 "error setting bucket_needs_journal_commit: %i", ret); 556 return ret; 557 } 558 } 559 560 percpu_down_read(&c->mark_lock); 561 if (!gc && new_a->gen != old_a->gen) 562 *bucket_gen(ca, new.k->p.offset) = new_a->gen; 563 564 bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); 565 566 if (gc) { 567 struct bucket *g = gc_bucket(ca, new.k->p.offset); 568 569 bucket_lock(g); 570 571 g->gen_valid = 1; 572 g->gen = new_a->gen; 573 g->data_type = new_a->data_type; 574 g->stripe = new_a->stripe; 575 g->stripe_redundancy = new_a->stripe_redundancy; 576 g->dirty_sectors = new_a->dirty_sectors; 577 g->cached_sectors = new_a->cached_sectors; 578 579 bucket_unlock(g); 580 } 581 percpu_up_read(&c->mark_lock); 582 583 /* 584 * need to know if we're getting called from the invalidate path or 585 * not: 586 */ 587 588 if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && 589 old_a->cached_sectors) { 590 ret = update_cached_sectors(c, new, ca->dev_idx, 591 -((s64) old_a->cached_sectors), 592 journal_seq, gc); 593 if (ret) { 594 bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", 595 __func__); 596 return ret; 597 } 598 } 599 600 if (new_a->data_type == BCH_DATA_free && 601 (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) 602 closure_wake_up(&c->freelist_wait); 603 604 if (new_a->data_type == BCH_DATA_need_discard && 605 (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) 606 bch2_do_discards(c); 607 608 if (old_a->data_type != BCH_DATA_cached && 609 new_a->data_type == BCH_DATA_cached && 610 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 611 bch2_do_invalidates(c); 612 613 if (new_a->data_type == BCH_DATA_need_gc_gens) 614 bch2_do_gc_gens(c); 615 616 return 0; 617 } 618 619 int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, 620 size_t b, enum bch_data_type data_type, 621 unsigned sectors, struct gc_pos pos, 622 unsigned flags) 623 { 624 struct bucket old, new, *g; 625 int ret = 0; 626 627 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 628 BUG_ON(data_type != BCH_DATA_sb && 629 data_type != BCH_DATA_journal); 630 631 /* 632 * Backup superblock might be past the end of our normal usable space: 633 */ 634 if (b >= ca->mi.nbuckets) 635 return 0; 636 637 percpu_down_read(&c->mark_lock); 638 g = gc_bucket(ca, b); 639 640 bucket_lock(g); 641 old = *g; 642 643 if (bch2_fs_inconsistent_on(g->data_type && 644 g->data_type != data_type, c, 645 "different types of data in same bucket: %s, %s", 646 bch2_data_types[g->data_type], 647 bch2_data_types[data_type])) { 648 ret = -EIO; 649 goto err; 650 } 651 652 if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, 653 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", 654 ca->dev_idx, b, g->gen, 655 bch2_data_types[g->data_type ?: data_type], 656 g->dirty_sectors, sectors)) { 657 ret = -EIO; 658 goto err; 659 } 660 661 662 g->data_type = data_type; 663 g->dirty_sectors += sectors; 664 new = *g; 665 err: 666 bucket_unlock(g); 667 if (!ret) 668 bch2_dev_usage_update_m(c, ca, old, new, 0, true); 669 percpu_up_read(&c->mark_lock); 670 return ret; 671 } 672 673 static int check_bucket_ref(struct btree_trans *trans, 674 struct bkey_s_c k, 675 const struct bch_extent_ptr *ptr, 676 s64 sectors, enum bch_data_type ptr_data_type, 677 u8 b_gen, u8 bucket_data_type, 678 u32 dirty_sectors, u32 cached_sectors) 679 { 680 struct bch_fs *c = trans->c; 681 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 682 size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); 683 u32 bucket_sectors = !ptr->cached 684 ? dirty_sectors 685 : cached_sectors; 686 struct printbuf buf = PRINTBUF; 687 int ret = 0; 688 689 if (bucket_data_type == BCH_DATA_cached) 690 bucket_data_type = BCH_DATA_user; 691 692 if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || 693 (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) 694 bucket_data_type = ptr_data_type = BCH_DATA_stripe; 695 696 if (gen_after(ptr->gen, b_gen)) { 697 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 698 BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, 699 "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" 700 "while marking %s", 701 ptr->dev, bucket_nr, b_gen, 702 bch2_data_types[bucket_data_type ?: ptr_data_type], 703 ptr->gen, 704 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 705 ret = -EIO; 706 goto err; 707 } 708 709 if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { 710 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 711 BCH_FSCK_ERR_ptr_too_stale, 712 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 713 "while marking %s", 714 ptr->dev, bucket_nr, b_gen, 715 bch2_data_types[bucket_data_type ?: ptr_data_type], 716 ptr->gen, 717 (printbuf_reset(&buf), 718 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 719 ret = -EIO; 720 goto err; 721 } 722 723 if (b_gen != ptr->gen && !ptr->cached) { 724 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 725 BCH_FSCK_ERR_stale_dirty_ptr, 726 "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" 727 "while marking %s", 728 ptr->dev, bucket_nr, b_gen, 729 *bucket_gen(ca, bucket_nr), 730 bch2_data_types[bucket_data_type ?: ptr_data_type], 731 ptr->gen, 732 (printbuf_reset(&buf), 733 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 734 ret = -EIO; 735 goto err; 736 } 737 738 if (b_gen != ptr->gen) { 739 ret = 1; 740 goto out; 741 } 742 743 if (!data_type_is_empty(bucket_data_type) && 744 ptr_data_type && 745 bucket_data_type != ptr_data_type) { 746 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 747 BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, 748 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" 749 "while marking %s", 750 ptr->dev, bucket_nr, b_gen, 751 bch2_data_types[bucket_data_type], 752 bch2_data_types[ptr_data_type], 753 (printbuf_reset(&buf), 754 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 755 ret = -EIO; 756 goto err; 757 } 758 759 if ((u64) bucket_sectors + sectors > U32_MAX) { 760 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 761 BCH_FSCK_ERR_bucket_sector_count_overflow, 762 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" 763 "while marking %s", 764 ptr->dev, bucket_nr, b_gen, 765 bch2_data_types[bucket_data_type ?: ptr_data_type], 766 bucket_sectors, sectors, 767 (printbuf_reset(&buf), 768 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 769 ret = -EIO; 770 goto err; 771 } 772 out: 773 printbuf_exit(&buf); 774 return ret; 775 err: 776 bch2_dump_trans_updates(trans); 777 goto out; 778 } 779 780 static int mark_stripe_bucket(struct btree_trans *trans, 781 struct bkey_s_c k, 782 unsigned ptr_idx, 783 unsigned flags) 784 { 785 struct bch_fs *c = trans->c; 786 u64 journal_seq = trans->journal_res.seq; 787 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 788 unsigned nr_data = s->nr_blocks - s->nr_redundant; 789 bool parity = ptr_idx >= nr_data; 790 enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; 791 s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; 792 const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; 793 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 794 struct bucket old, new, *g; 795 struct printbuf buf = PRINTBUF; 796 int ret = 0; 797 798 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 799 800 /* * XXX doesn't handle deletion */ 801 802 percpu_down_read(&c->mark_lock); 803 g = PTR_GC_BUCKET(ca, ptr); 804 805 if (g->dirty_sectors || 806 (g->stripe && g->stripe != k.k->p.offset)) { 807 bch2_fs_inconsistent(c, 808 "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", 809 ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, 810 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 811 ret = -EINVAL; 812 goto err; 813 } 814 815 bucket_lock(g); 816 old = *g; 817 818 ret = check_bucket_ref(trans, k, ptr, sectors, data_type, 819 g->gen, g->data_type, 820 g->dirty_sectors, g->cached_sectors); 821 if (ret) 822 goto err; 823 824 g->data_type = data_type; 825 g->dirty_sectors += sectors; 826 827 g->stripe = k.k->p.offset; 828 g->stripe_redundancy = s->nr_redundant; 829 new = *g; 830 err: 831 bucket_unlock(g); 832 if (!ret) 833 bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); 834 percpu_up_read(&c->mark_lock); 835 printbuf_exit(&buf); 836 return ret; 837 } 838 839 static int __mark_pointer(struct btree_trans *trans, 840 struct bkey_s_c k, 841 const struct bch_extent_ptr *ptr, 842 s64 sectors, enum bch_data_type ptr_data_type, 843 u8 bucket_gen, u8 *bucket_data_type, 844 u32 *dirty_sectors, u32 *cached_sectors) 845 { 846 u32 *dst_sectors = !ptr->cached 847 ? dirty_sectors 848 : cached_sectors; 849 int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, 850 bucket_gen, *bucket_data_type, 851 *dirty_sectors, *cached_sectors); 852 853 if (ret) 854 return ret; 855 856 *dst_sectors += sectors; 857 858 if (!*dirty_sectors && !*cached_sectors) 859 *bucket_data_type = 0; 860 else if (*bucket_data_type != BCH_DATA_stripe) 861 *bucket_data_type = ptr_data_type; 862 863 return 0; 864 } 865 866 static int bch2_mark_pointer(struct btree_trans *trans, 867 enum btree_id btree_id, unsigned level, 868 struct bkey_s_c k, 869 struct extent_ptr_decoded p, 870 s64 sectors, 871 unsigned flags) 872 { 873 u64 journal_seq = trans->journal_res.seq; 874 struct bch_fs *c = trans->c; 875 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); 876 struct bucket old, new, *g; 877 enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); 878 u8 bucket_data_type; 879 int ret = 0; 880 881 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 882 883 percpu_down_read(&c->mark_lock); 884 g = PTR_GC_BUCKET(ca, &p.ptr); 885 bucket_lock(g); 886 old = *g; 887 888 bucket_data_type = g->data_type; 889 ret = __mark_pointer(trans, k, &p.ptr, sectors, 890 data_type, g->gen, 891 &bucket_data_type, 892 &g->dirty_sectors, 893 &g->cached_sectors); 894 if (!ret) 895 g->data_type = bucket_data_type; 896 897 new = *g; 898 bucket_unlock(g); 899 if (!ret) 900 bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); 901 percpu_up_read(&c->mark_lock); 902 903 return ret; 904 } 905 906 static int bch2_mark_stripe_ptr(struct btree_trans *trans, 907 struct bkey_s_c k, 908 struct bch_extent_stripe_ptr p, 909 enum bch_data_type data_type, 910 s64 sectors, 911 unsigned flags) 912 { 913 struct bch_fs *c = trans->c; 914 struct bch_replicas_padded r; 915 struct gc_stripe *m; 916 917 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 918 919 m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); 920 if (!m) { 921 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 922 (u64) p.idx); 923 return -BCH_ERR_ENOMEM_mark_stripe_ptr; 924 } 925 926 mutex_lock(&c->ec_stripes_heap_lock); 927 928 if (!m || !m->alive) { 929 mutex_unlock(&c->ec_stripes_heap_lock); 930 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", 931 (u64) p.idx); 932 bch2_inconsistent_error(c); 933 return -EIO; 934 } 935 936 m->block_sectors[p.block] += sectors; 937 938 r = m->r; 939 mutex_unlock(&c->ec_stripes_heap_lock); 940 941 r.e.data_type = data_type; 942 update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); 943 944 return 0; 945 } 946 947 static int __mark_extent(struct btree_trans *trans, 948 enum btree_id btree_id, unsigned level, 949 struct bkey_s_c k, unsigned flags) 950 { 951 u64 journal_seq = trans->journal_res.seq; 952 struct bch_fs *c = trans->c; 953 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 954 const union bch_extent_entry *entry; 955 struct extent_ptr_decoded p; 956 struct bch_replicas_padded r; 957 enum bch_data_type data_type = bkey_is_btree_ptr(k.k) 958 ? BCH_DATA_btree 959 : BCH_DATA_user; 960 s64 sectors = bkey_is_btree_ptr(k.k) 961 ? btree_sectors(c) 962 : k.k->size; 963 s64 dirty_sectors = 0; 964 bool stale; 965 int ret; 966 967 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 968 969 r.e.data_type = data_type; 970 r.e.nr_devs = 0; 971 r.e.nr_required = 1; 972 973 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 974 s64 disk_sectors = ptr_disk_sectors(sectors, p); 975 976 if (flags & BTREE_TRIGGER_OVERWRITE) 977 disk_sectors = -disk_sectors; 978 979 ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); 980 if (ret < 0) 981 return ret; 982 983 stale = ret > 0; 984 985 if (p.ptr.cached) { 986 if (!stale) { 987 ret = update_cached_sectors(c, k, p.ptr.dev, 988 disk_sectors, journal_seq, true); 989 if (ret) { 990 bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", 991 __func__); 992 return ret; 993 } 994 } 995 } else if (!p.has_ec) { 996 dirty_sectors += disk_sectors; 997 r.e.devs[r.e.nr_devs++] = p.ptr.dev; 998 } else { 999 ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, 1000 disk_sectors, flags); 1001 if (ret) 1002 return ret; 1003 1004 /* 1005 * There may be other dirty pointers in this extent, but 1006 * if so they're not required for mounting if we have an 1007 * erasure coded pointer in this extent: 1008 */ 1009 r.e.nr_required = 0; 1010 } 1011 } 1012 1013 if (r.e.nr_devs) { 1014 ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); 1015 if (ret) { 1016 struct printbuf buf = PRINTBUF; 1017 1018 bch2_bkey_val_to_text(&buf, c, k); 1019 bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); 1020 printbuf_exit(&buf); 1021 return ret; 1022 } 1023 } 1024 1025 return 0; 1026 } 1027 1028 int bch2_mark_extent(struct btree_trans *trans, 1029 enum btree_id btree_id, unsigned level, 1030 struct bkey_s_c old, struct bkey_s_c new, 1031 unsigned flags) 1032 { 1033 return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); 1034 } 1035 1036 int bch2_mark_stripe(struct btree_trans *trans, 1037 enum btree_id btree_id, unsigned level, 1038 struct bkey_s_c old, struct bkey_s_c new, 1039 unsigned flags) 1040 { 1041 bool gc = flags & BTREE_TRIGGER_GC; 1042 u64 journal_seq = trans->journal_res.seq; 1043 struct bch_fs *c = trans->c; 1044 u64 idx = new.k->p.offset; 1045 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 1046 ? bkey_s_c_to_stripe(old).v : NULL; 1047 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 1048 ? bkey_s_c_to_stripe(new).v : NULL; 1049 unsigned i; 1050 int ret; 1051 1052 BUG_ON(gc && old_s); 1053 1054 if (!gc) { 1055 struct stripe *m = genradix_ptr(&c->stripes, idx); 1056 1057 if (!m) { 1058 struct printbuf buf1 = PRINTBUF; 1059 struct printbuf buf2 = PRINTBUF; 1060 1061 bch2_bkey_val_to_text(&buf1, c, old); 1062 bch2_bkey_val_to_text(&buf2, c, new); 1063 bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" 1064 "old %s\n" 1065 "new %s", idx, buf1.buf, buf2.buf); 1066 printbuf_exit(&buf2); 1067 printbuf_exit(&buf1); 1068 bch2_inconsistent_error(c); 1069 return -1; 1070 } 1071 1072 if (!new_s) { 1073 bch2_stripes_heap_del(c, m, idx); 1074 1075 memset(m, 0, sizeof(*m)); 1076 } else { 1077 m->sectors = le16_to_cpu(new_s->sectors); 1078 m->algorithm = new_s->algorithm; 1079 m->nr_blocks = new_s->nr_blocks; 1080 m->nr_redundant = new_s->nr_redundant; 1081 m->blocks_nonempty = 0; 1082 1083 for (i = 0; i < new_s->nr_blocks; i++) 1084 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); 1085 1086 if (!old_s) 1087 bch2_stripes_heap_insert(c, m, idx); 1088 else 1089 bch2_stripes_heap_update(c, m, idx); 1090 } 1091 } else { 1092 struct gc_stripe *m = 1093 genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); 1094 1095 if (!m) { 1096 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 1097 idx); 1098 return -BCH_ERR_ENOMEM_mark_stripe; 1099 } 1100 /* 1101 * This will be wrong when we bring back runtime gc: we should 1102 * be unmarking the old key and then marking the new key 1103 */ 1104 m->alive = true; 1105 m->sectors = le16_to_cpu(new_s->sectors); 1106 m->nr_blocks = new_s->nr_blocks; 1107 m->nr_redundant = new_s->nr_redundant; 1108 1109 for (i = 0; i < new_s->nr_blocks; i++) 1110 m->ptrs[i] = new_s->ptrs[i]; 1111 1112 bch2_bkey_to_replicas(&m->r.e, new); 1113 1114 /* 1115 * gc recalculates this field from stripe ptr 1116 * references: 1117 */ 1118 memset(m->block_sectors, 0, sizeof(m->block_sectors)); 1119 1120 for (i = 0; i < new_s->nr_blocks; i++) { 1121 ret = mark_stripe_bucket(trans, new, i, flags); 1122 if (ret) 1123 return ret; 1124 } 1125 1126 ret = update_replicas(c, new, &m->r.e, 1127 ((s64) m->sectors * m->nr_redundant), 1128 journal_seq, gc); 1129 if (ret) { 1130 struct printbuf buf = PRINTBUF; 1131 1132 bch2_bkey_val_to_text(&buf, c, new); 1133 bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); 1134 printbuf_exit(&buf); 1135 return ret; 1136 } 1137 } 1138 1139 return 0; 1140 } 1141 1142 static int __mark_reservation(struct btree_trans *trans, 1143 enum btree_id btree_id, unsigned level, 1144 struct bkey_s_c k, unsigned flags) 1145 { 1146 struct bch_fs *c = trans->c; 1147 struct bch_fs_usage *fs_usage; 1148 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; 1149 s64 sectors = (s64) k.k->size; 1150 1151 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 1152 1153 if (flags & BTREE_TRIGGER_OVERWRITE) 1154 sectors = -sectors; 1155 sectors *= replicas; 1156 1157 percpu_down_read(&c->mark_lock); 1158 preempt_disable(); 1159 1160 fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); 1161 replicas = clamp_t(unsigned, replicas, 1, 1162 ARRAY_SIZE(fs_usage->persistent_reserved)); 1163 1164 fs_usage->reserved += sectors; 1165 fs_usage->persistent_reserved[replicas - 1] += sectors; 1166 1167 preempt_enable(); 1168 percpu_up_read(&c->mark_lock); 1169 1170 return 0; 1171 } 1172 1173 int bch2_mark_reservation(struct btree_trans *trans, 1174 enum btree_id btree_id, unsigned level, 1175 struct bkey_s_c old, struct bkey_s_c new, 1176 unsigned flags) 1177 { 1178 return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); 1179 } 1180 1181 static s64 __bch2_mark_reflink_p(struct btree_trans *trans, 1182 struct bkey_s_c_reflink_p p, 1183 u64 start, u64 end, 1184 u64 *idx, unsigned flags, size_t r_idx) 1185 { 1186 struct bch_fs *c = trans->c; 1187 struct reflink_gc *r; 1188 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; 1189 u64 next_idx = end; 1190 s64 ret = 0; 1191 struct printbuf buf = PRINTBUF; 1192 1193 if (r_idx >= c->reflink_gc_nr) 1194 goto not_found; 1195 1196 r = genradix_ptr(&c->reflink_gc_table, r_idx); 1197 next_idx = min(next_idx, r->offset - r->size); 1198 if (*idx < next_idx) 1199 goto not_found; 1200 1201 BUG_ON((s64) r->refcount + add < 0); 1202 1203 r->refcount += add; 1204 *idx = r->offset; 1205 return 0; 1206 not_found: 1207 if (fsck_err(c, reflink_p_to_missing_reflink_v, 1208 "pointer to missing indirect extent\n" 1209 " %s\n" 1210 " missing range %llu-%llu", 1211 (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), 1212 *idx, next_idx)) { 1213 struct bkey_i_error *new; 1214 1215 new = bch2_trans_kmalloc(trans, sizeof(*new)); 1216 ret = PTR_ERR_OR_ZERO(new); 1217 if (ret) 1218 goto err; 1219 1220 bkey_init(&new->k); 1221 new->k.type = KEY_TYPE_error; 1222 new->k.p = bkey_start_pos(p.k); 1223 new->k.p.offset += *idx - start; 1224 bch2_key_resize(&new->k, next_idx - *idx); 1225 ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, 1226 BTREE_TRIGGER_NORUN); 1227 } 1228 1229 *idx = next_idx; 1230 err: 1231 fsck_err: 1232 printbuf_exit(&buf); 1233 return ret; 1234 } 1235 1236 static int __mark_reflink_p(struct btree_trans *trans, 1237 enum btree_id btree_id, unsigned level, 1238 struct bkey_s_c k, unsigned flags) 1239 { 1240 struct bch_fs *c = trans->c; 1241 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 1242 struct reflink_gc *ref; 1243 size_t l, r, m; 1244 u64 idx = le64_to_cpu(p.v->idx), start = idx; 1245 u64 end = le64_to_cpu(p.v->idx) + p.k->size; 1246 int ret = 0; 1247 1248 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 1249 1250 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { 1251 idx -= le32_to_cpu(p.v->front_pad); 1252 end += le32_to_cpu(p.v->back_pad); 1253 } 1254 1255 l = 0; 1256 r = c->reflink_gc_nr; 1257 while (l < r) { 1258 m = l + (r - l) / 2; 1259 1260 ref = genradix_ptr(&c->reflink_gc_table, m); 1261 if (ref->offset <= idx) 1262 l = m + 1; 1263 else 1264 r = m; 1265 } 1266 1267 while (idx < end && !ret) 1268 ret = __bch2_mark_reflink_p(trans, p, start, end, 1269 &idx, flags, l++); 1270 1271 return ret; 1272 } 1273 1274 int bch2_mark_reflink_p(struct btree_trans *trans, 1275 enum btree_id btree_id, unsigned level, 1276 struct bkey_s_c old, struct bkey_s_c new, 1277 unsigned flags) 1278 { 1279 return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); 1280 } 1281 1282 void bch2_trans_fs_usage_revert(struct btree_trans *trans, 1283 struct replicas_delta_list *deltas) 1284 { 1285 struct bch_fs *c = trans->c; 1286 struct bch_fs_usage *dst; 1287 struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; 1288 s64 added = 0; 1289 unsigned i; 1290 1291 percpu_down_read(&c->mark_lock); 1292 preempt_disable(); 1293 dst = fs_usage_ptr(c, trans->journal_res.seq, false); 1294 1295 /* revert changes: */ 1296 for (d = deltas->d; d != top; d = replicas_delta_next(d)) { 1297 switch (d->r.data_type) { 1298 case BCH_DATA_btree: 1299 case BCH_DATA_user: 1300 case BCH_DATA_parity: 1301 added += d->delta; 1302 } 1303 BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); 1304 } 1305 1306 dst->nr_inodes -= deltas->nr_inodes; 1307 1308 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1309 added -= deltas->persistent_reserved[i]; 1310 dst->reserved -= deltas->persistent_reserved[i]; 1311 dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; 1312 } 1313 1314 if (added > 0) { 1315 trans->disk_res->sectors += added; 1316 this_cpu_add(*c->online_reserved, added); 1317 } 1318 1319 preempt_enable(); 1320 percpu_up_read(&c->mark_lock); 1321 } 1322 1323 int bch2_trans_fs_usage_apply(struct btree_trans *trans, 1324 struct replicas_delta_list *deltas) 1325 { 1326 struct bch_fs *c = trans->c; 1327 static int warned_disk_usage = 0; 1328 bool warn = false; 1329 u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; 1330 struct replicas_delta *d, *d2; 1331 struct replicas_delta *top = (void *) deltas->d + deltas->used; 1332 struct bch_fs_usage *dst; 1333 s64 added = 0, should_not_have_added; 1334 unsigned i; 1335 1336 percpu_down_read(&c->mark_lock); 1337 preempt_disable(); 1338 dst = fs_usage_ptr(c, trans->journal_res.seq, false); 1339 1340 for (d = deltas->d; d != top; d = replicas_delta_next(d)) { 1341 switch (d->r.data_type) { 1342 case BCH_DATA_btree: 1343 case BCH_DATA_user: 1344 case BCH_DATA_parity: 1345 added += d->delta; 1346 } 1347 1348 if (__update_replicas(c, dst, &d->r, d->delta)) 1349 goto need_mark; 1350 } 1351 1352 dst->nr_inodes += deltas->nr_inodes; 1353 1354 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1355 added += deltas->persistent_reserved[i]; 1356 dst->reserved += deltas->persistent_reserved[i]; 1357 dst->persistent_reserved[i] += deltas->persistent_reserved[i]; 1358 } 1359 1360 /* 1361 * Not allowed to reduce sectors_available except by getting a 1362 * reservation: 1363 */ 1364 should_not_have_added = added - (s64) disk_res_sectors; 1365 if (unlikely(should_not_have_added > 0)) { 1366 u64 old, new, v = atomic64_read(&c->sectors_available); 1367 1368 do { 1369 old = v; 1370 new = max_t(s64, 0, old - should_not_have_added); 1371 } while ((v = atomic64_cmpxchg(&c->sectors_available, 1372 old, new)) != old); 1373 1374 added -= should_not_have_added; 1375 warn = true; 1376 } 1377 1378 if (added > 0) { 1379 trans->disk_res->sectors -= added; 1380 this_cpu_sub(*c->online_reserved, added); 1381 } 1382 1383 preempt_enable(); 1384 percpu_up_read(&c->mark_lock); 1385 1386 if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) 1387 bch2_trans_inconsistent(trans, 1388 "disk usage increased %lli more than %llu sectors reserved)", 1389 should_not_have_added, disk_res_sectors); 1390 return 0; 1391 need_mark: 1392 /* revert changes: */ 1393 for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) 1394 BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); 1395 1396 preempt_enable(); 1397 percpu_up_read(&c->mark_lock); 1398 return -1; 1399 } 1400 1401 /* trans_mark: */ 1402 1403 static inline int bch2_trans_mark_pointer(struct btree_trans *trans, 1404 enum btree_id btree_id, unsigned level, 1405 struct bkey_s_c k, struct extent_ptr_decoded p, 1406 unsigned flags) 1407 { 1408 bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); 1409 struct btree_iter iter; 1410 struct bkey_i_alloc_v4 *a; 1411 struct bpos bucket; 1412 struct bch_backpointer bp; 1413 s64 sectors; 1414 int ret; 1415 1416 bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); 1417 sectors = bp.bucket_len; 1418 if (!insert) 1419 sectors = -sectors; 1420 1421 a = bch2_trans_start_alloc_update(trans, &iter, bucket); 1422 if (IS_ERR(a)) 1423 return PTR_ERR(a); 1424 1425 ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, 1426 a->v.gen, &a->v.data_type, 1427 &a->v.dirty_sectors, &a->v.cached_sectors) ?: 1428 bch2_trans_update(trans, &iter, &a->k_i, 0); 1429 bch2_trans_iter_exit(trans, &iter); 1430 1431 if (ret) 1432 return ret; 1433 1434 if (!p.ptr.cached) { 1435 ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); 1436 if (ret) 1437 return ret; 1438 } 1439 1440 return 0; 1441 } 1442 1443 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, 1444 struct extent_ptr_decoded p, 1445 s64 sectors, enum bch_data_type data_type) 1446 { 1447 struct btree_iter iter; 1448 struct bkey_i_stripe *s; 1449 struct bch_replicas_padded r; 1450 int ret = 0; 1451 1452 s = bch2_bkey_get_mut_typed(trans, &iter, 1453 BTREE_ID_stripes, POS(0, p.ec.idx), 1454 BTREE_ITER_WITH_UPDATES, stripe); 1455 ret = PTR_ERR_OR_ZERO(s); 1456 if (unlikely(ret)) { 1457 bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, 1458 "pointer to nonexistent stripe %llu", 1459 (u64) p.ec.idx); 1460 goto err; 1461 } 1462 1463 if (!bch2_ptr_matches_stripe(&s->v, p)) { 1464 bch2_trans_inconsistent(trans, 1465 "stripe pointer doesn't match stripe %llu", 1466 (u64) p.ec.idx); 1467 ret = -EIO; 1468 goto err; 1469 } 1470 1471 stripe_blockcount_set(&s->v, p.ec.block, 1472 stripe_blockcount_get(&s->v, p.ec.block) + 1473 sectors); 1474 1475 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); 1476 r.e.data_type = data_type; 1477 ret = update_replicas_list(trans, &r.e, sectors); 1478 err: 1479 bch2_trans_iter_exit(trans, &iter); 1480 return ret; 1481 } 1482 1483 static int __trans_mark_extent(struct btree_trans *trans, 1484 enum btree_id btree_id, unsigned level, 1485 struct bkey_s_c k, unsigned flags) 1486 { 1487 struct bch_fs *c = trans->c; 1488 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1489 const union bch_extent_entry *entry; 1490 struct extent_ptr_decoded p; 1491 struct bch_replicas_padded r; 1492 enum bch_data_type data_type = bkey_is_btree_ptr(k.k) 1493 ? BCH_DATA_btree 1494 : BCH_DATA_user; 1495 s64 sectors = bkey_is_btree_ptr(k.k) 1496 ? btree_sectors(c) 1497 : k.k->size; 1498 s64 dirty_sectors = 0; 1499 bool stale; 1500 int ret = 0; 1501 1502 r.e.data_type = data_type; 1503 r.e.nr_devs = 0; 1504 r.e.nr_required = 1; 1505 1506 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1507 s64 disk_sectors = ptr_disk_sectors(sectors, p); 1508 1509 if (flags & BTREE_TRIGGER_OVERWRITE) 1510 disk_sectors = -disk_sectors; 1511 1512 ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); 1513 if (ret < 0) 1514 return ret; 1515 1516 stale = ret > 0; 1517 1518 if (p.ptr.cached) { 1519 if (!stale) { 1520 ret = update_cached_sectors_list(trans, p.ptr.dev, 1521 disk_sectors); 1522 if (ret) 1523 return ret; 1524 } 1525 } else if (!p.has_ec) { 1526 dirty_sectors += disk_sectors; 1527 r.e.devs[r.e.nr_devs++] = p.ptr.dev; 1528 } else { 1529 ret = bch2_trans_mark_stripe_ptr(trans, p, 1530 disk_sectors, data_type); 1531 if (ret) 1532 return ret; 1533 1534 r.e.nr_required = 0; 1535 } 1536 } 1537 1538 if (r.e.nr_devs) 1539 ret = update_replicas_list(trans, &r.e, dirty_sectors); 1540 1541 return ret; 1542 } 1543 1544 int bch2_trans_mark_extent(struct btree_trans *trans, 1545 enum btree_id btree_id, unsigned level, 1546 struct bkey_s_c old, struct bkey_i *new, 1547 unsigned flags) 1548 { 1549 struct bch_fs *c = trans->c; 1550 int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - 1551 (int) bch2_bkey_needs_rebalance(c, old); 1552 1553 if (mod) { 1554 int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); 1555 if (ret) 1556 return ret; 1557 } 1558 1559 return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); 1560 } 1561 1562 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, 1563 struct bkey_s_c_stripe s, 1564 unsigned idx, bool deleting) 1565 { 1566 struct bch_fs *c = trans->c; 1567 const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; 1568 struct btree_iter iter; 1569 struct bkey_i_alloc_v4 *a; 1570 enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant 1571 ? BCH_DATA_parity : 0; 1572 s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; 1573 int ret = 0; 1574 1575 if (deleting) 1576 sectors = -sectors; 1577 1578 a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); 1579 if (IS_ERR(a)) 1580 return PTR_ERR(a); 1581 1582 ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, 1583 a->v.gen, a->v.data_type, 1584 a->v.dirty_sectors, a->v.cached_sectors); 1585 if (ret) 1586 goto err; 1587 1588 if (!deleting) { 1589 if (bch2_trans_inconsistent_on(a->v.stripe || 1590 a->v.stripe_redundancy, trans, 1591 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", 1592 iter.pos.inode, iter.pos.offset, a->v.gen, 1593 bch2_data_types[a->v.data_type], 1594 a->v.dirty_sectors, 1595 a->v.stripe, s.k->p.offset)) { 1596 ret = -EIO; 1597 goto err; 1598 } 1599 1600 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, 1601 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", 1602 iter.pos.inode, iter.pos.offset, a->v.gen, 1603 bch2_data_types[a->v.data_type], 1604 a->v.dirty_sectors, 1605 s.k->p.offset)) { 1606 ret = -EIO; 1607 goto err; 1608 } 1609 1610 a->v.stripe = s.k->p.offset; 1611 a->v.stripe_redundancy = s.v->nr_redundant; 1612 a->v.data_type = BCH_DATA_stripe; 1613 } else { 1614 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || 1615 a->v.stripe_redundancy != s.v->nr_redundant, trans, 1616 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", 1617 iter.pos.inode, iter.pos.offset, a->v.gen, 1618 s.k->p.offset, a->v.stripe)) { 1619 ret = -EIO; 1620 goto err; 1621 } 1622 1623 a->v.stripe = 0; 1624 a->v.stripe_redundancy = 0; 1625 a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); 1626 } 1627 1628 a->v.dirty_sectors += sectors; 1629 if (data_type) 1630 a->v.data_type = !deleting ? data_type : 0; 1631 1632 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1633 if (ret) 1634 goto err; 1635 err: 1636 bch2_trans_iter_exit(trans, &iter); 1637 return ret; 1638 } 1639 1640 int bch2_trans_mark_stripe(struct btree_trans *trans, 1641 enum btree_id btree_id, unsigned level, 1642 struct bkey_s_c old, struct bkey_i *new, 1643 unsigned flags) 1644 { 1645 const struct bch_stripe *old_s = NULL; 1646 struct bch_stripe *new_s = NULL; 1647 struct bch_replicas_padded r; 1648 unsigned i, nr_blocks; 1649 int ret = 0; 1650 1651 if (old.k->type == KEY_TYPE_stripe) 1652 old_s = bkey_s_c_to_stripe(old).v; 1653 if (new->k.type == KEY_TYPE_stripe) 1654 new_s = &bkey_i_to_stripe(new)->v; 1655 1656 /* 1657 * If the pointers aren't changing, we don't need to do anything: 1658 */ 1659 if (new_s && old_s && 1660 new_s->nr_blocks == old_s->nr_blocks && 1661 new_s->nr_redundant == old_s->nr_redundant && 1662 !memcmp(old_s->ptrs, new_s->ptrs, 1663 new_s->nr_blocks * sizeof(struct bch_extent_ptr))) 1664 return 0; 1665 1666 BUG_ON(new_s && old_s && 1667 (new_s->nr_blocks != old_s->nr_blocks || 1668 new_s->nr_redundant != old_s->nr_redundant)); 1669 1670 nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; 1671 1672 if (new_s) { 1673 s64 sectors = le16_to_cpu(new_s->sectors); 1674 1675 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); 1676 ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); 1677 if (ret) 1678 return ret; 1679 } 1680 1681 if (old_s) { 1682 s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); 1683 1684 bch2_bkey_to_replicas(&r.e, old); 1685 ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); 1686 if (ret) 1687 return ret; 1688 } 1689 1690 for (i = 0; i < nr_blocks; i++) { 1691 if (new_s && old_s && 1692 !memcmp(&new_s->ptrs[i], 1693 &old_s->ptrs[i], 1694 sizeof(new_s->ptrs[i]))) 1695 continue; 1696 1697 if (new_s) { 1698 ret = bch2_trans_mark_stripe_bucket(trans, 1699 bkey_i_to_s_c_stripe(new), i, false); 1700 if (ret) 1701 break; 1702 } 1703 1704 if (old_s) { 1705 ret = bch2_trans_mark_stripe_bucket(trans, 1706 bkey_s_c_to_stripe(old), i, true); 1707 if (ret) 1708 break; 1709 } 1710 } 1711 1712 return ret; 1713 } 1714 1715 static int __trans_mark_reservation(struct btree_trans *trans, 1716 enum btree_id btree_id, unsigned level, 1717 struct bkey_s_c k, unsigned flags) 1718 { 1719 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; 1720 s64 sectors = (s64) k.k->size; 1721 struct replicas_delta_list *d; 1722 int ret; 1723 1724 if (flags & BTREE_TRIGGER_OVERWRITE) 1725 sectors = -sectors; 1726 sectors *= replicas; 1727 1728 ret = bch2_replicas_deltas_realloc(trans, 0); 1729 if (ret) 1730 return ret; 1731 1732 d = trans->fs_usage_deltas; 1733 replicas = clamp_t(unsigned, replicas, 1, 1734 ARRAY_SIZE(d->persistent_reserved)); 1735 1736 d->persistent_reserved[replicas - 1] += sectors; 1737 return 0; 1738 } 1739 1740 int bch2_trans_mark_reservation(struct btree_trans *trans, 1741 enum btree_id btree_id, unsigned level, 1742 struct bkey_s_c old, 1743 struct bkey_i *new, 1744 unsigned flags) 1745 { 1746 return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); 1747 } 1748 1749 static int trans_mark_reflink_p_segment(struct btree_trans *trans, 1750 struct bkey_s_c_reflink_p p, 1751 u64 *idx, unsigned flags) 1752 { 1753 struct bch_fs *c = trans->c; 1754 struct btree_iter iter; 1755 struct bkey_i *k; 1756 __le64 *refcount; 1757 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; 1758 struct printbuf buf = PRINTBUF; 1759 int ret; 1760 1761 k = bch2_bkey_get_mut_noupdate(trans, &iter, 1762 BTREE_ID_reflink, POS(0, *idx), 1763 BTREE_ITER_WITH_UPDATES); 1764 ret = PTR_ERR_OR_ZERO(k); 1765 if (ret) 1766 goto err; 1767 1768 refcount = bkey_refcount(k); 1769 if (!refcount) { 1770 bch2_bkey_val_to_text(&buf, c, p.s_c); 1771 bch2_trans_inconsistent(trans, 1772 "nonexistent indirect extent at %llu while marking\n %s", 1773 *idx, buf.buf); 1774 ret = -EIO; 1775 goto err; 1776 } 1777 1778 if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { 1779 bch2_bkey_val_to_text(&buf, c, p.s_c); 1780 bch2_trans_inconsistent(trans, 1781 "indirect extent refcount underflow at %llu while marking\n %s", 1782 *idx, buf.buf); 1783 ret = -EIO; 1784 goto err; 1785 } 1786 1787 if (flags & BTREE_TRIGGER_INSERT) { 1788 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; 1789 u64 pad; 1790 1791 pad = max_t(s64, le32_to_cpu(v->front_pad), 1792 le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); 1793 BUG_ON(pad > U32_MAX); 1794 v->front_pad = cpu_to_le32(pad); 1795 1796 pad = max_t(s64, le32_to_cpu(v->back_pad), 1797 k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); 1798 BUG_ON(pad > U32_MAX); 1799 v->back_pad = cpu_to_le32(pad); 1800 } 1801 1802 le64_add_cpu(refcount, add); 1803 1804 bch2_btree_iter_set_pos_to_extent_start(&iter); 1805 ret = bch2_trans_update(trans, &iter, k, 0); 1806 if (ret) 1807 goto err; 1808 1809 *idx = k->k.p.offset; 1810 err: 1811 bch2_trans_iter_exit(trans, &iter); 1812 printbuf_exit(&buf); 1813 return ret; 1814 } 1815 1816 static int __trans_mark_reflink_p(struct btree_trans *trans, 1817 enum btree_id btree_id, unsigned level, 1818 struct bkey_s_c k, unsigned flags) 1819 { 1820 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 1821 u64 idx, end_idx; 1822 int ret = 0; 1823 1824 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); 1825 end_idx = le64_to_cpu(p.v->idx) + p.k->size + 1826 le32_to_cpu(p.v->back_pad); 1827 1828 while (idx < end_idx && !ret) 1829 ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); 1830 return ret; 1831 } 1832 1833 int bch2_trans_mark_reflink_p(struct btree_trans *trans, 1834 enum btree_id btree_id, unsigned level, 1835 struct bkey_s_c old, 1836 struct bkey_i *new, 1837 unsigned flags) 1838 { 1839 if (flags & BTREE_TRIGGER_INSERT) { 1840 struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; 1841 1842 v->front_pad = v->back_pad = 0; 1843 } 1844 1845 return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); 1846 } 1847 1848 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, 1849 struct bch_dev *ca, size_t b, 1850 enum bch_data_type type, 1851 unsigned sectors) 1852 { 1853 struct bch_fs *c = trans->c; 1854 struct btree_iter iter; 1855 struct bkey_i_alloc_v4 *a; 1856 int ret = 0; 1857 1858 /* 1859 * Backup superblock might be past the end of our normal usable space: 1860 */ 1861 if (b >= ca->mi.nbuckets) 1862 return 0; 1863 1864 a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); 1865 if (IS_ERR(a)) 1866 return PTR_ERR(a); 1867 1868 if (a->v.data_type && type && a->v.data_type != type) { 1869 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 1870 BCH_FSCK_ERR_bucket_metadata_type_mismatch, 1871 "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" 1872 "while marking %s", 1873 iter.pos.inode, iter.pos.offset, a->v.gen, 1874 bch2_data_types[a->v.data_type], 1875 bch2_data_types[type], 1876 bch2_data_types[type]); 1877 ret = -EIO; 1878 goto err; 1879 } 1880 1881 if (a->v.data_type != type || 1882 a->v.dirty_sectors != sectors) { 1883 a->v.data_type = type; 1884 a->v.dirty_sectors = sectors; 1885 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1886 } 1887 err: 1888 bch2_trans_iter_exit(trans, &iter); 1889 return ret; 1890 } 1891 1892 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, 1893 struct bch_dev *ca, size_t b, 1894 enum bch_data_type type, 1895 unsigned sectors) 1896 { 1897 return commit_do(trans, NULL, NULL, 0, 1898 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); 1899 } 1900 1901 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, 1902 struct bch_dev *ca, 1903 u64 start, u64 end, 1904 enum bch_data_type type, 1905 u64 *bucket, unsigned *bucket_sectors) 1906 { 1907 do { 1908 u64 b = sector_to_bucket(ca, start); 1909 unsigned sectors = 1910 min_t(u64, bucket_to_sector(ca, b + 1), end) - start; 1911 1912 if (b != *bucket && *bucket_sectors) { 1913 int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, 1914 type, *bucket_sectors); 1915 if (ret) 1916 return ret; 1917 1918 *bucket_sectors = 0; 1919 } 1920 1921 *bucket = b; 1922 *bucket_sectors += sectors; 1923 start += sectors; 1924 } while (start < end); 1925 1926 return 0; 1927 } 1928 1929 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, 1930 struct bch_dev *ca) 1931 { 1932 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; 1933 u64 bucket = 0; 1934 unsigned i, bucket_sectors = 0; 1935 int ret; 1936 1937 for (i = 0; i < layout->nr_superblocks; i++) { 1938 u64 offset = le64_to_cpu(layout->sb_offset[i]); 1939 1940 if (offset == BCH_SB_SECTOR) { 1941 ret = bch2_trans_mark_metadata_sectors(trans, ca, 1942 0, BCH_SB_SECTOR, 1943 BCH_DATA_sb, &bucket, &bucket_sectors); 1944 if (ret) 1945 return ret; 1946 } 1947 1948 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, 1949 offset + (1 << layout->sb_max_size_bits), 1950 BCH_DATA_sb, &bucket, &bucket_sectors); 1951 if (ret) 1952 return ret; 1953 } 1954 1955 if (bucket_sectors) { 1956 ret = bch2_trans_mark_metadata_bucket(trans, ca, 1957 bucket, BCH_DATA_sb, bucket_sectors); 1958 if (ret) 1959 return ret; 1960 } 1961 1962 for (i = 0; i < ca->journal.nr; i++) { 1963 ret = bch2_trans_mark_metadata_bucket(trans, ca, 1964 ca->journal.buckets[i], 1965 BCH_DATA_journal, ca->mi.bucket_size); 1966 if (ret) 1967 return ret; 1968 } 1969 1970 return 0; 1971 } 1972 1973 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) 1974 { 1975 int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); 1976 1977 if (ret) 1978 bch_err_fn(c, ret); 1979 return ret; 1980 } 1981 1982 int bch2_trans_mark_dev_sbs(struct bch_fs *c) 1983 { 1984 struct bch_dev *ca; 1985 unsigned i; 1986 1987 for_each_online_member(ca, c, i) { 1988 int ret = bch2_trans_mark_dev_sb(c, ca); 1989 if (ret) { 1990 percpu_ref_put(&ca->ref); 1991 return ret; 1992 } 1993 } 1994 1995 return 0; 1996 } 1997 1998 /* Disk reservations: */ 1999 2000 #define SECTORS_CACHE 1024 2001 2002 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, 2003 u64 sectors, int flags) 2004 { 2005 struct bch_fs_pcpu *pcpu; 2006 u64 old, v, get; 2007 s64 sectors_available; 2008 int ret; 2009 2010 percpu_down_read(&c->mark_lock); 2011 preempt_disable(); 2012 pcpu = this_cpu_ptr(c->pcpu); 2013 2014 if (sectors <= pcpu->sectors_available) 2015 goto out; 2016 2017 v = atomic64_read(&c->sectors_available); 2018 do { 2019 old = v; 2020 get = min((u64) sectors + SECTORS_CACHE, old); 2021 2022 if (get < sectors) { 2023 preempt_enable(); 2024 goto recalculate; 2025 } 2026 } while ((v = atomic64_cmpxchg(&c->sectors_available, 2027 old, old - get)) != old); 2028 2029 pcpu->sectors_available += get; 2030 2031 out: 2032 pcpu->sectors_available -= sectors; 2033 this_cpu_add(*c->online_reserved, sectors); 2034 res->sectors += sectors; 2035 2036 preempt_enable(); 2037 percpu_up_read(&c->mark_lock); 2038 return 0; 2039 2040 recalculate: 2041 mutex_lock(&c->sectors_available_lock); 2042 2043 percpu_u64_set(&c->pcpu->sectors_available, 0); 2044 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); 2045 2046 if (sectors <= sectors_available || 2047 (flags & BCH_DISK_RESERVATION_NOFAIL)) { 2048 atomic64_set(&c->sectors_available, 2049 max_t(s64, 0, sectors_available - sectors)); 2050 this_cpu_add(*c->online_reserved, sectors); 2051 res->sectors += sectors; 2052 ret = 0; 2053 } else { 2054 atomic64_set(&c->sectors_available, sectors_available); 2055 ret = -BCH_ERR_ENOSPC_disk_reservation; 2056 } 2057 2058 mutex_unlock(&c->sectors_available_lock); 2059 percpu_up_read(&c->mark_lock); 2060 2061 return ret; 2062 } 2063 2064 /* Startup/shutdown: */ 2065 2066 static void bucket_gens_free_rcu(struct rcu_head *rcu) 2067 { 2068 struct bucket_gens *buckets = 2069 container_of(rcu, struct bucket_gens, rcu); 2070 2071 kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); 2072 } 2073 2074 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 2075 { 2076 struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; 2077 unsigned long *buckets_nouse = NULL; 2078 bool resize = ca->bucket_gens != NULL; 2079 int ret; 2080 2081 if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, 2082 GFP_KERNEL|__GFP_ZERO))) { 2083 ret = -BCH_ERR_ENOMEM_bucket_gens; 2084 goto err; 2085 } 2086 2087 if ((c->opts.buckets_nouse && 2088 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * 2089 sizeof(unsigned long), 2090 GFP_KERNEL|__GFP_ZERO)))) { 2091 ret = -BCH_ERR_ENOMEM_buckets_nouse; 2092 goto err; 2093 } 2094 2095 bucket_gens->first_bucket = ca->mi.first_bucket; 2096 bucket_gens->nbuckets = nbuckets; 2097 2098 if (resize) { 2099 down_write(&c->gc_lock); 2100 down_write(&ca->bucket_lock); 2101 percpu_down_write(&c->mark_lock); 2102 } 2103 2104 old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); 2105 2106 if (resize) { 2107 size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); 2108 2109 memcpy(bucket_gens->b, 2110 old_bucket_gens->b, 2111 n); 2112 if (buckets_nouse) 2113 memcpy(buckets_nouse, 2114 ca->buckets_nouse, 2115 BITS_TO_LONGS(n) * sizeof(unsigned long)); 2116 } 2117 2118 rcu_assign_pointer(ca->bucket_gens, bucket_gens); 2119 bucket_gens = old_bucket_gens; 2120 2121 swap(ca->buckets_nouse, buckets_nouse); 2122 2123 nbuckets = ca->mi.nbuckets; 2124 2125 if (resize) { 2126 percpu_up_write(&c->mark_lock); 2127 up_write(&ca->bucket_lock); 2128 up_write(&c->gc_lock); 2129 } 2130 2131 ret = 0; 2132 err: 2133 kvpfree(buckets_nouse, 2134 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); 2135 if (bucket_gens) 2136 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); 2137 2138 return ret; 2139 } 2140 2141 void bch2_dev_buckets_free(struct bch_dev *ca) 2142 { 2143 unsigned i; 2144 2145 kvpfree(ca->buckets_nouse, 2146 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); 2147 kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), 2148 sizeof(struct bucket_gens) + ca->mi.nbuckets); 2149 2150 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 2151 free_percpu(ca->usage[i]); 2152 kfree(ca->usage_base); 2153 } 2154 2155 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) 2156 { 2157 unsigned i; 2158 2159 ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); 2160 if (!ca->usage_base) 2161 return -BCH_ERR_ENOMEM_usage_init; 2162 2163 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { 2164 ca->usage[i] = alloc_percpu(struct bch_dev_usage); 2165 if (!ca->usage[i]) 2166 return -BCH_ERR_ENOMEM_usage_init; 2167 } 2168 2169 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); 2170 } 2171