1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code for manipulating bucket marks for garbage collection. 4 * 5 * Copyright 2014 Datera, Inc. 6 */ 7 8 #include "bcachefs.h" 9 #include "alloc_background.h" 10 #include "backpointers.h" 11 #include "bset.h" 12 #include "btree_gc.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "buckets_waiting_for_journal.h" 16 #include "ec.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "movinggc.h" 20 #include "recovery.h" 21 #include "reflink.h" 22 #include "replicas.h" 23 #include "subvolume.h" 24 #include "trace.h" 25 26 #include <linux/preempt.h> 27 28 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, 29 enum bch_data_type data_type, 30 s64 sectors) 31 { 32 switch (data_type) { 33 case BCH_DATA_btree: 34 fs_usage->btree += sectors; 35 break; 36 case BCH_DATA_user: 37 case BCH_DATA_parity: 38 fs_usage->data += sectors; 39 break; 40 case BCH_DATA_cached: 41 fs_usage->cached += sectors; 42 break; 43 default: 44 break; 45 } 46 } 47 48 void bch2_fs_usage_initialize(struct bch_fs *c) 49 { 50 struct bch_fs_usage *usage; 51 struct bch_dev *ca; 52 unsigned i; 53 54 percpu_down_write(&c->mark_lock); 55 usage = c->usage_base; 56 57 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 58 bch2_fs_usage_acc_to_base(c, i); 59 60 for (i = 0; i < BCH_REPLICAS_MAX; i++) 61 usage->reserved += usage->persistent_reserved[i]; 62 63 for (i = 0; i < c->replicas.nr; i++) { 64 struct bch_replicas_entry *e = 65 cpu_replicas_entry(&c->replicas, i); 66 67 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); 68 } 69 70 for_each_member_device(ca, c, i) { 71 struct bch_dev_usage dev = bch2_dev_usage_read(ca); 72 73 usage->hidden += (dev.d[BCH_DATA_sb].buckets + 74 dev.d[BCH_DATA_journal].buckets) * 75 ca->mi.bucket_size; 76 } 77 78 percpu_up_write(&c->mark_lock); 79 } 80 81 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, 82 unsigned journal_seq, 83 bool gc) 84 { 85 BUG_ON(!gc && !journal_seq); 86 87 return this_cpu_ptr(gc 88 ? ca->usage_gc 89 : ca->usage[journal_seq & JOURNAL_BUF_MASK]); 90 } 91 92 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) 93 { 94 struct bch_fs *c = ca->fs; 95 unsigned seq, i, u64s = dev_usage_u64s(); 96 97 do { 98 seq = read_seqcount_begin(&c->usage_lock); 99 memcpy(usage, ca->usage_base, u64s * sizeof(u64)); 100 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 101 acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); 102 } while (read_seqcount_retry(&c->usage_lock, seq)); 103 } 104 105 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) 106 { 107 ssize_t offset = v - (u64 *) c->usage_base; 108 unsigned i, seq; 109 u64 ret; 110 111 BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); 112 percpu_rwsem_assert_held(&c->mark_lock); 113 114 do { 115 seq = read_seqcount_begin(&c->usage_lock); 116 ret = *v; 117 118 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 119 ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); 120 } while (read_seqcount_retry(&c->usage_lock, seq)); 121 122 return ret; 123 } 124 125 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) 126 { 127 struct bch_fs_usage_online *ret; 128 unsigned nr_replicas = READ_ONCE(c->replicas.nr); 129 unsigned seq, i; 130 retry: 131 ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); 132 if (unlikely(!ret)) 133 return NULL; 134 135 percpu_down_read(&c->mark_lock); 136 137 if (nr_replicas != c->replicas.nr) { 138 nr_replicas = c->replicas.nr; 139 percpu_up_read(&c->mark_lock); 140 kfree(ret); 141 goto retry; 142 } 143 144 ret->online_reserved = percpu_u64_get(c->online_reserved); 145 146 do { 147 seq = read_seqcount_begin(&c->usage_lock); 148 unsafe_memcpy(&ret->u, c->usage_base, 149 __fs_usage_u64s(nr_replicas) * sizeof(u64), 150 "embedded variable length struct"); 151 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 152 acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], 153 __fs_usage_u64s(nr_replicas)); 154 } while (read_seqcount_retry(&c->usage_lock, seq)); 155 156 return ret; 157 } 158 159 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) 160 { 161 struct bch_dev *ca; 162 unsigned i, u64s = fs_usage_u64s(c); 163 164 BUG_ON(idx >= ARRAY_SIZE(c->usage)); 165 166 preempt_disable(); 167 write_seqcount_begin(&c->usage_lock); 168 169 acc_u64s_percpu((u64 *) c->usage_base, 170 (u64 __percpu *) c->usage[idx], u64s); 171 percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); 172 173 rcu_read_lock(); 174 for_each_member_device_rcu(ca, c, i, NULL) { 175 u64s = dev_usage_u64s(); 176 177 acc_u64s_percpu((u64 *) ca->usage_base, 178 (u64 __percpu *) ca->usage[idx], u64s); 179 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); 180 } 181 rcu_read_unlock(); 182 183 write_seqcount_end(&c->usage_lock); 184 preempt_enable(); 185 } 186 187 void bch2_fs_usage_to_text(struct printbuf *out, 188 struct bch_fs *c, 189 struct bch_fs_usage_online *fs_usage) 190 { 191 unsigned i; 192 193 prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); 194 195 prt_printf(out, "hidden:\t\t\t\t%llu\n", 196 fs_usage->u.hidden); 197 prt_printf(out, "data:\t\t\t\t%llu\n", 198 fs_usage->u.data); 199 prt_printf(out, "cached:\t\t\t\t%llu\n", 200 fs_usage->u.cached); 201 prt_printf(out, "reserved:\t\t\t%llu\n", 202 fs_usage->u.reserved); 203 prt_printf(out, "nr_inodes:\t\t\t%llu\n", 204 fs_usage->u.nr_inodes); 205 prt_printf(out, "online reserved:\t\t%llu\n", 206 fs_usage->online_reserved); 207 208 for (i = 0; 209 i < ARRAY_SIZE(fs_usage->u.persistent_reserved); 210 i++) { 211 prt_printf(out, "%u replicas:\n", i + 1); 212 prt_printf(out, "\treserved:\t\t%llu\n", 213 fs_usage->u.persistent_reserved[i]); 214 } 215 216 for (i = 0; i < c->replicas.nr; i++) { 217 struct bch_replicas_entry *e = 218 cpu_replicas_entry(&c->replicas, i); 219 220 prt_printf(out, "\t"); 221 bch2_replicas_entry_to_text(out, e); 222 prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); 223 } 224 } 225 226 static u64 reserve_factor(u64 r) 227 { 228 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); 229 } 230 231 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) 232 { 233 return min(fs_usage->u.hidden + 234 fs_usage->u.btree + 235 fs_usage->u.data + 236 reserve_factor(fs_usage->u.reserved + 237 fs_usage->online_reserved), 238 c->capacity); 239 } 240 241 static struct bch_fs_usage_short 242 __bch2_fs_usage_read_short(struct bch_fs *c) 243 { 244 struct bch_fs_usage_short ret; 245 u64 data, reserved; 246 247 ret.capacity = c->capacity - 248 bch2_fs_usage_read_one(c, &c->usage_base->hidden); 249 250 data = bch2_fs_usage_read_one(c, &c->usage_base->data) + 251 bch2_fs_usage_read_one(c, &c->usage_base->btree); 252 reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + 253 percpu_u64_get(c->online_reserved); 254 255 ret.used = min(ret.capacity, data + reserve_factor(reserved)); 256 ret.free = ret.capacity - ret.used; 257 258 ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); 259 260 return ret; 261 } 262 263 struct bch_fs_usage_short 264 bch2_fs_usage_read_short(struct bch_fs *c) 265 { 266 struct bch_fs_usage_short ret; 267 268 percpu_down_read(&c->mark_lock); 269 ret = __bch2_fs_usage_read_short(c); 270 percpu_up_read(&c->mark_lock); 271 272 return ret; 273 } 274 275 void bch2_dev_usage_init(struct bch_dev *ca) 276 { 277 ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; 278 } 279 280 static inline int bucket_sectors_fragmented(struct bch_dev *ca, 281 struct bch_alloc_v4 a) 282 { 283 return a.dirty_sectors 284 ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) 285 : 0; 286 } 287 288 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, 289 struct bch_alloc_v4 old, 290 struct bch_alloc_v4 new, 291 u64 journal_seq, bool gc) 292 { 293 struct bch_fs_usage *fs_usage; 294 struct bch_dev_usage *u; 295 296 preempt_disable(); 297 fs_usage = fs_usage_ptr(c, journal_seq, gc); 298 299 if (data_type_is_hidden(old.data_type)) 300 fs_usage->hidden -= ca->mi.bucket_size; 301 if (data_type_is_hidden(new.data_type)) 302 fs_usage->hidden += ca->mi.bucket_size; 303 304 u = dev_usage_ptr(ca, journal_seq, gc); 305 306 u->d[old.data_type].buckets--; 307 u->d[new.data_type].buckets++; 308 309 u->buckets_ec -= (int) !!old.stripe; 310 u->buckets_ec += (int) !!new.stripe; 311 312 u->d[old.data_type].sectors -= old.dirty_sectors; 313 u->d[new.data_type].sectors += new.dirty_sectors; 314 315 u->d[BCH_DATA_cached].sectors += new.cached_sectors; 316 u->d[BCH_DATA_cached].sectors -= old.cached_sectors; 317 318 u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); 319 u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); 320 321 preempt_enable(); 322 } 323 324 static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, 325 struct bucket old, struct bucket new, 326 u64 journal_seq, bool gc) 327 { 328 struct bch_alloc_v4 old_a = { 329 .gen = old.gen, 330 .data_type = old.data_type, 331 .dirty_sectors = old.dirty_sectors, 332 .cached_sectors = old.cached_sectors, 333 .stripe = old.stripe, 334 }; 335 struct bch_alloc_v4 new_a = { 336 .gen = new.gen, 337 .data_type = new.data_type, 338 .dirty_sectors = new.dirty_sectors, 339 .cached_sectors = new.cached_sectors, 340 .stripe = new.stripe, 341 }; 342 343 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); 344 } 345 346 static inline int __update_replicas(struct bch_fs *c, 347 struct bch_fs_usage *fs_usage, 348 struct bch_replicas_entry *r, 349 s64 sectors) 350 { 351 int idx = bch2_replicas_entry_idx(c, r); 352 353 if (idx < 0) 354 return -1; 355 356 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); 357 fs_usage->replicas[idx] += sectors; 358 return 0; 359 } 360 361 static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, 362 struct bch_replicas_entry *r, s64 sectors, 363 unsigned journal_seq, bool gc) 364 { 365 struct bch_fs_usage *fs_usage; 366 int idx, ret = 0; 367 struct printbuf buf = PRINTBUF; 368 369 percpu_down_read(&c->mark_lock); 370 371 idx = bch2_replicas_entry_idx(c, r); 372 if (idx < 0 && 373 fsck_err(c, "no replicas entry\n" 374 " while marking %s", 375 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 376 percpu_up_read(&c->mark_lock); 377 ret = bch2_mark_replicas(c, r); 378 percpu_down_read(&c->mark_lock); 379 380 if (ret) 381 goto err; 382 idx = bch2_replicas_entry_idx(c, r); 383 } 384 if (idx < 0) { 385 ret = -1; 386 goto err; 387 } 388 389 preempt_disable(); 390 fs_usage = fs_usage_ptr(c, journal_seq, gc); 391 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); 392 fs_usage->replicas[idx] += sectors; 393 preempt_enable(); 394 err: 395 fsck_err: 396 percpu_up_read(&c->mark_lock); 397 printbuf_exit(&buf); 398 return ret; 399 } 400 401 static inline int update_cached_sectors(struct bch_fs *c, 402 struct bkey_s_c k, 403 unsigned dev, s64 sectors, 404 unsigned journal_seq, bool gc) 405 { 406 struct bch_replicas_padded r; 407 408 bch2_replicas_entry_cached(&r.e, dev); 409 410 return update_replicas(c, k, &r.e, sectors, journal_seq, gc); 411 } 412 413 static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, 414 gfp_t gfp) 415 { 416 struct replicas_delta_list *d = trans->fs_usage_deltas; 417 unsigned new_size = d ? (d->size + more) * 2 : 128; 418 unsigned alloc_size = sizeof(*d) + new_size; 419 420 WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); 421 422 if (!d || d->used + more > d->size) { 423 d = krealloc(d, alloc_size, gfp|__GFP_ZERO); 424 425 if (unlikely(!d)) { 426 if (alloc_size > REPLICAS_DELTA_LIST_MAX) 427 return -ENOMEM; 428 429 d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); 430 if (!d) 431 return -ENOMEM; 432 433 memset(d, 0, REPLICAS_DELTA_LIST_MAX); 434 435 if (trans->fs_usage_deltas) 436 memcpy(d, trans->fs_usage_deltas, 437 trans->fs_usage_deltas->size + sizeof(*d)); 438 439 new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); 440 kfree(trans->fs_usage_deltas); 441 } 442 443 d->size = new_size; 444 trans->fs_usage_deltas = d; 445 } 446 447 return 0; 448 } 449 450 int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) 451 { 452 return allocate_dropping_locks_errcode(trans, 453 __replicas_deltas_realloc(trans, more, _gfp)); 454 } 455 456 static inline int update_replicas_list(struct btree_trans *trans, 457 struct bch_replicas_entry *r, 458 s64 sectors) 459 { 460 struct replicas_delta_list *d; 461 struct replicas_delta *n; 462 unsigned b; 463 int ret; 464 465 if (!sectors) 466 return 0; 467 468 b = replicas_entry_bytes(r) + 8; 469 ret = bch2_replicas_deltas_realloc(trans, b); 470 if (ret) 471 return ret; 472 473 d = trans->fs_usage_deltas; 474 n = (void *) d->d + d->used; 475 n->delta = sectors; 476 unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), 477 r, replicas_entry_bytes(r), 478 "flexible array member embedded in strcuct with padding"); 479 bch2_replicas_entry_sort(&n->r); 480 d->used += b; 481 return 0; 482 } 483 484 static inline int update_cached_sectors_list(struct btree_trans *trans, 485 unsigned dev, s64 sectors) 486 { 487 struct bch_replicas_padded r; 488 489 bch2_replicas_entry_cached(&r.e, dev); 490 491 return update_replicas_list(trans, &r.e, sectors); 492 } 493 494 int bch2_mark_alloc(struct btree_trans *trans, 495 enum btree_id btree, unsigned level, 496 struct bkey_s_c old, struct bkey_s_c new, 497 unsigned flags) 498 { 499 bool gc = flags & BTREE_TRIGGER_GC; 500 u64 journal_seq = trans->journal_res.seq; 501 u64 bucket_journal_seq; 502 struct bch_fs *c = trans->c; 503 struct bch_alloc_v4 old_a_convert, new_a_convert; 504 const struct bch_alloc_v4 *old_a, *new_a; 505 struct bch_dev *ca; 506 int ret = 0; 507 508 /* 509 * alloc btree is read in by bch2_alloc_read, not gc: 510 */ 511 if ((flags & BTREE_TRIGGER_GC) && 512 !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) 513 return 0; 514 515 if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, 516 "alloc key for invalid device or bucket")) 517 return -EIO; 518 519 ca = bch_dev_bkey_exists(c, new.k->p.inode); 520 521 old_a = bch2_alloc_to_v4(old, &old_a_convert); 522 new_a = bch2_alloc_to_v4(new, &new_a_convert); 523 524 bucket_journal_seq = new_a->journal_seq; 525 526 if ((flags & BTREE_TRIGGER_INSERT) && 527 data_type_is_empty(old_a->data_type) != 528 data_type_is_empty(new_a->data_type) && 529 new.k->type == KEY_TYPE_alloc_v4) { 530 struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; 531 532 EBUG_ON(!journal_seq); 533 534 /* 535 * If the btree updates referring to a bucket weren't flushed 536 * before the bucket became empty again, then the we don't have 537 * to wait on a journal flush before we can reuse the bucket: 538 */ 539 v->journal_seq = bucket_journal_seq = 540 data_type_is_empty(new_a->data_type) && 541 (journal_seq == v->journal_seq || 542 bch2_journal_noflush_seq(&c->journal, v->journal_seq)) 543 ? 0 : journal_seq; 544 } 545 546 if (!data_type_is_empty(old_a->data_type) && 547 data_type_is_empty(new_a->data_type) && 548 bucket_journal_seq) { 549 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 550 c->journal.flushed_seq_ondisk, 551 new.k->p.inode, new.k->p.offset, 552 bucket_journal_seq); 553 if (ret) { 554 bch2_fs_fatal_error(c, 555 "error setting bucket_needs_journal_commit: %i", ret); 556 return ret; 557 } 558 } 559 560 percpu_down_read(&c->mark_lock); 561 if (!gc && new_a->gen != old_a->gen) 562 *bucket_gen(ca, new.k->p.offset) = new_a->gen; 563 564 bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); 565 566 if (gc) { 567 struct bucket *g = gc_bucket(ca, new.k->p.offset); 568 569 bucket_lock(g); 570 571 g->gen_valid = 1; 572 g->gen = new_a->gen; 573 g->data_type = new_a->data_type; 574 g->stripe = new_a->stripe; 575 g->stripe_redundancy = new_a->stripe_redundancy; 576 g->dirty_sectors = new_a->dirty_sectors; 577 g->cached_sectors = new_a->cached_sectors; 578 579 bucket_unlock(g); 580 } 581 percpu_up_read(&c->mark_lock); 582 583 /* 584 * need to know if we're getting called from the invalidate path or 585 * not: 586 */ 587 588 if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && 589 old_a->cached_sectors) { 590 ret = update_cached_sectors(c, new, ca->dev_idx, 591 -((s64) old_a->cached_sectors), 592 journal_seq, gc); 593 if (ret) { 594 bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", 595 __func__); 596 return ret; 597 } 598 } 599 600 if (new_a->data_type == BCH_DATA_free && 601 (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) 602 closure_wake_up(&c->freelist_wait); 603 604 if (new_a->data_type == BCH_DATA_need_discard && 605 (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) 606 bch2_do_discards(c); 607 608 if (old_a->data_type != BCH_DATA_cached && 609 new_a->data_type == BCH_DATA_cached && 610 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 611 bch2_do_invalidates(c); 612 613 if (new_a->data_type == BCH_DATA_need_gc_gens) 614 bch2_do_gc_gens(c); 615 616 return 0; 617 } 618 619 int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, 620 size_t b, enum bch_data_type data_type, 621 unsigned sectors, struct gc_pos pos, 622 unsigned flags) 623 { 624 struct bucket old, new, *g; 625 int ret = 0; 626 627 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 628 BUG_ON(data_type != BCH_DATA_sb && 629 data_type != BCH_DATA_journal); 630 631 /* 632 * Backup superblock might be past the end of our normal usable space: 633 */ 634 if (b >= ca->mi.nbuckets) 635 return 0; 636 637 percpu_down_read(&c->mark_lock); 638 g = gc_bucket(ca, b); 639 640 bucket_lock(g); 641 old = *g; 642 643 if (bch2_fs_inconsistent_on(g->data_type && 644 g->data_type != data_type, c, 645 "different types of data in same bucket: %s, %s", 646 bch2_data_types[g->data_type], 647 bch2_data_types[data_type])) { 648 ret = -EIO; 649 goto err; 650 } 651 652 if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, 653 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", 654 ca->dev_idx, b, g->gen, 655 bch2_data_types[g->data_type ?: data_type], 656 g->dirty_sectors, sectors)) { 657 ret = -EIO; 658 goto err; 659 } 660 661 662 g->data_type = data_type; 663 g->dirty_sectors += sectors; 664 new = *g; 665 err: 666 bucket_unlock(g); 667 if (!ret) 668 bch2_dev_usage_update_m(c, ca, old, new, 0, true); 669 percpu_up_read(&c->mark_lock); 670 return ret; 671 } 672 673 static int check_bucket_ref(struct btree_trans *trans, 674 struct bkey_s_c k, 675 const struct bch_extent_ptr *ptr, 676 s64 sectors, enum bch_data_type ptr_data_type, 677 u8 b_gen, u8 bucket_data_type, 678 u32 dirty_sectors, u32 cached_sectors) 679 { 680 struct bch_fs *c = trans->c; 681 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 682 size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); 683 u32 bucket_sectors = !ptr->cached 684 ? dirty_sectors 685 : cached_sectors; 686 struct printbuf buf = PRINTBUF; 687 int ret = 0; 688 689 if (bucket_data_type == BCH_DATA_cached) 690 bucket_data_type = BCH_DATA_user; 691 692 if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || 693 (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) 694 bucket_data_type = ptr_data_type = BCH_DATA_stripe; 695 696 if (gen_after(ptr->gen, b_gen)) { 697 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 698 "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" 699 "while marking %s", 700 ptr->dev, bucket_nr, b_gen, 701 bch2_data_types[bucket_data_type ?: ptr_data_type], 702 ptr->gen, 703 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 704 ret = -EIO; 705 goto err; 706 } 707 708 if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { 709 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 710 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" 711 "while marking %s", 712 ptr->dev, bucket_nr, b_gen, 713 bch2_data_types[bucket_data_type ?: ptr_data_type], 714 ptr->gen, 715 (printbuf_reset(&buf), 716 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 717 ret = -EIO; 718 goto err; 719 } 720 721 if (b_gen != ptr->gen && !ptr->cached) { 722 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 723 "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" 724 "while marking %s", 725 ptr->dev, bucket_nr, b_gen, 726 *bucket_gen(ca, bucket_nr), 727 bch2_data_types[bucket_data_type ?: ptr_data_type], 728 ptr->gen, 729 (printbuf_reset(&buf), 730 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 731 ret = -EIO; 732 goto err; 733 } 734 735 if (b_gen != ptr->gen) { 736 ret = 1; 737 goto out; 738 } 739 740 if (!data_type_is_empty(bucket_data_type) && 741 ptr_data_type && 742 bucket_data_type != ptr_data_type) { 743 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 744 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" 745 "while marking %s", 746 ptr->dev, bucket_nr, b_gen, 747 bch2_data_types[bucket_data_type], 748 bch2_data_types[ptr_data_type], 749 (printbuf_reset(&buf), 750 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 751 ret = -EIO; 752 goto err; 753 } 754 755 if ((u64) bucket_sectors + sectors > U32_MAX) { 756 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 757 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" 758 "while marking %s", 759 ptr->dev, bucket_nr, b_gen, 760 bch2_data_types[bucket_data_type ?: ptr_data_type], 761 bucket_sectors, sectors, 762 (printbuf_reset(&buf), 763 bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 764 ret = -EIO; 765 goto err; 766 } 767 out: 768 printbuf_exit(&buf); 769 return ret; 770 err: 771 bch2_dump_trans_updates(trans); 772 goto out; 773 } 774 775 static int mark_stripe_bucket(struct btree_trans *trans, 776 struct bkey_s_c k, 777 unsigned ptr_idx, 778 unsigned flags) 779 { 780 struct bch_fs *c = trans->c; 781 u64 journal_seq = trans->journal_res.seq; 782 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 783 unsigned nr_data = s->nr_blocks - s->nr_redundant; 784 bool parity = ptr_idx >= nr_data; 785 enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; 786 s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; 787 const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; 788 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 789 struct bucket old, new, *g; 790 struct printbuf buf = PRINTBUF; 791 int ret = 0; 792 793 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 794 795 /* * XXX doesn't handle deletion */ 796 797 percpu_down_read(&c->mark_lock); 798 g = PTR_GC_BUCKET(ca, ptr); 799 800 if (g->dirty_sectors || 801 (g->stripe && g->stripe != k.k->p.offset)) { 802 bch2_fs_inconsistent(c, 803 "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", 804 ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, 805 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 806 ret = -EINVAL; 807 goto err; 808 } 809 810 bucket_lock(g); 811 old = *g; 812 813 ret = check_bucket_ref(trans, k, ptr, sectors, data_type, 814 g->gen, g->data_type, 815 g->dirty_sectors, g->cached_sectors); 816 if (ret) 817 goto err; 818 819 g->data_type = data_type; 820 g->dirty_sectors += sectors; 821 822 g->stripe = k.k->p.offset; 823 g->stripe_redundancy = s->nr_redundant; 824 new = *g; 825 err: 826 bucket_unlock(g); 827 if (!ret) 828 bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); 829 percpu_up_read(&c->mark_lock); 830 printbuf_exit(&buf); 831 return ret; 832 } 833 834 static int __mark_pointer(struct btree_trans *trans, 835 struct bkey_s_c k, 836 const struct bch_extent_ptr *ptr, 837 s64 sectors, enum bch_data_type ptr_data_type, 838 u8 bucket_gen, u8 *bucket_data_type, 839 u32 *dirty_sectors, u32 *cached_sectors) 840 { 841 u32 *dst_sectors = !ptr->cached 842 ? dirty_sectors 843 : cached_sectors; 844 int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, 845 bucket_gen, *bucket_data_type, 846 *dirty_sectors, *cached_sectors); 847 848 if (ret) 849 return ret; 850 851 *dst_sectors += sectors; 852 *bucket_data_type = *dirty_sectors || *cached_sectors 853 ? ptr_data_type : 0; 854 return 0; 855 } 856 857 static int bch2_mark_pointer(struct btree_trans *trans, 858 enum btree_id btree_id, unsigned level, 859 struct bkey_s_c k, 860 struct extent_ptr_decoded p, 861 s64 sectors, 862 unsigned flags) 863 { 864 u64 journal_seq = trans->journal_res.seq; 865 struct bch_fs *c = trans->c; 866 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); 867 struct bucket old, new, *g; 868 enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); 869 u8 bucket_data_type; 870 int ret = 0; 871 872 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 873 874 percpu_down_read(&c->mark_lock); 875 g = PTR_GC_BUCKET(ca, &p.ptr); 876 bucket_lock(g); 877 old = *g; 878 879 bucket_data_type = g->data_type; 880 ret = __mark_pointer(trans, k, &p.ptr, sectors, 881 data_type, g->gen, 882 &bucket_data_type, 883 &g->dirty_sectors, 884 &g->cached_sectors); 885 if (!ret) 886 g->data_type = bucket_data_type; 887 888 new = *g; 889 bucket_unlock(g); 890 if (!ret) 891 bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); 892 percpu_up_read(&c->mark_lock); 893 894 return ret; 895 } 896 897 static int bch2_mark_stripe_ptr(struct btree_trans *trans, 898 struct bkey_s_c k, 899 struct bch_extent_stripe_ptr p, 900 enum bch_data_type data_type, 901 s64 sectors, 902 unsigned flags) 903 { 904 struct bch_fs *c = trans->c; 905 struct bch_replicas_padded r; 906 struct gc_stripe *m; 907 908 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 909 910 m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); 911 if (!m) { 912 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 913 (u64) p.idx); 914 return -BCH_ERR_ENOMEM_mark_stripe_ptr; 915 } 916 917 mutex_lock(&c->ec_stripes_heap_lock); 918 919 if (!m || !m->alive) { 920 mutex_unlock(&c->ec_stripes_heap_lock); 921 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", 922 (u64) p.idx); 923 bch2_inconsistent_error(c); 924 return -EIO; 925 } 926 927 m->block_sectors[p.block] += sectors; 928 929 r = m->r; 930 mutex_unlock(&c->ec_stripes_heap_lock); 931 932 r.e.data_type = data_type; 933 update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); 934 935 return 0; 936 } 937 938 int bch2_mark_extent(struct btree_trans *trans, 939 enum btree_id btree_id, unsigned level, 940 struct bkey_s_c old, struct bkey_s_c new, 941 unsigned flags) 942 { 943 u64 journal_seq = trans->journal_res.seq; 944 struct bch_fs *c = trans->c; 945 struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; 946 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 947 const union bch_extent_entry *entry; 948 struct extent_ptr_decoded p; 949 struct bch_replicas_padded r; 950 enum bch_data_type data_type = bkey_is_btree_ptr(k.k) 951 ? BCH_DATA_btree 952 : BCH_DATA_user; 953 s64 sectors = bkey_is_btree_ptr(k.k) 954 ? btree_sectors(c) 955 : k.k->size; 956 s64 dirty_sectors = 0; 957 bool stale; 958 int ret; 959 960 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 961 962 r.e.data_type = data_type; 963 r.e.nr_devs = 0; 964 r.e.nr_required = 1; 965 966 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 967 s64 disk_sectors = ptr_disk_sectors(sectors, p); 968 969 if (flags & BTREE_TRIGGER_OVERWRITE) 970 disk_sectors = -disk_sectors; 971 972 ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); 973 if (ret < 0) 974 return ret; 975 976 stale = ret > 0; 977 978 if (p.ptr.cached) { 979 if (!stale) { 980 ret = update_cached_sectors(c, k, p.ptr.dev, 981 disk_sectors, journal_seq, true); 982 if (ret) { 983 bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", 984 __func__); 985 return ret; 986 } 987 } 988 } else if (!p.has_ec) { 989 dirty_sectors += disk_sectors; 990 r.e.devs[r.e.nr_devs++] = p.ptr.dev; 991 } else { 992 ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, 993 disk_sectors, flags); 994 if (ret) 995 return ret; 996 997 /* 998 * There may be other dirty pointers in this extent, but 999 * if so they're not required for mounting if we have an 1000 * erasure coded pointer in this extent: 1001 */ 1002 r.e.nr_required = 0; 1003 } 1004 } 1005 1006 if (r.e.nr_devs) { 1007 ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); 1008 if (ret) { 1009 struct printbuf buf = PRINTBUF; 1010 1011 bch2_bkey_val_to_text(&buf, c, k); 1012 bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); 1013 printbuf_exit(&buf); 1014 return ret; 1015 } 1016 } 1017 1018 return 0; 1019 } 1020 1021 int bch2_mark_stripe(struct btree_trans *trans, 1022 enum btree_id btree_id, unsigned level, 1023 struct bkey_s_c old, struct bkey_s_c new, 1024 unsigned flags) 1025 { 1026 bool gc = flags & BTREE_TRIGGER_GC; 1027 u64 journal_seq = trans->journal_res.seq; 1028 struct bch_fs *c = trans->c; 1029 u64 idx = new.k->p.offset; 1030 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 1031 ? bkey_s_c_to_stripe(old).v : NULL; 1032 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 1033 ? bkey_s_c_to_stripe(new).v : NULL; 1034 unsigned i; 1035 int ret; 1036 1037 BUG_ON(gc && old_s); 1038 1039 if (!gc) { 1040 struct stripe *m = genradix_ptr(&c->stripes, idx); 1041 1042 if (!m) { 1043 struct printbuf buf1 = PRINTBUF; 1044 struct printbuf buf2 = PRINTBUF; 1045 1046 bch2_bkey_val_to_text(&buf1, c, old); 1047 bch2_bkey_val_to_text(&buf2, c, new); 1048 bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" 1049 "old %s\n" 1050 "new %s", idx, buf1.buf, buf2.buf); 1051 printbuf_exit(&buf2); 1052 printbuf_exit(&buf1); 1053 bch2_inconsistent_error(c); 1054 return -1; 1055 } 1056 1057 if (!new_s) { 1058 bch2_stripes_heap_del(c, m, idx); 1059 1060 memset(m, 0, sizeof(*m)); 1061 } else { 1062 m->sectors = le16_to_cpu(new_s->sectors); 1063 m->algorithm = new_s->algorithm; 1064 m->nr_blocks = new_s->nr_blocks; 1065 m->nr_redundant = new_s->nr_redundant; 1066 m->blocks_nonempty = 0; 1067 1068 for (i = 0; i < new_s->nr_blocks; i++) 1069 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); 1070 1071 if (!old_s) 1072 bch2_stripes_heap_insert(c, m, idx); 1073 else 1074 bch2_stripes_heap_update(c, m, idx); 1075 } 1076 } else { 1077 struct gc_stripe *m = 1078 genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); 1079 1080 if (!m) { 1081 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 1082 idx); 1083 return -BCH_ERR_ENOMEM_mark_stripe; 1084 } 1085 /* 1086 * This will be wrong when we bring back runtime gc: we should 1087 * be unmarking the old key and then marking the new key 1088 */ 1089 m->alive = true; 1090 m->sectors = le16_to_cpu(new_s->sectors); 1091 m->nr_blocks = new_s->nr_blocks; 1092 m->nr_redundant = new_s->nr_redundant; 1093 1094 for (i = 0; i < new_s->nr_blocks; i++) 1095 m->ptrs[i] = new_s->ptrs[i]; 1096 1097 bch2_bkey_to_replicas(&m->r.e, new); 1098 1099 /* 1100 * gc recalculates this field from stripe ptr 1101 * references: 1102 */ 1103 memset(m->block_sectors, 0, sizeof(m->block_sectors)); 1104 1105 for (i = 0; i < new_s->nr_blocks; i++) { 1106 ret = mark_stripe_bucket(trans, new, i, flags); 1107 if (ret) 1108 return ret; 1109 } 1110 1111 ret = update_replicas(c, new, &m->r.e, 1112 ((s64) m->sectors * m->nr_redundant), 1113 journal_seq, gc); 1114 if (ret) { 1115 struct printbuf buf = PRINTBUF; 1116 1117 bch2_bkey_val_to_text(&buf, c, new); 1118 bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); 1119 printbuf_exit(&buf); 1120 return ret; 1121 } 1122 } 1123 1124 return 0; 1125 } 1126 1127 int bch2_mark_reservation(struct btree_trans *trans, 1128 enum btree_id btree_id, unsigned level, 1129 struct bkey_s_c old, struct bkey_s_c new, 1130 unsigned flags) 1131 { 1132 struct bch_fs *c = trans->c; 1133 struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; 1134 struct bch_fs_usage *fs_usage; 1135 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; 1136 s64 sectors = (s64) k.k->size; 1137 1138 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 1139 1140 if (flags & BTREE_TRIGGER_OVERWRITE) 1141 sectors = -sectors; 1142 sectors *= replicas; 1143 1144 percpu_down_read(&c->mark_lock); 1145 preempt_disable(); 1146 1147 fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); 1148 replicas = clamp_t(unsigned, replicas, 1, 1149 ARRAY_SIZE(fs_usage->persistent_reserved)); 1150 1151 fs_usage->reserved += sectors; 1152 fs_usage->persistent_reserved[replicas - 1] += sectors; 1153 1154 preempt_enable(); 1155 percpu_up_read(&c->mark_lock); 1156 1157 return 0; 1158 } 1159 1160 static s64 __bch2_mark_reflink_p(struct btree_trans *trans, 1161 struct bkey_s_c_reflink_p p, 1162 u64 start, u64 end, 1163 u64 *idx, unsigned flags, size_t r_idx) 1164 { 1165 struct bch_fs *c = trans->c; 1166 struct reflink_gc *r; 1167 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; 1168 u64 next_idx = end; 1169 s64 ret = 0; 1170 struct printbuf buf = PRINTBUF; 1171 1172 if (r_idx >= c->reflink_gc_nr) 1173 goto not_found; 1174 1175 r = genradix_ptr(&c->reflink_gc_table, r_idx); 1176 next_idx = min(next_idx, r->offset - r->size); 1177 if (*idx < next_idx) 1178 goto not_found; 1179 1180 BUG_ON((s64) r->refcount + add < 0); 1181 1182 r->refcount += add; 1183 *idx = r->offset; 1184 return 0; 1185 not_found: 1186 if (fsck_err(c, "pointer to missing indirect extent\n" 1187 " %s\n" 1188 " missing range %llu-%llu", 1189 (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), 1190 *idx, next_idx)) { 1191 struct bkey_i_error *new; 1192 1193 new = bch2_trans_kmalloc(trans, sizeof(*new)); 1194 ret = PTR_ERR_OR_ZERO(new); 1195 if (ret) 1196 goto err; 1197 1198 bkey_init(&new->k); 1199 new->k.type = KEY_TYPE_error; 1200 new->k.p = bkey_start_pos(p.k); 1201 new->k.p.offset += *idx - start; 1202 bch2_key_resize(&new->k, next_idx - *idx); 1203 ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, 1204 BTREE_TRIGGER_NORUN); 1205 } 1206 1207 *idx = next_idx; 1208 err: 1209 fsck_err: 1210 printbuf_exit(&buf); 1211 return ret; 1212 } 1213 1214 int bch2_mark_reflink_p(struct btree_trans *trans, 1215 enum btree_id btree_id, unsigned level, 1216 struct bkey_s_c old, struct bkey_s_c new, 1217 unsigned flags) 1218 { 1219 struct bch_fs *c = trans->c; 1220 struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; 1221 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 1222 struct reflink_gc *ref; 1223 size_t l, r, m; 1224 u64 idx = le64_to_cpu(p.v->idx), start = idx; 1225 u64 end = le64_to_cpu(p.v->idx) + p.k->size; 1226 int ret = 0; 1227 1228 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 1229 1230 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { 1231 idx -= le32_to_cpu(p.v->front_pad); 1232 end += le32_to_cpu(p.v->back_pad); 1233 } 1234 1235 l = 0; 1236 r = c->reflink_gc_nr; 1237 while (l < r) { 1238 m = l + (r - l) / 2; 1239 1240 ref = genradix_ptr(&c->reflink_gc_table, m); 1241 if (ref->offset <= idx) 1242 l = m + 1; 1243 else 1244 r = m; 1245 } 1246 1247 while (idx < end && !ret) 1248 ret = __bch2_mark_reflink_p(trans, p, start, end, 1249 &idx, flags, l++); 1250 1251 return ret; 1252 } 1253 1254 void bch2_trans_fs_usage_revert(struct btree_trans *trans, 1255 struct replicas_delta_list *deltas) 1256 { 1257 struct bch_fs *c = trans->c; 1258 struct bch_fs_usage *dst; 1259 struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; 1260 s64 added = 0; 1261 unsigned i; 1262 1263 percpu_down_read(&c->mark_lock); 1264 preempt_disable(); 1265 dst = fs_usage_ptr(c, trans->journal_res.seq, false); 1266 1267 /* revert changes: */ 1268 for (d = deltas->d; d != top; d = replicas_delta_next(d)) { 1269 switch (d->r.data_type) { 1270 case BCH_DATA_btree: 1271 case BCH_DATA_user: 1272 case BCH_DATA_parity: 1273 added += d->delta; 1274 } 1275 BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); 1276 } 1277 1278 dst->nr_inodes -= deltas->nr_inodes; 1279 1280 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1281 added -= deltas->persistent_reserved[i]; 1282 dst->reserved -= deltas->persistent_reserved[i]; 1283 dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; 1284 } 1285 1286 if (added > 0) { 1287 trans->disk_res->sectors += added; 1288 this_cpu_add(*c->online_reserved, added); 1289 } 1290 1291 preempt_enable(); 1292 percpu_up_read(&c->mark_lock); 1293 } 1294 1295 int bch2_trans_fs_usage_apply(struct btree_trans *trans, 1296 struct replicas_delta_list *deltas) 1297 { 1298 struct bch_fs *c = trans->c; 1299 static int warned_disk_usage = 0; 1300 bool warn = false; 1301 unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; 1302 struct replicas_delta *d, *d2; 1303 struct replicas_delta *top = (void *) deltas->d + deltas->used; 1304 struct bch_fs_usage *dst; 1305 s64 added = 0, should_not_have_added; 1306 unsigned i; 1307 1308 percpu_down_read(&c->mark_lock); 1309 preempt_disable(); 1310 dst = fs_usage_ptr(c, trans->journal_res.seq, false); 1311 1312 for (d = deltas->d; d != top; d = replicas_delta_next(d)) { 1313 switch (d->r.data_type) { 1314 case BCH_DATA_btree: 1315 case BCH_DATA_user: 1316 case BCH_DATA_parity: 1317 added += d->delta; 1318 } 1319 1320 if (__update_replicas(c, dst, &d->r, d->delta)) 1321 goto need_mark; 1322 } 1323 1324 dst->nr_inodes += deltas->nr_inodes; 1325 1326 for (i = 0; i < BCH_REPLICAS_MAX; i++) { 1327 added += deltas->persistent_reserved[i]; 1328 dst->reserved += deltas->persistent_reserved[i]; 1329 dst->persistent_reserved[i] += deltas->persistent_reserved[i]; 1330 } 1331 1332 /* 1333 * Not allowed to reduce sectors_available except by getting a 1334 * reservation: 1335 */ 1336 should_not_have_added = added - (s64) disk_res_sectors; 1337 if (unlikely(should_not_have_added > 0)) { 1338 u64 old, new, v = atomic64_read(&c->sectors_available); 1339 1340 do { 1341 old = v; 1342 new = max_t(s64, 0, old - should_not_have_added); 1343 } while ((v = atomic64_cmpxchg(&c->sectors_available, 1344 old, new)) != old); 1345 1346 added -= should_not_have_added; 1347 warn = true; 1348 } 1349 1350 if (added > 0) { 1351 trans->disk_res->sectors -= added; 1352 this_cpu_sub(*c->online_reserved, added); 1353 } 1354 1355 preempt_enable(); 1356 percpu_up_read(&c->mark_lock); 1357 1358 if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) 1359 bch2_trans_inconsistent(trans, 1360 "disk usage increased %lli more than %u sectors reserved)", 1361 should_not_have_added, disk_res_sectors); 1362 return 0; 1363 need_mark: 1364 /* revert changes: */ 1365 for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) 1366 BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); 1367 1368 preempt_enable(); 1369 percpu_up_read(&c->mark_lock); 1370 return -1; 1371 } 1372 1373 /* trans_mark: */ 1374 1375 static inline int bch2_trans_mark_pointer(struct btree_trans *trans, 1376 enum btree_id btree_id, unsigned level, 1377 struct bkey_s_c k, struct extent_ptr_decoded p, 1378 unsigned flags) 1379 { 1380 bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); 1381 struct btree_iter iter; 1382 struct bkey_i_alloc_v4 *a; 1383 struct bpos bucket; 1384 struct bch_backpointer bp; 1385 s64 sectors; 1386 int ret; 1387 1388 bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); 1389 sectors = bp.bucket_len; 1390 if (!insert) 1391 sectors = -sectors; 1392 1393 a = bch2_trans_start_alloc_update(trans, &iter, bucket); 1394 if (IS_ERR(a)) 1395 return PTR_ERR(a); 1396 1397 ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, 1398 a->v.gen, &a->v.data_type, 1399 &a->v.dirty_sectors, &a->v.cached_sectors) ?: 1400 bch2_trans_update(trans, &iter, &a->k_i, 0); 1401 bch2_trans_iter_exit(trans, &iter); 1402 1403 if (ret) 1404 return ret; 1405 1406 if (!p.ptr.cached) { 1407 ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); 1408 if (ret) 1409 return ret; 1410 } 1411 1412 return 0; 1413 } 1414 1415 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, 1416 struct extent_ptr_decoded p, 1417 s64 sectors, enum bch_data_type data_type) 1418 { 1419 struct btree_iter iter; 1420 struct bkey_i_stripe *s; 1421 struct bch_replicas_padded r; 1422 int ret = 0; 1423 1424 s = bch2_bkey_get_mut_typed(trans, &iter, 1425 BTREE_ID_stripes, POS(0, p.ec.idx), 1426 BTREE_ITER_WITH_UPDATES, stripe); 1427 ret = PTR_ERR_OR_ZERO(s); 1428 if (unlikely(ret)) { 1429 bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, 1430 "pointer to nonexistent stripe %llu", 1431 (u64) p.ec.idx); 1432 goto err; 1433 } 1434 1435 if (!bch2_ptr_matches_stripe(&s->v, p)) { 1436 bch2_trans_inconsistent(trans, 1437 "stripe pointer doesn't match stripe %llu", 1438 (u64) p.ec.idx); 1439 ret = -EIO; 1440 goto err; 1441 } 1442 1443 stripe_blockcount_set(&s->v, p.ec.block, 1444 stripe_blockcount_get(&s->v, p.ec.block) + 1445 sectors); 1446 1447 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); 1448 r.e.data_type = data_type; 1449 ret = update_replicas_list(trans, &r.e, sectors); 1450 err: 1451 bch2_trans_iter_exit(trans, &iter); 1452 return ret; 1453 } 1454 1455 int bch2_trans_mark_extent(struct btree_trans *trans, 1456 enum btree_id btree_id, unsigned level, 1457 struct bkey_s_c old, struct bkey_i *new, 1458 unsigned flags) 1459 { 1460 struct bch_fs *c = trans->c; 1461 struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE 1462 ? old 1463 : bkey_i_to_s_c(new); 1464 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1465 const union bch_extent_entry *entry; 1466 struct extent_ptr_decoded p; 1467 struct bch_replicas_padded r; 1468 enum bch_data_type data_type = bkey_is_btree_ptr(k.k) 1469 ? BCH_DATA_btree 1470 : BCH_DATA_user; 1471 s64 sectors = bkey_is_btree_ptr(k.k) 1472 ? btree_sectors(c) 1473 : k.k->size; 1474 s64 dirty_sectors = 0; 1475 bool stale; 1476 int ret = 0; 1477 1478 r.e.data_type = data_type; 1479 r.e.nr_devs = 0; 1480 r.e.nr_required = 1; 1481 1482 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1483 s64 disk_sectors = ptr_disk_sectors(sectors, p); 1484 1485 if (flags & BTREE_TRIGGER_OVERWRITE) 1486 disk_sectors = -disk_sectors; 1487 1488 ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); 1489 if (ret < 0) 1490 return ret; 1491 1492 stale = ret > 0; 1493 1494 if (p.ptr.cached) { 1495 if (!stale) { 1496 ret = update_cached_sectors_list(trans, p.ptr.dev, 1497 disk_sectors); 1498 if (ret) 1499 return ret; 1500 } 1501 } else if (!p.has_ec) { 1502 dirty_sectors += disk_sectors; 1503 r.e.devs[r.e.nr_devs++] = p.ptr.dev; 1504 } else { 1505 ret = bch2_trans_mark_stripe_ptr(trans, p, 1506 disk_sectors, data_type); 1507 if (ret) 1508 return ret; 1509 1510 r.e.nr_required = 0; 1511 } 1512 } 1513 1514 if (r.e.nr_devs) 1515 ret = update_replicas_list(trans, &r.e, dirty_sectors); 1516 1517 return ret; 1518 } 1519 1520 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, 1521 struct bkey_s_c_stripe s, 1522 unsigned idx, bool deleting) 1523 { 1524 struct bch_fs *c = trans->c; 1525 const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; 1526 struct btree_iter iter; 1527 struct bkey_i_alloc_v4 *a; 1528 enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant 1529 ? BCH_DATA_parity : 0; 1530 s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; 1531 int ret = 0; 1532 1533 if (deleting) 1534 sectors = -sectors; 1535 1536 a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); 1537 if (IS_ERR(a)) 1538 return PTR_ERR(a); 1539 1540 ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, 1541 a->v.gen, a->v.data_type, 1542 a->v.dirty_sectors, a->v.cached_sectors); 1543 if (ret) 1544 goto err; 1545 1546 if (!deleting) { 1547 if (bch2_trans_inconsistent_on(a->v.stripe || 1548 a->v.stripe_redundancy, trans, 1549 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", 1550 iter.pos.inode, iter.pos.offset, a->v.gen, 1551 bch2_data_types[a->v.data_type], 1552 a->v.dirty_sectors, 1553 a->v.stripe, s.k->p.offset)) { 1554 ret = -EIO; 1555 goto err; 1556 } 1557 1558 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, 1559 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", 1560 iter.pos.inode, iter.pos.offset, a->v.gen, 1561 bch2_data_types[a->v.data_type], 1562 a->v.dirty_sectors, 1563 s.k->p.offset)) { 1564 ret = -EIO; 1565 goto err; 1566 } 1567 1568 a->v.stripe = s.k->p.offset; 1569 a->v.stripe_redundancy = s.v->nr_redundant; 1570 a->v.data_type = BCH_DATA_stripe; 1571 } else { 1572 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || 1573 a->v.stripe_redundancy != s.v->nr_redundant, trans, 1574 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", 1575 iter.pos.inode, iter.pos.offset, a->v.gen, 1576 s.k->p.offset, a->v.stripe)) { 1577 ret = -EIO; 1578 goto err; 1579 } 1580 1581 a->v.stripe = 0; 1582 a->v.stripe_redundancy = 0; 1583 a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); 1584 } 1585 1586 a->v.dirty_sectors += sectors; 1587 if (data_type) 1588 a->v.data_type = !deleting ? data_type : 0; 1589 1590 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1591 if (ret) 1592 goto err; 1593 err: 1594 bch2_trans_iter_exit(trans, &iter); 1595 return ret; 1596 } 1597 1598 int bch2_trans_mark_stripe(struct btree_trans *trans, 1599 enum btree_id btree_id, unsigned level, 1600 struct bkey_s_c old, struct bkey_i *new, 1601 unsigned flags) 1602 { 1603 const struct bch_stripe *old_s = NULL; 1604 struct bch_stripe *new_s = NULL; 1605 struct bch_replicas_padded r; 1606 unsigned i, nr_blocks; 1607 int ret = 0; 1608 1609 if (old.k->type == KEY_TYPE_stripe) 1610 old_s = bkey_s_c_to_stripe(old).v; 1611 if (new->k.type == KEY_TYPE_stripe) 1612 new_s = &bkey_i_to_stripe(new)->v; 1613 1614 /* 1615 * If the pointers aren't changing, we don't need to do anything: 1616 */ 1617 if (new_s && old_s && 1618 new_s->nr_blocks == old_s->nr_blocks && 1619 new_s->nr_redundant == old_s->nr_redundant && 1620 !memcmp(old_s->ptrs, new_s->ptrs, 1621 new_s->nr_blocks * sizeof(struct bch_extent_ptr))) 1622 return 0; 1623 1624 BUG_ON(new_s && old_s && 1625 (new_s->nr_blocks != old_s->nr_blocks || 1626 new_s->nr_redundant != old_s->nr_redundant)); 1627 1628 nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; 1629 1630 if (new_s) { 1631 s64 sectors = le16_to_cpu(new_s->sectors); 1632 1633 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); 1634 ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); 1635 if (ret) 1636 return ret; 1637 } 1638 1639 if (old_s) { 1640 s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); 1641 1642 bch2_bkey_to_replicas(&r.e, old); 1643 ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); 1644 if (ret) 1645 return ret; 1646 } 1647 1648 for (i = 0; i < nr_blocks; i++) { 1649 if (new_s && old_s && 1650 !memcmp(&new_s->ptrs[i], 1651 &old_s->ptrs[i], 1652 sizeof(new_s->ptrs[i]))) 1653 continue; 1654 1655 if (new_s) { 1656 ret = bch2_trans_mark_stripe_bucket(trans, 1657 bkey_i_to_s_c_stripe(new), i, false); 1658 if (ret) 1659 break; 1660 } 1661 1662 if (old_s) { 1663 ret = bch2_trans_mark_stripe_bucket(trans, 1664 bkey_s_c_to_stripe(old), i, true); 1665 if (ret) 1666 break; 1667 } 1668 } 1669 1670 return ret; 1671 } 1672 1673 int bch2_trans_mark_reservation(struct btree_trans *trans, 1674 enum btree_id btree_id, unsigned level, 1675 struct bkey_s_c old, 1676 struct bkey_i *new, 1677 unsigned flags) 1678 { 1679 struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE 1680 ? old 1681 : bkey_i_to_s_c(new); 1682 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; 1683 s64 sectors = (s64) k.k->size; 1684 struct replicas_delta_list *d; 1685 int ret; 1686 1687 if (flags & BTREE_TRIGGER_OVERWRITE) 1688 sectors = -sectors; 1689 sectors *= replicas; 1690 1691 ret = bch2_replicas_deltas_realloc(trans, 0); 1692 if (ret) 1693 return ret; 1694 1695 d = trans->fs_usage_deltas; 1696 replicas = clamp_t(unsigned, replicas, 1, 1697 ARRAY_SIZE(d->persistent_reserved)); 1698 1699 d->persistent_reserved[replicas - 1] += sectors; 1700 return 0; 1701 } 1702 1703 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, 1704 struct bkey_s_c_reflink_p p, 1705 u64 *idx, unsigned flags) 1706 { 1707 struct bch_fs *c = trans->c; 1708 struct btree_iter iter; 1709 struct bkey_i *k; 1710 __le64 *refcount; 1711 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; 1712 struct printbuf buf = PRINTBUF; 1713 int ret; 1714 1715 k = bch2_bkey_get_mut_noupdate(trans, &iter, 1716 BTREE_ID_reflink, POS(0, *idx), 1717 BTREE_ITER_WITH_UPDATES); 1718 ret = PTR_ERR_OR_ZERO(k); 1719 if (ret) 1720 goto err; 1721 1722 refcount = bkey_refcount(k); 1723 if (!refcount) { 1724 bch2_bkey_val_to_text(&buf, c, p.s_c); 1725 bch2_trans_inconsistent(trans, 1726 "nonexistent indirect extent at %llu while marking\n %s", 1727 *idx, buf.buf); 1728 ret = -EIO; 1729 goto err; 1730 } 1731 1732 if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { 1733 bch2_bkey_val_to_text(&buf, c, p.s_c); 1734 bch2_trans_inconsistent(trans, 1735 "indirect extent refcount underflow at %llu while marking\n %s", 1736 *idx, buf.buf); 1737 ret = -EIO; 1738 goto err; 1739 } 1740 1741 if (flags & BTREE_TRIGGER_INSERT) { 1742 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; 1743 u64 pad; 1744 1745 pad = max_t(s64, le32_to_cpu(v->front_pad), 1746 le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); 1747 BUG_ON(pad > U32_MAX); 1748 v->front_pad = cpu_to_le32(pad); 1749 1750 pad = max_t(s64, le32_to_cpu(v->back_pad), 1751 k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); 1752 BUG_ON(pad > U32_MAX); 1753 v->back_pad = cpu_to_le32(pad); 1754 } 1755 1756 le64_add_cpu(refcount, add); 1757 1758 bch2_btree_iter_set_pos_to_extent_start(&iter); 1759 ret = bch2_trans_update(trans, &iter, k, 0); 1760 if (ret) 1761 goto err; 1762 1763 *idx = k->k.p.offset; 1764 err: 1765 bch2_trans_iter_exit(trans, &iter); 1766 printbuf_exit(&buf); 1767 return ret; 1768 } 1769 1770 int bch2_trans_mark_reflink_p(struct btree_trans *trans, 1771 enum btree_id btree_id, unsigned level, 1772 struct bkey_s_c old, 1773 struct bkey_i *new, 1774 unsigned flags) 1775 { 1776 struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE 1777 ? old 1778 : bkey_i_to_s_c(new); 1779 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 1780 u64 idx, end_idx; 1781 int ret = 0; 1782 1783 if (flags & BTREE_TRIGGER_INSERT) { 1784 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; 1785 1786 v->front_pad = v->back_pad = 0; 1787 } 1788 1789 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); 1790 end_idx = le64_to_cpu(p.v->idx) + p.k->size + 1791 le32_to_cpu(p.v->back_pad); 1792 1793 while (idx < end_idx && !ret) 1794 ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); 1795 1796 return ret; 1797 } 1798 1799 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, 1800 struct bch_dev *ca, size_t b, 1801 enum bch_data_type type, 1802 unsigned sectors) 1803 { 1804 struct bch_fs *c = trans->c; 1805 struct btree_iter iter; 1806 struct bkey_i_alloc_v4 *a; 1807 int ret = 0; 1808 1809 /* 1810 * Backup superblock might be past the end of our normal usable space: 1811 */ 1812 if (b >= ca->mi.nbuckets) 1813 return 0; 1814 1815 a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); 1816 if (IS_ERR(a)) 1817 return PTR_ERR(a); 1818 1819 if (a->v.data_type && type && a->v.data_type != type) { 1820 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, 1821 "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" 1822 "while marking %s", 1823 iter.pos.inode, iter.pos.offset, a->v.gen, 1824 bch2_data_types[a->v.data_type], 1825 bch2_data_types[type], 1826 bch2_data_types[type]); 1827 ret = -EIO; 1828 goto out; 1829 } 1830 1831 a->v.data_type = type; 1832 a->v.dirty_sectors = sectors; 1833 1834 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1835 if (ret) 1836 goto out; 1837 out: 1838 bch2_trans_iter_exit(trans, &iter); 1839 return ret; 1840 } 1841 1842 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, 1843 struct bch_dev *ca, size_t b, 1844 enum bch_data_type type, 1845 unsigned sectors) 1846 { 1847 return commit_do(trans, NULL, NULL, 0, 1848 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); 1849 } 1850 1851 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, 1852 struct bch_dev *ca, 1853 u64 start, u64 end, 1854 enum bch_data_type type, 1855 u64 *bucket, unsigned *bucket_sectors) 1856 { 1857 do { 1858 u64 b = sector_to_bucket(ca, start); 1859 unsigned sectors = 1860 min_t(u64, bucket_to_sector(ca, b + 1), end) - start; 1861 1862 if (b != *bucket && *bucket_sectors) { 1863 int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, 1864 type, *bucket_sectors); 1865 if (ret) 1866 return ret; 1867 1868 *bucket_sectors = 0; 1869 } 1870 1871 *bucket = b; 1872 *bucket_sectors += sectors; 1873 start += sectors; 1874 } while (start < end); 1875 1876 return 0; 1877 } 1878 1879 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, 1880 struct bch_dev *ca) 1881 { 1882 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; 1883 u64 bucket = 0; 1884 unsigned i, bucket_sectors = 0; 1885 int ret; 1886 1887 for (i = 0; i < layout->nr_superblocks; i++) { 1888 u64 offset = le64_to_cpu(layout->sb_offset[i]); 1889 1890 if (offset == BCH_SB_SECTOR) { 1891 ret = bch2_trans_mark_metadata_sectors(trans, ca, 1892 0, BCH_SB_SECTOR, 1893 BCH_DATA_sb, &bucket, &bucket_sectors); 1894 if (ret) 1895 return ret; 1896 } 1897 1898 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, 1899 offset + (1 << layout->sb_max_size_bits), 1900 BCH_DATA_sb, &bucket, &bucket_sectors); 1901 if (ret) 1902 return ret; 1903 } 1904 1905 if (bucket_sectors) { 1906 ret = bch2_trans_mark_metadata_bucket(trans, ca, 1907 bucket, BCH_DATA_sb, bucket_sectors); 1908 if (ret) 1909 return ret; 1910 } 1911 1912 for (i = 0; i < ca->journal.nr; i++) { 1913 ret = bch2_trans_mark_metadata_bucket(trans, ca, 1914 ca->journal.buckets[i], 1915 BCH_DATA_journal, ca->mi.bucket_size); 1916 if (ret) 1917 return ret; 1918 } 1919 1920 return 0; 1921 } 1922 1923 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) 1924 { 1925 int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); 1926 1927 if (ret) 1928 bch_err_fn(c, ret); 1929 return ret; 1930 } 1931 1932 /* Disk reservations: */ 1933 1934 #define SECTORS_CACHE 1024 1935 1936 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, 1937 u64 sectors, int flags) 1938 { 1939 struct bch_fs_pcpu *pcpu; 1940 u64 old, v, get; 1941 s64 sectors_available; 1942 int ret; 1943 1944 percpu_down_read(&c->mark_lock); 1945 preempt_disable(); 1946 pcpu = this_cpu_ptr(c->pcpu); 1947 1948 if (sectors <= pcpu->sectors_available) 1949 goto out; 1950 1951 v = atomic64_read(&c->sectors_available); 1952 do { 1953 old = v; 1954 get = min((u64) sectors + SECTORS_CACHE, old); 1955 1956 if (get < sectors) { 1957 preempt_enable(); 1958 goto recalculate; 1959 } 1960 } while ((v = atomic64_cmpxchg(&c->sectors_available, 1961 old, old - get)) != old); 1962 1963 pcpu->sectors_available += get; 1964 1965 out: 1966 pcpu->sectors_available -= sectors; 1967 this_cpu_add(*c->online_reserved, sectors); 1968 res->sectors += sectors; 1969 1970 preempt_enable(); 1971 percpu_up_read(&c->mark_lock); 1972 return 0; 1973 1974 recalculate: 1975 mutex_lock(&c->sectors_available_lock); 1976 1977 percpu_u64_set(&c->pcpu->sectors_available, 0); 1978 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); 1979 1980 if (sectors <= sectors_available || 1981 (flags & BCH_DISK_RESERVATION_NOFAIL)) { 1982 atomic64_set(&c->sectors_available, 1983 max_t(s64, 0, sectors_available - sectors)); 1984 this_cpu_add(*c->online_reserved, sectors); 1985 res->sectors += sectors; 1986 ret = 0; 1987 } else { 1988 atomic64_set(&c->sectors_available, sectors_available); 1989 ret = -BCH_ERR_ENOSPC_disk_reservation; 1990 } 1991 1992 mutex_unlock(&c->sectors_available_lock); 1993 percpu_up_read(&c->mark_lock); 1994 1995 return ret; 1996 } 1997 1998 /* Startup/shutdown: */ 1999 2000 static void bucket_gens_free_rcu(struct rcu_head *rcu) 2001 { 2002 struct bucket_gens *buckets = 2003 container_of(rcu, struct bucket_gens, rcu); 2004 2005 kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); 2006 } 2007 2008 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 2009 { 2010 struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; 2011 unsigned long *buckets_nouse = NULL; 2012 bool resize = ca->bucket_gens != NULL; 2013 int ret; 2014 2015 if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, 2016 GFP_KERNEL|__GFP_ZERO))) { 2017 ret = -BCH_ERR_ENOMEM_bucket_gens; 2018 goto err; 2019 } 2020 2021 if ((c->opts.buckets_nouse && 2022 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * 2023 sizeof(unsigned long), 2024 GFP_KERNEL|__GFP_ZERO)))) { 2025 ret = -BCH_ERR_ENOMEM_buckets_nouse; 2026 goto err; 2027 } 2028 2029 bucket_gens->first_bucket = ca->mi.first_bucket; 2030 bucket_gens->nbuckets = nbuckets; 2031 2032 bch2_copygc_stop(c); 2033 2034 if (resize) { 2035 down_write(&c->gc_lock); 2036 down_write(&ca->bucket_lock); 2037 percpu_down_write(&c->mark_lock); 2038 } 2039 2040 old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); 2041 2042 if (resize) { 2043 size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); 2044 2045 memcpy(bucket_gens->b, 2046 old_bucket_gens->b, 2047 n); 2048 if (buckets_nouse) 2049 memcpy(buckets_nouse, 2050 ca->buckets_nouse, 2051 BITS_TO_LONGS(n) * sizeof(unsigned long)); 2052 } 2053 2054 rcu_assign_pointer(ca->bucket_gens, bucket_gens); 2055 bucket_gens = old_bucket_gens; 2056 2057 swap(ca->buckets_nouse, buckets_nouse); 2058 2059 nbuckets = ca->mi.nbuckets; 2060 2061 if (resize) { 2062 percpu_up_write(&c->mark_lock); 2063 up_write(&ca->bucket_lock); 2064 up_write(&c->gc_lock); 2065 } 2066 2067 ret = 0; 2068 err: 2069 kvpfree(buckets_nouse, 2070 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); 2071 if (bucket_gens) 2072 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); 2073 2074 return ret; 2075 } 2076 2077 void bch2_dev_buckets_free(struct bch_dev *ca) 2078 { 2079 unsigned i; 2080 2081 kvpfree(ca->buckets_nouse, 2082 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); 2083 kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), 2084 sizeof(struct bucket_gens) + ca->mi.nbuckets); 2085 2086 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) 2087 free_percpu(ca->usage[i]); 2088 kfree(ca->usage_base); 2089 } 2090 2091 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) 2092 { 2093 unsigned i; 2094 2095 ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); 2096 if (!ca->usage_base) 2097 return -BCH_ERR_ENOMEM_usage_init; 2098 2099 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { 2100 ca->usage[i] = alloc_percpu(struct bch_dev_usage); 2101 if (!ca->usage[i]) 2102 return -BCH_ERR_ENOMEM_usage_init; 2103 } 2104 2105 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); 2106 } 2107