1 // SPDX-License-Identifier: GPL-2.0 2 3 /* erasure coding */ 4 5 #include "bcachefs.h" 6 #include "alloc_foreground.h" 7 #include "backpointers.h" 8 #include "bkey_buf.h" 9 #include "bset.h" 10 #include "btree_gc.h" 11 #include "btree_update.h" 12 #include "btree_write_buffer.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "disk_groups.h" 16 #include "ec.h" 17 #include "error.h" 18 #include "io_read.h" 19 #include "keylist.h" 20 #include "recovery.h" 21 #include "replicas.h" 22 #include "super-io.h" 23 #include "util.h" 24 25 #include <linux/sort.h> 26 27 #ifdef __KERNEL__ 28 29 #include <linux/raid/pq.h> 30 #include <linux/raid/xor.h> 31 32 static void raid5_recov(unsigned disks, unsigned failed_idx, 33 size_t size, void **data) 34 { 35 unsigned i = 2, nr; 36 37 BUG_ON(failed_idx >= disks); 38 39 swap(data[0], data[failed_idx]); 40 memcpy(data[0], data[1], size); 41 42 while (i < disks) { 43 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); 44 xor_blocks(nr, size, data[0], data + i); 45 i += nr; 46 } 47 48 swap(data[0], data[failed_idx]); 49 } 50 51 static void raid_gen(int nd, int np, size_t size, void **v) 52 { 53 if (np >= 1) 54 raid5_recov(nd + np, nd, size, v); 55 if (np >= 2) 56 raid6_call.gen_syndrome(nd + np, size, v); 57 BUG_ON(np > 2); 58 } 59 60 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) 61 { 62 switch (nr) { 63 case 0: 64 break; 65 case 1: 66 if (ir[0] < nd + 1) 67 raid5_recov(nd + 1, ir[0], size, v); 68 else 69 raid6_call.gen_syndrome(nd + np, size, v); 70 break; 71 case 2: 72 if (ir[1] < nd) { 73 /* data+data failure. */ 74 raid6_2data_recov(nd + np, size, ir[0], ir[1], v); 75 } else if (ir[0] < nd) { 76 /* data + p/q failure */ 77 78 if (ir[1] == nd) /* data + p failure */ 79 raid6_datap_recov(nd + np, size, ir[0], v); 80 else { /* data + q failure */ 81 raid5_recov(nd + 1, ir[0], size, v); 82 raid6_call.gen_syndrome(nd + np, size, v); 83 } 84 } else { 85 raid_gen(nd, np, size, v); 86 } 87 break; 88 default: 89 BUG(); 90 } 91 } 92 93 #else 94 95 #include <raid/raid.h> 96 97 #endif 98 99 struct ec_bio { 100 struct bch_dev *ca; 101 struct ec_stripe_buf *buf; 102 size_t idx; 103 struct bio bio; 104 }; 105 106 /* Stripes btree keys: */ 107 108 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, 109 enum bkey_invalid_flags flags, 110 struct printbuf *err) 111 { 112 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 113 int ret = 0; 114 115 bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || 116 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, 117 stripe_pos_bad, 118 "stripe at bad pos"); 119 120 bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, 121 stripe_val_size_bad, 122 "incorrect value size (%zu < %u)", 123 bkey_val_u64s(k.k), stripe_val_u64s(s)); 124 125 ret = bch2_bkey_ptrs_invalid(c, k, flags, err); 126 fsck_err: 127 return ret; 128 } 129 130 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, 131 struct bkey_s_c k) 132 { 133 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 134 unsigned i, nr_data = s->nr_blocks - s->nr_redundant; 135 136 prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", 137 s->algorithm, 138 le16_to_cpu(s->sectors), 139 nr_data, 140 s->nr_redundant, 141 s->csum_type, 142 1U << s->csum_granularity_bits); 143 144 for (i = 0; i < s->nr_blocks; i++) { 145 const struct bch_extent_ptr *ptr = s->ptrs + i; 146 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 147 u32 offset; 148 u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); 149 150 prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); 151 if (i < nr_data) 152 prt_printf(out, "#%u", stripe_blockcount_get(s, i)); 153 prt_printf(out, " gen %u", ptr->gen); 154 if (ptr_stale(ca, ptr)) 155 prt_printf(out, " stale"); 156 } 157 } 158 159 /* returns blocknr in stripe that we matched: */ 160 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, 161 struct bkey_s_c k, unsigned *block) 162 { 163 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 164 const struct bch_extent_ptr *ptr; 165 unsigned i, nr_data = s->nr_blocks - s->nr_redundant; 166 167 bkey_for_each_ptr(ptrs, ptr) 168 for (i = 0; i < nr_data; i++) 169 if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, 170 le16_to_cpu(s->sectors))) { 171 *block = i; 172 return ptr; 173 } 174 175 return NULL; 176 } 177 178 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) 179 { 180 switch (k.k->type) { 181 case KEY_TYPE_extent: { 182 struct bkey_s_c_extent e = bkey_s_c_to_extent(k); 183 const union bch_extent_entry *entry; 184 185 extent_for_each_entry(e, entry) 186 if (extent_entry_type(entry) == 187 BCH_EXTENT_ENTRY_stripe_ptr && 188 entry->stripe_ptr.idx == idx) 189 return true; 190 191 break; 192 } 193 } 194 195 return false; 196 } 197 198 /* Stripe bufs: */ 199 200 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) 201 { 202 if (buf->key.k.type == KEY_TYPE_stripe) { 203 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); 204 unsigned i; 205 206 for (i = 0; i < s->v.nr_blocks; i++) { 207 kvpfree(buf->data[i], buf->size << 9); 208 buf->data[i] = NULL; 209 } 210 } 211 } 212 213 /* XXX: this is a non-mempoolified memory allocation: */ 214 static int ec_stripe_buf_init(struct ec_stripe_buf *buf, 215 unsigned offset, unsigned size) 216 { 217 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 218 unsigned csum_granularity = 1U << v->csum_granularity_bits; 219 unsigned end = offset + size; 220 unsigned i; 221 222 BUG_ON(end > le16_to_cpu(v->sectors)); 223 224 offset = round_down(offset, csum_granularity); 225 end = min_t(unsigned, le16_to_cpu(v->sectors), 226 round_up(end, csum_granularity)); 227 228 buf->offset = offset; 229 buf->size = end - offset; 230 231 memset(buf->valid, 0xFF, sizeof(buf->valid)); 232 233 for (i = 0; i < v->nr_blocks; i++) { 234 buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); 235 if (!buf->data[i]) 236 goto err; 237 } 238 239 return 0; 240 err: 241 ec_stripe_buf_exit(buf); 242 return -BCH_ERR_ENOMEM_stripe_buf; 243 } 244 245 /* Checksumming: */ 246 247 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, 248 unsigned block, unsigned offset) 249 { 250 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 251 unsigned csum_granularity = 1 << v->csum_granularity_bits; 252 unsigned end = buf->offset + buf->size; 253 unsigned len = min(csum_granularity, end - offset); 254 255 BUG_ON(offset >= end); 256 BUG_ON(offset < buf->offset); 257 BUG_ON(offset & (csum_granularity - 1)); 258 BUG_ON(offset + len != le16_to_cpu(v->sectors) && 259 (len & (csum_granularity - 1))); 260 261 return bch2_checksum(NULL, v->csum_type, 262 null_nonce(), 263 buf->data[block] + ((offset - buf->offset) << 9), 264 len << 9); 265 } 266 267 static void ec_generate_checksums(struct ec_stripe_buf *buf) 268 { 269 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 270 unsigned i, j, csums_per_device = stripe_csums_per_device(v); 271 272 if (!v->csum_type) 273 return; 274 275 BUG_ON(buf->offset); 276 BUG_ON(buf->size != le16_to_cpu(v->sectors)); 277 278 for (i = 0; i < v->nr_blocks; i++) 279 for (j = 0; j < csums_per_device; j++) 280 stripe_csum_set(v, i, j, 281 ec_block_checksum(buf, i, j << v->csum_granularity_bits)); 282 } 283 284 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) 285 { 286 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 287 unsigned csum_granularity = 1 << v->csum_granularity_bits; 288 unsigned i; 289 290 if (!v->csum_type) 291 return; 292 293 for (i = 0; i < v->nr_blocks; i++) { 294 unsigned offset = buf->offset; 295 unsigned end = buf->offset + buf->size; 296 297 if (!test_bit(i, buf->valid)) 298 continue; 299 300 while (offset < end) { 301 unsigned j = offset >> v->csum_granularity_bits; 302 unsigned len = min(csum_granularity, end - offset); 303 struct bch_csum want = stripe_csum_get(v, i, j); 304 struct bch_csum got = ec_block_checksum(buf, i, offset); 305 306 if (bch2_crc_cmp(want, got)) { 307 struct printbuf err = PRINTBUF; 308 struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); 309 310 prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", 311 want.hi, want.lo, 312 got.hi, got.lo, 313 bch2_csum_types[v->csum_type]); 314 prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); 315 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); 316 bch_err_ratelimited(ca, "%s", err.buf); 317 printbuf_exit(&err); 318 319 clear_bit(i, buf->valid); 320 321 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 322 break; 323 } 324 325 offset += len; 326 } 327 } 328 } 329 330 /* Erasure coding: */ 331 332 static void ec_generate_ec(struct ec_stripe_buf *buf) 333 { 334 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 335 unsigned nr_data = v->nr_blocks - v->nr_redundant; 336 unsigned bytes = le16_to_cpu(v->sectors) << 9; 337 338 raid_gen(nr_data, v->nr_redundant, bytes, buf->data); 339 } 340 341 static unsigned ec_nr_failed(struct ec_stripe_buf *buf) 342 { 343 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 344 345 return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); 346 } 347 348 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) 349 { 350 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 351 unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; 352 unsigned nr_data = v->nr_blocks - v->nr_redundant; 353 unsigned bytes = buf->size << 9; 354 355 if (ec_nr_failed(buf) > v->nr_redundant) { 356 bch_err_ratelimited(c, 357 "error doing reconstruct read: unable to read enough blocks"); 358 return -1; 359 } 360 361 for (i = 0; i < nr_data; i++) 362 if (!test_bit(i, buf->valid)) 363 failed[nr_failed++] = i; 364 365 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); 366 return 0; 367 } 368 369 /* IO: */ 370 371 static void ec_block_endio(struct bio *bio) 372 { 373 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); 374 struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; 375 struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; 376 struct bch_dev *ca = ec_bio->ca; 377 struct closure *cl = bio->bi_private; 378 379 if (bch2_dev_io_err_on(bio->bi_status, ca, 380 bio_data_dir(bio) 381 ? BCH_MEMBER_ERROR_write 382 : BCH_MEMBER_ERROR_read, 383 "erasure coding %s error: %s", 384 bio_data_dir(bio) ? "write" : "read", 385 bch2_blk_status_to_str(bio->bi_status))) 386 clear_bit(ec_bio->idx, ec_bio->buf->valid); 387 388 if (ptr_stale(ca, ptr)) { 389 bch_err_ratelimited(ca->fs, 390 "error %s stripe: stale pointer after io", 391 bio_data_dir(bio) == READ ? "reading from" : "writing to"); 392 clear_bit(ec_bio->idx, ec_bio->buf->valid); 393 } 394 395 bio_put(&ec_bio->bio); 396 percpu_ref_put(&ca->io_ref); 397 closure_put(cl); 398 } 399 400 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, 401 blk_opf_t opf, unsigned idx, struct closure *cl) 402 { 403 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 404 unsigned offset = 0, bytes = buf->size << 9; 405 struct bch_extent_ptr *ptr = &v->ptrs[idx]; 406 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 407 enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant 408 ? BCH_DATA_user 409 : BCH_DATA_parity; 410 int rw = op_is_write(opf); 411 412 if (ptr_stale(ca, ptr)) { 413 bch_err_ratelimited(c, 414 "error %s stripe: stale pointer", 415 rw == READ ? "reading from" : "writing to"); 416 clear_bit(idx, buf->valid); 417 return; 418 } 419 420 if (!bch2_dev_get_ioref(ca, rw)) { 421 clear_bit(idx, buf->valid); 422 return; 423 } 424 425 this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); 426 427 while (offset < bytes) { 428 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, 429 DIV_ROUND_UP(bytes, PAGE_SIZE)); 430 unsigned b = min_t(size_t, bytes - offset, 431 nr_iovecs << PAGE_SHIFT); 432 struct ec_bio *ec_bio; 433 434 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 435 nr_iovecs, 436 opf, 437 GFP_KERNEL, 438 &c->ec_bioset), 439 struct ec_bio, bio); 440 441 ec_bio->ca = ca; 442 ec_bio->buf = buf; 443 ec_bio->idx = idx; 444 445 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); 446 ec_bio->bio.bi_end_io = ec_block_endio; 447 ec_bio->bio.bi_private = cl; 448 449 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); 450 451 closure_get(cl); 452 percpu_ref_get(&ca->io_ref); 453 454 submit_bio(&ec_bio->bio); 455 456 offset += b; 457 } 458 459 percpu_ref_put(&ca->io_ref); 460 } 461 462 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, 463 struct ec_stripe_buf *stripe) 464 { 465 struct btree_iter iter; 466 struct bkey_s_c k; 467 int ret; 468 469 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 470 POS(0, idx), BTREE_ITER_SLOTS); 471 ret = bkey_err(k); 472 if (ret) 473 goto err; 474 if (k.k->type != KEY_TYPE_stripe) { 475 ret = -ENOENT; 476 goto err; 477 } 478 bkey_reassemble(&stripe->key, k); 479 err: 480 bch2_trans_iter_exit(trans, &iter); 481 return ret; 482 } 483 484 /* recovery read path: */ 485 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) 486 { 487 struct bch_fs *c = trans->c; 488 struct ec_stripe_buf *buf; 489 struct closure cl; 490 struct bch_stripe *v; 491 unsigned i, offset; 492 int ret = 0; 493 494 closure_init_stack(&cl); 495 496 BUG_ON(!rbio->pick.has_ec); 497 498 buf = kzalloc(sizeof(*buf), GFP_NOFS); 499 if (!buf) 500 return -BCH_ERR_ENOMEM_ec_read_extent; 501 502 ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); 503 if (ret) { 504 bch_err_ratelimited(c, 505 "error doing reconstruct read: error %i looking up stripe", ret); 506 kfree(buf); 507 return -EIO; 508 } 509 510 v = &bkey_i_to_stripe(&buf->key)->v; 511 512 if (!bch2_ptr_matches_stripe(v, rbio->pick)) { 513 bch_err_ratelimited(c, 514 "error doing reconstruct read: pointer doesn't match stripe"); 515 ret = -EIO; 516 goto err; 517 } 518 519 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; 520 if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { 521 bch_err_ratelimited(c, 522 "error doing reconstruct read: read is bigger than stripe"); 523 ret = -EIO; 524 goto err; 525 } 526 527 ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); 528 if (ret) 529 goto err; 530 531 for (i = 0; i < v->nr_blocks; i++) 532 ec_block_io(c, buf, REQ_OP_READ, i, &cl); 533 534 closure_sync(&cl); 535 536 if (ec_nr_failed(buf) > v->nr_redundant) { 537 bch_err_ratelimited(c, 538 "error doing reconstruct read: unable to read enough blocks"); 539 ret = -EIO; 540 goto err; 541 } 542 543 ec_validate_checksums(c, buf); 544 545 ret = ec_do_recov(c, buf); 546 if (ret) 547 goto err; 548 549 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, 550 buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); 551 err: 552 ec_stripe_buf_exit(buf); 553 kfree(buf); 554 return ret; 555 } 556 557 /* stripe bucket accounting: */ 558 559 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) 560 { 561 ec_stripes_heap n, *h = &c->ec_stripes_heap; 562 563 if (idx >= h->size) { 564 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) 565 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 566 567 mutex_lock(&c->ec_stripes_heap_lock); 568 if (n.size > h->size) { 569 memcpy(n.data, h->data, h->used * sizeof(h->data[0])); 570 n.used = h->used; 571 swap(*h, n); 572 } 573 mutex_unlock(&c->ec_stripes_heap_lock); 574 575 free_heap(&n); 576 } 577 578 if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) 579 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 580 581 if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && 582 !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) 583 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 584 585 return 0; 586 } 587 588 static int ec_stripe_mem_alloc(struct btree_trans *trans, 589 struct btree_iter *iter) 590 { 591 return allocate_dropping_locks_errcode(trans, 592 __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); 593 } 594 595 /* 596 * Hash table of open stripes: 597 * Stripes that are being created or modified are kept in a hash table, so that 598 * stripe deletion can skip them. 599 */ 600 601 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) 602 { 603 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 604 struct ec_stripe_new *s; 605 606 hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) 607 if (s->idx == idx) 608 return true; 609 return false; 610 } 611 612 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) 613 { 614 bool ret = false; 615 616 spin_lock(&c->ec_stripes_new_lock); 617 ret = __bch2_stripe_is_open(c, idx); 618 spin_unlock(&c->ec_stripes_new_lock); 619 620 return ret; 621 } 622 623 static bool bch2_try_open_stripe(struct bch_fs *c, 624 struct ec_stripe_new *s, 625 u64 idx) 626 { 627 bool ret; 628 629 spin_lock(&c->ec_stripes_new_lock); 630 ret = !__bch2_stripe_is_open(c, idx); 631 if (ret) { 632 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 633 634 s->idx = idx; 635 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); 636 } 637 spin_unlock(&c->ec_stripes_new_lock); 638 639 return ret; 640 } 641 642 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) 643 { 644 BUG_ON(!s->idx); 645 646 spin_lock(&c->ec_stripes_new_lock); 647 hlist_del_init(&s->hash); 648 spin_unlock(&c->ec_stripes_new_lock); 649 650 s->idx = 0; 651 } 652 653 /* Heap of all existing stripes, ordered by blocks_nonempty */ 654 655 static u64 stripe_idx_to_delete(struct bch_fs *c) 656 { 657 ec_stripes_heap *h = &c->ec_stripes_heap; 658 659 lockdep_assert_held(&c->ec_stripes_heap_lock); 660 661 if (h->used && 662 h->data[0].blocks_nonempty == 0 && 663 !bch2_stripe_is_open(c, h->data[0].idx)) 664 return h->data[0].idx; 665 666 return 0; 667 } 668 669 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, 670 struct ec_stripe_heap_entry l, 671 struct ec_stripe_heap_entry r) 672 { 673 return ((l.blocks_nonempty > r.blocks_nonempty) - 674 (l.blocks_nonempty < r.blocks_nonempty)); 675 } 676 677 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, 678 size_t i) 679 { 680 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); 681 682 genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; 683 } 684 685 static void heap_verify_backpointer(struct bch_fs *c, size_t idx) 686 { 687 ec_stripes_heap *h = &c->ec_stripes_heap; 688 struct stripe *m = genradix_ptr(&c->stripes, idx); 689 690 BUG_ON(m->heap_idx >= h->used); 691 BUG_ON(h->data[m->heap_idx].idx != idx); 692 } 693 694 void bch2_stripes_heap_del(struct bch_fs *c, 695 struct stripe *m, size_t idx) 696 { 697 mutex_lock(&c->ec_stripes_heap_lock); 698 heap_verify_backpointer(c, idx); 699 700 heap_del(&c->ec_stripes_heap, m->heap_idx, 701 ec_stripes_heap_cmp, 702 ec_stripes_heap_set_backpointer); 703 mutex_unlock(&c->ec_stripes_heap_lock); 704 } 705 706 void bch2_stripes_heap_insert(struct bch_fs *c, 707 struct stripe *m, size_t idx) 708 { 709 mutex_lock(&c->ec_stripes_heap_lock); 710 BUG_ON(heap_full(&c->ec_stripes_heap)); 711 712 heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { 713 .idx = idx, 714 .blocks_nonempty = m->blocks_nonempty, 715 }), 716 ec_stripes_heap_cmp, 717 ec_stripes_heap_set_backpointer); 718 719 heap_verify_backpointer(c, idx); 720 mutex_unlock(&c->ec_stripes_heap_lock); 721 } 722 723 void bch2_stripes_heap_update(struct bch_fs *c, 724 struct stripe *m, size_t idx) 725 { 726 ec_stripes_heap *h = &c->ec_stripes_heap; 727 bool do_deletes; 728 size_t i; 729 730 mutex_lock(&c->ec_stripes_heap_lock); 731 heap_verify_backpointer(c, idx); 732 733 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; 734 735 i = m->heap_idx; 736 heap_sift_up(h, i, ec_stripes_heap_cmp, 737 ec_stripes_heap_set_backpointer); 738 heap_sift_down(h, i, ec_stripes_heap_cmp, 739 ec_stripes_heap_set_backpointer); 740 741 heap_verify_backpointer(c, idx); 742 743 do_deletes = stripe_idx_to_delete(c) != 0; 744 mutex_unlock(&c->ec_stripes_heap_lock); 745 746 if (do_deletes) 747 bch2_do_stripe_deletes(c); 748 } 749 750 /* stripe deletion */ 751 752 static int ec_stripe_delete(struct btree_trans *trans, u64 idx) 753 { 754 struct bch_fs *c = trans->c; 755 struct btree_iter iter; 756 struct bkey_s_c k; 757 struct bkey_s_c_stripe s; 758 int ret; 759 760 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), 761 BTREE_ITER_INTENT); 762 ret = bkey_err(k); 763 if (ret) 764 goto err; 765 766 if (k.k->type != KEY_TYPE_stripe) { 767 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); 768 ret = -EINVAL; 769 goto err; 770 } 771 772 s = bkey_s_c_to_stripe(k); 773 for (unsigned i = 0; i < s.v->nr_blocks; i++) 774 if (stripe_blockcount_get(s.v, i)) { 775 struct printbuf buf = PRINTBUF; 776 777 bch2_bkey_val_to_text(&buf, c, k); 778 bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); 779 printbuf_exit(&buf); 780 ret = -EINVAL; 781 goto err; 782 } 783 784 ret = bch2_btree_delete_at(trans, &iter, 0); 785 err: 786 bch2_trans_iter_exit(trans, &iter); 787 return ret; 788 } 789 790 static void ec_stripe_delete_work(struct work_struct *work) 791 { 792 struct bch_fs *c = 793 container_of(work, struct bch_fs, ec_stripe_delete_work); 794 struct btree_trans *trans = bch2_trans_get(c); 795 int ret; 796 u64 idx; 797 798 while (1) { 799 mutex_lock(&c->ec_stripes_heap_lock); 800 idx = stripe_idx_to_delete(c); 801 mutex_unlock(&c->ec_stripes_heap_lock); 802 803 if (!idx) 804 break; 805 806 ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, 807 ec_stripe_delete(trans, idx)); 808 if (ret) { 809 bch_err_fn(c, ret); 810 break; 811 } 812 } 813 814 bch2_trans_put(trans); 815 816 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 817 } 818 819 void bch2_do_stripe_deletes(struct bch_fs *c) 820 { 821 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && 822 !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) 823 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 824 } 825 826 /* stripe creation: */ 827 828 static int ec_stripe_key_update(struct btree_trans *trans, 829 struct bkey_i_stripe *new, 830 bool create) 831 { 832 struct bch_fs *c = trans->c; 833 struct btree_iter iter; 834 struct bkey_s_c k; 835 int ret; 836 837 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 838 new->k.p, BTREE_ITER_INTENT); 839 ret = bkey_err(k); 840 if (ret) 841 goto err; 842 843 if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { 844 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", 845 create ? "creating" : "updating", 846 bch2_bkey_types[k.k->type]); 847 ret = -EINVAL; 848 goto err; 849 } 850 851 if (k.k->type == KEY_TYPE_stripe) { 852 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; 853 unsigned i; 854 855 if (old->nr_blocks != new->v.nr_blocks) { 856 bch_err(c, "error updating stripe: nr_blocks does not match"); 857 ret = -EINVAL; 858 goto err; 859 } 860 861 for (i = 0; i < new->v.nr_blocks; i++) { 862 unsigned v = stripe_blockcount_get(old, i); 863 864 BUG_ON(v && 865 (old->ptrs[i].dev != new->v.ptrs[i].dev || 866 old->ptrs[i].gen != new->v.ptrs[i].gen || 867 old->ptrs[i].offset != new->v.ptrs[i].offset)); 868 869 stripe_blockcount_set(&new->v, i, v); 870 } 871 } 872 873 ret = bch2_trans_update(trans, &iter, &new->k_i, 0); 874 err: 875 bch2_trans_iter_exit(trans, &iter); 876 return ret; 877 } 878 879 static int ec_stripe_update_extent(struct btree_trans *trans, 880 struct bpos bucket, u8 gen, 881 struct ec_stripe_buf *s, 882 struct bpos *bp_pos) 883 { 884 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 885 struct bch_fs *c = trans->c; 886 struct bch_backpointer bp; 887 struct btree_iter iter; 888 struct bkey_s_c k; 889 const struct bch_extent_ptr *ptr_c; 890 struct bch_extent_ptr *ptr, *ec_ptr = NULL; 891 struct bch_extent_stripe_ptr stripe_ptr; 892 struct bkey_i *n; 893 int ret, dev, block; 894 895 ret = bch2_get_next_backpointer(trans, bucket, gen, 896 bp_pos, &bp, BTREE_ITER_CACHED); 897 if (ret) 898 return ret; 899 if (bpos_eq(*bp_pos, SPOS_MAX)) 900 return 0; 901 902 if (bp.level) { 903 struct printbuf buf = PRINTBUF; 904 struct btree_iter node_iter; 905 struct btree *b; 906 907 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); 908 bch2_trans_iter_exit(trans, &node_iter); 909 910 if (!b) 911 return 0; 912 913 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); 914 bch2_backpointer_to_text(&buf, &bp); 915 916 bch2_fs_inconsistent(c, "%s", buf.buf); 917 printbuf_exit(&buf); 918 return -EIO; 919 } 920 921 k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); 922 ret = bkey_err(k); 923 if (ret) 924 return ret; 925 if (!k.k) { 926 /* 927 * extent no longer exists - we could flush the btree 928 * write buffer and retry to verify, but no need: 929 */ 930 return 0; 931 } 932 933 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) 934 goto out; 935 936 ptr_c = bkey_matches_stripe(v, k, &block); 937 /* 938 * It doesn't generally make sense to erasure code cached ptrs: 939 * XXX: should we be incrementing a counter? 940 */ 941 if (!ptr_c || ptr_c->cached) 942 goto out; 943 944 dev = v->ptrs[block].dev; 945 946 n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); 947 ret = PTR_ERR_OR_ZERO(n); 948 if (ret) 949 goto out; 950 951 bkey_reassemble(n, k); 952 953 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); 954 ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); 955 BUG_ON(!ec_ptr); 956 957 stripe_ptr = (struct bch_extent_stripe_ptr) { 958 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, 959 .block = block, 960 .redundancy = v->nr_redundant, 961 .idx = s->key.k.p.offset, 962 }; 963 964 __extent_entry_insert(n, 965 (union bch_extent_entry *) ec_ptr, 966 (union bch_extent_entry *) &stripe_ptr); 967 968 ret = bch2_trans_update(trans, &iter, n, 0); 969 out: 970 bch2_trans_iter_exit(trans, &iter); 971 return ret; 972 } 973 974 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, 975 unsigned block) 976 { 977 struct bch_fs *c = trans->c; 978 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 979 struct bch_extent_ptr bucket = v->ptrs[block]; 980 struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); 981 struct bpos bp_pos = POS_MIN; 982 int ret = 0; 983 984 while (1) { 985 ret = commit_do(trans, NULL, NULL, 986 BTREE_INSERT_NOCHECK_RW| 987 BTREE_INSERT_NOFAIL, 988 ec_stripe_update_extent(trans, bucket_pos, bucket.gen, 989 s, &bp_pos)); 990 if (ret) 991 break; 992 if (bkey_eq(bp_pos, POS_MAX)) 993 break; 994 995 bp_pos = bpos_nosnap_successor(bp_pos); 996 } 997 998 return ret; 999 } 1000 1001 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) 1002 { 1003 struct btree_trans *trans = bch2_trans_get(c); 1004 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1005 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1006 int ret = 0; 1007 1008 ret = bch2_btree_write_buffer_flush(trans); 1009 if (ret) 1010 goto err; 1011 1012 for (i = 0; i < nr_data; i++) { 1013 ret = ec_stripe_update_bucket(trans, s, i); 1014 if (ret) 1015 break; 1016 } 1017 err: 1018 bch2_trans_put(trans); 1019 1020 return ret; 1021 } 1022 1023 static void zero_out_rest_of_ec_bucket(struct bch_fs *c, 1024 struct ec_stripe_new *s, 1025 unsigned block, 1026 struct open_bucket *ob) 1027 { 1028 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 1029 unsigned offset = ca->mi.bucket_size - ob->sectors_free; 1030 int ret; 1031 1032 if (!bch2_dev_get_ioref(ca, WRITE)) { 1033 s->err = -BCH_ERR_erofs_no_writes; 1034 return; 1035 } 1036 1037 memset(s->new_stripe.data[block] + (offset << 9), 1038 0, 1039 ob->sectors_free << 9); 1040 1041 ret = blkdev_issue_zeroout(ca->disk_sb.bdev, 1042 ob->bucket * ca->mi.bucket_size + offset, 1043 ob->sectors_free, 1044 GFP_KERNEL, 0); 1045 1046 percpu_ref_put(&ca->io_ref); 1047 1048 if (ret) 1049 s->err = ret; 1050 } 1051 1052 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) 1053 { 1054 if (s->idx) 1055 bch2_stripe_close(c, s); 1056 kfree(s); 1057 } 1058 1059 /* 1060 * data buckets of new stripe all written: create the stripe 1061 */ 1062 static void ec_stripe_create(struct ec_stripe_new *s) 1063 { 1064 struct bch_fs *c = s->c; 1065 struct open_bucket *ob; 1066 struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; 1067 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1068 int ret; 1069 1070 BUG_ON(s->h->s == s); 1071 1072 closure_sync(&s->iodone); 1073 1074 if (!s->err) { 1075 for (i = 0; i < nr_data; i++) 1076 if (s->blocks[i]) { 1077 ob = c->open_buckets + s->blocks[i]; 1078 1079 if (ob->sectors_free) 1080 zero_out_rest_of_ec_bucket(c, s, i, ob); 1081 } 1082 } 1083 1084 if (s->err) { 1085 if (!bch2_err_matches(s->err, EROFS)) 1086 bch_err(c, "error creating stripe: error writing data buckets"); 1087 goto err; 1088 } 1089 1090 if (s->have_existing_stripe) { 1091 ec_validate_checksums(c, &s->existing_stripe); 1092 1093 if (ec_do_recov(c, &s->existing_stripe)) { 1094 bch_err(c, "error creating stripe: error reading existing stripe"); 1095 goto err; 1096 } 1097 1098 for (i = 0; i < nr_data; i++) 1099 if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) 1100 swap(s->new_stripe.data[i], 1101 s->existing_stripe.data[i]); 1102 1103 ec_stripe_buf_exit(&s->existing_stripe); 1104 } 1105 1106 BUG_ON(!s->allocated); 1107 BUG_ON(!s->idx); 1108 1109 ec_generate_ec(&s->new_stripe); 1110 1111 ec_generate_checksums(&s->new_stripe); 1112 1113 /* write p/q: */ 1114 for (i = nr_data; i < v->nr_blocks; i++) 1115 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); 1116 closure_sync(&s->iodone); 1117 1118 if (ec_nr_failed(&s->new_stripe)) { 1119 bch_err(c, "error creating stripe: error writing redundancy buckets"); 1120 goto err; 1121 } 1122 1123 ret = bch2_trans_do(c, &s->res, NULL, 1124 BTREE_INSERT_NOCHECK_RW| 1125 BTREE_INSERT_NOFAIL, 1126 ec_stripe_key_update(trans, 1127 bkey_i_to_stripe(&s->new_stripe.key), 1128 !s->have_existing_stripe)); 1129 if (ret) { 1130 bch_err(c, "error creating stripe: error creating stripe key"); 1131 goto err; 1132 } 1133 1134 ret = ec_stripe_update_extents(c, &s->new_stripe); 1135 if (ret) { 1136 bch_err_msg(c, ret, "creating stripe: error updating pointers"); 1137 goto err; 1138 } 1139 err: 1140 bch2_disk_reservation_put(c, &s->res); 1141 1142 for (i = 0; i < v->nr_blocks; i++) 1143 if (s->blocks[i]) { 1144 ob = c->open_buckets + s->blocks[i]; 1145 1146 if (i < nr_data) { 1147 ob->ec = NULL; 1148 __bch2_open_bucket_put(c, ob); 1149 } else { 1150 bch2_open_bucket_put(c, ob); 1151 } 1152 } 1153 1154 mutex_lock(&c->ec_stripe_new_lock); 1155 list_del(&s->list); 1156 mutex_unlock(&c->ec_stripe_new_lock); 1157 wake_up(&c->ec_stripe_new_wait); 1158 1159 ec_stripe_buf_exit(&s->existing_stripe); 1160 ec_stripe_buf_exit(&s->new_stripe); 1161 closure_debug_destroy(&s->iodone); 1162 1163 ec_stripe_new_put(c, s, STRIPE_REF_stripe); 1164 } 1165 1166 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) 1167 { 1168 struct ec_stripe_new *s; 1169 1170 mutex_lock(&c->ec_stripe_new_lock); 1171 list_for_each_entry(s, &c->ec_stripe_new_list, list) 1172 if (!atomic_read(&s->ref[STRIPE_REF_io])) 1173 goto out; 1174 s = NULL; 1175 out: 1176 mutex_unlock(&c->ec_stripe_new_lock); 1177 1178 return s; 1179 } 1180 1181 static void ec_stripe_create_work(struct work_struct *work) 1182 { 1183 struct bch_fs *c = container_of(work, 1184 struct bch_fs, ec_stripe_create_work); 1185 struct ec_stripe_new *s; 1186 1187 while ((s = get_pending_stripe(c))) 1188 ec_stripe_create(s); 1189 1190 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1191 } 1192 1193 void bch2_ec_do_stripe_creates(struct bch_fs *c) 1194 { 1195 bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); 1196 1197 if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) 1198 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1199 } 1200 1201 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) 1202 { 1203 struct ec_stripe_new *s = h->s; 1204 1205 BUG_ON(!s->allocated && !s->err); 1206 1207 h->s = NULL; 1208 s->pending = true; 1209 1210 mutex_lock(&c->ec_stripe_new_lock); 1211 list_add(&s->list, &c->ec_stripe_new_list); 1212 mutex_unlock(&c->ec_stripe_new_lock); 1213 1214 ec_stripe_new_put(c, s, STRIPE_REF_io); 1215 } 1216 1217 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) 1218 { 1219 struct ec_stripe_new *s = ob->ec; 1220 1221 s->err = -EIO; 1222 } 1223 1224 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) 1225 { 1226 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); 1227 struct bch_dev *ca; 1228 unsigned offset; 1229 1230 if (!ob) 1231 return NULL; 1232 1233 BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); 1234 1235 ca = bch_dev_bkey_exists(c, ob->dev); 1236 offset = ca->mi.bucket_size - ob->sectors_free; 1237 1238 return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); 1239 } 1240 1241 static int unsigned_cmp(const void *_l, const void *_r) 1242 { 1243 unsigned l = *((const unsigned *) _l); 1244 unsigned r = *((const unsigned *) _r); 1245 1246 return cmp_int(l, r); 1247 } 1248 1249 /* pick most common bucket size: */ 1250 static unsigned pick_blocksize(struct bch_fs *c, 1251 struct bch_devs_mask *devs) 1252 { 1253 struct bch_dev *ca; 1254 unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; 1255 struct { 1256 unsigned nr, size; 1257 } cur = { 0, 0 }, best = { 0, 0 }; 1258 1259 for_each_member_device_rcu(ca, c, i, devs) 1260 sizes[nr++] = ca->mi.bucket_size; 1261 1262 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); 1263 1264 for (i = 0; i < nr; i++) { 1265 if (sizes[i] != cur.size) { 1266 if (cur.nr > best.nr) 1267 best = cur; 1268 1269 cur.nr = 0; 1270 cur.size = sizes[i]; 1271 } 1272 1273 cur.nr++; 1274 } 1275 1276 if (cur.nr > best.nr) 1277 best = cur; 1278 1279 return best.size; 1280 } 1281 1282 static bool may_create_new_stripe(struct bch_fs *c) 1283 { 1284 return false; 1285 } 1286 1287 static void ec_stripe_key_init(struct bch_fs *c, 1288 struct bkey_i *k, 1289 unsigned nr_data, 1290 unsigned nr_parity, 1291 unsigned stripe_size) 1292 { 1293 struct bkey_i_stripe *s = bkey_stripe_init(k); 1294 unsigned u64s; 1295 1296 s->v.sectors = cpu_to_le16(stripe_size); 1297 s->v.algorithm = 0; 1298 s->v.nr_blocks = nr_data + nr_parity; 1299 s->v.nr_redundant = nr_parity; 1300 s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); 1301 s->v.csum_type = BCH_CSUM_crc32c; 1302 s->v.pad = 0; 1303 1304 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { 1305 BUG_ON(1 << s->v.csum_granularity_bits >= 1306 le16_to_cpu(s->v.sectors) || 1307 s->v.csum_granularity_bits == U8_MAX); 1308 s->v.csum_granularity_bits++; 1309 } 1310 1311 set_bkey_val_u64s(&s->k, u64s); 1312 } 1313 1314 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) 1315 { 1316 struct ec_stripe_new *s; 1317 1318 lockdep_assert_held(&h->lock); 1319 1320 s = kzalloc(sizeof(*s), GFP_KERNEL); 1321 if (!s) 1322 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; 1323 1324 mutex_init(&s->lock); 1325 closure_init(&s->iodone, NULL); 1326 atomic_set(&s->ref[STRIPE_REF_stripe], 1); 1327 atomic_set(&s->ref[STRIPE_REF_io], 1); 1328 s->c = c; 1329 s->h = h; 1330 s->nr_data = min_t(unsigned, h->nr_active_devs, 1331 BCH_BKEY_PTRS_MAX) - h->redundancy; 1332 s->nr_parity = h->redundancy; 1333 1334 ec_stripe_key_init(c, &s->new_stripe.key, 1335 s->nr_data, s->nr_parity, h->blocksize); 1336 1337 h->s = s; 1338 return 0; 1339 } 1340 1341 static struct ec_stripe_head * 1342 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, 1343 unsigned algo, unsigned redundancy, 1344 enum bch_watermark watermark) 1345 { 1346 struct ec_stripe_head *h; 1347 struct bch_dev *ca; 1348 unsigned i; 1349 1350 h = kzalloc(sizeof(*h), GFP_KERNEL); 1351 if (!h) 1352 return NULL; 1353 1354 mutex_init(&h->lock); 1355 BUG_ON(!mutex_trylock(&h->lock)); 1356 1357 h->target = target; 1358 h->algo = algo; 1359 h->redundancy = redundancy; 1360 h->watermark = watermark; 1361 1362 rcu_read_lock(); 1363 h->devs = target_rw_devs(c, BCH_DATA_user, target); 1364 1365 for_each_member_device_rcu(ca, c, i, &h->devs) 1366 if (!ca->mi.durability) 1367 __clear_bit(i, h->devs.d); 1368 1369 h->blocksize = pick_blocksize(c, &h->devs); 1370 1371 for_each_member_device_rcu(ca, c, i, &h->devs) 1372 if (ca->mi.bucket_size == h->blocksize) 1373 h->nr_active_devs++; 1374 1375 rcu_read_unlock(); 1376 1377 /* 1378 * If we only have redundancy + 1 devices, we're better off with just 1379 * replication: 1380 */ 1381 if (h->nr_active_devs < h->redundancy + 2) 1382 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", 1383 h->nr_active_devs, h->redundancy + 2); 1384 1385 list_add(&h->list, &c->ec_stripe_head_list); 1386 return h; 1387 } 1388 1389 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) 1390 { 1391 if (h->s && 1392 h->s->allocated && 1393 bitmap_weight(h->s->blocks_allocated, 1394 h->s->nr_data) == h->s->nr_data) 1395 ec_stripe_set_pending(c, h); 1396 1397 mutex_unlock(&h->lock); 1398 } 1399 1400 static struct ec_stripe_head * 1401 __bch2_ec_stripe_head_get(struct btree_trans *trans, 1402 unsigned target, 1403 unsigned algo, 1404 unsigned redundancy, 1405 enum bch_watermark watermark) 1406 { 1407 struct bch_fs *c = trans->c; 1408 struct ec_stripe_head *h; 1409 int ret; 1410 1411 if (!redundancy) 1412 return NULL; 1413 1414 ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); 1415 if (ret) 1416 return ERR_PTR(ret); 1417 1418 if (test_bit(BCH_FS_GOING_RO, &c->flags)) { 1419 h = ERR_PTR(-BCH_ERR_erofs_no_writes); 1420 goto found; 1421 } 1422 1423 list_for_each_entry(h, &c->ec_stripe_head_list, list) 1424 if (h->target == target && 1425 h->algo == algo && 1426 h->redundancy == redundancy && 1427 h->watermark == watermark) { 1428 ret = bch2_trans_mutex_lock(trans, &h->lock); 1429 if (ret) 1430 h = ERR_PTR(ret); 1431 goto found; 1432 } 1433 1434 h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); 1435 found: 1436 if (!IS_ERR_OR_NULL(h) && 1437 h->nr_active_devs < h->redundancy + 2) { 1438 mutex_unlock(&h->lock); 1439 h = NULL; 1440 } 1441 mutex_unlock(&c->ec_stripe_head_lock); 1442 return h; 1443 } 1444 1445 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, 1446 enum bch_watermark watermark, struct closure *cl) 1447 { 1448 struct bch_fs *c = trans->c; 1449 struct bch_devs_mask devs = h->devs; 1450 struct open_bucket *ob; 1451 struct open_buckets buckets; 1452 struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1453 unsigned i, j, nr_have_parity = 0, nr_have_data = 0; 1454 bool have_cache = true; 1455 int ret = 0; 1456 1457 BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); 1458 BUG_ON(v->nr_redundant != h->s->nr_parity); 1459 1460 for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { 1461 __clear_bit(v->ptrs[i].dev, devs.d); 1462 if (i < h->s->nr_data) 1463 nr_have_data++; 1464 else 1465 nr_have_parity++; 1466 } 1467 1468 BUG_ON(nr_have_data > h->s->nr_data); 1469 BUG_ON(nr_have_parity > h->s->nr_parity); 1470 1471 buckets.nr = 0; 1472 if (nr_have_parity < h->s->nr_parity) { 1473 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1474 &h->parity_stripe, 1475 &devs, 1476 h->s->nr_parity, 1477 &nr_have_parity, 1478 &have_cache, 0, 1479 BCH_DATA_parity, 1480 watermark, 1481 cl); 1482 1483 open_bucket_for_each(c, &buckets, ob, i) { 1484 j = find_next_zero_bit(h->s->blocks_gotten, 1485 h->s->nr_data + h->s->nr_parity, 1486 h->s->nr_data); 1487 BUG_ON(j >= h->s->nr_data + h->s->nr_parity); 1488 1489 h->s->blocks[j] = buckets.v[i]; 1490 v->ptrs[j] = bch2_ob_ptr(c, ob); 1491 __set_bit(j, h->s->blocks_gotten); 1492 } 1493 1494 if (ret) 1495 return ret; 1496 } 1497 1498 buckets.nr = 0; 1499 if (nr_have_data < h->s->nr_data) { 1500 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1501 &h->block_stripe, 1502 &devs, 1503 h->s->nr_data, 1504 &nr_have_data, 1505 &have_cache, 0, 1506 BCH_DATA_user, 1507 watermark, 1508 cl); 1509 1510 open_bucket_for_each(c, &buckets, ob, i) { 1511 j = find_next_zero_bit(h->s->blocks_gotten, 1512 h->s->nr_data, 0); 1513 BUG_ON(j >= h->s->nr_data); 1514 1515 h->s->blocks[j] = buckets.v[i]; 1516 v->ptrs[j] = bch2_ob_ptr(c, ob); 1517 __set_bit(j, h->s->blocks_gotten); 1518 } 1519 1520 if (ret) 1521 return ret; 1522 } 1523 1524 return 0; 1525 } 1526 1527 /* XXX: doesn't obey target: */ 1528 static s64 get_existing_stripe(struct bch_fs *c, 1529 struct ec_stripe_head *head) 1530 { 1531 ec_stripes_heap *h = &c->ec_stripes_heap; 1532 struct stripe *m; 1533 size_t heap_idx; 1534 u64 stripe_idx; 1535 s64 ret = -1; 1536 1537 if (may_create_new_stripe(c)) 1538 return -1; 1539 1540 mutex_lock(&c->ec_stripes_heap_lock); 1541 for (heap_idx = 0; heap_idx < h->used; heap_idx++) { 1542 /* No blocks worth reusing, stripe will just be deleted: */ 1543 if (!h->data[heap_idx].blocks_nonempty) 1544 continue; 1545 1546 stripe_idx = h->data[heap_idx].idx; 1547 1548 m = genradix_ptr(&c->stripes, stripe_idx); 1549 1550 if (m->algorithm == head->algo && 1551 m->nr_redundant == head->redundancy && 1552 m->sectors == head->blocksize && 1553 m->blocks_nonempty < m->nr_blocks - m->nr_redundant && 1554 bch2_try_open_stripe(c, head->s, stripe_idx)) { 1555 ret = stripe_idx; 1556 break; 1557 } 1558 } 1559 mutex_unlock(&c->ec_stripes_heap_lock); 1560 return ret; 1561 } 1562 1563 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) 1564 { 1565 struct bch_fs *c = trans->c; 1566 struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1567 struct bch_stripe *existing_v; 1568 unsigned i; 1569 s64 idx; 1570 int ret; 1571 1572 /* 1573 * If we can't allocate a new stripe, and there's no stripes with empty 1574 * blocks for us to reuse, that means we have to wait on copygc: 1575 */ 1576 idx = get_existing_stripe(c, h); 1577 if (idx < 0) 1578 return -BCH_ERR_stripe_alloc_blocked; 1579 1580 ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); 1581 if (ret) { 1582 bch2_stripe_close(c, h->s); 1583 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1584 bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); 1585 return ret; 1586 } 1587 1588 existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; 1589 1590 BUG_ON(existing_v->nr_redundant != h->s->nr_parity); 1591 h->s->nr_data = existing_v->nr_blocks - 1592 existing_v->nr_redundant; 1593 1594 ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); 1595 if (ret) { 1596 bch2_stripe_close(c, h->s); 1597 return ret; 1598 } 1599 1600 BUG_ON(h->s->existing_stripe.size != h->blocksize); 1601 BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); 1602 1603 /* 1604 * Free buckets we initially allocated - they might conflict with 1605 * blocks from the stripe we're reusing: 1606 */ 1607 for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { 1608 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); 1609 h->s->blocks[i] = 0; 1610 } 1611 memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); 1612 memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); 1613 1614 for (i = 0; i < existing_v->nr_blocks; i++) { 1615 if (stripe_blockcount_get(existing_v, i)) { 1616 __set_bit(i, h->s->blocks_gotten); 1617 __set_bit(i, h->s->blocks_allocated); 1618 } 1619 1620 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); 1621 } 1622 1623 bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); 1624 h->s->have_existing_stripe = true; 1625 1626 return 0; 1627 } 1628 1629 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) 1630 { 1631 struct bch_fs *c = trans->c; 1632 struct btree_iter iter; 1633 struct bkey_s_c k; 1634 struct bpos min_pos = POS(0, 1); 1635 struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); 1636 int ret; 1637 1638 if (!h->s->res.sectors) { 1639 ret = bch2_disk_reservation_get(c, &h->s->res, 1640 h->blocksize, 1641 h->s->nr_parity, 1642 BCH_DISK_RESERVATION_NOFAIL); 1643 if (ret) 1644 return ret; 1645 } 1646 1647 for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, 1648 BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { 1649 if (bkey_gt(k.k->p, POS(0, U32_MAX))) { 1650 if (start_pos.offset) { 1651 start_pos = min_pos; 1652 bch2_btree_iter_set_pos(&iter, start_pos); 1653 continue; 1654 } 1655 1656 ret = -BCH_ERR_ENOSPC_stripe_create; 1657 break; 1658 } 1659 1660 if (bkey_deleted(k.k) && 1661 bch2_try_open_stripe(c, h->s, k.k->p.offset)) 1662 break; 1663 } 1664 1665 c->ec_stripe_hint = iter.pos.offset; 1666 1667 if (ret) 1668 goto err; 1669 1670 ret = ec_stripe_mem_alloc(trans, &iter); 1671 if (ret) { 1672 bch2_stripe_close(c, h->s); 1673 goto err; 1674 } 1675 1676 h->s->new_stripe.key.k.p = iter.pos; 1677 out: 1678 bch2_trans_iter_exit(trans, &iter); 1679 return ret; 1680 err: 1681 bch2_disk_reservation_put(c, &h->s->res); 1682 goto out; 1683 } 1684 1685 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, 1686 unsigned target, 1687 unsigned algo, 1688 unsigned redundancy, 1689 enum bch_watermark watermark, 1690 struct closure *cl) 1691 { 1692 struct bch_fs *c = trans->c; 1693 struct ec_stripe_head *h; 1694 bool waiting = false; 1695 int ret; 1696 1697 h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); 1698 if (IS_ERR_OR_NULL(h)) 1699 return h; 1700 1701 if (!h->s) { 1702 ret = ec_new_stripe_alloc(c, h); 1703 if (ret) { 1704 bch_err(c, "failed to allocate new stripe"); 1705 goto err; 1706 } 1707 } 1708 1709 if (h->s->allocated) 1710 goto allocated; 1711 1712 if (h->s->have_existing_stripe) 1713 goto alloc_existing; 1714 1715 /* First, try to allocate a full stripe: */ 1716 ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: 1717 __bch2_ec_stripe_head_reserve(trans, h); 1718 if (!ret) 1719 goto allocate_buf; 1720 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1721 bch2_err_matches(ret, ENOMEM)) 1722 goto err; 1723 1724 /* 1725 * Not enough buckets available for a full stripe: we must reuse an 1726 * existing stripe: 1727 */ 1728 while (1) { 1729 ret = __bch2_ec_stripe_head_reuse(trans, h); 1730 if (!ret) 1731 break; 1732 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) 1733 goto err; 1734 1735 if (watermark == BCH_WATERMARK_copygc) { 1736 ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: 1737 __bch2_ec_stripe_head_reserve(trans, h); 1738 if (ret) 1739 goto err; 1740 goto allocate_buf; 1741 } 1742 1743 /* XXX freelist_wait? */ 1744 closure_wait(&c->freelist_wait, cl); 1745 waiting = true; 1746 } 1747 1748 if (waiting) 1749 closure_wake_up(&c->freelist_wait); 1750 alloc_existing: 1751 /* 1752 * Retry allocating buckets, with the watermark for this 1753 * particular write: 1754 */ 1755 ret = new_stripe_alloc_buckets(trans, h, watermark, cl); 1756 if (ret) 1757 goto err; 1758 1759 allocate_buf: 1760 ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); 1761 if (ret) 1762 goto err; 1763 1764 h->s->allocated = true; 1765 allocated: 1766 BUG_ON(!h->s->idx); 1767 BUG_ON(!h->s->new_stripe.data[0]); 1768 BUG_ON(trans->restarted); 1769 return h; 1770 err: 1771 bch2_ec_stripe_head_put(c, h); 1772 return ERR_PTR(ret); 1773 } 1774 1775 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) 1776 { 1777 struct ec_stripe_head *h; 1778 struct open_bucket *ob; 1779 unsigned i; 1780 1781 mutex_lock(&c->ec_stripe_head_lock); 1782 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 1783 mutex_lock(&h->lock); 1784 if (!h->s) 1785 goto unlock; 1786 1787 if (!ca) 1788 goto found; 1789 1790 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { 1791 if (!h->s->blocks[i]) 1792 continue; 1793 1794 ob = c->open_buckets + h->s->blocks[i]; 1795 if (ob->dev == ca->dev_idx) 1796 goto found; 1797 } 1798 goto unlock; 1799 found: 1800 h->s->err = -BCH_ERR_erofs_no_writes; 1801 ec_stripe_set_pending(c, h); 1802 unlock: 1803 mutex_unlock(&h->lock); 1804 } 1805 mutex_unlock(&c->ec_stripe_head_lock); 1806 } 1807 1808 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) 1809 { 1810 __bch2_ec_stop(c, ca); 1811 } 1812 1813 void bch2_fs_ec_stop(struct bch_fs *c) 1814 { 1815 __bch2_ec_stop(c, NULL); 1816 } 1817 1818 static bool bch2_fs_ec_flush_done(struct bch_fs *c) 1819 { 1820 bool ret; 1821 1822 mutex_lock(&c->ec_stripe_new_lock); 1823 ret = list_empty(&c->ec_stripe_new_list); 1824 mutex_unlock(&c->ec_stripe_new_lock); 1825 1826 return ret; 1827 } 1828 1829 void bch2_fs_ec_flush(struct bch_fs *c) 1830 { 1831 wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); 1832 } 1833 1834 int bch2_stripes_read(struct bch_fs *c) 1835 { 1836 struct btree_trans *trans = bch2_trans_get(c); 1837 struct btree_iter iter; 1838 struct bkey_s_c k; 1839 const struct bch_stripe *s; 1840 struct stripe *m; 1841 unsigned i; 1842 int ret; 1843 1844 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, 1845 BTREE_ITER_PREFETCH, k, ret) { 1846 if (k.k->type != KEY_TYPE_stripe) 1847 continue; 1848 1849 ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); 1850 if (ret) 1851 break; 1852 1853 s = bkey_s_c_to_stripe(k).v; 1854 1855 m = genradix_ptr(&c->stripes, k.k->p.offset); 1856 m->sectors = le16_to_cpu(s->sectors); 1857 m->algorithm = s->algorithm; 1858 m->nr_blocks = s->nr_blocks; 1859 m->nr_redundant = s->nr_redundant; 1860 m->blocks_nonempty = 0; 1861 1862 for (i = 0; i < s->nr_blocks; i++) 1863 m->blocks_nonempty += !!stripe_blockcount_get(s, i); 1864 1865 bch2_stripes_heap_insert(c, m, k.k->p.offset); 1866 } 1867 bch2_trans_iter_exit(trans, &iter); 1868 1869 bch2_trans_put(trans); 1870 1871 if (ret) 1872 bch_err_fn(c, ret); 1873 1874 return ret; 1875 } 1876 1877 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) 1878 { 1879 ec_stripes_heap *h = &c->ec_stripes_heap; 1880 struct stripe *m; 1881 size_t i; 1882 1883 mutex_lock(&c->ec_stripes_heap_lock); 1884 for (i = 0; i < min_t(size_t, h->used, 50); i++) { 1885 m = genradix_ptr(&c->stripes, h->data[i].idx); 1886 1887 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, 1888 h->data[i].blocks_nonempty, 1889 m->nr_blocks - m->nr_redundant, 1890 m->nr_redundant); 1891 if (bch2_stripe_is_open(c, h->data[i].idx)) 1892 prt_str(out, " open"); 1893 prt_newline(out); 1894 } 1895 mutex_unlock(&c->ec_stripes_heap_lock); 1896 } 1897 1898 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) 1899 { 1900 struct ec_stripe_head *h; 1901 struct ec_stripe_new *s; 1902 1903 mutex_lock(&c->ec_stripe_head_lock); 1904 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 1905 prt_printf(out, "target %u algo %u redundancy %u %s:\n", 1906 h->target, h->algo, h->redundancy, 1907 bch2_watermarks[h->watermark]); 1908 1909 if (h->s) 1910 prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", 1911 h->s->idx, h->s->nr_data, h->s->nr_parity, 1912 bitmap_weight(h->s->blocks_allocated, 1913 h->s->nr_data)); 1914 } 1915 mutex_unlock(&c->ec_stripe_head_lock); 1916 1917 prt_printf(out, "in flight:\n"); 1918 1919 mutex_lock(&c->ec_stripe_new_lock); 1920 list_for_each_entry(s, &c->ec_stripe_new_list, list) { 1921 prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", 1922 s->idx, s->nr_data, s->nr_parity, 1923 atomic_read(&s->ref[STRIPE_REF_io]), 1924 atomic_read(&s->ref[STRIPE_REF_stripe]), 1925 bch2_watermarks[s->h->watermark]); 1926 } 1927 mutex_unlock(&c->ec_stripe_new_lock); 1928 } 1929 1930 void bch2_fs_ec_exit(struct bch_fs *c) 1931 { 1932 struct ec_stripe_head *h; 1933 unsigned i; 1934 1935 while (1) { 1936 mutex_lock(&c->ec_stripe_head_lock); 1937 h = list_first_entry_or_null(&c->ec_stripe_head_list, 1938 struct ec_stripe_head, list); 1939 if (h) 1940 list_del(&h->list); 1941 mutex_unlock(&c->ec_stripe_head_lock); 1942 if (!h) 1943 break; 1944 1945 if (h->s) { 1946 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) 1947 BUG_ON(h->s->blocks[i]); 1948 1949 kfree(h->s); 1950 } 1951 kfree(h); 1952 } 1953 1954 BUG_ON(!list_empty(&c->ec_stripe_new_list)); 1955 1956 free_heap(&c->ec_stripes_heap); 1957 genradix_free(&c->stripes); 1958 bioset_exit(&c->ec_bioset); 1959 } 1960 1961 void bch2_fs_ec_init_early(struct bch_fs *c) 1962 { 1963 spin_lock_init(&c->ec_stripes_new_lock); 1964 mutex_init(&c->ec_stripes_heap_lock); 1965 1966 INIT_LIST_HEAD(&c->ec_stripe_head_list); 1967 mutex_init(&c->ec_stripe_head_lock); 1968 1969 INIT_LIST_HEAD(&c->ec_stripe_new_list); 1970 mutex_init(&c->ec_stripe_new_lock); 1971 init_waitqueue_head(&c->ec_stripe_new_wait); 1972 1973 INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); 1974 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); 1975 } 1976 1977 int bch2_fs_ec_init(struct bch_fs *c) 1978 { 1979 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), 1980 BIOSET_NEED_BVECS); 1981 } 1982