1 // SPDX-License-Identifier: GPL-2.0 2 3 /* erasure coding */ 4 5 #include "bcachefs.h" 6 #include "alloc_background.h" 7 #include "alloc_foreground.h" 8 #include "backpointers.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_gc.h" 12 #include "btree_update.h" 13 #include "btree_write_buffer.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "disk_groups.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "io_read.h" 20 #include "keylist.h" 21 #include "recovery.h" 22 #include "replicas.h" 23 #include "super-io.h" 24 #include "util.h" 25 26 #include <linux/sort.h> 27 28 #ifdef __KERNEL__ 29 30 #include <linux/raid/pq.h> 31 #include <linux/raid/xor.h> 32 33 static void raid5_recov(unsigned disks, unsigned failed_idx, 34 size_t size, void **data) 35 { 36 unsigned i = 2, nr; 37 38 BUG_ON(failed_idx >= disks); 39 40 swap(data[0], data[failed_idx]); 41 memcpy(data[0], data[1], size); 42 43 while (i < disks) { 44 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); 45 xor_blocks(nr, size, data[0], data + i); 46 i += nr; 47 } 48 49 swap(data[0], data[failed_idx]); 50 } 51 52 static void raid_gen(int nd, int np, size_t size, void **v) 53 { 54 if (np >= 1) 55 raid5_recov(nd + np, nd, size, v); 56 if (np >= 2) 57 raid6_call.gen_syndrome(nd + np, size, v); 58 BUG_ON(np > 2); 59 } 60 61 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) 62 { 63 switch (nr) { 64 case 0: 65 break; 66 case 1: 67 if (ir[0] < nd + 1) 68 raid5_recov(nd + 1, ir[0], size, v); 69 else 70 raid6_call.gen_syndrome(nd + np, size, v); 71 break; 72 case 2: 73 if (ir[1] < nd) { 74 /* data+data failure. */ 75 raid6_2data_recov(nd + np, size, ir[0], ir[1], v); 76 } else if (ir[0] < nd) { 77 /* data + p/q failure */ 78 79 if (ir[1] == nd) /* data + p failure */ 80 raid6_datap_recov(nd + np, size, ir[0], v); 81 else { /* data + q failure */ 82 raid5_recov(nd + 1, ir[0], size, v); 83 raid6_call.gen_syndrome(nd + np, size, v); 84 } 85 } else { 86 raid_gen(nd, np, size, v); 87 } 88 break; 89 default: 90 BUG(); 91 } 92 } 93 94 #else 95 96 #include <raid/raid.h> 97 98 #endif 99 100 struct ec_bio { 101 struct bch_dev *ca; 102 struct ec_stripe_buf *buf; 103 size_t idx; 104 struct bio bio; 105 }; 106 107 /* Stripes btree keys: */ 108 109 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, 110 enum bch_validate_flags flags, 111 struct printbuf *err) 112 { 113 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 114 int ret = 0; 115 116 bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || 117 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, 118 stripe_pos_bad, 119 "stripe at bad pos"); 120 121 bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, 122 stripe_val_size_bad, 123 "incorrect value size (%zu < %u)", 124 bkey_val_u64s(k.k), stripe_val_u64s(s)); 125 126 ret = bch2_bkey_ptrs_invalid(c, k, flags, err); 127 fsck_err: 128 return ret; 129 } 130 131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, 132 struct bkey_s_c k) 133 { 134 const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; 135 struct bch_stripe s = {}; 136 137 memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); 138 139 unsigned nr_data = s.nr_blocks - s.nr_redundant; 140 141 prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", 142 s.algorithm, 143 le16_to_cpu(s.sectors), 144 nr_data, 145 s.nr_redundant); 146 bch2_prt_csum_type(out, s.csum_type); 147 prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); 148 149 for (unsigned i = 0; i < s.nr_blocks; i++) { 150 const struct bch_extent_ptr *ptr = sp->ptrs + i; 151 152 if ((void *) ptr >= bkey_val_end(k)) 153 break; 154 155 bch2_extent_ptr_to_text(out, c, ptr); 156 157 if (s.csum_type < BCH_CSUM_NR && 158 i < nr_data && 159 stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) 160 prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); 161 } 162 } 163 164 /* Triggers: */ 165 166 static int __mark_stripe_bucket(struct btree_trans *trans, 167 struct bch_dev *ca, 168 struct bkey_s_c_stripe s, 169 unsigned ptr_idx, bool deleting, 170 struct bpos bucket, 171 struct bch_alloc_v4 *a, 172 enum btree_iter_update_trigger_flags flags) 173 { 174 const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; 175 unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; 176 bool parity = ptr_idx >= nr_data; 177 enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; 178 s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; 179 struct printbuf buf = PRINTBUF; 180 int ret = 0; 181 182 struct bch_fs *c = trans->c; 183 if (deleting) 184 sectors = -sectors; 185 186 if (!deleting) { 187 if (bch2_trans_inconsistent_on(a->stripe || 188 a->stripe_redundancy, trans, 189 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", 190 bucket.inode, bucket.offset, a->gen, 191 bch2_data_type_str(a->data_type), 192 a->dirty_sectors, 193 a->stripe, s.k->p.offset, 194 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 195 ret = -EIO; 196 goto err; 197 } 198 199 if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, 200 "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", 201 bucket.inode, bucket.offset, a->gen, 202 bch2_data_type_str(a->data_type), 203 a->dirty_sectors, 204 a->cached_sectors, 205 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 206 ret = -EIO; 207 goto err; 208 } 209 } else { 210 if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || 211 a->stripe_redundancy != s.v->nr_redundant, trans, 212 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", 213 bucket.inode, bucket.offset, a->gen, 214 a->stripe, 215 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 216 ret = -EIO; 217 goto err; 218 } 219 220 if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, 221 "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", 222 bucket.inode, bucket.offset, a->gen, 223 bch2_data_type_str(a->data_type), 224 bch2_data_type_str(data_type), 225 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 226 ret = -EIO; 227 goto err; 228 } 229 230 if (bch2_trans_inconsistent_on(parity && 231 (a->dirty_sectors != -sectors || 232 a->cached_sectors), trans, 233 "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", 234 bucket.inode, bucket.offset, a->gen, 235 a->dirty_sectors, 236 a->cached_sectors, 237 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 238 ret = -EIO; 239 goto err; 240 } 241 } 242 243 if (sectors) { 244 ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, 245 a->gen, a->data_type, &a->dirty_sectors); 246 if (ret) 247 goto err; 248 } 249 250 if (!deleting) { 251 a->stripe = s.k->p.offset; 252 a->stripe_redundancy = s.v->nr_redundant; 253 } else { 254 a->stripe = 0; 255 a->stripe_redundancy = 0; 256 } 257 258 alloc_data_type_set(a, data_type); 259 err: 260 printbuf_exit(&buf); 261 return ret; 262 } 263 264 static int mark_stripe_bucket(struct btree_trans *trans, 265 struct bkey_s_c_stripe s, 266 unsigned ptr_idx, bool deleting, 267 enum btree_iter_update_trigger_flags flags) 268 { 269 struct bch_fs *c = trans->c; 270 const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; 271 int ret = 0; 272 273 struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); 274 if (unlikely(!ca)) { 275 if (!(flags & BTREE_TRIGGER_overwrite)) 276 ret = -EIO; 277 goto err; 278 } 279 280 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 281 282 if (flags & BTREE_TRIGGER_transactional) { 283 struct bkey_i_alloc_v4 *a = 284 bch2_trans_start_alloc_update(trans, bucket); 285 ret = PTR_ERR_OR_ZERO(a) ?: 286 __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); 287 } 288 289 if (flags & BTREE_TRIGGER_gc) { 290 percpu_down_read(&c->mark_lock); 291 struct bucket *g = gc_bucket(ca, bucket.offset); 292 bucket_lock(g); 293 struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; 294 ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); 295 if (!ret) { 296 alloc_to_bucket(g, new); 297 bch2_dev_usage_update(c, ca, &old, &new, 0, true); 298 } 299 bucket_unlock(g); 300 percpu_up_read(&c->mark_lock); 301 } 302 err: 303 bch2_dev_put(ca); 304 return ret; 305 } 306 307 static int mark_stripe_buckets(struct btree_trans *trans, 308 struct bkey_s_c old, struct bkey_s_c new, 309 enum btree_iter_update_trigger_flags flags) 310 { 311 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 312 ? bkey_s_c_to_stripe(old).v : NULL; 313 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 314 ? bkey_s_c_to_stripe(new).v : NULL; 315 316 BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); 317 318 unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; 319 320 for (unsigned i = 0; i < nr_blocks; i++) { 321 if (new_s && old_s && 322 !memcmp(&new_s->ptrs[i], 323 &old_s->ptrs[i], 324 sizeof(new_s->ptrs[i]))) 325 continue; 326 327 if (new_s) { 328 int ret = mark_stripe_bucket(trans, 329 bkey_s_c_to_stripe(new), i, false, flags); 330 if (ret) 331 return ret; 332 } 333 334 if (old_s) { 335 int ret = mark_stripe_bucket(trans, 336 bkey_s_c_to_stripe(old), i, true, flags); 337 if (ret) 338 return ret; 339 } 340 } 341 342 return 0; 343 } 344 345 int bch2_trigger_stripe(struct btree_trans *trans, 346 enum btree_id btree, unsigned level, 347 struct bkey_s_c old, struct bkey_s _new, 348 enum btree_iter_update_trigger_flags flags) 349 { 350 struct bkey_s_c new = _new.s_c; 351 struct bch_fs *c = trans->c; 352 u64 idx = new.k->p.offset; 353 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 354 ? bkey_s_c_to_stripe(old).v : NULL; 355 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 356 ? bkey_s_c_to_stripe(new).v : NULL; 357 358 if (unlikely(flags & BTREE_TRIGGER_check_repair)) 359 return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); 360 361 if (flags & BTREE_TRIGGER_transactional) { 362 /* 363 * If the pointers aren't changing, we don't need to do anything: 364 */ 365 if (new_s && old_s && 366 new_s->nr_blocks == old_s->nr_blocks && 367 new_s->nr_redundant == old_s->nr_redundant && 368 !memcmp(old_s->ptrs, new_s->ptrs, 369 new_s->nr_blocks * sizeof(struct bch_extent_ptr))) 370 return 0; 371 372 BUG_ON(new_s && old_s && 373 (new_s->nr_blocks != old_s->nr_blocks || 374 new_s->nr_redundant != old_s->nr_redundant)); 375 376 if (new_s) { 377 s64 sectors = le16_to_cpu(new_s->sectors); 378 379 struct bch_replicas_padded r; 380 bch2_bkey_to_replicas(&r.e, new); 381 int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); 382 if (ret) 383 return ret; 384 } 385 386 if (old_s) { 387 s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); 388 389 struct bch_replicas_padded r; 390 bch2_bkey_to_replicas(&r.e, old); 391 int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); 392 if (ret) 393 return ret; 394 } 395 396 int ret = mark_stripe_buckets(trans, old, new, flags); 397 if (ret) 398 return ret; 399 } 400 401 if (flags & BTREE_TRIGGER_atomic) { 402 struct stripe *m = genradix_ptr(&c->stripes, idx); 403 404 if (!m) { 405 struct printbuf buf1 = PRINTBUF; 406 struct printbuf buf2 = PRINTBUF; 407 408 bch2_bkey_val_to_text(&buf1, c, old); 409 bch2_bkey_val_to_text(&buf2, c, new); 410 bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" 411 "old %s\n" 412 "new %s", idx, buf1.buf, buf2.buf); 413 printbuf_exit(&buf2); 414 printbuf_exit(&buf1); 415 bch2_inconsistent_error(c); 416 return -1; 417 } 418 419 if (!new_s) { 420 bch2_stripes_heap_del(c, m, idx); 421 422 memset(m, 0, sizeof(*m)); 423 } else { 424 m->sectors = le16_to_cpu(new_s->sectors); 425 m->algorithm = new_s->algorithm; 426 m->nr_blocks = new_s->nr_blocks; 427 m->nr_redundant = new_s->nr_redundant; 428 m->blocks_nonempty = 0; 429 430 for (unsigned i = 0; i < new_s->nr_blocks; i++) 431 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); 432 433 if (!old_s) 434 bch2_stripes_heap_insert(c, m, idx); 435 else 436 bch2_stripes_heap_update(c, m, idx); 437 } 438 } 439 440 if (flags & BTREE_TRIGGER_gc) { 441 struct gc_stripe *m = 442 genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); 443 444 if (!m) { 445 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 446 idx); 447 return -BCH_ERR_ENOMEM_mark_stripe; 448 } 449 /* 450 * This will be wrong when we bring back runtime gc: we should 451 * be unmarking the old key and then marking the new key 452 */ 453 m->alive = true; 454 m->sectors = le16_to_cpu(new_s->sectors); 455 m->nr_blocks = new_s->nr_blocks; 456 m->nr_redundant = new_s->nr_redundant; 457 458 for (unsigned i = 0; i < new_s->nr_blocks; i++) 459 m->ptrs[i] = new_s->ptrs[i]; 460 461 bch2_bkey_to_replicas(&m->r.e, new); 462 463 /* 464 * gc recalculates this field from stripe ptr 465 * references: 466 */ 467 memset(m->block_sectors, 0, sizeof(m->block_sectors)); 468 469 int ret = mark_stripe_buckets(trans, old, new, flags); 470 if (ret) 471 return ret; 472 473 ret = bch2_update_replicas(c, new, &m->r.e, 474 ((s64) m->sectors * m->nr_redundant), 475 0, true); 476 if (ret) { 477 struct printbuf buf = PRINTBUF; 478 479 bch2_bkey_val_to_text(&buf, c, new); 480 bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf); 481 printbuf_exit(&buf); 482 return ret; 483 } 484 } 485 486 return 0; 487 } 488 489 /* returns blocknr in stripe that we matched: */ 490 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, 491 struct bkey_s_c k, unsigned *block) 492 { 493 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 494 unsigned i, nr_data = s->nr_blocks - s->nr_redundant; 495 496 bkey_for_each_ptr(ptrs, ptr) 497 for (i = 0; i < nr_data; i++) 498 if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, 499 le16_to_cpu(s->sectors))) { 500 *block = i; 501 return ptr; 502 } 503 504 return NULL; 505 } 506 507 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) 508 { 509 switch (k.k->type) { 510 case KEY_TYPE_extent: { 511 struct bkey_s_c_extent e = bkey_s_c_to_extent(k); 512 const union bch_extent_entry *entry; 513 514 extent_for_each_entry(e, entry) 515 if (extent_entry_type(entry) == 516 BCH_EXTENT_ENTRY_stripe_ptr && 517 entry->stripe_ptr.idx == idx) 518 return true; 519 520 break; 521 } 522 } 523 524 return false; 525 } 526 527 /* Stripe bufs: */ 528 529 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) 530 { 531 if (buf->key.k.type == KEY_TYPE_stripe) { 532 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); 533 unsigned i; 534 535 for (i = 0; i < s->v.nr_blocks; i++) { 536 kvfree(buf->data[i]); 537 buf->data[i] = NULL; 538 } 539 } 540 } 541 542 /* XXX: this is a non-mempoolified memory allocation: */ 543 static int ec_stripe_buf_init(struct ec_stripe_buf *buf, 544 unsigned offset, unsigned size) 545 { 546 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 547 unsigned csum_granularity = 1U << v->csum_granularity_bits; 548 unsigned end = offset + size; 549 unsigned i; 550 551 BUG_ON(end > le16_to_cpu(v->sectors)); 552 553 offset = round_down(offset, csum_granularity); 554 end = min_t(unsigned, le16_to_cpu(v->sectors), 555 round_up(end, csum_granularity)); 556 557 buf->offset = offset; 558 buf->size = end - offset; 559 560 memset(buf->valid, 0xFF, sizeof(buf->valid)); 561 562 for (i = 0; i < v->nr_blocks; i++) { 563 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); 564 if (!buf->data[i]) 565 goto err; 566 } 567 568 return 0; 569 err: 570 ec_stripe_buf_exit(buf); 571 return -BCH_ERR_ENOMEM_stripe_buf; 572 } 573 574 /* Checksumming: */ 575 576 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, 577 unsigned block, unsigned offset) 578 { 579 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 580 unsigned csum_granularity = 1 << v->csum_granularity_bits; 581 unsigned end = buf->offset + buf->size; 582 unsigned len = min(csum_granularity, end - offset); 583 584 BUG_ON(offset >= end); 585 BUG_ON(offset < buf->offset); 586 BUG_ON(offset & (csum_granularity - 1)); 587 BUG_ON(offset + len != le16_to_cpu(v->sectors) && 588 (len & (csum_granularity - 1))); 589 590 return bch2_checksum(NULL, v->csum_type, 591 null_nonce(), 592 buf->data[block] + ((offset - buf->offset) << 9), 593 len << 9); 594 } 595 596 static void ec_generate_checksums(struct ec_stripe_buf *buf) 597 { 598 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 599 unsigned i, j, csums_per_device = stripe_csums_per_device(v); 600 601 if (!v->csum_type) 602 return; 603 604 BUG_ON(buf->offset); 605 BUG_ON(buf->size != le16_to_cpu(v->sectors)); 606 607 for (i = 0; i < v->nr_blocks; i++) 608 for (j = 0; j < csums_per_device; j++) 609 stripe_csum_set(v, i, j, 610 ec_block_checksum(buf, i, j << v->csum_granularity_bits)); 611 } 612 613 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) 614 { 615 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 616 unsigned csum_granularity = 1 << v->csum_granularity_bits; 617 unsigned i; 618 619 if (!v->csum_type) 620 return; 621 622 for (i = 0; i < v->nr_blocks; i++) { 623 unsigned offset = buf->offset; 624 unsigned end = buf->offset + buf->size; 625 626 if (!test_bit(i, buf->valid)) 627 continue; 628 629 while (offset < end) { 630 unsigned j = offset >> v->csum_granularity_bits; 631 unsigned len = min(csum_granularity, end - offset); 632 struct bch_csum want = stripe_csum_get(v, i, j); 633 struct bch_csum got = ec_block_checksum(buf, i, offset); 634 635 if (bch2_crc_cmp(want, got)) { 636 struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); 637 if (ca) { 638 struct printbuf err = PRINTBUF; 639 640 prt_str(&err, "stripe "); 641 bch2_csum_err_msg(&err, v->csum_type, want, got); 642 prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); 643 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); 644 bch_err_ratelimited(ca, "%s", err.buf); 645 printbuf_exit(&err); 646 647 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 648 } 649 650 clear_bit(i, buf->valid); 651 break; 652 } 653 654 offset += len; 655 } 656 } 657 } 658 659 /* Erasure coding: */ 660 661 static void ec_generate_ec(struct ec_stripe_buf *buf) 662 { 663 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 664 unsigned nr_data = v->nr_blocks - v->nr_redundant; 665 unsigned bytes = le16_to_cpu(v->sectors) << 9; 666 667 raid_gen(nr_data, v->nr_redundant, bytes, buf->data); 668 } 669 670 static unsigned ec_nr_failed(struct ec_stripe_buf *buf) 671 { 672 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 673 674 return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); 675 } 676 677 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) 678 { 679 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 680 unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; 681 unsigned nr_data = v->nr_blocks - v->nr_redundant; 682 unsigned bytes = buf->size << 9; 683 684 if (ec_nr_failed(buf) > v->nr_redundant) { 685 bch_err_ratelimited(c, 686 "error doing reconstruct read: unable to read enough blocks"); 687 return -1; 688 } 689 690 for (i = 0; i < nr_data; i++) 691 if (!test_bit(i, buf->valid)) 692 failed[nr_failed++] = i; 693 694 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); 695 return 0; 696 } 697 698 /* IO: */ 699 700 static void ec_block_endio(struct bio *bio) 701 { 702 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); 703 struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; 704 struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; 705 struct bch_dev *ca = ec_bio->ca; 706 struct closure *cl = bio->bi_private; 707 708 if (bch2_dev_io_err_on(bio->bi_status, ca, 709 bio_data_dir(bio) 710 ? BCH_MEMBER_ERROR_write 711 : BCH_MEMBER_ERROR_read, 712 "erasure coding %s error: %s", 713 bio_data_dir(bio) ? "write" : "read", 714 bch2_blk_status_to_str(bio->bi_status))) 715 clear_bit(ec_bio->idx, ec_bio->buf->valid); 716 717 if (dev_ptr_stale(ca, ptr)) { 718 bch_err_ratelimited(ca->fs, 719 "error %s stripe: stale pointer after io", 720 bio_data_dir(bio) == READ ? "reading from" : "writing to"); 721 clear_bit(ec_bio->idx, ec_bio->buf->valid); 722 } 723 724 bio_put(&ec_bio->bio); 725 percpu_ref_put(&ca->io_ref); 726 closure_put(cl); 727 } 728 729 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, 730 blk_opf_t opf, unsigned idx, struct closure *cl) 731 { 732 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 733 unsigned offset = 0, bytes = buf->size << 9; 734 struct bch_extent_ptr *ptr = &v->ptrs[idx]; 735 enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant 736 ? BCH_DATA_user 737 : BCH_DATA_parity; 738 int rw = op_is_write(opf); 739 740 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); 741 if (!ca) { 742 clear_bit(idx, buf->valid); 743 return; 744 } 745 746 if (dev_ptr_stale(ca, ptr)) { 747 bch_err_ratelimited(c, 748 "error %s stripe: stale pointer", 749 rw == READ ? "reading from" : "writing to"); 750 clear_bit(idx, buf->valid); 751 return; 752 } 753 754 755 this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); 756 757 while (offset < bytes) { 758 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, 759 DIV_ROUND_UP(bytes, PAGE_SIZE)); 760 unsigned b = min_t(size_t, bytes - offset, 761 nr_iovecs << PAGE_SHIFT); 762 struct ec_bio *ec_bio; 763 764 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 765 nr_iovecs, 766 opf, 767 GFP_KERNEL, 768 &c->ec_bioset), 769 struct ec_bio, bio); 770 771 ec_bio->ca = ca; 772 ec_bio->buf = buf; 773 ec_bio->idx = idx; 774 775 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); 776 ec_bio->bio.bi_end_io = ec_block_endio; 777 ec_bio->bio.bi_private = cl; 778 779 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); 780 781 closure_get(cl); 782 percpu_ref_get(&ca->io_ref); 783 784 submit_bio(&ec_bio->bio); 785 786 offset += b; 787 } 788 789 percpu_ref_put(&ca->io_ref); 790 } 791 792 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, 793 struct ec_stripe_buf *stripe) 794 { 795 struct btree_iter iter; 796 struct bkey_s_c k; 797 int ret; 798 799 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 800 POS(0, idx), BTREE_ITER_slots); 801 ret = bkey_err(k); 802 if (ret) 803 goto err; 804 if (k.k->type != KEY_TYPE_stripe) { 805 ret = -ENOENT; 806 goto err; 807 } 808 bkey_reassemble(&stripe->key, k); 809 err: 810 bch2_trans_iter_exit(trans, &iter); 811 return ret; 812 } 813 814 /* recovery read path: */ 815 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) 816 { 817 struct bch_fs *c = trans->c; 818 struct ec_stripe_buf *buf; 819 struct closure cl; 820 struct bch_stripe *v; 821 unsigned i, offset; 822 int ret = 0; 823 824 closure_init_stack(&cl); 825 826 BUG_ON(!rbio->pick.has_ec); 827 828 buf = kzalloc(sizeof(*buf), GFP_NOFS); 829 if (!buf) 830 return -BCH_ERR_ENOMEM_ec_read_extent; 831 832 ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); 833 if (ret) { 834 bch_err_ratelimited(c, 835 "error doing reconstruct read: error %i looking up stripe", ret); 836 kfree(buf); 837 return -EIO; 838 } 839 840 v = &bkey_i_to_stripe(&buf->key)->v; 841 842 if (!bch2_ptr_matches_stripe(v, rbio->pick)) { 843 bch_err_ratelimited(c, 844 "error doing reconstruct read: pointer doesn't match stripe"); 845 ret = -EIO; 846 goto err; 847 } 848 849 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; 850 if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { 851 bch_err_ratelimited(c, 852 "error doing reconstruct read: read is bigger than stripe"); 853 ret = -EIO; 854 goto err; 855 } 856 857 ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); 858 if (ret) 859 goto err; 860 861 for (i = 0; i < v->nr_blocks; i++) 862 ec_block_io(c, buf, REQ_OP_READ, i, &cl); 863 864 closure_sync(&cl); 865 866 if (ec_nr_failed(buf) > v->nr_redundant) { 867 bch_err_ratelimited(c, 868 "error doing reconstruct read: unable to read enough blocks"); 869 ret = -EIO; 870 goto err; 871 } 872 873 ec_validate_checksums(c, buf); 874 875 ret = ec_do_recov(c, buf); 876 if (ret) 877 goto err; 878 879 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, 880 buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); 881 err: 882 ec_stripe_buf_exit(buf); 883 kfree(buf); 884 return ret; 885 } 886 887 /* stripe bucket accounting: */ 888 889 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) 890 { 891 ec_stripes_heap n, *h = &c->ec_stripes_heap; 892 893 if (idx >= h->size) { 894 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) 895 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 896 897 mutex_lock(&c->ec_stripes_heap_lock); 898 if (n.size > h->size) { 899 memcpy(n.data, h->data, h->used * sizeof(h->data[0])); 900 n.used = h->used; 901 swap(*h, n); 902 } 903 mutex_unlock(&c->ec_stripes_heap_lock); 904 905 free_heap(&n); 906 } 907 908 if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) 909 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 910 911 if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && 912 !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) 913 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 914 915 return 0; 916 } 917 918 static int ec_stripe_mem_alloc(struct btree_trans *trans, 919 struct btree_iter *iter) 920 { 921 return allocate_dropping_locks_errcode(trans, 922 __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); 923 } 924 925 /* 926 * Hash table of open stripes: 927 * Stripes that are being created or modified are kept in a hash table, so that 928 * stripe deletion can skip them. 929 */ 930 931 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) 932 { 933 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 934 struct ec_stripe_new *s; 935 936 hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) 937 if (s->idx == idx) 938 return true; 939 return false; 940 } 941 942 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) 943 { 944 bool ret = false; 945 946 spin_lock(&c->ec_stripes_new_lock); 947 ret = __bch2_stripe_is_open(c, idx); 948 spin_unlock(&c->ec_stripes_new_lock); 949 950 return ret; 951 } 952 953 static bool bch2_try_open_stripe(struct bch_fs *c, 954 struct ec_stripe_new *s, 955 u64 idx) 956 { 957 bool ret; 958 959 spin_lock(&c->ec_stripes_new_lock); 960 ret = !__bch2_stripe_is_open(c, idx); 961 if (ret) { 962 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 963 964 s->idx = idx; 965 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); 966 } 967 spin_unlock(&c->ec_stripes_new_lock); 968 969 return ret; 970 } 971 972 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) 973 { 974 BUG_ON(!s->idx); 975 976 spin_lock(&c->ec_stripes_new_lock); 977 hlist_del_init(&s->hash); 978 spin_unlock(&c->ec_stripes_new_lock); 979 980 s->idx = 0; 981 } 982 983 /* Heap of all existing stripes, ordered by blocks_nonempty */ 984 985 static u64 stripe_idx_to_delete(struct bch_fs *c) 986 { 987 ec_stripes_heap *h = &c->ec_stripes_heap; 988 989 lockdep_assert_held(&c->ec_stripes_heap_lock); 990 991 if (h->used && 992 h->data[0].blocks_nonempty == 0 && 993 !bch2_stripe_is_open(c, h->data[0].idx)) 994 return h->data[0].idx; 995 996 return 0; 997 } 998 999 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, 1000 struct ec_stripe_heap_entry l, 1001 struct ec_stripe_heap_entry r) 1002 { 1003 return ((l.blocks_nonempty > r.blocks_nonempty) - 1004 (l.blocks_nonempty < r.blocks_nonempty)); 1005 } 1006 1007 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, 1008 size_t i) 1009 { 1010 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); 1011 1012 genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; 1013 } 1014 1015 static void heap_verify_backpointer(struct bch_fs *c, size_t idx) 1016 { 1017 ec_stripes_heap *h = &c->ec_stripes_heap; 1018 struct stripe *m = genradix_ptr(&c->stripes, idx); 1019 1020 BUG_ON(m->heap_idx >= h->used); 1021 BUG_ON(h->data[m->heap_idx].idx != idx); 1022 } 1023 1024 void bch2_stripes_heap_del(struct bch_fs *c, 1025 struct stripe *m, size_t idx) 1026 { 1027 mutex_lock(&c->ec_stripes_heap_lock); 1028 heap_verify_backpointer(c, idx); 1029 1030 heap_del(&c->ec_stripes_heap, m->heap_idx, 1031 ec_stripes_heap_cmp, 1032 ec_stripes_heap_set_backpointer); 1033 mutex_unlock(&c->ec_stripes_heap_lock); 1034 } 1035 1036 void bch2_stripes_heap_insert(struct bch_fs *c, 1037 struct stripe *m, size_t idx) 1038 { 1039 mutex_lock(&c->ec_stripes_heap_lock); 1040 BUG_ON(heap_full(&c->ec_stripes_heap)); 1041 1042 heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { 1043 .idx = idx, 1044 .blocks_nonempty = m->blocks_nonempty, 1045 }), 1046 ec_stripes_heap_cmp, 1047 ec_stripes_heap_set_backpointer); 1048 1049 heap_verify_backpointer(c, idx); 1050 mutex_unlock(&c->ec_stripes_heap_lock); 1051 } 1052 1053 void bch2_stripes_heap_update(struct bch_fs *c, 1054 struct stripe *m, size_t idx) 1055 { 1056 ec_stripes_heap *h = &c->ec_stripes_heap; 1057 bool do_deletes; 1058 size_t i; 1059 1060 mutex_lock(&c->ec_stripes_heap_lock); 1061 heap_verify_backpointer(c, idx); 1062 1063 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; 1064 1065 i = m->heap_idx; 1066 heap_sift_up(h, i, ec_stripes_heap_cmp, 1067 ec_stripes_heap_set_backpointer); 1068 heap_sift_down(h, i, ec_stripes_heap_cmp, 1069 ec_stripes_heap_set_backpointer); 1070 1071 heap_verify_backpointer(c, idx); 1072 1073 do_deletes = stripe_idx_to_delete(c) != 0; 1074 mutex_unlock(&c->ec_stripes_heap_lock); 1075 1076 if (do_deletes) 1077 bch2_do_stripe_deletes(c); 1078 } 1079 1080 /* stripe deletion */ 1081 1082 static int ec_stripe_delete(struct btree_trans *trans, u64 idx) 1083 { 1084 struct bch_fs *c = trans->c; 1085 struct btree_iter iter; 1086 struct bkey_s_c k; 1087 struct bkey_s_c_stripe s; 1088 int ret; 1089 1090 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), 1091 BTREE_ITER_intent); 1092 ret = bkey_err(k); 1093 if (ret) 1094 goto err; 1095 1096 if (k.k->type != KEY_TYPE_stripe) { 1097 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); 1098 ret = -EINVAL; 1099 goto err; 1100 } 1101 1102 s = bkey_s_c_to_stripe(k); 1103 for (unsigned i = 0; i < s.v->nr_blocks; i++) 1104 if (stripe_blockcount_get(s.v, i)) { 1105 struct printbuf buf = PRINTBUF; 1106 1107 bch2_bkey_val_to_text(&buf, c, k); 1108 bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); 1109 printbuf_exit(&buf); 1110 ret = -EINVAL; 1111 goto err; 1112 } 1113 1114 ret = bch2_btree_delete_at(trans, &iter, 0); 1115 err: 1116 bch2_trans_iter_exit(trans, &iter); 1117 return ret; 1118 } 1119 1120 static void ec_stripe_delete_work(struct work_struct *work) 1121 { 1122 struct bch_fs *c = 1123 container_of(work, struct bch_fs, ec_stripe_delete_work); 1124 1125 while (1) { 1126 mutex_lock(&c->ec_stripes_heap_lock); 1127 u64 idx = stripe_idx_to_delete(c); 1128 mutex_unlock(&c->ec_stripes_heap_lock); 1129 1130 if (!idx) 1131 break; 1132 1133 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1134 ec_stripe_delete(trans, idx)); 1135 bch_err_fn(c, ret); 1136 if (ret) 1137 break; 1138 } 1139 1140 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 1141 } 1142 1143 void bch2_do_stripe_deletes(struct bch_fs *c) 1144 { 1145 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && 1146 !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) 1147 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 1148 } 1149 1150 /* stripe creation: */ 1151 1152 static int ec_stripe_key_update(struct btree_trans *trans, 1153 struct bkey_i_stripe *new, 1154 bool create) 1155 { 1156 struct bch_fs *c = trans->c; 1157 struct btree_iter iter; 1158 struct bkey_s_c k; 1159 int ret; 1160 1161 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 1162 new->k.p, BTREE_ITER_intent); 1163 ret = bkey_err(k); 1164 if (ret) 1165 goto err; 1166 1167 if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { 1168 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", 1169 create ? "creating" : "updating", 1170 bch2_bkey_types[k.k->type]); 1171 ret = -EINVAL; 1172 goto err; 1173 } 1174 1175 if (k.k->type == KEY_TYPE_stripe) { 1176 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; 1177 unsigned i; 1178 1179 if (old->nr_blocks != new->v.nr_blocks) { 1180 bch_err(c, "error updating stripe: nr_blocks does not match"); 1181 ret = -EINVAL; 1182 goto err; 1183 } 1184 1185 for (i = 0; i < new->v.nr_blocks; i++) { 1186 unsigned v = stripe_blockcount_get(old, i); 1187 1188 BUG_ON(v && 1189 (old->ptrs[i].dev != new->v.ptrs[i].dev || 1190 old->ptrs[i].gen != new->v.ptrs[i].gen || 1191 old->ptrs[i].offset != new->v.ptrs[i].offset)); 1192 1193 stripe_blockcount_set(&new->v, i, v); 1194 } 1195 } 1196 1197 ret = bch2_trans_update(trans, &iter, &new->k_i, 0); 1198 err: 1199 bch2_trans_iter_exit(trans, &iter); 1200 return ret; 1201 } 1202 1203 static int ec_stripe_update_extent(struct btree_trans *trans, 1204 struct bch_dev *ca, 1205 struct bpos bucket, u8 gen, 1206 struct ec_stripe_buf *s, 1207 struct bpos *bp_pos) 1208 { 1209 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1210 struct bch_fs *c = trans->c; 1211 struct bch_backpointer bp; 1212 struct btree_iter iter; 1213 struct bkey_s_c k; 1214 const struct bch_extent_ptr *ptr_c; 1215 struct bch_extent_ptr *ec_ptr = NULL; 1216 struct bch_extent_stripe_ptr stripe_ptr; 1217 struct bkey_i *n; 1218 int ret, dev, block; 1219 1220 ret = bch2_get_next_backpointer(trans, ca, bucket, gen, 1221 bp_pos, &bp, BTREE_ITER_cached); 1222 if (ret) 1223 return ret; 1224 if (bpos_eq(*bp_pos, SPOS_MAX)) 1225 return 0; 1226 1227 if (bp.level) { 1228 struct printbuf buf = PRINTBUF; 1229 struct btree_iter node_iter; 1230 struct btree *b; 1231 1232 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); 1233 bch2_trans_iter_exit(trans, &node_iter); 1234 1235 if (!b) 1236 return 0; 1237 1238 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); 1239 bch2_backpointer_to_text(&buf, &bp); 1240 1241 bch2_fs_inconsistent(c, "%s", buf.buf); 1242 printbuf_exit(&buf); 1243 return -EIO; 1244 } 1245 1246 k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); 1247 ret = bkey_err(k); 1248 if (ret) 1249 return ret; 1250 if (!k.k) { 1251 /* 1252 * extent no longer exists - we could flush the btree 1253 * write buffer and retry to verify, but no need: 1254 */ 1255 return 0; 1256 } 1257 1258 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) 1259 goto out; 1260 1261 ptr_c = bkey_matches_stripe(v, k, &block); 1262 /* 1263 * It doesn't generally make sense to erasure code cached ptrs: 1264 * XXX: should we be incrementing a counter? 1265 */ 1266 if (!ptr_c || ptr_c->cached) 1267 goto out; 1268 1269 dev = v->ptrs[block].dev; 1270 1271 n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); 1272 ret = PTR_ERR_OR_ZERO(n); 1273 if (ret) 1274 goto out; 1275 1276 bkey_reassemble(n, k); 1277 1278 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); 1279 ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); 1280 BUG_ON(!ec_ptr); 1281 1282 stripe_ptr = (struct bch_extent_stripe_ptr) { 1283 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, 1284 .block = block, 1285 .redundancy = v->nr_redundant, 1286 .idx = s->key.k.p.offset, 1287 }; 1288 1289 __extent_entry_insert(n, 1290 (union bch_extent_entry *) ec_ptr, 1291 (union bch_extent_entry *) &stripe_ptr); 1292 1293 ret = bch2_trans_update(trans, &iter, n, 0); 1294 out: 1295 bch2_trans_iter_exit(trans, &iter); 1296 return ret; 1297 } 1298 1299 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, 1300 unsigned block) 1301 { 1302 struct bch_fs *c = trans->c; 1303 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1304 struct bch_extent_ptr ptr = v->ptrs[block]; 1305 struct bpos bp_pos = POS_MIN; 1306 int ret = 0; 1307 1308 struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); 1309 if (!ca) 1310 return -EIO; 1311 1312 struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); 1313 1314 while (1) { 1315 ret = commit_do(trans, NULL, NULL, 1316 BCH_TRANS_COMMIT_no_check_rw| 1317 BCH_TRANS_COMMIT_no_enospc, 1318 ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); 1319 if (ret) 1320 break; 1321 if (bkey_eq(bp_pos, POS_MAX)) 1322 break; 1323 1324 bp_pos = bpos_nosnap_successor(bp_pos); 1325 } 1326 1327 bch2_dev_put(ca); 1328 return ret; 1329 } 1330 1331 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) 1332 { 1333 struct btree_trans *trans = bch2_trans_get(c); 1334 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1335 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1336 int ret = 0; 1337 1338 ret = bch2_btree_write_buffer_flush_sync(trans); 1339 if (ret) 1340 goto err; 1341 1342 for (i = 0; i < nr_data; i++) { 1343 ret = ec_stripe_update_bucket(trans, s, i); 1344 if (ret) 1345 break; 1346 } 1347 err: 1348 bch2_trans_put(trans); 1349 1350 return ret; 1351 } 1352 1353 static void zero_out_rest_of_ec_bucket(struct bch_fs *c, 1354 struct ec_stripe_new *s, 1355 unsigned block, 1356 struct open_bucket *ob) 1357 { 1358 struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); 1359 if (!ca) { 1360 s->err = -BCH_ERR_erofs_no_writes; 1361 return; 1362 } 1363 1364 unsigned offset = ca->mi.bucket_size - ob->sectors_free; 1365 memset(s->new_stripe.data[block] + (offset << 9), 1366 0, 1367 ob->sectors_free << 9); 1368 1369 int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, 1370 ob->bucket * ca->mi.bucket_size + offset, 1371 ob->sectors_free, 1372 GFP_KERNEL, 0); 1373 1374 percpu_ref_put(&ca->io_ref); 1375 1376 if (ret) 1377 s->err = ret; 1378 } 1379 1380 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) 1381 { 1382 if (s->idx) 1383 bch2_stripe_close(c, s); 1384 kfree(s); 1385 } 1386 1387 /* 1388 * data buckets of new stripe all written: create the stripe 1389 */ 1390 static void ec_stripe_create(struct ec_stripe_new *s) 1391 { 1392 struct bch_fs *c = s->c; 1393 struct open_bucket *ob; 1394 struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; 1395 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1396 int ret; 1397 1398 BUG_ON(s->h->s == s); 1399 1400 closure_sync(&s->iodone); 1401 1402 if (!s->err) { 1403 for (i = 0; i < nr_data; i++) 1404 if (s->blocks[i]) { 1405 ob = c->open_buckets + s->blocks[i]; 1406 1407 if (ob->sectors_free) 1408 zero_out_rest_of_ec_bucket(c, s, i, ob); 1409 } 1410 } 1411 1412 if (s->err) { 1413 if (!bch2_err_matches(s->err, EROFS)) 1414 bch_err(c, "error creating stripe: error writing data buckets"); 1415 goto err; 1416 } 1417 1418 if (s->have_existing_stripe) { 1419 ec_validate_checksums(c, &s->existing_stripe); 1420 1421 if (ec_do_recov(c, &s->existing_stripe)) { 1422 bch_err(c, "error creating stripe: error reading existing stripe"); 1423 goto err; 1424 } 1425 1426 for (i = 0; i < nr_data; i++) 1427 if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) 1428 swap(s->new_stripe.data[i], 1429 s->existing_stripe.data[i]); 1430 1431 ec_stripe_buf_exit(&s->existing_stripe); 1432 } 1433 1434 BUG_ON(!s->allocated); 1435 BUG_ON(!s->idx); 1436 1437 ec_generate_ec(&s->new_stripe); 1438 1439 ec_generate_checksums(&s->new_stripe); 1440 1441 /* write p/q: */ 1442 for (i = nr_data; i < v->nr_blocks; i++) 1443 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); 1444 closure_sync(&s->iodone); 1445 1446 if (ec_nr_failed(&s->new_stripe)) { 1447 bch_err(c, "error creating stripe: error writing redundancy buckets"); 1448 goto err; 1449 } 1450 1451 ret = bch2_trans_do(c, &s->res, NULL, 1452 BCH_TRANS_COMMIT_no_check_rw| 1453 BCH_TRANS_COMMIT_no_enospc, 1454 ec_stripe_key_update(trans, 1455 bkey_i_to_stripe(&s->new_stripe.key), 1456 !s->have_existing_stripe)); 1457 bch_err_msg(c, ret, "creating stripe key"); 1458 if (ret) { 1459 goto err; 1460 } 1461 1462 ret = ec_stripe_update_extents(c, &s->new_stripe); 1463 bch_err_msg(c, ret, "error updating extents"); 1464 if (ret) 1465 goto err; 1466 err: 1467 bch2_disk_reservation_put(c, &s->res); 1468 1469 for (i = 0; i < v->nr_blocks; i++) 1470 if (s->blocks[i]) { 1471 ob = c->open_buckets + s->blocks[i]; 1472 1473 if (i < nr_data) { 1474 ob->ec = NULL; 1475 __bch2_open_bucket_put(c, ob); 1476 } else { 1477 bch2_open_bucket_put(c, ob); 1478 } 1479 } 1480 1481 mutex_lock(&c->ec_stripe_new_lock); 1482 list_del(&s->list); 1483 mutex_unlock(&c->ec_stripe_new_lock); 1484 wake_up(&c->ec_stripe_new_wait); 1485 1486 ec_stripe_buf_exit(&s->existing_stripe); 1487 ec_stripe_buf_exit(&s->new_stripe); 1488 closure_debug_destroy(&s->iodone); 1489 1490 ec_stripe_new_put(c, s, STRIPE_REF_stripe); 1491 } 1492 1493 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) 1494 { 1495 struct ec_stripe_new *s; 1496 1497 mutex_lock(&c->ec_stripe_new_lock); 1498 list_for_each_entry(s, &c->ec_stripe_new_list, list) 1499 if (!atomic_read(&s->ref[STRIPE_REF_io])) 1500 goto out; 1501 s = NULL; 1502 out: 1503 mutex_unlock(&c->ec_stripe_new_lock); 1504 1505 return s; 1506 } 1507 1508 static void ec_stripe_create_work(struct work_struct *work) 1509 { 1510 struct bch_fs *c = container_of(work, 1511 struct bch_fs, ec_stripe_create_work); 1512 struct ec_stripe_new *s; 1513 1514 while ((s = get_pending_stripe(c))) 1515 ec_stripe_create(s); 1516 1517 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1518 } 1519 1520 void bch2_ec_do_stripe_creates(struct bch_fs *c) 1521 { 1522 bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); 1523 1524 if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) 1525 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1526 } 1527 1528 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) 1529 { 1530 struct ec_stripe_new *s = h->s; 1531 1532 BUG_ON(!s->allocated && !s->err); 1533 1534 h->s = NULL; 1535 s->pending = true; 1536 1537 mutex_lock(&c->ec_stripe_new_lock); 1538 list_add(&s->list, &c->ec_stripe_new_list); 1539 mutex_unlock(&c->ec_stripe_new_lock); 1540 1541 ec_stripe_new_put(c, s, STRIPE_REF_io); 1542 } 1543 1544 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) 1545 { 1546 struct ec_stripe_new *s = ob->ec; 1547 1548 s->err = -EIO; 1549 } 1550 1551 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) 1552 { 1553 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); 1554 if (!ob) 1555 return NULL; 1556 1557 BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); 1558 1559 struct bch_dev *ca = ob_dev(c, ob); 1560 unsigned offset = ca->mi.bucket_size - ob->sectors_free; 1561 1562 return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); 1563 } 1564 1565 static int unsigned_cmp(const void *_l, const void *_r) 1566 { 1567 unsigned l = *((const unsigned *) _l); 1568 unsigned r = *((const unsigned *) _r); 1569 1570 return cmp_int(l, r); 1571 } 1572 1573 /* pick most common bucket size: */ 1574 static unsigned pick_blocksize(struct bch_fs *c, 1575 struct bch_devs_mask *devs) 1576 { 1577 unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; 1578 struct { 1579 unsigned nr, size; 1580 } cur = { 0, 0 }, best = { 0, 0 }; 1581 1582 for_each_member_device_rcu(c, ca, devs) 1583 sizes[nr++] = ca->mi.bucket_size; 1584 1585 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); 1586 1587 for (unsigned i = 0; i < nr; i++) { 1588 if (sizes[i] != cur.size) { 1589 if (cur.nr > best.nr) 1590 best = cur; 1591 1592 cur.nr = 0; 1593 cur.size = sizes[i]; 1594 } 1595 1596 cur.nr++; 1597 } 1598 1599 if (cur.nr > best.nr) 1600 best = cur; 1601 1602 return best.size; 1603 } 1604 1605 static bool may_create_new_stripe(struct bch_fs *c) 1606 { 1607 return false; 1608 } 1609 1610 static void ec_stripe_key_init(struct bch_fs *c, 1611 struct bkey_i *k, 1612 unsigned nr_data, 1613 unsigned nr_parity, 1614 unsigned stripe_size) 1615 { 1616 struct bkey_i_stripe *s = bkey_stripe_init(k); 1617 unsigned u64s; 1618 1619 s->v.sectors = cpu_to_le16(stripe_size); 1620 s->v.algorithm = 0; 1621 s->v.nr_blocks = nr_data + nr_parity; 1622 s->v.nr_redundant = nr_parity; 1623 s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); 1624 s->v.csum_type = BCH_CSUM_crc32c; 1625 s->v.pad = 0; 1626 1627 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { 1628 BUG_ON(1 << s->v.csum_granularity_bits >= 1629 le16_to_cpu(s->v.sectors) || 1630 s->v.csum_granularity_bits == U8_MAX); 1631 s->v.csum_granularity_bits++; 1632 } 1633 1634 set_bkey_val_u64s(&s->k, u64s); 1635 } 1636 1637 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) 1638 { 1639 struct ec_stripe_new *s; 1640 1641 lockdep_assert_held(&h->lock); 1642 1643 s = kzalloc(sizeof(*s), GFP_KERNEL); 1644 if (!s) 1645 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; 1646 1647 mutex_init(&s->lock); 1648 closure_init(&s->iodone, NULL); 1649 atomic_set(&s->ref[STRIPE_REF_stripe], 1); 1650 atomic_set(&s->ref[STRIPE_REF_io], 1); 1651 s->c = c; 1652 s->h = h; 1653 s->nr_data = min_t(unsigned, h->nr_active_devs, 1654 BCH_BKEY_PTRS_MAX) - h->redundancy; 1655 s->nr_parity = h->redundancy; 1656 1657 ec_stripe_key_init(c, &s->new_stripe.key, 1658 s->nr_data, s->nr_parity, h->blocksize); 1659 1660 h->s = s; 1661 return 0; 1662 } 1663 1664 static struct ec_stripe_head * 1665 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, 1666 unsigned algo, unsigned redundancy, 1667 enum bch_watermark watermark) 1668 { 1669 struct ec_stripe_head *h; 1670 1671 h = kzalloc(sizeof(*h), GFP_KERNEL); 1672 if (!h) 1673 return NULL; 1674 1675 mutex_init(&h->lock); 1676 BUG_ON(!mutex_trylock(&h->lock)); 1677 1678 h->target = target; 1679 h->algo = algo; 1680 h->redundancy = redundancy; 1681 h->watermark = watermark; 1682 1683 rcu_read_lock(); 1684 h->devs = target_rw_devs(c, BCH_DATA_user, target); 1685 1686 for_each_member_device_rcu(c, ca, &h->devs) 1687 if (!ca->mi.durability) 1688 __clear_bit(ca->dev_idx, h->devs.d); 1689 1690 h->blocksize = pick_blocksize(c, &h->devs); 1691 1692 for_each_member_device_rcu(c, ca, &h->devs) 1693 if (ca->mi.bucket_size == h->blocksize) 1694 h->nr_active_devs++; 1695 1696 rcu_read_unlock(); 1697 1698 /* 1699 * If we only have redundancy + 1 devices, we're better off with just 1700 * replication: 1701 */ 1702 if (h->nr_active_devs < h->redundancy + 2) 1703 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", 1704 h->nr_active_devs, h->redundancy + 2); 1705 1706 list_add(&h->list, &c->ec_stripe_head_list); 1707 return h; 1708 } 1709 1710 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) 1711 { 1712 if (h->s && 1713 h->s->allocated && 1714 bitmap_weight(h->s->blocks_allocated, 1715 h->s->nr_data) == h->s->nr_data) 1716 ec_stripe_set_pending(c, h); 1717 1718 mutex_unlock(&h->lock); 1719 } 1720 1721 static struct ec_stripe_head * 1722 __bch2_ec_stripe_head_get(struct btree_trans *trans, 1723 unsigned target, 1724 unsigned algo, 1725 unsigned redundancy, 1726 enum bch_watermark watermark) 1727 { 1728 struct bch_fs *c = trans->c; 1729 struct ec_stripe_head *h; 1730 int ret; 1731 1732 if (!redundancy) 1733 return NULL; 1734 1735 ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); 1736 if (ret) 1737 return ERR_PTR(ret); 1738 1739 if (test_bit(BCH_FS_going_ro, &c->flags)) { 1740 h = ERR_PTR(-BCH_ERR_erofs_no_writes); 1741 goto found; 1742 } 1743 1744 list_for_each_entry(h, &c->ec_stripe_head_list, list) 1745 if (h->target == target && 1746 h->algo == algo && 1747 h->redundancy == redundancy && 1748 h->watermark == watermark) { 1749 ret = bch2_trans_mutex_lock(trans, &h->lock); 1750 if (ret) 1751 h = ERR_PTR(ret); 1752 goto found; 1753 } 1754 1755 h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); 1756 found: 1757 if (!IS_ERR_OR_NULL(h) && 1758 h->nr_active_devs < h->redundancy + 2) { 1759 mutex_unlock(&h->lock); 1760 h = NULL; 1761 } 1762 mutex_unlock(&c->ec_stripe_head_lock); 1763 return h; 1764 } 1765 1766 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, 1767 enum bch_watermark watermark, struct closure *cl) 1768 { 1769 struct bch_fs *c = trans->c; 1770 struct bch_devs_mask devs = h->devs; 1771 struct open_bucket *ob; 1772 struct open_buckets buckets; 1773 struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1774 unsigned i, j, nr_have_parity = 0, nr_have_data = 0; 1775 bool have_cache = true; 1776 int ret = 0; 1777 1778 BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); 1779 BUG_ON(v->nr_redundant != h->s->nr_parity); 1780 1781 for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { 1782 __clear_bit(v->ptrs[i].dev, devs.d); 1783 if (i < h->s->nr_data) 1784 nr_have_data++; 1785 else 1786 nr_have_parity++; 1787 } 1788 1789 BUG_ON(nr_have_data > h->s->nr_data); 1790 BUG_ON(nr_have_parity > h->s->nr_parity); 1791 1792 buckets.nr = 0; 1793 if (nr_have_parity < h->s->nr_parity) { 1794 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1795 &h->parity_stripe, 1796 &devs, 1797 h->s->nr_parity, 1798 &nr_have_parity, 1799 &have_cache, 0, 1800 BCH_DATA_parity, 1801 watermark, 1802 cl); 1803 1804 open_bucket_for_each(c, &buckets, ob, i) { 1805 j = find_next_zero_bit(h->s->blocks_gotten, 1806 h->s->nr_data + h->s->nr_parity, 1807 h->s->nr_data); 1808 BUG_ON(j >= h->s->nr_data + h->s->nr_parity); 1809 1810 h->s->blocks[j] = buckets.v[i]; 1811 v->ptrs[j] = bch2_ob_ptr(c, ob); 1812 __set_bit(j, h->s->blocks_gotten); 1813 } 1814 1815 if (ret) 1816 return ret; 1817 } 1818 1819 buckets.nr = 0; 1820 if (nr_have_data < h->s->nr_data) { 1821 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1822 &h->block_stripe, 1823 &devs, 1824 h->s->nr_data, 1825 &nr_have_data, 1826 &have_cache, 0, 1827 BCH_DATA_user, 1828 watermark, 1829 cl); 1830 1831 open_bucket_for_each(c, &buckets, ob, i) { 1832 j = find_next_zero_bit(h->s->blocks_gotten, 1833 h->s->nr_data, 0); 1834 BUG_ON(j >= h->s->nr_data); 1835 1836 h->s->blocks[j] = buckets.v[i]; 1837 v->ptrs[j] = bch2_ob_ptr(c, ob); 1838 __set_bit(j, h->s->blocks_gotten); 1839 } 1840 1841 if (ret) 1842 return ret; 1843 } 1844 1845 return 0; 1846 } 1847 1848 /* XXX: doesn't obey target: */ 1849 static s64 get_existing_stripe(struct bch_fs *c, 1850 struct ec_stripe_head *head) 1851 { 1852 ec_stripes_heap *h = &c->ec_stripes_heap; 1853 struct stripe *m; 1854 size_t heap_idx; 1855 u64 stripe_idx; 1856 s64 ret = -1; 1857 1858 if (may_create_new_stripe(c)) 1859 return -1; 1860 1861 mutex_lock(&c->ec_stripes_heap_lock); 1862 for (heap_idx = 0; heap_idx < h->used; heap_idx++) { 1863 /* No blocks worth reusing, stripe will just be deleted: */ 1864 if (!h->data[heap_idx].blocks_nonempty) 1865 continue; 1866 1867 stripe_idx = h->data[heap_idx].idx; 1868 1869 m = genradix_ptr(&c->stripes, stripe_idx); 1870 1871 if (m->algorithm == head->algo && 1872 m->nr_redundant == head->redundancy && 1873 m->sectors == head->blocksize && 1874 m->blocks_nonempty < m->nr_blocks - m->nr_redundant && 1875 bch2_try_open_stripe(c, head->s, stripe_idx)) { 1876 ret = stripe_idx; 1877 break; 1878 } 1879 } 1880 mutex_unlock(&c->ec_stripes_heap_lock); 1881 return ret; 1882 } 1883 1884 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) 1885 { 1886 struct bch_fs *c = trans->c; 1887 struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1888 struct bch_stripe *existing_v; 1889 unsigned i; 1890 s64 idx; 1891 int ret; 1892 1893 /* 1894 * If we can't allocate a new stripe, and there's no stripes with empty 1895 * blocks for us to reuse, that means we have to wait on copygc: 1896 */ 1897 idx = get_existing_stripe(c, h); 1898 if (idx < 0) 1899 return -BCH_ERR_stripe_alloc_blocked; 1900 1901 ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); 1902 bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, 1903 "reading stripe key: %s", bch2_err_str(ret)); 1904 if (ret) { 1905 bch2_stripe_close(c, h->s); 1906 return ret; 1907 } 1908 1909 existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; 1910 1911 BUG_ON(existing_v->nr_redundant != h->s->nr_parity); 1912 h->s->nr_data = existing_v->nr_blocks - 1913 existing_v->nr_redundant; 1914 1915 ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); 1916 if (ret) { 1917 bch2_stripe_close(c, h->s); 1918 return ret; 1919 } 1920 1921 BUG_ON(h->s->existing_stripe.size != h->blocksize); 1922 BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); 1923 1924 /* 1925 * Free buckets we initially allocated - they might conflict with 1926 * blocks from the stripe we're reusing: 1927 */ 1928 for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { 1929 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); 1930 h->s->blocks[i] = 0; 1931 } 1932 memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); 1933 memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); 1934 1935 for (i = 0; i < existing_v->nr_blocks; i++) { 1936 if (stripe_blockcount_get(existing_v, i)) { 1937 __set_bit(i, h->s->blocks_gotten); 1938 __set_bit(i, h->s->blocks_allocated); 1939 } 1940 1941 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); 1942 } 1943 1944 bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); 1945 h->s->have_existing_stripe = true; 1946 1947 return 0; 1948 } 1949 1950 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) 1951 { 1952 struct bch_fs *c = trans->c; 1953 struct btree_iter iter; 1954 struct bkey_s_c k; 1955 struct bpos min_pos = POS(0, 1); 1956 struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); 1957 int ret; 1958 1959 if (!h->s->res.sectors) { 1960 ret = bch2_disk_reservation_get(c, &h->s->res, 1961 h->blocksize, 1962 h->s->nr_parity, 1963 BCH_DISK_RESERVATION_NOFAIL); 1964 if (ret) 1965 return ret; 1966 } 1967 1968 for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, 1969 BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { 1970 if (bkey_gt(k.k->p, POS(0, U32_MAX))) { 1971 if (start_pos.offset) { 1972 start_pos = min_pos; 1973 bch2_btree_iter_set_pos(&iter, start_pos); 1974 continue; 1975 } 1976 1977 ret = -BCH_ERR_ENOSPC_stripe_create; 1978 break; 1979 } 1980 1981 if (bkey_deleted(k.k) && 1982 bch2_try_open_stripe(c, h->s, k.k->p.offset)) 1983 break; 1984 } 1985 1986 c->ec_stripe_hint = iter.pos.offset; 1987 1988 if (ret) 1989 goto err; 1990 1991 ret = ec_stripe_mem_alloc(trans, &iter); 1992 if (ret) { 1993 bch2_stripe_close(c, h->s); 1994 goto err; 1995 } 1996 1997 h->s->new_stripe.key.k.p = iter.pos; 1998 out: 1999 bch2_trans_iter_exit(trans, &iter); 2000 return ret; 2001 err: 2002 bch2_disk_reservation_put(c, &h->s->res); 2003 goto out; 2004 } 2005 2006 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, 2007 unsigned target, 2008 unsigned algo, 2009 unsigned redundancy, 2010 enum bch_watermark watermark, 2011 struct closure *cl) 2012 { 2013 struct bch_fs *c = trans->c; 2014 struct ec_stripe_head *h; 2015 bool waiting = false; 2016 int ret; 2017 2018 h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); 2019 if (IS_ERR_OR_NULL(h)) 2020 return h; 2021 2022 if (!h->s) { 2023 ret = ec_new_stripe_alloc(c, h); 2024 if (ret) { 2025 bch_err(c, "failed to allocate new stripe"); 2026 goto err; 2027 } 2028 } 2029 2030 if (h->s->allocated) 2031 goto allocated; 2032 2033 if (h->s->have_existing_stripe) 2034 goto alloc_existing; 2035 2036 /* First, try to allocate a full stripe: */ 2037 ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: 2038 __bch2_ec_stripe_head_reserve(trans, h); 2039 if (!ret) 2040 goto allocate_buf; 2041 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 2042 bch2_err_matches(ret, ENOMEM)) 2043 goto err; 2044 2045 /* 2046 * Not enough buckets available for a full stripe: we must reuse an 2047 * existing stripe: 2048 */ 2049 while (1) { 2050 ret = __bch2_ec_stripe_head_reuse(trans, h); 2051 if (!ret) 2052 break; 2053 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) 2054 goto err; 2055 2056 if (watermark == BCH_WATERMARK_copygc) { 2057 ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: 2058 __bch2_ec_stripe_head_reserve(trans, h); 2059 if (ret) 2060 goto err; 2061 goto allocate_buf; 2062 } 2063 2064 /* XXX freelist_wait? */ 2065 closure_wait(&c->freelist_wait, cl); 2066 waiting = true; 2067 } 2068 2069 if (waiting) 2070 closure_wake_up(&c->freelist_wait); 2071 alloc_existing: 2072 /* 2073 * Retry allocating buckets, with the watermark for this 2074 * particular write: 2075 */ 2076 ret = new_stripe_alloc_buckets(trans, h, watermark, cl); 2077 if (ret) 2078 goto err; 2079 2080 allocate_buf: 2081 ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); 2082 if (ret) 2083 goto err; 2084 2085 h->s->allocated = true; 2086 allocated: 2087 BUG_ON(!h->s->idx); 2088 BUG_ON(!h->s->new_stripe.data[0]); 2089 BUG_ON(trans->restarted); 2090 return h; 2091 err: 2092 bch2_ec_stripe_head_put(c, h); 2093 return ERR_PTR(ret); 2094 } 2095 2096 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) 2097 { 2098 struct ec_stripe_head *h; 2099 struct open_bucket *ob; 2100 unsigned i; 2101 2102 mutex_lock(&c->ec_stripe_head_lock); 2103 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 2104 mutex_lock(&h->lock); 2105 if (!h->s) 2106 goto unlock; 2107 2108 if (!ca) 2109 goto found; 2110 2111 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { 2112 if (!h->s->blocks[i]) 2113 continue; 2114 2115 ob = c->open_buckets + h->s->blocks[i]; 2116 if (ob->dev == ca->dev_idx) 2117 goto found; 2118 } 2119 goto unlock; 2120 found: 2121 h->s->err = -BCH_ERR_erofs_no_writes; 2122 ec_stripe_set_pending(c, h); 2123 unlock: 2124 mutex_unlock(&h->lock); 2125 } 2126 mutex_unlock(&c->ec_stripe_head_lock); 2127 } 2128 2129 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) 2130 { 2131 __bch2_ec_stop(c, ca); 2132 } 2133 2134 void bch2_fs_ec_stop(struct bch_fs *c) 2135 { 2136 __bch2_ec_stop(c, NULL); 2137 } 2138 2139 static bool bch2_fs_ec_flush_done(struct bch_fs *c) 2140 { 2141 bool ret; 2142 2143 mutex_lock(&c->ec_stripe_new_lock); 2144 ret = list_empty(&c->ec_stripe_new_list); 2145 mutex_unlock(&c->ec_stripe_new_lock); 2146 2147 return ret; 2148 } 2149 2150 void bch2_fs_ec_flush(struct bch_fs *c) 2151 { 2152 wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); 2153 } 2154 2155 int bch2_stripes_read(struct bch_fs *c) 2156 { 2157 int ret = bch2_trans_run(c, 2158 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, 2159 BTREE_ITER_prefetch, k, ({ 2160 if (k.k->type != KEY_TYPE_stripe) 2161 continue; 2162 2163 ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); 2164 if (ret) 2165 break; 2166 2167 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 2168 2169 struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); 2170 m->sectors = le16_to_cpu(s->sectors); 2171 m->algorithm = s->algorithm; 2172 m->nr_blocks = s->nr_blocks; 2173 m->nr_redundant = s->nr_redundant; 2174 m->blocks_nonempty = 0; 2175 2176 for (unsigned i = 0; i < s->nr_blocks; i++) 2177 m->blocks_nonempty += !!stripe_blockcount_get(s, i); 2178 2179 bch2_stripes_heap_insert(c, m, k.k->p.offset); 2180 0; 2181 }))); 2182 bch_err_fn(c, ret); 2183 return ret; 2184 } 2185 2186 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) 2187 { 2188 ec_stripes_heap *h = &c->ec_stripes_heap; 2189 struct stripe *m; 2190 size_t i; 2191 2192 mutex_lock(&c->ec_stripes_heap_lock); 2193 for (i = 0; i < min_t(size_t, h->used, 50); i++) { 2194 m = genradix_ptr(&c->stripes, h->data[i].idx); 2195 2196 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, 2197 h->data[i].blocks_nonempty, 2198 m->nr_blocks - m->nr_redundant, 2199 m->nr_redundant); 2200 if (bch2_stripe_is_open(c, h->data[i].idx)) 2201 prt_str(out, " open"); 2202 prt_newline(out); 2203 } 2204 mutex_unlock(&c->ec_stripes_heap_lock); 2205 } 2206 2207 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) 2208 { 2209 struct ec_stripe_head *h; 2210 struct ec_stripe_new *s; 2211 2212 mutex_lock(&c->ec_stripe_head_lock); 2213 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 2214 prt_printf(out, "target %u algo %u redundancy %u %s:\n", 2215 h->target, h->algo, h->redundancy, 2216 bch2_watermarks[h->watermark]); 2217 2218 if (h->s) 2219 prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", 2220 h->s->idx, h->s->nr_data, h->s->nr_parity, 2221 bitmap_weight(h->s->blocks_allocated, 2222 h->s->nr_data)); 2223 } 2224 mutex_unlock(&c->ec_stripe_head_lock); 2225 2226 prt_printf(out, "in flight:\n"); 2227 2228 mutex_lock(&c->ec_stripe_new_lock); 2229 list_for_each_entry(s, &c->ec_stripe_new_list, list) { 2230 prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", 2231 s->idx, s->nr_data, s->nr_parity, 2232 atomic_read(&s->ref[STRIPE_REF_io]), 2233 atomic_read(&s->ref[STRIPE_REF_stripe]), 2234 bch2_watermarks[s->h->watermark]); 2235 } 2236 mutex_unlock(&c->ec_stripe_new_lock); 2237 } 2238 2239 void bch2_fs_ec_exit(struct bch_fs *c) 2240 { 2241 struct ec_stripe_head *h; 2242 unsigned i; 2243 2244 while (1) { 2245 mutex_lock(&c->ec_stripe_head_lock); 2246 h = list_first_entry_or_null(&c->ec_stripe_head_list, 2247 struct ec_stripe_head, list); 2248 if (h) 2249 list_del(&h->list); 2250 mutex_unlock(&c->ec_stripe_head_lock); 2251 if (!h) 2252 break; 2253 2254 if (h->s) { 2255 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) 2256 BUG_ON(h->s->blocks[i]); 2257 2258 kfree(h->s); 2259 } 2260 kfree(h); 2261 } 2262 2263 BUG_ON(!list_empty(&c->ec_stripe_new_list)); 2264 2265 free_heap(&c->ec_stripes_heap); 2266 genradix_free(&c->stripes); 2267 bioset_exit(&c->ec_bioset); 2268 } 2269 2270 void bch2_fs_ec_init_early(struct bch_fs *c) 2271 { 2272 spin_lock_init(&c->ec_stripes_new_lock); 2273 mutex_init(&c->ec_stripes_heap_lock); 2274 2275 INIT_LIST_HEAD(&c->ec_stripe_head_list); 2276 mutex_init(&c->ec_stripe_head_lock); 2277 2278 INIT_LIST_HEAD(&c->ec_stripe_new_list); 2279 mutex_init(&c->ec_stripe_new_lock); 2280 init_waitqueue_head(&c->ec_stripe_new_wait); 2281 2282 INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); 2283 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); 2284 } 2285 2286 int bch2_fs_ec_init(struct bch_fs *c) 2287 { 2288 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), 2289 BIOSET_NEED_BVECS); 2290 } 2291