1 // SPDX-License-Identifier: GPL-2.0 2 3 /* erasure coding */ 4 5 #include "bcachefs.h" 6 #include "alloc_background.h" 7 #include "alloc_foreground.h" 8 #include "backpointers.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_gc.h" 12 #include "btree_update.h" 13 #include "btree_write_buffer.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "disk_accounting.h" 17 #include "disk_groups.h" 18 #include "ec.h" 19 #include "error.h" 20 #include "io_read.h" 21 #include "keylist.h" 22 #include "recovery.h" 23 #include "replicas.h" 24 #include "super-io.h" 25 #include "util.h" 26 27 #include <linux/sort.h> 28 29 #ifdef __KERNEL__ 30 31 #include <linux/raid/pq.h> 32 #include <linux/raid/xor.h> 33 34 static void raid5_recov(unsigned disks, unsigned failed_idx, 35 size_t size, void **data) 36 { 37 unsigned i = 2, nr; 38 39 BUG_ON(failed_idx >= disks); 40 41 swap(data[0], data[failed_idx]); 42 memcpy(data[0], data[1], size); 43 44 while (i < disks) { 45 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); 46 xor_blocks(nr, size, data[0], data + i); 47 i += nr; 48 } 49 50 swap(data[0], data[failed_idx]); 51 } 52 53 static void raid_gen(int nd, int np, size_t size, void **v) 54 { 55 if (np >= 1) 56 raid5_recov(nd + np, nd, size, v); 57 if (np >= 2) 58 raid6_call.gen_syndrome(nd + np, size, v); 59 BUG_ON(np > 2); 60 } 61 62 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) 63 { 64 switch (nr) { 65 case 0: 66 break; 67 case 1: 68 if (ir[0] < nd + 1) 69 raid5_recov(nd + 1, ir[0], size, v); 70 else 71 raid6_call.gen_syndrome(nd + np, size, v); 72 break; 73 case 2: 74 if (ir[1] < nd) { 75 /* data+data failure. */ 76 raid6_2data_recov(nd + np, size, ir[0], ir[1], v); 77 } else if (ir[0] < nd) { 78 /* data + p/q failure */ 79 80 if (ir[1] == nd) /* data + p failure */ 81 raid6_datap_recov(nd + np, size, ir[0], v); 82 else { /* data + q failure */ 83 raid5_recov(nd + 1, ir[0], size, v); 84 raid6_call.gen_syndrome(nd + np, size, v); 85 } 86 } else { 87 raid_gen(nd, np, size, v); 88 } 89 break; 90 default: 91 BUG(); 92 } 93 } 94 95 #else 96 97 #include <raid/raid.h> 98 99 #endif 100 101 struct ec_bio { 102 struct bch_dev *ca; 103 struct ec_stripe_buf *buf; 104 size_t idx; 105 struct bio bio; 106 }; 107 108 /* Stripes btree keys: */ 109 110 int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, 111 enum bch_validate_flags flags) 112 { 113 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 114 int ret = 0; 115 116 bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || 117 bpos_gt(k.k->p, POS(0, U32_MAX)), 118 c, stripe_pos_bad, 119 "stripe at bad pos"); 120 121 bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), 122 c, stripe_val_size_bad, 123 "incorrect value size (%zu < %u)", 124 bkey_val_u64s(k.k), stripe_val_u64s(s)); 125 126 ret = bch2_bkey_ptrs_validate(c, k, flags); 127 fsck_err: 128 return ret; 129 } 130 131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, 132 struct bkey_s_c k) 133 { 134 const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; 135 struct bch_stripe s = {}; 136 137 memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); 138 139 unsigned nr_data = s.nr_blocks - s.nr_redundant; 140 141 prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", 142 s.algorithm, 143 le16_to_cpu(s.sectors), 144 nr_data, 145 s.nr_redundant); 146 bch2_prt_csum_type(out, s.csum_type); 147 prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); 148 149 for (unsigned i = 0; i < s.nr_blocks; i++) { 150 const struct bch_extent_ptr *ptr = sp->ptrs + i; 151 152 if ((void *) ptr >= bkey_val_end(k)) 153 break; 154 155 bch2_extent_ptr_to_text(out, c, ptr); 156 157 if (s.csum_type < BCH_CSUM_NR && 158 i < nr_data && 159 stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) 160 prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); 161 } 162 } 163 164 /* Triggers: */ 165 166 static int __mark_stripe_bucket(struct btree_trans *trans, 167 struct bch_dev *ca, 168 struct bkey_s_c_stripe s, 169 unsigned ptr_idx, bool deleting, 170 struct bpos bucket, 171 struct bch_alloc_v4 *a, 172 enum btree_iter_update_trigger_flags flags) 173 { 174 const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; 175 unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; 176 bool parity = ptr_idx >= nr_data; 177 enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; 178 s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; 179 struct printbuf buf = PRINTBUF; 180 int ret = 0; 181 182 struct bch_fs *c = trans->c; 183 if (deleting) 184 sectors = -sectors; 185 186 if (!deleting) { 187 if (bch2_trans_inconsistent_on(a->stripe || 188 a->stripe_redundancy, trans, 189 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", 190 bucket.inode, bucket.offset, a->gen, 191 bch2_data_type_str(a->data_type), 192 a->dirty_sectors, 193 a->stripe, s.k->p.offset, 194 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 195 ret = -EIO; 196 goto err; 197 } 198 199 if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, 200 "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", 201 bucket.inode, bucket.offset, a->gen, 202 bch2_data_type_str(a->data_type), 203 a->dirty_sectors, 204 a->cached_sectors, 205 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 206 ret = -EIO; 207 goto err; 208 } 209 } else { 210 if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || 211 a->stripe_redundancy != s.v->nr_redundant, trans, 212 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", 213 bucket.inode, bucket.offset, a->gen, 214 a->stripe, 215 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 216 ret = -EIO; 217 goto err; 218 } 219 220 if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, 221 "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", 222 bucket.inode, bucket.offset, a->gen, 223 bch2_data_type_str(a->data_type), 224 bch2_data_type_str(data_type), 225 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 226 ret = -EIO; 227 goto err; 228 } 229 230 if (bch2_trans_inconsistent_on(parity && 231 (a->dirty_sectors != -sectors || 232 a->cached_sectors), trans, 233 "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", 234 bucket.inode, bucket.offset, a->gen, 235 a->dirty_sectors, 236 a->cached_sectors, 237 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 238 ret = -EIO; 239 goto err; 240 } 241 } 242 243 if (sectors) { 244 ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, 245 a->gen, a->data_type, &a->dirty_sectors); 246 if (ret) 247 goto err; 248 } 249 250 if (!deleting) { 251 a->stripe = s.k->p.offset; 252 a->stripe_redundancy = s.v->nr_redundant; 253 } else { 254 a->stripe = 0; 255 a->stripe_redundancy = 0; 256 } 257 258 alloc_data_type_set(a, data_type); 259 err: 260 printbuf_exit(&buf); 261 return ret; 262 } 263 264 static int mark_stripe_bucket(struct btree_trans *trans, 265 struct bkey_s_c_stripe s, 266 unsigned ptr_idx, bool deleting, 267 enum btree_iter_update_trigger_flags flags) 268 { 269 struct bch_fs *c = trans->c; 270 const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; 271 struct printbuf buf = PRINTBUF; 272 int ret = 0; 273 274 struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); 275 if (unlikely(!ca)) { 276 if (!(flags & BTREE_TRIGGER_overwrite)) 277 ret = -EIO; 278 goto err; 279 } 280 281 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 282 283 if (flags & BTREE_TRIGGER_transactional) { 284 struct bkey_i_alloc_v4 *a = 285 bch2_trans_start_alloc_update(trans, bucket, 0); 286 ret = PTR_ERR_OR_ZERO(a) ?: 287 __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); 288 } 289 290 if (flags & BTREE_TRIGGER_gc) { 291 percpu_down_read(&c->mark_lock); 292 struct bucket *g = gc_bucket(ca, bucket.offset); 293 if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", 294 ptr->dev, 295 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 296 ret = -EIO; 297 goto err_unlock; 298 } 299 300 bucket_lock(g); 301 struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; 302 ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); 303 alloc_to_bucket(g, new); 304 bucket_unlock(g); 305 err_unlock: 306 percpu_up_read(&c->mark_lock); 307 if (!ret) 308 ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); 309 } 310 err: 311 bch2_dev_put(ca); 312 printbuf_exit(&buf); 313 return ret; 314 } 315 316 static int mark_stripe_buckets(struct btree_trans *trans, 317 struct bkey_s_c old, struct bkey_s_c new, 318 enum btree_iter_update_trigger_flags flags) 319 { 320 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 321 ? bkey_s_c_to_stripe(old).v : NULL; 322 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 323 ? bkey_s_c_to_stripe(new).v : NULL; 324 325 BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); 326 327 unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; 328 329 for (unsigned i = 0; i < nr_blocks; i++) { 330 if (new_s && old_s && 331 !memcmp(&new_s->ptrs[i], 332 &old_s->ptrs[i], 333 sizeof(new_s->ptrs[i]))) 334 continue; 335 336 if (new_s) { 337 int ret = mark_stripe_bucket(trans, 338 bkey_s_c_to_stripe(new), i, false, flags); 339 if (ret) 340 return ret; 341 } 342 343 if (old_s) { 344 int ret = mark_stripe_bucket(trans, 345 bkey_s_c_to_stripe(old), i, true, flags); 346 if (ret) 347 return ret; 348 } 349 } 350 351 return 0; 352 } 353 354 int bch2_trigger_stripe(struct btree_trans *trans, 355 enum btree_id btree, unsigned level, 356 struct bkey_s_c old, struct bkey_s _new, 357 enum btree_iter_update_trigger_flags flags) 358 { 359 struct bkey_s_c new = _new.s_c; 360 struct bch_fs *c = trans->c; 361 u64 idx = new.k->p.offset; 362 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 363 ? bkey_s_c_to_stripe(old).v : NULL; 364 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 365 ? bkey_s_c_to_stripe(new).v : NULL; 366 367 if (unlikely(flags & BTREE_TRIGGER_check_repair)) 368 return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); 369 370 BUG_ON(new_s && old_s && 371 (new_s->nr_blocks != old_s->nr_blocks || 372 new_s->nr_redundant != old_s->nr_redundant)); 373 374 375 if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { 376 /* 377 * If the pointers aren't changing, we don't need to do anything: 378 */ 379 if (new_s && old_s && 380 new_s->nr_blocks == old_s->nr_blocks && 381 new_s->nr_redundant == old_s->nr_redundant && 382 !memcmp(old_s->ptrs, new_s->ptrs, 383 new_s->nr_blocks * sizeof(struct bch_extent_ptr))) 384 return 0; 385 386 struct gc_stripe *gc = NULL; 387 if (flags & BTREE_TRIGGER_gc) { 388 gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); 389 if (!gc) { 390 bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); 391 return -BCH_ERR_ENOMEM_mark_stripe; 392 } 393 394 /* 395 * This will be wrong when we bring back runtime gc: we should 396 * be unmarking the old key and then marking the new key 397 * 398 * Also: when we bring back runtime gc, locking 399 */ 400 gc->alive = true; 401 gc->sectors = le16_to_cpu(new_s->sectors); 402 gc->nr_blocks = new_s->nr_blocks; 403 gc->nr_redundant = new_s->nr_redundant; 404 405 for (unsigned i = 0; i < new_s->nr_blocks; i++) 406 gc->ptrs[i] = new_s->ptrs[i]; 407 408 /* 409 * gc recalculates this field from stripe ptr 410 * references: 411 */ 412 memset(gc->block_sectors, 0, sizeof(gc->block_sectors)); 413 } 414 415 if (new_s) { 416 s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; 417 418 struct disk_accounting_pos acc = { 419 .type = BCH_DISK_ACCOUNTING_replicas, 420 }; 421 bch2_bkey_to_replicas(&acc.replicas, new); 422 int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); 423 if (ret) 424 return ret; 425 426 if (gc) 427 memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); 428 } 429 430 if (old_s) { 431 s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; 432 433 struct disk_accounting_pos acc = { 434 .type = BCH_DISK_ACCOUNTING_replicas, 435 }; 436 bch2_bkey_to_replicas(&acc.replicas, old); 437 int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); 438 if (ret) 439 return ret; 440 } 441 442 int ret = mark_stripe_buckets(trans, old, new, flags); 443 if (ret) 444 return ret; 445 } 446 447 if (flags & BTREE_TRIGGER_atomic) { 448 struct stripe *m = genradix_ptr(&c->stripes, idx); 449 450 if (!m) { 451 struct printbuf buf1 = PRINTBUF; 452 struct printbuf buf2 = PRINTBUF; 453 454 bch2_bkey_val_to_text(&buf1, c, old); 455 bch2_bkey_val_to_text(&buf2, c, new); 456 bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" 457 "old %s\n" 458 "new %s", idx, buf1.buf, buf2.buf); 459 printbuf_exit(&buf2); 460 printbuf_exit(&buf1); 461 bch2_inconsistent_error(c); 462 return -1; 463 } 464 465 if (!new_s) { 466 bch2_stripes_heap_del(c, m, idx); 467 468 memset(m, 0, sizeof(*m)); 469 } else { 470 m->sectors = le16_to_cpu(new_s->sectors); 471 m->algorithm = new_s->algorithm; 472 m->nr_blocks = new_s->nr_blocks; 473 m->nr_redundant = new_s->nr_redundant; 474 m->blocks_nonempty = 0; 475 476 for (unsigned i = 0; i < new_s->nr_blocks; i++) 477 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); 478 479 if (!old_s) 480 bch2_stripes_heap_insert(c, m, idx); 481 else 482 bch2_stripes_heap_update(c, m, idx); 483 } 484 } 485 486 return 0; 487 } 488 489 /* returns blocknr in stripe that we matched: */ 490 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, 491 struct bkey_s_c k, unsigned *block) 492 { 493 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 494 unsigned i, nr_data = s->nr_blocks - s->nr_redundant; 495 496 bkey_for_each_ptr(ptrs, ptr) 497 for (i = 0; i < nr_data; i++) 498 if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, 499 le16_to_cpu(s->sectors))) { 500 *block = i; 501 return ptr; 502 } 503 504 return NULL; 505 } 506 507 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) 508 { 509 switch (k.k->type) { 510 case KEY_TYPE_extent: { 511 struct bkey_s_c_extent e = bkey_s_c_to_extent(k); 512 const union bch_extent_entry *entry; 513 514 extent_for_each_entry(e, entry) 515 if (extent_entry_type(entry) == 516 BCH_EXTENT_ENTRY_stripe_ptr && 517 entry->stripe_ptr.idx == idx) 518 return true; 519 520 break; 521 } 522 } 523 524 return false; 525 } 526 527 /* Stripe bufs: */ 528 529 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) 530 { 531 if (buf->key.k.type == KEY_TYPE_stripe) { 532 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); 533 unsigned i; 534 535 for (i = 0; i < s->v.nr_blocks; i++) { 536 kvfree(buf->data[i]); 537 buf->data[i] = NULL; 538 } 539 } 540 } 541 542 /* XXX: this is a non-mempoolified memory allocation: */ 543 static int ec_stripe_buf_init(struct ec_stripe_buf *buf, 544 unsigned offset, unsigned size) 545 { 546 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 547 unsigned csum_granularity = 1U << v->csum_granularity_bits; 548 unsigned end = offset + size; 549 unsigned i; 550 551 BUG_ON(end > le16_to_cpu(v->sectors)); 552 553 offset = round_down(offset, csum_granularity); 554 end = min_t(unsigned, le16_to_cpu(v->sectors), 555 round_up(end, csum_granularity)); 556 557 buf->offset = offset; 558 buf->size = end - offset; 559 560 memset(buf->valid, 0xFF, sizeof(buf->valid)); 561 562 for (i = 0; i < v->nr_blocks; i++) { 563 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); 564 if (!buf->data[i]) 565 goto err; 566 } 567 568 return 0; 569 err: 570 ec_stripe_buf_exit(buf); 571 return -BCH_ERR_ENOMEM_stripe_buf; 572 } 573 574 /* Checksumming: */ 575 576 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, 577 unsigned block, unsigned offset) 578 { 579 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 580 unsigned csum_granularity = 1 << v->csum_granularity_bits; 581 unsigned end = buf->offset + buf->size; 582 unsigned len = min(csum_granularity, end - offset); 583 584 BUG_ON(offset >= end); 585 BUG_ON(offset < buf->offset); 586 BUG_ON(offset & (csum_granularity - 1)); 587 BUG_ON(offset + len != le16_to_cpu(v->sectors) && 588 (len & (csum_granularity - 1))); 589 590 return bch2_checksum(NULL, v->csum_type, 591 null_nonce(), 592 buf->data[block] + ((offset - buf->offset) << 9), 593 len << 9); 594 } 595 596 static void ec_generate_checksums(struct ec_stripe_buf *buf) 597 { 598 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 599 unsigned i, j, csums_per_device = stripe_csums_per_device(v); 600 601 if (!v->csum_type) 602 return; 603 604 BUG_ON(buf->offset); 605 BUG_ON(buf->size != le16_to_cpu(v->sectors)); 606 607 for (i = 0; i < v->nr_blocks; i++) 608 for (j = 0; j < csums_per_device; j++) 609 stripe_csum_set(v, i, j, 610 ec_block_checksum(buf, i, j << v->csum_granularity_bits)); 611 } 612 613 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) 614 { 615 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 616 unsigned csum_granularity = 1 << v->csum_granularity_bits; 617 unsigned i; 618 619 if (!v->csum_type) 620 return; 621 622 for (i = 0; i < v->nr_blocks; i++) { 623 unsigned offset = buf->offset; 624 unsigned end = buf->offset + buf->size; 625 626 if (!test_bit(i, buf->valid)) 627 continue; 628 629 while (offset < end) { 630 unsigned j = offset >> v->csum_granularity_bits; 631 unsigned len = min(csum_granularity, end - offset); 632 struct bch_csum want = stripe_csum_get(v, i, j); 633 struct bch_csum got = ec_block_checksum(buf, i, offset); 634 635 if (bch2_crc_cmp(want, got)) { 636 struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); 637 if (ca) { 638 struct printbuf err = PRINTBUF; 639 640 prt_str(&err, "stripe "); 641 bch2_csum_err_msg(&err, v->csum_type, want, got); 642 prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); 643 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); 644 bch_err_ratelimited(ca, "%s", err.buf); 645 printbuf_exit(&err); 646 647 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 648 } 649 650 clear_bit(i, buf->valid); 651 break; 652 } 653 654 offset += len; 655 } 656 } 657 } 658 659 /* Erasure coding: */ 660 661 static void ec_generate_ec(struct ec_stripe_buf *buf) 662 { 663 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 664 unsigned nr_data = v->nr_blocks - v->nr_redundant; 665 unsigned bytes = le16_to_cpu(v->sectors) << 9; 666 667 raid_gen(nr_data, v->nr_redundant, bytes, buf->data); 668 } 669 670 static unsigned ec_nr_failed(struct ec_stripe_buf *buf) 671 { 672 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 673 674 return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); 675 } 676 677 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) 678 { 679 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 680 unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; 681 unsigned nr_data = v->nr_blocks - v->nr_redundant; 682 unsigned bytes = buf->size << 9; 683 684 if (ec_nr_failed(buf) > v->nr_redundant) { 685 bch_err_ratelimited(c, 686 "error doing reconstruct read: unable to read enough blocks"); 687 return -1; 688 } 689 690 for (i = 0; i < nr_data; i++) 691 if (!test_bit(i, buf->valid)) 692 failed[nr_failed++] = i; 693 694 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); 695 return 0; 696 } 697 698 /* IO: */ 699 700 static void ec_block_endio(struct bio *bio) 701 { 702 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); 703 struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; 704 struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; 705 struct bch_dev *ca = ec_bio->ca; 706 struct closure *cl = bio->bi_private; 707 708 if (bch2_dev_io_err_on(bio->bi_status, ca, 709 bio_data_dir(bio) 710 ? BCH_MEMBER_ERROR_write 711 : BCH_MEMBER_ERROR_read, 712 "erasure coding %s error: %s", 713 bio_data_dir(bio) ? "write" : "read", 714 bch2_blk_status_to_str(bio->bi_status))) 715 clear_bit(ec_bio->idx, ec_bio->buf->valid); 716 717 int stale = dev_ptr_stale(ca, ptr); 718 if (stale) { 719 bch_err_ratelimited(ca->fs, 720 "error %s stripe: stale/invalid pointer (%i) after io", 721 bio_data_dir(bio) == READ ? "reading from" : "writing to", 722 stale); 723 clear_bit(ec_bio->idx, ec_bio->buf->valid); 724 } 725 726 bio_put(&ec_bio->bio); 727 percpu_ref_put(&ca->io_ref); 728 closure_put(cl); 729 } 730 731 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, 732 blk_opf_t opf, unsigned idx, struct closure *cl) 733 { 734 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 735 unsigned offset = 0, bytes = buf->size << 9; 736 struct bch_extent_ptr *ptr = &v->ptrs[idx]; 737 enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant 738 ? BCH_DATA_user 739 : BCH_DATA_parity; 740 int rw = op_is_write(opf); 741 742 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); 743 if (!ca) { 744 clear_bit(idx, buf->valid); 745 return; 746 } 747 748 int stale = dev_ptr_stale(ca, ptr); 749 if (stale) { 750 bch_err_ratelimited(c, 751 "error %s stripe: stale pointer (%i)", 752 rw == READ ? "reading from" : "writing to", 753 stale); 754 clear_bit(idx, buf->valid); 755 return; 756 } 757 758 759 this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); 760 761 while (offset < bytes) { 762 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, 763 DIV_ROUND_UP(bytes, PAGE_SIZE)); 764 unsigned b = min_t(size_t, bytes - offset, 765 nr_iovecs << PAGE_SHIFT); 766 struct ec_bio *ec_bio; 767 768 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 769 nr_iovecs, 770 opf, 771 GFP_KERNEL, 772 &c->ec_bioset), 773 struct ec_bio, bio); 774 775 ec_bio->ca = ca; 776 ec_bio->buf = buf; 777 ec_bio->idx = idx; 778 779 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); 780 ec_bio->bio.bi_end_io = ec_block_endio; 781 ec_bio->bio.bi_private = cl; 782 783 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); 784 785 closure_get(cl); 786 percpu_ref_get(&ca->io_ref); 787 788 submit_bio(&ec_bio->bio); 789 790 offset += b; 791 } 792 793 percpu_ref_put(&ca->io_ref); 794 } 795 796 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, 797 struct ec_stripe_buf *stripe) 798 { 799 struct btree_iter iter; 800 struct bkey_s_c k; 801 int ret; 802 803 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 804 POS(0, idx), BTREE_ITER_slots); 805 ret = bkey_err(k); 806 if (ret) 807 goto err; 808 if (k.k->type != KEY_TYPE_stripe) { 809 ret = -ENOENT; 810 goto err; 811 } 812 bkey_reassemble(&stripe->key, k); 813 err: 814 bch2_trans_iter_exit(trans, &iter); 815 return ret; 816 } 817 818 /* recovery read path: */ 819 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) 820 { 821 struct bch_fs *c = trans->c; 822 struct ec_stripe_buf *buf; 823 struct closure cl; 824 struct bch_stripe *v; 825 unsigned i, offset; 826 int ret = 0; 827 828 closure_init_stack(&cl); 829 830 BUG_ON(!rbio->pick.has_ec); 831 832 buf = kzalloc(sizeof(*buf), GFP_NOFS); 833 if (!buf) 834 return -BCH_ERR_ENOMEM_ec_read_extent; 835 836 ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); 837 if (ret) { 838 bch_err_ratelimited(c, 839 "error doing reconstruct read: error %i looking up stripe", ret); 840 kfree(buf); 841 return -EIO; 842 } 843 844 v = &bkey_i_to_stripe(&buf->key)->v; 845 846 if (!bch2_ptr_matches_stripe(v, rbio->pick)) { 847 bch_err_ratelimited(c, 848 "error doing reconstruct read: pointer doesn't match stripe"); 849 ret = -EIO; 850 goto err; 851 } 852 853 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; 854 if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { 855 bch_err_ratelimited(c, 856 "error doing reconstruct read: read is bigger than stripe"); 857 ret = -EIO; 858 goto err; 859 } 860 861 ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); 862 if (ret) 863 goto err; 864 865 for (i = 0; i < v->nr_blocks; i++) 866 ec_block_io(c, buf, REQ_OP_READ, i, &cl); 867 868 closure_sync(&cl); 869 870 if (ec_nr_failed(buf) > v->nr_redundant) { 871 bch_err_ratelimited(c, 872 "error doing reconstruct read: unable to read enough blocks"); 873 ret = -EIO; 874 goto err; 875 } 876 877 ec_validate_checksums(c, buf); 878 879 ret = ec_do_recov(c, buf); 880 if (ret) 881 goto err; 882 883 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, 884 buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); 885 err: 886 ec_stripe_buf_exit(buf); 887 kfree(buf); 888 return ret; 889 } 890 891 /* stripe bucket accounting: */ 892 893 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) 894 { 895 ec_stripes_heap n, *h = &c->ec_stripes_heap; 896 897 if (idx >= h->size) { 898 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) 899 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 900 901 mutex_lock(&c->ec_stripes_heap_lock); 902 if (n.size > h->size) { 903 memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); 904 n.nr = h->nr; 905 swap(*h, n); 906 } 907 mutex_unlock(&c->ec_stripes_heap_lock); 908 909 free_heap(&n); 910 } 911 912 if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) 913 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 914 915 if (c->gc_pos.phase != GC_PHASE_not_running && 916 !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) 917 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 918 919 return 0; 920 } 921 922 static int ec_stripe_mem_alloc(struct btree_trans *trans, 923 struct btree_iter *iter) 924 { 925 return allocate_dropping_locks_errcode(trans, 926 __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); 927 } 928 929 /* 930 * Hash table of open stripes: 931 * Stripes that are being created or modified are kept in a hash table, so that 932 * stripe deletion can skip them. 933 */ 934 935 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) 936 { 937 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 938 struct ec_stripe_new *s; 939 940 hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) 941 if (s->idx == idx) 942 return true; 943 return false; 944 } 945 946 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) 947 { 948 bool ret = false; 949 950 spin_lock(&c->ec_stripes_new_lock); 951 ret = __bch2_stripe_is_open(c, idx); 952 spin_unlock(&c->ec_stripes_new_lock); 953 954 return ret; 955 } 956 957 static bool bch2_try_open_stripe(struct bch_fs *c, 958 struct ec_stripe_new *s, 959 u64 idx) 960 { 961 bool ret; 962 963 spin_lock(&c->ec_stripes_new_lock); 964 ret = !__bch2_stripe_is_open(c, idx); 965 if (ret) { 966 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 967 968 s->idx = idx; 969 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); 970 } 971 spin_unlock(&c->ec_stripes_new_lock); 972 973 return ret; 974 } 975 976 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) 977 { 978 BUG_ON(!s->idx); 979 980 spin_lock(&c->ec_stripes_new_lock); 981 hlist_del_init(&s->hash); 982 spin_unlock(&c->ec_stripes_new_lock); 983 984 s->idx = 0; 985 } 986 987 /* Heap of all existing stripes, ordered by blocks_nonempty */ 988 989 static u64 stripe_idx_to_delete(struct bch_fs *c) 990 { 991 ec_stripes_heap *h = &c->ec_stripes_heap; 992 993 lockdep_assert_held(&c->ec_stripes_heap_lock); 994 995 if (h->nr && 996 h->data[0].blocks_nonempty == 0 && 997 !bch2_stripe_is_open(c, h->data[0].idx)) 998 return h->data[0].idx; 999 1000 return 0; 1001 } 1002 1003 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, 1004 size_t i) 1005 { 1006 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); 1007 1008 genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; 1009 } 1010 1011 static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) 1012 { 1013 struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; 1014 struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; 1015 1016 return ((_l->blocks_nonempty > _r->blocks_nonempty) < 1017 (_l->blocks_nonempty < _r->blocks_nonempty)); 1018 } 1019 1020 static inline void ec_stripes_heap_swap(void *l, void *r, void *h) 1021 { 1022 struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; 1023 struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; 1024 ec_stripes_heap *_h = (ec_stripes_heap *)h; 1025 size_t i = _l - _h->data; 1026 size_t j = _r - _h->data; 1027 1028 swap(*_l, *_r); 1029 1030 ec_stripes_heap_set_backpointer(_h, i); 1031 ec_stripes_heap_set_backpointer(_h, j); 1032 } 1033 1034 static void heap_verify_backpointer(struct bch_fs *c, size_t idx) 1035 { 1036 ec_stripes_heap *h = &c->ec_stripes_heap; 1037 struct stripe *m = genradix_ptr(&c->stripes, idx); 1038 1039 BUG_ON(m->heap_idx >= h->nr); 1040 BUG_ON(h->data[m->heap_idx].idx != idx); 1041 } 1042 1043 void bch2_stripes_heap_del(struct bch_fs *c, 1044 struct stripe *m, size_t idx) 1045 { 1046 const struct min_heap_callbacks callbacks = { 1047 .less = ec_stripes_heap_cmp, 1048 .swp = ec_stripes_heap_swap, 1049 }; 1050 1051 mutex_lock(&c->ec_stripes_heap_lock); 1052 heap_verify_backpointer(c, idx); 1053 1054 min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); 1055 mutex_unlock(&c->ec_stripes_heap_lock); 1056 } 1057 1058 void bch2_stripes_heap_insert(struct bch_fs *c, 1059 struct stripe *m, size_t idx) 1060 { 1061 const struct min_heap_callbacks callbacks = { 1062 .less = ec_stripes_heap_cmp, 1063 .swp = ec_stripes_heap_swap, 1064 }; 1065 1066 mutex_lock(&c->ec_stripes_heap_lock); 1067 BUG_ON(min_heap_full(&c->ec_stripes_heap)); 1068 1069 genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; 1070 min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { 1071 .idx = idx, 1072 .blocks_nonempty = m->blocks_nonempty, 1073 }), 1074 &callbacks, 1075 &c->ec_stripes_heap); 1076 1077 heap_verify_backpointer(c, idx); 1078 mutex_unlock(&c->ec_stripes_heap_lock); 1079 } 1080 1081 void bch2_stripes_heap_update(struct bch_fs *c, 1082 struct stripe *m, size_t idx) 1083 { 1084 const struct min_heap_callbacks callbacks = { 1085 .less = ec_stripes_heap_cmp, 1086 .swp = ec_stripes_heap_swap, 1087 }; 1088 ec_stripes_heap *h = &c->ec_stripes_heap; 1089 bool do_deletes; 1090 size_t i; 1091 1092 mutex_lock(&c->ec_stripes_heap_lock); 1093 heap_verify_backpointer(c, idx); 1094 1095 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; 1096 1097 i = m->heap_idx; 1098 min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); 1099 min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); 1100 1101 heap_verify_backpointer(c, idx); 1102 1103 do_deletes = stripe_idx_to_delete(c) != 0; 1104 mutex_unlock(&c->ec_stripes_heap_lock); 1105 1106 if (do_deletes) 1107 bch2_do_stripe_deletes(c); 1108 } 1109 1110 /* stripe deletion */ 1111 1112 static int ec_stripe_delete(struct btree_trans *trans, u64 idx) 1113 { 1114 struct bch_fs *c = trans->c; 1115 struct btree_iter iter; 1116 struct bkey_s_c k; 1117 struct bkey_s_c_stripe s; 1118 int ret; 1119 1120 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), 1121 BTREE_ITER_intent); 1122 ret = bkey_err(k); 1123 if (ret) 1124 goto err; 1125 1126 if (k.k->type != KEY_TYPE_stripe) { 1127 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); 1128 ret = -EINVAL; 1129 goto err; 1130 } 1131 1132 s = bkey_s_c_to_stripe(k); 1133 for (unsigned i = 0; i < s.v->nr_blocks; i++) 1134 if (stripe_blockcount_get(s.v, i)) { 1135 struct printbuf buf = PRINTBUF; 1136 1137 bch2_bkey_val_to_text(&buf, c, k); 1138 bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); 1139 printbuf_exit(&buf); 1140 ret = -EINVAL; 1141 goto err; 1142 } 1143 1144 ret = bch2_btree_delete_at(trans, &iter, 0); 1145 err: 1146 bch2_trans_iter_exit(trans, &iter); 1147 return ret; 1148 } 1149 1150 static void ec_stripe_delete_work(struct work_struct *work) 1151 { 1152 struct bch_fs *c = 1153 container_of(work, struct bch_fs, ec_stripe_delete_work); 1154 1155 while (1) { 1156 mutex_lock(&c->ec_stripes_heap_lock); 1157 u64 idx = stripe_idx_to_delete(c); 1158 mutex_unlock(&c->ec_stripes_heap_lock); 1159 1160 if (!idx) 1161 break; 1162 1163 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1164 ec_stripe_delete(trans, idx)); 1165 bch_err_fn(c, ret); 1166 if (ret) 1167 break; 1168 } 1169 1170 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 1171 } 1172 1173 void bch2_do_stripe_deletes(struct bch_fs *c) 1174 { 1175 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && 1176 !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) 1177 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 1178 } 1179 1180 /* stripe creation: */ 1181 1182 static int ec_stripe_key_update(struct btree_trans *trans, 1183 struct bkey_i_stripe *new, 1184 bool create) 1185 { 1186 struct bch_fs *c = trans->c; 1187 struct btree_iter iter; 1188 struct bkey_s_c k; 1189 int ret; 1190 1191 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 1192 new->k.p, BTREE_ITER_intent); 1193 ret = bkey_err(k); 1194 if (ret) 1195 goto err; 1196 1197 if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { 1198 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", 1199 create ? "creating" : "updating", 1200 bch2_bkey_types[k.k->type]); 1201 ret = -EINVAL; 1202 goto err; 1203 } 1204 1205 if (k.k->type == KEY_TYPE_stripe) { 1206 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; 1207 unsigned i; 1208 1209 if (old->nr_blocks != new->v.nr_blocks) { 1210 bch_err(c, "error updating stripe: nr_blocks does not match"); 1211 ret = -EINVAL; 1212 goto err; 1213 } 1214 1215 for (i = 0; i < new->v.nr_blocks; i++) { 1216 unsigned v = stripe_blockcount_get(old, i); 1217 1218 BUG_ON(v && 1219 (old->ptrs[i].dev != new->v.ptrs[i].dev || 1220 old->ptrs[i].gen != new->v.ptrs[i].gen || 1221 old->ptrs[i].offset != new->v.ptrs[i].offset)); 1222 1223 stripe_blockcount_set(&new->v, i, v); 1224 } 1225 } 1226 1227 ret = bch2_trans_update(trans, &iter, &new->k_i, 0); 1228 err: 1229 bch2_trans_iter_exit(trans, &iter); 1230 return ret; 1231 } 1232 1233 static int ec_stripe_update_extent(struct btree_trans *trans, 1234 struct bch_dev *ca, 1235 struct bpos bucket, u8 gen, 1236 struct ec_stripe_buf *s, 1237 struct bpos *bp_pos) 1238 { 1239 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1240 struct bch_fs *c = trans->c; 1241 struct bch_backpointer bp; 1242 struct btree_iter iter; 1243 struct bkey_s_c k; 1244 const struct bch_extent_ptr *ptr_c; 1245 struct bch_extent_ptr *ec_ptr = NULL; 1246 struct bch_extent_stripe_ptr stripe_ptr; 1247 struct bkey_i *n; 1248 int ret, dev, block; 1249 1250 ret = bch2_get_next_backpointer(trans, ca, bucket, gen, 1251 bp_pos, &bp, BTREE_ITER_cached); 1252 if (ret) 1253 return ret; 1254 if (bpos_eq(*bp_pos, SPOS_MAX)) 1255 return 0; 1256 1257 if (bp.level) { 1258 struct printbuf buf = PRINTBUF; 1259 struct btree_iter node_iter; 1260 struct btree *b; 1261 1262 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); 1263 bch2_trans_iter_exit(trans, &node_iter); 1264 1265 if (!b) 1266 return 0; 1267 1268 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); 1269 bch2_backpointer_to_text(&buf, &bp); 1270 1271 bch2_fs_inconsistent(c, "%s", buf.buf); 1272 printbuf_exit(&buf); 1273 return -EIO; 1274 } 1275 1276 k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); 1277 ret = bkey_err(k); 1278 if (ret) 1279 return ret; 1280 if (!k.k) { 1281 /* 1282 * extent no longer exists - we could flush the btree 1283 * write buffer and retry to verify, but no need: 1284 */ 1285 return 0; 1286 } 1287 1288 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) 1289 goto out; 1290 1291 ptr_c = bkey_matches_stripe(v, k, &block); 1292 /* 1293 * It doesn't generally make sense to erasure code cached ptrs: 1294 * XXX: should we be incrementing a counter? 1295 */ 1296 if (!ptr_c || ptr_c->cached) 1297 goto out; 1298 1299 dev = v->ptrs[block].dev; 1300 1301 n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); 1302 ret = PTR_ERR_OR_ZERO(n); 1303 if (ret) 1304 goto out; 1305 1306 bkey_reassemble(n, k); 1307 1308 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); 1309 ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); 1310 BUG_ON(!ec_ptr); 1311 1312 stripe_ptr = (struct bch_extent_stripe_ptr) { 1313 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, 1314 .block = block, 1315 .redundancy = v->nr_redundant, 1316 .idx = s->key.k.p.offset, 1317 }; 1318 1319 __extent_entry_insert(n, 1320 (union bch_extent_entry *) ec_ptr, 1321 (union bch_extent_entry *) &stripe_ptr); 1322 1323 ret = bch2_trans_update(trans, &iter, n, 0); 1324 out: 1325 bch2_trans_iter_exit(trans, &iter); 1326 return ret; 1327 } 1328 1329 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, 1330 unsigned block) 1331 { 1332 struct bch_fs *c = trans->c; 1333 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1334 struct bch_extent_ptr ptr = v->ptrs[block]; 1335 struct bpos bp_pos = POS_MIN; 1336 int ret = 0; 1337 1338 struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); 1339 if (!ca) 1340 return -EIO; 1341 1342 struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); 1343 1344 while (1) { 1345 ret = commit_do(trans, NULL, NULL, 1346 BCH_TRANS_COMMIT_no_check_rw| 1347 BCH_TRANS_COMMIT_no_enospc, 1348 ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); 1349 if (ret) 1350 break; 1351 if (bkey_eq(bp_pos, POS_MAX)) 1352 break; 1353 1354 bp_pos = bpos_nosnap_successor(bp_pos); 1355 } 1356 1357 bch2_dev_put(ca); 1358 return ret; 1359 } 1360 1361 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) 1362 { 1363 struct btree_trans *trans = bch2_trans_get(c); 1364 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1365 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1366 int ret = 0; 1367 1368 ret = bch2_btree_write_buffer_flush_sync(trans); 1369 if (ret) 1370 goto err; 1371 1372 for (i = 0; i < nr_data; i++) { 1373 ret = ec_stripe_update_bucket(trans, s, i); 1374 if (ret) 1375 break; 1376 } 1377 err: 1378 bch2_trans_put(trans); 1379 1380 return ret; 1381 } 1382 1383 static void zero_out_rest_of_ec_bucket(struct bch_fs *c, 1384 struct ec_stripe_new *s, 1385 unsigned block, 1386 struct open_bucket *ob) 1387 { 1388 struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); 1389 if (!ca) { 1390 s->err = -BCH_ERR_erofs_no_writes; 1391 return; 1392 } 1393 1394 unsigned offset = ca->mi.bucket_size - ob->sectors_free; 1395 memset(s->new_stripe.data[block] + (offset << 9), 1396 0, 1397 ob->sectors_free << 9); 1398 1399 int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, 1400 ob->bucket * ca->mi.bucket_size + offset, 1401 ob->sectors_free, 1402 GFP_KERNEL, 0); 1403 1404 percpu_ref_put(&ca->io_ref); 1405 1406 if (ret) 1407 s->err = ret; 1408 } 1409 1410 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) 1411 { 1412 if (s->idx) 1413 bch2_stripe_close(c, s); 1414 kfree(s); 1415 } 1416 1417 /* 1418 * data buckets of new stripe all written: create the stripe 1419 */ 1420 static void ec_stripe_create(struct ec_stripe_new *s) 1421 { 1422 struct bch_fs *c = s->c; 1423 struct open_bucket *ob; 1424 struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; 1425 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1426 int ret; 1427 1428 BUG_ON(s->h->s == s); 1429 1430 closure_sync(&s->iodone); 1431 1432 if (!s->err) { 1433 for (i = 0; i < nr_data; i++) 1434 if (s->blocks[i]) { 1435 ob = c->open_buckets + s->blocks[i]; 1436 1437 if (ob->sectors_free) 1438 zero_out_rest_of_ec_bucket(c, s, i, ob); 1439 } 1440 } 1441 1442 if (s->err) { 1443 if (!bch2_err_matches(s->err, EROFS)) 1444 bch_err(c, "error creating stripe: error writing data buckets"); 1445 goto err; 1446 } 1447 1448 if (s->have_existing_stripe) { 1449 ec_validate_checksums(c, &s->existing_stripe); 1450 1451 if (ec_do_recov(c, &s->existing_stripe)) { 1452 bch_err(c, "error creating stripe: error reading existing stripe"); 1453 goto err; 1454 } 1455 1456 for (i = 0; i < nr_data; i++) 1457 if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) 1458 swap(s->new_stripe.data[i], 1459 s->existing_stripe.data[i]); 1460 1461 ec_stripe_buf_exit(&s->existing_stripe); 1462 } 1463 1464 BUG_ON(!s->allocated); 1465 BUG_ON(!s->idx); 1466 1467 ec_generate_ec(&s->new_stripe); 1468 1469 ec_generate_checksums(&s->new_stripe); 1470 1471 /* write p/q: */ 1472 for (i = nr_data; i < v->nr_blocks; i++) 1473 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); 1474 closure_sync(&s->iodone); 1475 1476 if (ec_nr_failed(&s->new_stripe)) { 1477 bch_err(c, "error creating stripe: error writing redundancy buckets"); 1478 goto err; 1479 } 1480 1481 ret = bch2_trans_do(c, &s->res, NULL, 1482 BCH_TRANS_COMMIT_no_check_rw| 1483 BCH_TRANS_COMMIT_no_enospc, 1484 ec_stripe_key_update(trans, 1485 bkey_i_to_stripe(&s->new_stripe.key), 1486 !s->have_existing_stripe)); 1487 bch_err_msg(c, ret, "creating stripe key"); 1488 if (ret) { 1489 goto err; 1490 } 1491 1492 ret = ec_stripe_update_extents(c, &s->new_stripe); 1493 bch_err_msg(c, ret, "error updating extents"); 1494 if (ret) 1495 goto err; 1496 err: 1497 bch2_disk_reservation_put(c, &s->res); 1498 1499 for (i = 0; i < v->nr_blocks; i++) 1500 if (s->blocks[i]) { 1501 ob = c->open_buckets + s->blocks[i]; 1502 1503 if (i < nr_data) { 1504 ob->ec = NULL; 1505 __bch2_open_bucket_put(c, ob); 1506 } else { 1507 bch2_open_bucket_put(c, ob); 1508 } 1509 } 1510 1511 mutex_lock(&c->ec_stripe_new_lock); 1512 list_del(&s->list); 1513 mutex_unlock(&c->ec_stripe_new_lock); 1514 wake_up(&c->ec_stripe_new_wait); 1515 1516 ec_stripe_buf_exit(&s->existing_stripe); 1517 ec_stripe_buf_exit(&s->new_stripe); 1518 closure_debug_destroy(&s->iodone); 1519 1520 ec_stripe_new_put(c, s, STRIPE_REF_stripe); 1521 } 1522 1523 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) 1524 { 1525 struct ec_stripe_new *s; 1526 1527 mutex_lock(&c->ec_stripe_new_lock); 1528 list_for_each_entry(s, &c->ec_stripe_new_list, list) 1529 if (!atomic_read(&s->ref[STRIPE_REF_io])) 1530 goto out; 1531 s = NULL; 1532 out: 1533 mutex_unlock(&c->ec_stripe_new_lock); 1534 1535 return s; 1536 } 1537 1538 static void ec_stripe_create_work(struct work_struct *work) 1539 { 1540 struct bch_fs *c = container_of(work, 1541 struct bch_fs, ec_stripe_create_work); 1542 struct ec_stripe_new *s; 1543 1544 while ((s = get_pending_stripe(c))) 1545 ec_stripe_create(s); 1546 1547 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1548 } 1549 1550 void bch2_ec_do_stripe_creates(struct bch_fs *c) 1551 { 1552 bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); 1553 1554 if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) 1555 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1556 } 1557 1558 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) 1559 { 1560 struct ec_stripe_new *s = h->s; 1561 1562 BUG_ON(!s->allocated && !s->err); 1563 1564 h->s = NULL; 1565 s->pending = true; 1566 1567 mutex_lock(&c->ec_stripe_new_lock); 1568 list_add(&s->list, &c->ec_stripe_new_list); 1569 mutex_unlock(&c->ec_stripe_new_lock); 1570 1571 ec_stripe_new_put(c, s, STRIPE_REF_io); 1572 } 1573 1574 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) 1575 { 1576 struct ec_stripe_new *s = ob->ec; 1577 1578 s->err = -EIO; 1579 } 1580 1581 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) 1582 { 1583 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); 1584 if (!ob) 1585 return NULL; 1586 1587 BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); 1588 1589 struct bch_dev *ca = ob_dev(c, ob); 1590 unsigned offset = ca->mi.bucket_size - ob->sectors_free; 1591 1592 return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); 1593 } 1594 1595 static int unsigned_cmp(const void *_l, const void *_r) 1596 { 1597 unsigned l = *((const unsigned *) _l); 1598 unsigned r = *((const unsigned *) _r); 1599 1600 return cmp_int(l, r); 1601 } 1602 1603 /* pick most common bucket size: */ 1604 static unsigned pick_blocksize(struct bch_fs *c, 1605 struct bch_devs_mask *devs) 1606 { 1607 unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; 1608 struct { 1609 unsigned nr, size; 1610 } cur = { 0, 0 }, best = { 0, 0 }; 1611 1612 for_each_member_device_rcu(c, ca, devs) 1613 sizes[nr++] = ca->mi.bucket_size; 1614 1615 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); 1616 1617 for (unsigned i = 0; i < nr; i++) { 1618 if (sizes[i] != cur.size) { 1619 if (cur.nr > best.nr) 1620 best = cur; 1621 1622 cur.nr = 0; 1623 cur.size = sizes[i]; 1624 } 1625 1626 cur.nr++; 1627 } 1628 1629 if (cur.nr > best.nr) 1630 best = cur; 1631 1632 return best.size; 1633 } 1634 1635 static bool may_create_new_stripe(struct bch_fs *c) 1636 { 1637 return false; 1638 } 1639 1640 static void ec_stripe_key_init(struct bch_fs *c, 1641 struct bkey_i *k, 1642 unsigned nr_data, 1643 unsigned nr_parity, 1644 unsigned stripe_size) 1645 { 1646 struct bkey_i_stripe *s = bkey_stripe_init(k); 1647 unsigned u64s; 1648 1649 s->v.sectors = cpu_to_le16(stripe_size); 1650 s->v.algorithm = 0; 1651 s->v.nr_blocks = nr_data + nr_parity; 1652 s->v.nr_redundant = nr_parity; 1653 s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); 1654 s->v.csum_type = BCH_CSUM_crc32c; 1655 s->v.pad = 0; 1656 1657 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { 1658 BUG_ON(1 << s->v.csum_granularity_bits >= 1659 le16_to_cpu(s->v.sectors) || 1660 s->v.csum_granularity_bits == U8_MAX); 1661 s->v.csum_granularity_bits++; 1662 } 1663 1664 set_bkey_val_u64s(&s->k, u64s); 1665 } 1666 1667 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) 1668 { 1669 struct ec_stripe_new *s; 1670 1671 lockdep_assert_held(&h->lock); 1672 1673 s = kzalloc(sizeof(*s), GFP_KERNEL); 1674 if (!s) 1675 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; 1676 1677 mutex_init(&s->lock); 1678 closure_init(&s->iodone, NULL); 1679 atomic_set(&s->ref[STRIPE_REF_stripe], 1); 1680 atomic_set(&s->ref[STRIPE_REF_io], 1); 1681 s->c = c; 1682 s->h = h; 1683 s->nr_data = min_t(unsigned, h->nr_active_devs, 1684 BCH_BKEY_PTRS_MAX) - h->redundancy; 1685 s->nr_parity = h->redundancy; 1686 1687 ec_stripe_key_init(c, &s->new_stripe.key, 1688 s->nr_data, s->nr_parity, h->blocksize); 1689 1690 h->s = s; 1691 return 0; 1692 } 1693 1694 static struct ec_stripe_head * 1695 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, 1696 unsigned algo, unsigned redundancy, 1697 enum bch_watermark watermark) 1698 { 1699 struct ec_stripe_head *h; 1700 1701 h = kzalloc(sizeof(*h), GFP_KERNEL); 1702 if (!h) 1703 return NULL; 1704 1705 mutex_init(&h->lock); 1706 BUG_ON(!mutex_trylock(&h->lock)); 1707 1708 h->target = target; 1709 h->algo = algo; 1710 h->redundancy = redundancy; 1711 h->watermark = watermark; 1712 1713 rcu_read_lock(); 1714 h->devs = target_rw_devs(c, BCH_DATA_user, target); 1715 1716 for_each_member_device_rcu(c, ca, &h->devs) 1717 if (!ca->mi.durability) 1718 __clear_bit(ca->dev_idx, h->devs.d); 1719 1720 h->blocksize = pick_blocksize(c, &h->devs); 1721 1722 for_each_member_device_rcu(c, ca, &h->devs) 1723 if (ca->mi.bucket_size == h->blocksize) 1724 h->nr_active_devs++; 1725 1726 rcu_read_unlock(); 1727 1728 /* 1729 * If we only have redundancy + 1 devices, we're better off with just 1730 * replication: 1731 */ 1732 if (h->nr_active_devs < h->redundancy + 2) 1733 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", 1734 h->nr_active_devs, h->redundancy + 2); 1735 1736 list_add(&h->list, &c->ec_stripe_head_list); 1737 return h; 1738 } 1739 1740 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) 1741 { 1742 if (h->s && 1743 h->s->allocated && 1744 bitmap_weight(h->s->blocks_allocated, 1745 h->s->nr_data) == h->s->nr_data) 1746 ec_stripe_set_pending(c, h); 1747 1748 mutex_unlock(&h->lock); 1749 } 1750 1751 static struct ec_stripe_head * 1752 __bch2_ec_stripe_head_get(struct btree_trans *trans, 1753 unsigned target, 1754 unsigned algo, 1755 unsigned redundancy, 1756 enum bch_watermark watermark) 1757 { 1758 struct bch_fs *c = trans->c; 1759 struct ec_stripe_head *h; 1760 int ret; 1761 1762 if (!redundancy) 1763 return NULL; 1764 1765 ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); 1766 if (ret) 1767 return ERR_PTR(ret); 1768 1769 if (test_bit(BCH_FS_going_ro, &c->flags)) { 1770 h = ERR_PTR(-BCH_ERR_erofs_no_writes); 1771 goto found; 1772 } 1773 1774 list_for_each_entry(h, &c->ec_stripe_head_list, list) 1775 if (h->target == target && 1776 h->algo == algo && 1777 h->redundancy == redundancy && 1778 h->watermark == watermark) { 1779 ret = bch2_trans_mutex_lock(trans, &h->lock); 1780 if (ret) 1781 h = ERR_PTR(ret); 1782 goto found; 1783 } 1784 1785 h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); 1786 found: 1787 if (!IS_ERR_OR_NULL(h) && 1788 h->nr_active_devs < h->redundancy + 2) { 1789 mutex_unlock(&h->lock); 1790 h = NULL; 1791 } 1792 mutex_unlock(&c->ec_stripe_head_lock); 1793 return h; 1794 } 1795 1796 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, 1797 enum bch_watermark watermark, struct closure *cl) 1798 { 1799 struct bch_fs *c = trans->c; 1800 struct bch_devs_mask devs = h->devs; 1801 struct open_bucket *ob; 1802 struct open_buckets buckets; 1803 struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1804 unsigned i, j, nr_have_parity = 0, nr_have_data = 0; 1805 bool have_cache = true; 1806 int ret = 0; 1807 1808 BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); 1809 BUG_ON(v->nr_redundant != h->s->nr_parity); 1810 1811 /* * We bypass the sector allocator which normally does this: */ 1812 bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); 1813 1814 for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { 1815 __clear_bit(v->ptrs[i].dev, devs.d); 1816 if (i < h->s->nr_data) 1817 nr_have_data++; 1818 else 1819 nr_have_parity++; 1820 } 1821 1822 BUG_ON(nr_have_data > h->s->nr_data); 1823 BUG_ON(nr_have_parity > h->s->nr_parity); 1824 1825 buckets.nr = 0; 1826 if (nr_have_parity < h->s->nr_parity) { 1827 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1828 &h->parity_stripe, 1829 &devs, 1830 h->s->nr_parity, 1831 &nr_have_parity, 1832 &have_cache, 0, 1833 BCH_DATA_parity, 1834 watermark, 1835 cl); 1836 1837 open_bucket_for_each(c, &buckets, ob, i) { 1838 j = find_next_zero_bit(h->s->blocks_gotten, 1839 h->s->nr_data + h->s->nr_parity, 1840 h->s->nr_data); 1841 BUG_ON(j >= h->s->nr_data + h->s->nr_parity); 1842 1843 h->s->blocks[j] = buckets.v[i]; 1844 v->ptrs[j] = bch2_ob_ptr(c, ob); 1845 __set_bit(j, h->s->blocks_gotten); 1846 } 1847 1848 if (ret) 1849 return ret; 1850 } 1851 1852 buckets.nr = 0; 1853 if (nr_have_data < h->s->nr_data) { 1854 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1855 &h->block_stripe, 1856 &devs, 1857 h->s->nr_data, 1858 &nr_have_data, 1859 &have_cache, 0, 1860 BCH_DATA_user, 1861 watermark, 1862 cl); 1863 1864 open_bucket_for_each(c, &buckets, ob, i) { 1865 j = find_next_zero_bit(h->s->blocks_gotten, 1866 h->s->nr_data, 0); 1867 BUG_ON(j >= h->s->nr_data); 1868 1869 h->s->blocks[j] = buckets.v[i]; 1870 v->ptrs[j] = bch2_ob_ptr(c, ob); 1871 __set_bit(j, h->s->blocks_gotten); 1872 } 1873 1874 if (ret) 1875 return ret; 1876 } 1877 1878 return 0; 1879 } 1880 1881 /* XXX: doesn't obey target: */ 1882 static s64 get_existing_stripe(struct bch_fs *c, 1883 struct ec_stripe_head *head) 1884 { 1885 ec_stripes_heap *h = &c->ec_stripes_heap; 1886 struct stripe *m; 1887 size_t heap_idx; 1888 u64 stripe_idx; 1889 s64 ret = -1; 1890 1891 if (may_create_new_stripe(c)) 1892 return -1; 1893 1894 mutex_lock(&c->ec_stripes_heap_lock); 1895 for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { 1896 /* No blocks worth reusing, stripe will just be deleted: */ 1897 if (!h->data[heap_idx].blocks_nonempty) 1898 continue; 1899 1900 stripe_idx = h->data[heap_idx].idx; 1901 1902 m = genradix_ptr(&c->stripes, stripe_idx); 1903 1904 if (m->algorithm == head->algo && 1905 m->nr_redundant == head->redundancy && 1906 m->sectors == head->blocksize && 1907 m->blocks_nonempty < m->nr_blocks - m->nr_redundant && 1908 bch2_try_open_stripe(c, head->s, stripe_idx)) { 1909 ret = stripe_idx; 1910 break; 1911 } 1912 } 1913 mutex_unlock(&c->ec_stripes_heap_lock); 1914 return ret; 1915 } 1916 1917 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) 1918 { 1919 struct bch_fs *c = trans->c; 1920 struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1921 struct bch_stripe *existing_v; 1922 unsigned i; 1923 s64 idx; 1924 int ret; 1925 1926 /* 1927 * If we can't allocate a new stripe, and there's no stripes with empty 1928 * blocks for us to reuse, that means we have to wait on copygc: 1929 */ 1930 idx = get_existing_stripe(c, h); 1931 if (idx < 0) 1932 return -BCH_ERR_stripe_alloc_blocked; 1933 1934 ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); 1935 bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, 1936 "reading stripe key: %s", bch2_err_str(ret)); 1937 if (ret) { 1938 bch2_stripe_close(c, h->s); 1939 return ret; 1940 } 1941 1942 existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; 1943 1944 BUG_ON(existing_v->nr_redundant != h->s->nr_parity); 1945 h->s->nr_data = existing_v->nr_blocks - 1946 existing_v->nr_redundant; 1947 1948 ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); 1949 if (ret) { 1950 bch2_stripe_close(c, h->s); 1951 return ret; 1952 } 1953 1954 BUG_ON(h->s->existing_stripe.size != h->blocksize); 1955 BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); 1956 1957 /* 1958 * Free buckets we initially allocated - they might conflict with 1959 * blocks from the stripe we're reusing: 1960 */ 1961 for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { 1962 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); 1963 h->s->blocks[i] = 0; 1964 } 1965 memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); 1966 memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); 1967 1968 for (i = 0; i < existing_v->nr_blocks; i++) { 1969 if (stripe_blockcount_get(existing_v, i)) { 1970 __set_bit(i, h->s->blocks_gotten); 1971 __set_bit(i, h->s->blocks_allocated); 1972 } 1973 1974 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); 1975 } 1976 1977 bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); 1978 h->s->have_existing_stripe = true; 1979 1980 return 0; 1981 } 1982 1983 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) 1984 { 1985 struct bch_fs *c = trans->c; 1986 struct btree_iter iter; 1987 struct bkey_s_c k; 1988 struct bpos min_pos = POS(0, 1); 1989 struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); 1990 int ret; 1991 1992 if (!h->s->res.sectors) { 1993 ret = bch2_disk_reservation_get(c, &h->s->res, 1994 h->blocksize, 1995 h->s->nr_parity, 1996 BCH_DISK_RESERVATION_NOFAIL); 1997 if (ret) 1998 return ret; 1999 } 2000 2001 for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, 2002 BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { 2003 if (bkey_gt(k.k->p, POS(0, U32_MAX))) { 2004 if (start_pos.offset) { 2005 start_pos = min_pos; 2006 bch2_btree_iter_set_pos(&iter, start_pos); 2007 continue; 2008 } 2009 2010 ret = -BCH_ERR_ENOSPC_stripe_create; 2011 break; 2012 } 2013 2014 if (bkey_deleted(k.k) && 2015 bch2_try_open_stripe(c, h->s, k.k->p.offset)) 2016 break; 2017 } 2018 2019 c->ec_stripe_hint = iter.pos.offset; 2020 2021 if (ret) 2022 goto err; 2023 2024 ret = ec_stripe_mem_alloc(trans, &iter); 2025 if (ret) { 2026 bch2_stripe_close(c, h->s); 2027 goto err; 2028 } 2029 2030 h->s->new_stripe.key.k.p = iter.pos; 2031 out: 2032 bch2_trans_iter_exit(trans, &iter); 2033 return ret; 2034 err: 2035 bch2_disk_reservation_put(c, &h->s->res); 2036 goto out; 2037 } 2038 2039 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, 2040 unsigned target, 2041 unsigned algo, 2042 unsigned redundancy, 2043 enum bch_watermark watermark, 2044 struct closure *cl) 2045 { 2046 struct bch_fs *c = trans->c; 2047 struct ec_stripe_head *h; 2048 bool waiting = false; 2049 int ret; 2050 2051 h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); 2052 if (IS_ERR_OR_NULL(h)) 2053 return h; 2054 2055 if (!h->s) { 2056 ret = ec_new_stripe_alloc(c, h); 2057 if (ret) { 2058 bch_err(c, "failed to allocate new stripe"); 2059 goto err; 2060 } 2061 } 2062 2063 if (h->s->allocated) 2064 goto allocated; 2065 2066 if (h->s->have_existing_stripe) 2067 goto alloc_existing; 2068 2069 /* First, try to allocate a full stripe: */ 2070 ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: 2071 __bch2_ec_stripe_head_reserve(trans, h); 2072 if (!ret) 2073 goto allocate_buf; 2074 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 2075 bch2_err_matches(ret, ENOMEM)) 2076 goto err; 2077 2078 /* 2079 * Not enough buckets available for a full stripe: we must reuse an 2080 * existing stripe: 2081 */ 2082 while (1) { 2083 ret = __bch2_ec_stripe_head_reuse(trans, h); 2084 if (!ret) 2085 break; 2086 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) 2087 goto err; 2088 2089 if (watermark == BCH_WATERMARK_copygc) { 2090 ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: 2091 __bch2_ec_stripe_head_reserve(trans, h); 2092 if (ret) 2093 goto err; 2094 goto allocate_buf; 2095 } 2096 2097 /* XXX freelist_wait? */ 2098 closure_wait(&c->freelist_wait, cl); 2099 waiting = true; 2100 } 2101 2102 if (waiting) 2103 closure_wake_up(&c->freelist_wait); 2104 alloc_existing: 2105 /* 2106 * Retry allocating buckets, with the watermark for this 2107 * particular write: 2108 */ 2109 ret = new_stripe_alloc_buckets(trans, h, watermark, cl); 2110 if (ret) 2111 goto err; 2112 2113 allocate_buf: 2114 ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); 2115 if (ret) 2116 goto err; 2117 2118 h->s->allocated = true; 2119 allocated: 2120 BUG_ON(!h->s->idx); 2121 BUG_ON(!h->s->new_stripe.data[0]); 2122 BUG_ON(trans->restarted); 2123 return h; 2124 err: 2125 bch2_ec_stripe_head_put(c, h); 2126 return ERR_PTR(ret); 2127 } 2128 2129 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) 2130 { 2131 struct ec_stripe_head *h; 2132 struct open_bucket *ob; 2133 unsigned i; 2134 2135 mutex_lock(&c->ec_stripe_head_lock); 2136 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 2137 mutex_lock(&h->lock); 2138 if (!h->s) 2139 goto unlock; 2140 2141 if (!ca) 2142 goto found; 2143 2144 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { 2145 if (!h->s->blocks[i]) 2146 continue; 2147 2148 ob = c->open_buckets + h->s->blocks[i]; 2149 if (ob->dev == ca->dev_idx) 2150 goto found; 2151 } 2152 goto unlock; 2153 found: 2154 h->s->err = -BCH_ERR_erofs_no_writes; 2155 ec_stripe_set_pending(c, h); 2156 unlock: 2157 mutex_unlock(&h->lock); 2158 } 2159 mutex_unlock(&c->ec_stripe_head_lock); 2160 } 2161 2162 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) 2163 { 2164 __bch2_ec_stop(c, ca); 2165 } 2166 2167 void bch2_fs_ec_stop(struct bch_fs *c) 2168 { 2169 __bch2_ec_stop(c, NULL); 2170 } 2171 2172 static bool bch2_fs_ec_flush_done(struct bch_fs *c) 2173 { 2174 bool ret; 2175 2176 mutex_lock(&c->ec_stripe_new_lock); 2177 ret = list_empty(&c->ec_stripe_new_list); 2178 mutex_unlock(&c->ec_stripe_new_lock); 2179 2180 return ret; 2181 } 2182 2183 void bch2_fs_ec_flush(struct bch_fs *c) 2184 { 2185 wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); 2186 } 2187 2188 int bch2_stripes_read(struct bch_fs *c) 2189 { 2190 int ret = bch2_trans_run(c, 2191 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, 2192 BTREE_ITER_prefetch, k, ({ 2193 if (k.k->type != KEY_TYPE_stripe) 2194 continue; 2195 2196 ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); 2197 if (ret) 2198 break; 2199 2200 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 2201 2202 struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); 2203 m->sectors = le16_to_cpu(s->sectors); 2204 m->algorithm = s->algorithm; 2205 m->nr_blocks = s->nr_blocks; 2206 m->nr_redundant = s->nr_redundant; 2207 m->blocks_nonempty = 0; 2208 2209 for (unsigned i = 0; i < s->nr_blocks; i++) 2210 m->blocks_nonempty += !!stripe_blockcount_get(s, i); 2211 2212 bch2_stripes_heap_insert(c, m, k.k->p.offset); 2213 0; 2214 }))); 2215 bch_err_fn(c, ret); 2216 return ret; 2217 } 2218 2219 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) 2220 { 2221 ec_stripes_heap *h = &c->ec_stripes_heap; 2222 struct stripe *m; 2223 size_t i; 2224 2225 mutex_lock(&c->ec_stripes_heap_lock); 2226 for (i = 0; i < min_t(size_t, h->nr, 50); i++) { 2227 m = genradix_ptr(&c->stripes, h->data[i].idx); 2228 2229 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, 2230 h->data[i].blocks_nonempty, 2231 m->nr_blocks - m->nr_redundant, 2232 m->nr_redundant); 2233 if (bch2_stripe_is_open(c, h->data[i].idx)) 2234 prt_str(out, " open"); 2235 prt_newline(out); 2236 } 2237 mutex_unlock(&c->ec_stripes_heap_lock); 2238 } 2239 2240 static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, 2241 struct ec_stripe_new *s) 2242 { 2243 prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", 2244 s->idx, s->nr_data, s->nr_parity, 2245 bitmap_weight(s->blocks_allocated, s->nr_data), 2246 atomic_read(&s->ref[STRIPE_REF_io]), 2247 atomic_read(&s->ref[STRIPE_REF_stripe]), 2248 bch2_watermarks[s->h->watermark]); 2249 2250 struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; 2251 unsigned i; 2252 for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) 2253 prt_printf(out, " %u", s->blocks[i]); 2254 prt_newline(out); 2255 } 2256 2257 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) 2258 { 2259 struct ec_stripe_head *h; 2260 struct ec_stripe_new *s; 2261 2262 mutex_lock(&c->ec_stripe_head_lock); 2263 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 2264 prt_printf(out, "target %u algo %u redundancy %u %s:\n", 2265 h->target, h->algo, h->redundancy, 2266 bch2_watermarks[h->watermark]); 2267 2268 if (h->s) 2269 bch2_new_stripe_to_text(out, c, h->s); 2270 } 2271 mutex_unlock(&c->ec_stripe_head_lock); 2272 2273 prt_printf(out, "in flight:\n"); 2274 2275 mutex_lock(&c->ec_stripe_new_lock); 2276 list_for_each_entry(s, &c->ec_stripe_new_list, list) 2277 bch2_new_stripe_to_text(out, c, s); 2278 mutex_unlock(&c->ec_stripe_new_lock); 2279 } 2280 2281 void bch2_fs_ec_exit(struct bch_fs *c) 2282 { 2283 struct ec_stripe_head *h; 2284 unsigned i; 2285 2286 while (1) { 2287 mutex_lock(&c->ec_stripe_head_lock); 2288 h = list_first_entry_or_null(&c->ec_stripe_head_list, 2289 struct ec_stripe_head, list); 2290 if (h) 2291 list_del(&h->list); 2292 mutex_unlock(&c->ec_stripe_head_lock); 2293 if (!h) 2294 break; 2295 2296 if (h->s) { 2297 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) 2298 BUG_ON(h->s->blocks[i]); 2299 2300 kfree(h->s); 2301 } 2302 kfree(h); 2303 } 2304 2305 BUG_ON(!list_empty(&c->ec_stripe_new_list)); 2306 2307 free_heap(&c->ec_stripes_heap); 2308 genradix_free(&c->stripes); 2309 bioset_exit(&c->ec_bioset); 2310 } 2311 2312 void bch2_fs_ec_init_early(struct bch_fs *c) 2313 { 2314 spin_lock_init(&c->ec_stripes_new_lock); 2315 mutex_init(&c->ec_stripes_heap_lock); 2316 2317 INIT_LIST_HEAD(&c->ec_stripe_head_list); 2318 mutex_init(&c->ec_stripe_head_lock); 2319 2320 INIT_LIST_HEAD(&c->ec_stripe_new_list); 2321 mutex_init(&c->ec_stripe_new_lock); 2322 init_waitqueue_head(&c->ec_stripe_new_wait); 2323 2324 INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); 2325 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); 2326 } 2327 2328 int bch2_fs_ec_init(struct bch_fs *c) 2329 { 2330 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), 2331 BIOSET_NEED_BVECS); 2332 } 2333