1 // SPDX-License-Identifier: GPL-2.0 2 3 /* erasure coding */ 4 5 #include "bcachefs.h" 6 #include "alloc_background.h" 7 #include "alloc_foreground.h" 8 #include "backpointers.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_gc.h" 12 #include "btree_update.h" 13 #include "btree_write_buffer.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "disk_groups.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "io_read.h" 20 #include "keylist.h" 21 #include "recovery.h" 22 #include "replicas.h" 23 #include "super-io.h" 24 #include "util.h" 25 26 #include <linux/sort.h> 27 28 #ifdef __KERNEL__ 29 30 #include <linux/raid/pq.h> 31 #include <linux/raid/xor.h> 32 33 static void raid5_recov(unsigned disks, unsigned failed_idx, 34 size_t size, void **data) 35 { 36 unsigned i = 2, nr; 37 38 BUG_ON(failed_idx >= disks); 39 40 swap(data[0], data[failed_idx]); 41 memcpy(data[0], data[1], size); 42 43 while (i < disks) { 44 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); 45 xor_blocks(nr, size, data[0], data + i); 46 i += nr; 47 } 48 49 swap(data[0], data[failed_idx]); 50 } 51 52 static void raid_gen(int nd, int np, size_t size, void **v) 53 { 54 if (np >= 1) 55 raid5_recov(nd + np, nd, size, v); 56 if (np >= 2) 57 raid6_call.gen_syndrome(nd + np, size, v); 58 BUG_ON(np > 2); 59 } 60 61 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) 62 { 63 switch (nr) { 64 case 0: 65 break; 66 case 1: 67 if (ir[0] < nd + 1) 68 raid5_recov(nd + 1, ir[0], size, v); 69 else 70 raid6_call.gen_syndrome(nd + np, size, v); 71 break; 72 case 2: 73 if (ir[1] < nd) { 74 /* data+data failure. */ 75 raid6_2data_recov(nd + np, size, ir[0], ir[1], v); 76 } else if (ir[0] < nd) { 77 /* data + p/q failure */ 78 79 if (ir[1] == nd) /* data + p failure */ 80 raid6_datap_recov(nd + np, size, ir[0], v); 81 else { /* data + q failure */ 82 raid5_recov(nd + 1, ir[0], size, v); 83 raid6_call.gen_syndrome(nd + np, size, v); 84 } 85 } else { 86 raid_gen(nd, np, size, v); 87 } 88 break; 89 default: 90 BUG(); 91 } 92 } 93 94 #else 95 96 #include <raid/raid.h> 97 98 #endif 99 100 struct ec_bio { 101 struct bch_dev *ca; 102 struct ec_stripe_buf *buf; 103 size_t idx; 104 struct bio bio; 105 }; 106 107 /* Stripes btree keys: */ 108 109 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, 110 enum bkey_invalid_flags flags, 111 struct printbuf *err) 112 { 113 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 114 int ret = 0; 115 116 bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || 117 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, 118 stripe_pos_bad, 119 "stripe at bad pos"); 120 121 bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, 122 stripe_val_size_bad, 123 "incorrect value size (%zu < %u)", 124 bkey_val_u64s(k.k), stripe_val_u64s(s)); 125 126 ret = bch2_bkey_ptrs_invalid(c, k, flags, err); 127 fsck_err: 128 return ret; 129 } 130 131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, 132 struct bkey_s_c k) 133 { 134 const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; 135 struct bch_stripe s = {}; 136 137 memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); 138 139 unsigned nr_data = s.nr_blocks - s.nr_redundant; 140 141 prt_printf(out, "algo %u sectors %u blocks %u:%u csum ", 142 s.algorithm, 143 le16_to_cpu(s.sectors), 144 nr_data, 145 s.nr_redundant); 146 bch2_prt_csum_type(out, s.csum_type); 147 prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); 148 149 for (unsigned i = 0; i < s.nr_blocks; i++) { 150 const struct bch_extent_ptr *ptr = sp->ptrs + i; 151 152 if ((void *) ptr >= bkey_val_end(k)) 153 break; 154 155 bch2_extent_ptr_to_text(out, c, ptr); 156 157 if (s.csum_type < BCH_CSUM_NR && 158 i < nr_data && 159 stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k)) 160 prt_printf(out, "#%u", stripe_blockcount_get(sp, i)); 161 } 162 } 163 164 /* Triggers: */ 165 166 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, 167 struct bkey_s_c_stripe s, 168 unsigned idx, bool deleting) 169 { 170 struct bch_fs *c = trans->c; 171 const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; 172 struct btree_iter iter; 173 struct bkey_i_alloc_v4 *a; 174 enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant 175 ? BCH_DATA_parity : 0; 176 s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; 177 int ret = 0; 178 179 if (deleting) 180 sectors = -sectors; 181 182 a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); 183 if (IS_ERR(a)) 184 return PTR_ERR(a); 185 186 ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, 187 a->v.gen, a->v.data_type, 188 a->v.dirty_sectors); 189 if (ret) 190 goto err; 191 192 if (!deleting) { 193 if (bch2_trans_inconsistent_on(a->v.stripe || 194 a->v.stripe_redundancy, trans, 195 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", 196 iter.pos.inode, iter.pos.offset, a->v.gen, 197 bch2_data_type_str(a->v.data_type), 198 a->v.dirty_sectors, 199 a->v.stripe, s.k->p.offset)) { 200 ret = -EIO; 201 goto err; 202 } 203 204 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, 205 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", 206 iter.pos.inode, iter.pos.offset, a->v.gen, 207 bch2_data_type_str(a->v.data_type), 208 a->v.dirty_sectors, 209 s.k->p.offset)) { 210 ret = -EIO; 211 goto err; 212 } 213 214 a->v.stripe = s.k->p.offset; 215 a->v.stripe_redundancy = s.v->nr_redundant; 216 a->v.data_type = BCH_DATA_stripe; 217 } else { 218 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || 219 a->v.stripe_redundancy != s.v->nr_redundant, trans, 220 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", 221 iter.pos.inode, iter.pos.offset, a->v.gen, 222 s.k->p.offset, a->v.stripe)) { 223 ret = -EIO; 224 goto err; 225 } 226 227 a->v.stripe = 0; 228 a->v.stripe_redundancy = 0; 229 a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); 230 } 231 232 a->v.dirty_sectors += sectors; 233 if (data_type) 234 a->v.data_type = !deleting ? data_type : 0; 235 236 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 237 if (ret) 238 goto err; 239 err: 240 bch2_trans_iter_exit(trans, &iter); 241 return ret; 242 } 243 244 static int mark_stripe_bucket(struct btree_trans *trans, 245 struct bkey_s_c k, 246 unsigned ptr_idx, 247 unsigned flags) 248 { 249 struct bch_fs *c = trans->c; 250 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 251 unsigned nr_data = s->nr_blocks - s->nr_redundant; 252 bool parity = ptr_idx >= nr_data; 253 enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; 254 s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; 255 const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; 256 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 257 struct bucket old, new, *g; 258 struct printbuf buf = PRINTBUF; 259 int ret = 0; 260 261 BUG_ON(!(flags & BTREE_TRIGGER_GC)); 262 263 /* * XXX doesn't handle deletion */ 264 265 percpu_down_read(&c->mark_lock); 266 g = PTR_GC_BUCKET(ca, ptr); 267 268 if (g->dirty_sectors || 269 (g->stripe && g->stripe != k.k->p.offset)) { 270 bch2_fs_inconsistent(c, 271 "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", 272 ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, 273 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 274 ret = -EINVAL; 275 goto err; 276 } 277 278 bucket_lock(g); 279 old = *g; 280 281 ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, 282 g->gen, g->data_type, 283 g->dirty_sectors); 284 if (ret) 285 goto err; 286 287 g->data_type = data_type; 288 g->dirty_sectors += sectors; 289 290 g->stripe = k.k->p.offset; 291 g->stripe_redundancy = s->nr_redundant; 292 new = *g; 293 err: 294 bucket_unlock(g); 295 if (!ret) 296 bch2_dev_usage_update_m(c, ca, &old, &new); 297 percpu_up_read(&c->mark_lock); 298 printbuf_exit(&buf); 299 return ret; 300 } 301 302 int bch2_trigger_stripe(struct btree_trans *trans, 303 enum btree_id btree_id, unsigned level, 304 struct bkey_s_c old, struct bkey_s _new, 305 unsigned flags) 306 { 307 struct bkey_s_c new = _new.s_c; 308 struct bch_fs *c = trans->c; 309 u64 idx = new.k->p.offset; 310 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe 311 ? bkey_s_c_to_stripe(old).v : NULL; 312 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe 313 ? bkey_s_c_to_stripe(new).v : NULL; 314 315 if (flags & BTREE_TRIGGER_TRANSACTIONAL) { 316 /* 317 * If the pointers aren't changing, we don't need to do anything: 318 */ 319 if (new_s && old_s && 320 new_s->nr_blocks == old_s->nr_blocks && 321 new_s->nr_redundant == old_s->nr_redundant && 322 !memcmp(old_s->ptrs, new_s->ptrs, 323 new_s->nr_blocks * sizeof(struct bch_extent_ptr))) 324 return 0; 325 326 BUG_ON(new_s && old_s && 327 (new_s->nr_blocks != old_s->nr_blocks || 328 new_s->nr_redundant != old_s->nr_redundant)); 329 330 if (new_s) { 331 s64 sectors = le16_to_cpu(new_s->sectors); 332 333 struct bch_replicas_padded r; 334 bch2_bkey_to_replicas(&r.e, new); 335 int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); 336 if (ret) 337 return ret; 338 } 339 340 if (old_s) { 341 s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); 342 343 struct bch_replicas_padded r; 344 bch2_bkey_to_replicas(&r.e, old); 345 int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); 346 if (ret) 347 return ret; 348 } 349 350 unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; 351 for (unsigned i = 0; i < nr_blocks; i++) { 352 if (new_s && old_s && 353 !memcmp(&new_s->ptrs[i], 354 &old_s->ptrs[i], 355 sizeof(new_s->ptrs[i]))) 356 continue; 357 358 if (new_s) { 359 int ret = bch2_trans_mark_stripe_bucket(trans, 360 bkey_s_c_to_stripe(new), i, false); 361 if (ret) 362 return ret; 363 } 364 365 if (old_s) { 366 int ret = bch2_trans_mark_stripe_bucket(trans, 367 bkey_s_c_to_stripe(old), i, true); 368 if (ret) 369 return ret; 370 } 371 } 372 } 373 374 if (flags & BTREE_TRIGGER_ATOMIC) { 375 struct stripe *m = genradix_ptr(&c->stripes, idx); 376 377 if (!m) { 378 struct printbuf buf1 = PRINTBUF; 379 struct printbuf buf2 = PRINTBUF; 380 381 bch2_bkey_val_to_text(&buf1, c, old); 382 bch2_bkey_val_to_text(&buf2, c, new); 383 bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" 384 "old %s\n" 385 "new %s", idx, buf1.buf, buf2.buf); 386 printbuf_exit(&buf2); 387 printbuf_exit(&buf1); 388 bch2_inconsistent_error(c); 389 return -1; 390 } 391 392 if (!new_s) { 393 bch2_stripes_heap_del(c, m, idx); 394 395 memset(m, 0, sizeof(*m)); 396 } else { 397 m->sectors = le16_to_cpu(new_s->sectors); 398 m->algorithm = new_s->algorithm; 399 m->nr_blocks = new_s->nr_blocks; 400 m->nr_redundant = new_s->nr_redundant; 401 m->blocks_nonempty = 0; 402 403 for (unsigned i = 0; i < new_s->nr_blocks; i++) 404 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); 405 406 if (!old_s) 407 bch2_stripes_heap_insert(c, m, idx); 408 else 409 bch2_stripes_heap_update(c, m, idx); 410 } 411 } 412 413 if (flags & BTREE_TRIGGER_GC) { 414 struct gc_stripe *m = 415 genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); 416 417 if (!m) { 418 bch_err(c, "error allocating memory for gc_stripes, idx %llu", 419 idx); 420 return -BCH_ERR_ENOMEM_mark_stripe; 421 } 422 /* 423 * This will be wrong when we bring back runtime gc: we should 424 * be unmarking the old key and then marking the new key 425 */ 426 m->alive = true; 427 m->sectors = le16_to_cpu(new_s->sectors); 428 m->nr_blocks = new_s->nr_blocks; 429 m->nr_redundant = new_s->nr_redundant; 430 431 for (unsigned i = 0; i < new_s->nr_blocks; i++) 432 m->ptrs[i] = new_s->ptrs[i]; 433 434 bch2_bkey_to_replicas(&m->r.e, new); 435 436 /* 437 * gc recalculates this field from stripe ptr 438 * references: 439 */ 440 memset(m->block_sectors, 0, sizeof(m->block_sectors)); 441 442 for (unsigned i = 0; i < new_s->nr_blocks; i++) { 443 int ret = mark_stripe_bucket(trans, new, i, flags); 444 if (ret) 445 return ret; 446 } 447 448 int ret = bch2_update_replicas(c, new, &m->r.e, 449 ((s64) m->sectors * m->nr_redundant), 450 0, true); 451 if (ret) { 452 struct printbuf buf = PRINTBUF; 453 454 bch2_bkey_val_to_text(&buf, c, new); 455 bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf); 456 printbuf_exit(&buf); 457 return ret; 458 } 459 } 460 461 return 0; 462 } 463 464 /* returns blocknr in stripe that we matched: */ 465 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, 466 struct bkey_s_c k, unsigned *block) 467 { 468 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 469 unsigned i, nr_data = s->nr_blocks - s->nr_redundant; 470 471 bkey_for_each_ptr(ptrs, ptr) 472 for (i = 0; i < nr_data; i++) 473 if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, 474 le16_to_cpu(s->sectors))) { 475 *block = i; 476 return ptr; 477 } 478 479 return NULL; 480 } 481 482 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) 483 { 484 switch (k.k->type) { 485 case KEY_TYPE_extent: { 486 struct bkey_s_c_extent e = bkey_s_c_to_extent(k); 487 const union bch_extent_entry *entry; 488 489 extent_for_each_entry(e, entry) 490 if (extent_entry_type(entry) == 491 BCH_EXTENT_ENTRY_stripe_ptr && 492 entry->stripe_ptr.idx == idx) 493 return true; 494 495 break; 496 } 497 } 498 499 return false; 500 } 501 502 /* Stripe bufs: */ 503 504 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) 505 { 506 if (buf->key.k.type == KEY_TYPE_stripe) { 507 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); 508 unsigned i; 509 510 for (i = 0; i < s->v.nr_blocks; i++) { 511 kvfree(buf->data[i]); 512 buf->data[i] = NULL; 513 } 514 } 515 } 516 517 /* XXX: this is a non-mempoolified memory allocation: */ 518 static int ec_stripe_buf_init(struct ec_stripe_buf *buf, 519 unsigned offset, unsigned size) 520 { 521 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 522 unsigned csum_granularity = 1U << v->csum_granularity_bits; 523 unsigned end = offset + size; 524 unsigned i; 525 526 BUG_ON(end > le16_to_cpu(v->sectors)); 527 528 offset = round_down(offset, csum_granularity); 529 end = min_t(unsigned, le16_to_cpu(v->sectors), 530 round_up(end, csum_granularity)); 531 532 buf->offset = offset; 533 buf->size = end - offset; 534 535 memset(buf->valid, 0xFF, sizeof(buf->valid)); 536 537 for (i = 0; i < v->nr_blocks; i++) { 538 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); 539 if (!buf->data[i]) 540 goto err; 541 } 542 543 return 0; 544 err: 545 ec_stripe_buf_exit(buf); 546 return -BCH_ERR_ENOMEM_stripe_buf; 547 } 548 549 /* Checksumming: */ 550 551 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, 552 unsigned block, unsigned offset) 553 { 554 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 555 unsigned csum_granularity = 1 << v->csum_granularity_bits; 556 unsigned end = buf->offset + buf->size; 557 unsigned len = min(csum_granularity, end - offset); 558 559 BUG_ON(offset >= end); 560 BUG_ON(offset < buf->offset); 561 BUG_ON(offset & (csum_granularity - 1)); 562 BUG_ON(offset + len != le16_to_cpu(v->sectors) && 563 (len & (csum_granularity - 1))); 564 565 return bch2_checksum(NULL, v->csum_type, 566 null_nonce(), 567 buf->data[block] + ((offset - buf->offset) << 9), 568 len << 9); 569 } 570 571 static void ec_generate_checksums(struct ec_stripe_buf *buf) 572 { 573 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 574 unsigned i, j, csums_per_device = stripe_csums_per_device(v); 575 576 if (!v->csum_type) 577 return; 578 579 BUG_ON(buf->offset); 580 BUG_ON(buf->size != le16_to_cpu(v->sectors)); 581 582 for (i = 0; i < v->nr_blocks; i++) 583 for (j = 0; j < csums_per_device; j++) 584 stripe_csum_set(v, i, j, 585 ec_block_checksum(buf, i, j << v->csum_granularity_bits)); 586 } 587 588 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) 589 { 590 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 591 unsigned csum_granularity = 1 << v->csum_granularity_bits; 592 unsigned i; 593 594 if (!v->csum_type) 595 return; 596 597 for (i = 0; i < v->nr_blocks; i++) { 598 unsigned offset = buf->offset; 599 unsigned end = buf->offset + buf->size; 600 601 if (!test_bit(i, buf->valid)) 602 continue; 603 604 while (offset < end) { 605 unsigned j = offset >> v->csum_granularity_bits; 606 unsigned len = min(csum_granularity, end - offset); 607 struct bch_csum want = stripe_csum_get(v, i, j); 608 struct bch_csum got = ec_block_checksum(buf, i, offset); 609 610 if (bch2_crc_cmp(want, got)) { 611 struct printbuf err = PRINTBUF; 612 struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); 613 614 prt_str(&err, "stripe "); 615 bch2_csum_err_msg(&err, v->csum_type, want, got); 616 prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); 617 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); 618 bch_err_ratelimited(ca, "%s", err.buf); 619 printbuf_exit(&err); 620 621 clear_bit(i, buf->valid); 622 623 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 624 break; 625 } 626 627 offset += len; 628 } 629 } 630 } 631 632 /* Erasure coding: */ 633 634 static void ec_generate_ec(struct ec_stripe_buf *buf) 635 { 636 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 637 unsigned nr_data = v->nr_blocks - v->nr_redundant; 638 unsigned bytes = le16_to_cpu(v->sectors) << 9; 639 640 raid_gen(nr_data, v->nr_redundant, bytes, buf->data); 641 } 642 643 static unsigned ec_nr_failed(struct ec_stripe_buf *buf) 644 { 645 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 646 647 return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); 648 } 649 650 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) 651 { 652 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 653 unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; 654 unsigned nr_data = v->nr_blocks - v->nr_redundant; 655 unsigned bytes = buf->size << 9; 656 657 if (ec_nr_failed(buf) > v->nr_redundant) { 658 bch_err_ratelimited(c, 659 "error doing reconstruct read: unable to read enough blocks"); 660 return -1; 661 } 662 663 for (i = 0; i < nr_data; i++) 664 if (!test_bit(i, buf->valid)) 665 failed[nr_failed++] = i; 666 667 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); 668 return 0; 669 } 670 671 /* IO: */ 672 673 static void ec_block_endio(struct bio *bio) 674 { 675 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); 676 struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; 677 struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; 678 struct bch_dev *ca = ec_bio->ca; 679 struct closure *cl = bio->bi_private; 680 681 if (bch2_dev_io_err_on(bio->bi_status, ca, 682 bio_data_dir(bio) 683 ? BCH_MEMBER_ERROR_write 684 : BCH_MEMBER_ERROR_read, 685 "erasure coding %s error: %s", 686 bio_data_dir(bio) ? "write" : "read", 687 bch2_blk_status_to_str(bio->bi_status))) 688 clear_bit(ec_bio->idx, ec_bio->buf->valid); 689 690 if (ptr_stale(ca, ptr)) { 691 bch_err_ratelimited(ca->fs, 692 "error %s stripe: stale pointer after io", 693 bio_data_dir(bio) == READ ? "reading from" : "writing to"); 694 clear_bit(ec_bio->idx, ec_bio->buf->valid); 695 } 696 697 bio_put(&ec_bio->bio); 698 percpu_ref_put(&ca->io_ref); 699 closure_put(cl); 700 } 701 702 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, 703 blk_opf_t opf, unsigned idx, struct closure *cl) 704 { 705 struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; 706 unsigned offset = 0, bytes = buf->size << 9; 707 struct bch_extent_ptr *ptr = &v->ptrs[idx]; 708 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); 709 enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant 710 ? BCH_DATA_user 711 : BCH_DATA_parity; 712 int rw = op_is_write(opf); 713 714 if (ptr_stale(ca, ptr)) { 715 bch_err_ratelimited(c, 716 "error %s stripe: stale pointer", 717 rw == READ ? "reading from" : "writing to"); 718 clear_bit(idx, buf->valid); 719 return; 720 } 721 722 if (!bch2_dev_get_ioref(ca, rw)) { 723 clear_bit(idx, buf->valid); 724 return; 725 } 726 727 this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); 728 729 while (offset < bytes) { 730 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, 731 DIV_ROUND_UP(bytes, PAGE_SIZE)); 732 unsigned b = min_t(size_t, bytes - offset, 733 nr_iovecs << PAGE_SHIFT); 734 struct ec_bio *ec_bio; 735 736 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 737 nr_iovecs, 738 opf, 739 GFP_KERNEL, 740 &c->ec_bioset), 741 struct ec_bio, bio); 742 743 ec_bio->ca = ca; 744 ec_bio->buf = buf; 745 ec_bio->idx = idx; 746 747 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); 748 ec_bio->bio.bi_end_io = ec_block_endio; 749 ec_bio->bio.bi_private = cl; 750 751 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); 752 753 closure_get(cl); 754 percpu_ref_get(&ca->io_ref); 755 756 submit_bio(&ec_bio->bio); 757 758 offset += b; 759 } 760 761 percpu_ref_put(&ca->io_ref); 762 } 763 764 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, 765 struct ec_stripe_buf *stripe) 766 { 767 struct btree_iter iter; 768 struct bkey_s_c k; 769 int ret; 770 771 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 772 POS(0, idx), BTREE_ITER_SLOTS); 773 ret = bkey_err(k); 774 if (ret) 775 goto err; 776 if (k.k->type != KEY_TYPE_stripe) { 777 ret = -ENOENT; 778 goto err; 779 } 780 bkey_reassemble(&stripe->key, k); 781 err: 782 bch2_trans_iter_exit(trans, &iter); 783 return ret; 784 } 785 786 /* recovery read path: */ 787 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) 788 { 789 struct bch_fs *c = trans->c; 790 struct ec_stripe_buf *buf; 791 struct closure cl; 792 struct bch_stripe *v; 793 unsigned i, offset; 794 int ret = 0; 795 796 closure_init_stack(&cl); 797 798 BUG_ON(!rbio->pick.has_ec); 799 800 buf = kzalloc(sizeof(*buf), GFP_NOFS); 801 if (!buf) 802 return -BCH_ERR_ENOMEM_ec_read_extent; 803 804 ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); 805 if (ret) { 806 bch_err_ratelimited(c, 807 "error doing reconstruct read: error %i looking up stripe", ret); 808 kfree(buf); 809 return -EIO; 810 } 811 812 v = &bkey_i_to_stripe(&buf->key)->v; 813 814 if (!bch2_ptr_matches_stripe(v, rbio->pick)) { 815 bch_err_ratelimited(c, 816 "error doing reconstruct read: pointer doesn't match stripe"); 817 ret = -EIO; 818 goto err; 819 } 820 821 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; 822 if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { 823 bch_err_ratelimited(c, 824 "error doing reconstruct read: read is bigger than stripe"); 825 ret = -EIO; 826 goto err; 827 } 828 829 ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); 830 if (ret) 831 goto err; 832 833 for (i = 0; i < v->nr_blocks; i++) 834 ec_block_io(c, buf, REQ_OP_READ, i, &cl); 835 836 closure_sync(&cl); 837 838 if (ec_nr_failed(buf) > v->nr_redundant) { 839 bch_err_ratelimited(c, 840 "error doing reconstruct read: unable to read enough blocks"); 841 ret = -EIO; 842 goto err; 843 } 844 845 ec_validate_checksums(c, buf); 846 847 ret = ec_do_recov(c, buf); 848 if (ret) 849 goto err; 850 851 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, 852 buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); 853 err: 854 ec_stripe_buf_exit(buf); 855 kfree(buf); 856 return ret; 857 } 858 859 /* stripe bucket accounting: */ 860 861 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) 862 { 863 ec_stripes_heap n, *h = &c->ec_stripes_heap; 864 865 if (idx >= h->size) { 866 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) 867 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 868 869 mutex_lock(&c->ec_stripes_heap_lock); 870 if (n.size > h->size) { 871 memcpy(n.data, h->data, h->used * sizeof(h->data[0])); 872 n.used = h->used; 873 swap(*h, n); 874 } 875 mutex_unlock(&c->ec_stripes_heap_lock); 876 877 free_heap(&n); 878 } 879 880 if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) 881 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 882 883 if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && 884 !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) 885 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; 886 887 return 0; 888 } 889 890 static int ec_stripe_mem_alloc(struct btree_trans *trans, 891 struct btree_iter *iter) 892 { 893 return allocate_dropping_locks_errcode(trans, 894 __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); 895 } 896 897 /* 898 * Hash table of open stripes: 899 * Stripes that are being created or modified are kept in a hash table, so that 900 * stripe deletion can skip them. 901 */ 902 903 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) 904 { 905 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 906 struct ec_stripe_new *s; 907 908 hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) 909 if (s->idx == idx) 910 return true; 911 return false; 912 } 913 914 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) 915 { 916 bool ret = false; 917 918 spin_lock(&c->ec_stripes_new_lock); 919 ret = __bch2_stripe_is_open(c, idx); 920 spin_unlock(&c->ec_stripes_new_lock); 921 922 return ret; 923 } 924 925 static bool bch2_try_open_stripe(struct bch_fs *c, 926 struct ec_stripe_new *s, 927 u64 idx) 928 { 929 bool ret; 930 931 spin_lock(&c->ec_stripes_new_lock); 932 ret = !__bch2_stripe_is_open(c, idx); 933 if (ret) { 934 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); 935 936 s->idx = idx; 937 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); 938 } 939 spin_unlock(&c->ec_stripes_new_lock); 940 941 return ret; 942 } 943 944 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) 945 { 946 BUG_ON(!s->idx); 947 948 spin_lock(&c->ec_stripes_new_lock); 949 hlist_del_init(&s->hash); 950 spin_unlock(&c->ec_stripes_new_lock); 951 952 s->idx = 0; 953 } 954 955 /* Heap of all existing stripes, ordered by blocks_nonempty */ 956 957 static u64 stripe_idx_to_delete(struct bch_fs *c) 958 { 959 ec_stripes_heap *h = &c->ec_stripes_heap; 960 961 lockdep_assert_held(&c->ec_stripes_heap_lock); 962 963 if (h->used && 964 h->data[0].blocks_nonempty == 0 && 965 !bch2_stripe_is_open(c, h->data[0].idx)) 966 return h->data[0].idx; 967 968 return 0; 969 } 970 971 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, 972 struct ec_stripe_heap_entry l, 973 struct ec_stripe_heap_entry r) 974 { 975 return ((l.blocks_nonempty > r.blocks_nonempty) - 976 (l.blocks_nonempty < r.blocks_nonempty)); 977 } 978 979 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, 980 size_t i) 981 { 982 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); 983 984 genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; 985 } 986 987 static void heap_verify_backpointer(struct bch_fs *c, size_t idx) 988 { 989 ec_stripes_heap *h = &c->ec_stripes_heap; 990 struct stripe *m = genradix_ptr(&c->stripes, idx); 991 992 BUG_ON(m->heap_idx >= h->used); 993 BUG_ON(h->data[m->heap_idx].idx != idx); 994 } 995 996 void bch2_stripes_heap_del(struct bch_fs *c, 997 struct stripe *m, size_t idx) 998 { 999 mutex_lock(&c->ec_stripes_heap_lock); 1000 heap_verify_backpointer(c, idx); 1001 1002 heap_del(&c->ec_stripes_heap, m->heap_idx, 1003 ec_stripes_heap_cmp, 1004 ec_stripes_heap_set_backpointer); 1005 mutex_unlock(&c->ec_stripes_heap_lock); 1006 } 1007 1008 void bch2_stripes_heap_insert(struct bch_fs *c, 1009 struct stripe *m, size_t idx) 1010 { 1011 mutex_lock(&c->ec_stripes_heap_lock); 1012 BUG_ON(heap_full(&c->ec_stripes_heap)); 1013 1014 heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { 1015 .idx = idx, 1016 .blocks_nonempty = m->blocks_nonempty, 1017 }), 1018 ec_stripes_heap_cmp, 1019 ec_stripes_heap_set_backpointer); 1020 1021 heap_verify_backpointer(c, idx); 1022 mutex_unlock(&c->ec_stripes_heap_lock); 1023 } 1024 1025 void bch2_stripes_heap_update(struct bch_fs *c, 1026 struct stripe *m, size_t idx) 1027 { 1028 ec_stripes_heap *h = &c->ec_stripes_heap; 1029 bool do_deletes; 1030 size_t i; 1031 1032 mutex_lock(&c->ec_stripes_heap_lock); 1033 heap_verify_backpointer(c, idx); 1034 1035 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; 1036 1037 i = m->heap_idx; 1038 heap_sift_up(h, i, ec_stripes_heap_cmp, 1039 ec_stripes_heap_set_backpointer); 1040 heap_sift_down(h, i, ec_stripes_heap_cmp, 1041 ec_stripes_heap_set_backpointer); 1042 1043 heap_verify_backpointer(c, idx); 1044 1045 do_deletes = stripe_idx_to_delete(c) != 0; 1046 mutex_unlock(&c->ec_stripes_heap_lock); 1047 1048 if (do_deletes) 1049 bch2_do_stripe_deletes(c); 1050 } 1051 1052 /* stripe deletion */ 1053 1054 static int ec_stripe_delete(struct btree_trans *trans, u64 idx) 1055 { 1056 struct bch_fs *c = trans->c; 1057 struct btree_iter iter; 1058 struct bkey_s_c k; 1059 struct bkey_s_c_stripe s; 1060 int ret; 1061 1062 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), 1063 BTREE_ITER_INTENT); 1064 ret = bkey_err(k); 1065 if (ret) 1066 goto err; 1067 1068 if (k.k->type != KEY_TYPE_stripe) { 1069 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); 1070 ret = -EINVAL; 1071 goto err; 1072 } 1073 1074 s = bkey_s_c_to_stripe(k); 1075 for (unsigned i = 0; i < s.v->nr_blocks; i++) 1076 if (stripe_blockcount_get(s.v, i)) { 1077 struct printbuf buf = PRINTBUF; 1078 1079 bch2_bkey_val_to_text(&buf, c, k); 1080 bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); 1081 printbuf_exit(&buf); 1082 ret = -EINVAL; 1083 goto err; 1084 } 1085 1086 ret = bch2_btree_delete_at(trans, &iter, 0); 1087 err: 1088 bch2_trans_iter_exit(trans, &iter); 1089 return ret; 1090 } 1091 1092 static void ec_stripe_delete_work(struct work_struct *work) 1093 { 1094 struct bch_fs *c = 1095 container_of(work, struct bch_fs, ec_stripe_delete_work); 1096 1097 while (1) { 1098 mutex_lock(&c->ec_stripes_heap_lock); 1099 u64 idx = stripe_idx_to_delete(c); 1100 mutex_unlock(&c->ec_stripes_heap_lock); 1101 1102 if (!idx) 1103 break; 1104 1105 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1106 ec_stripe_delete(trans, idx)); 1107 bch_err_fn(c, ret); 1108 if (ret) 1109 break; 1110 } 1111 1112 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 1113 } 1114 1115 void bch2_do_stripe_deletes(struct bch_fs *c) 1116 { 1117 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && 1118 !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) 1119 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); 1120 } 1121 1122 /* stripe creation: */ 1123 1124 static int ec_stripe_key_update(struct btree_trans *trans, 1125 struct bkey_i_stripe *new, 1126 bool create) 1127 { 1128 struct bch_fs *c = trans->c; 1129 struct btree_iter iter; 1130 struct bkey_s_c k; 1131 int ret; 1132 1133 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, 1134 new->k.p, BTREE_ITER_INTENT); 1135 ret = bkey_err(k); 1136 if (ret) 1137 goto err; 1138 1139 if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { 1140 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", 1141 create ? "creating" : "updating", 1142 bch2_bkey_types[k.k->type]); 1143 ret = -EINVAL; 1144 goto err; 1145 } 1146 1147 if (k.k->type == KEY_TYPE_stripe) { 1148 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; 1149 unsigned i; 1150 1151 if (old->nr_blocks != new->v.nr_blocks) { 1152 bch_err(c, "error updating stripe: nr_blocks does not match"); 1153 ret = -EINVAL; 1154 goto err; 1155 } 1156 1157 for (i = 0; i < new->v.nr_blocks; i++) { 1158 unsigned v = stripe_blockcount_get(old, i); 1159 1160 BUG_ON(v && 1161 (old->ptrs[i].dev != new->v.ptrs[i].dev || 1162 old->ptrs[i].gen != new->v.ptrs[i].gen || 1163 old->ptrs[i].offset != new->v.ptrs[i].offset)); 1164 1165 stripe_blockcount_set(&new->v, i, v); 1166 } 1167 } 1168 1169 ret = bch2_trans_update(trans, &iter, &new->k_i, 0); 1170 err: 1171 bch2_trans_iter_exit(trans, &iter); 1172 return ret; 1173 } 1174 1175 static int ec_stripe_update_extent(struct btree_trans *trans, 1176 struct bpos bucket, u8 gen, 1177 struct ec_stripe_buf *s, 1178 struct bpos *bp_pos) 1179 { 1180 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1181 struct bch_fs *c = trans->c; 1182 struct bch_backpointer bp; 1183 struct btree_iter iter; 1184 struct bkey_s_c k; 1185 const struct bch_extent_ptr *ptr_c; 1186 struct bch_extent_ptr *ptr, *ec_ptr = NULL; 1187 struct bch_extent_stripe_ptr stripe_ptr; 1188 struct bkey_i *n; 1189 int ret, dev, block; 1190 1191 ret = bch2_get_next_backpointer(trans, bucket, gen, 1192 bp_pos, &bp, BTREE_ITER_CACHED); 1193 if (ret) 1194 return ret; 1195 if (bpos_eq(*bp_pos, SPOS_MAX)) 1196 return 0; 1197 1198 if (bp.level) { 1199 struct printbuf buf = PRINTBUF; 1200 struct btree_iter node_iter; 1201 struct btree *b; 1202 1203 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); 1204 bch2_trans_iter_exit(trans, &node_iter); 1205 1206 if (!b) 1207 return 0; 1208 1209 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); 1210 bch2_backpointer_to_text(&buf, &bp); 1211 1212 bch2_fs_inconsistent(c, "%s", buf.buf); 1213 printbuf_exit(&buf); 1214 return -EIO; 1215 } 1216 1217 k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); 1218 ret = bkey_err(k); 1219 if (ret) 1220 return ret; 1221 if (!k.k) { 1222 /* 1223 * extent no longer exists - we could flush the btree 1224 * write buffer and retry to verify, but no need: 1225 */ 1226 return 0; 1227 } 1228 1229 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) 1230 goto out; 1231 1232 ptr_c = bkey_matches_stripe(v, k, &block); 1233 /* 1234 * It doesn't generally make sense to erasure code cached ptrs: 1235 * XXX: should we be incrementing a counter? 1236 */ 1237 if (!ptr_c || ptr_c->cached) 1238 goto out; 1239 1240 dev = v->ptrs[block].dev; 1241 1242 n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); 1243 ret = PTR_ERR_OR_ZERO(n); 1244 if (ret) 1245 goto out; 1246 1247 bkey_reassemble(n, k); 1248 1249 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); 1250 ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); 1251 BUG_ON(!ec_ptr); 1252 1253 stripe_ptr = (struct bch_extent_stripe_ptr) { 1254 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, 1255 .block = block, 1256 .redundancy = v->nr_redundant, 1257 .idx = s->key.k.p.offset, 1258 }; 1259 1260 __extent_entry_insert(n, 1261 (union bch_extent_entry *) ec_ptr, 1262 (union bch_extent_entry *) &stripe_ptr); 1263 1264 ret = bch2_trans_update(trans, &iter, n, 0); 1265 out: 1266 bch2_trans_iter_exit(trans, &iter); 1267 return ret; 1268 } 1269 1270 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, 1271 unsigned block) 1272 { 1273 struct bch_fs *c = trans->c; 1274 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1275 struct bch_extent_ptr bucket = v->ptrs[block]; 1276 struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); 1277 struct bpos bp_pos = POS_MIN; 1278 int ret = 0; 1279 1280 while (1) { 1281 ret = commit_do(trans, NULL, NULL, 1282 BCH_TRANS_COMMIT_no_check_rw| 1283 BCH_TRANS_COMMIT_no_enospc, 1284 ec_stripe_update_extent(trans, bucket_pos, bucket.gen, 1285 s, &bp_pos)); 1286 if (ret) 1287 break; 1288 if (bkey_eq(bp_pos, POS_MAX)) 1289 break; 1290 1291 bp_pos = bpos_nosnap_successor(bp_pos); 1292 } 1293 1294 return ret; 1295 } 1296 1297 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) 1298 { 1299 struct btree_trans *trans = bch2_trans_get(c); 1300 struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; 1301 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1302 int ret = 0; 1303 1304 ret = bch2_btree_write_buffer_flush_sync(trans); 1305 if (ret) 1306 goto err; 1307 1308 for (i = 0; i < nr_data; i++) { 1309 ret = ec_stripe_update_bucket(trans, s, i); 1310 if (ret) 1311 break; 1312 } 1313 err: 1314 bch2_trans_put(trans); 1315 1316 return ret; 1317 } 1318 1319 static void zero_out_rest_of_ec_bucket(struct bch_fs *c, 1320 struct ec_stripe_new *s, 1321 unsigned block, 1322 struct open_bucket *ob) 1323 { 1324 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 1325 unsigned offset = ca->mi.bucket_size - ob->sectors_free; 1326 int ret; 1327 1328 if (!bch2_dev_get_ioref(ca, WRITE)) { 1329 s->err = -BCH_ERR_erofs_no_writes; 1330 return; 1331 } 1332 1333 memset(s->new_stripe.data[block] + (offset << 9), 1334 0, 1335 ob->sectors_free << 9); 1336 1337 ret = blkdev_issue_zeroout(ca->disk_sb.bdev, 1338 ob->bucket * ca->mi.bucket_size + offset, 1339 ob->sectors_free, 1340 GFP_KERNEL, 0); 1341 1342 percpu_ref_put(&ca->io_ref); 1343 1344 if (ret) 1345 s->err = ret; 1346 } 1347 1348 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) 1349 { 1350 if (s->idx) 1351 bch2_stripe_close(c, s); 1352 kfree(s); 1353 } 1354 1355 /* 1356 * data buckets of new stripe all written: create the stripe 1357 */ 1358 static void ec_stripe_create(struct ec_stripe_new *s) 1359 { 1360 struct bch_fs *c = s->c; 1361 struct open_bucket *ob; 1362 struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; 1363 unsigned i, nr_data = v->nr_blocks - v->nr_redundant; 1364 int ret; 1365 1366 BUG_ON(s->h->s == s); 1367 1368 closure_sync(&s->iodone); 1369 1370 if (!s->err) { 1371 for (i = 0; i < nr_data; i++) 1372 if (s->blocks[i]) { 1373 ob = c->open_buckets + s->blocks[i]; 1374 1375 if (ob->sectors_free) 1376 zero_out_rest_of_ec_bucket(c, s, i, ob); 1377 } 1378 } 1379 1380 if (s->err) { 1381 if (!bch2_err_matches(s->err, EROFS)) 1382 bch_err(c, "error creating stripe: error writing data buckets"); 1383 goto err; 1384 } 1385 1386 if (s->have_existing_stripe) { 1387 ec_validate_checksums(c, &s->existing_stripe); 1388 1389 if (ec_do_recov(c, &s->existing_stripe)) { 1390 bch_err(c, "error creating stripe: error reading existing stripe"); 1391 goto err; 1392 } 1393 1394 for (i = 0; i < nr_data; i++) 1395 if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) 1396 swap(s->new_stripe.data[i], 1397 s->existing_stripe.data[i]); 1398 1399 ec_stripe_buf_exit(&s->existing_stripe); 1400 } 1401 1402 BUG_ON(!s->allocated); 1403 BUG_ON(!s->idx); 1404 1405 ec_generate_ec(&s->new_stripe); 1406 1407 ec_generate_checksums(&s->new_stripe); 1408 1409 /* write p/q: */ 1410 for (i = nr_data; i < v->nr_blocks; i++) 1411 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); 1412 closure_sync(&s->iodone); 1413 1414 if (ec_nr_failed(&s->new_stripe)) { 1415 bch_err(c, "error creating stripe: error writing redundancy buckets"); 1416 goto err; 1417 } 1418 1419 ret = bch2_trans_do(c, &s->res, NULL, 1420 BCH_TRANS_COMMIT_no_check_rw| 1421 BCH_TRANS_COMMIT_no_enospc, 1422 ec_stripe_key_update(trans, 1423 bkey_i_to_stripe(&s->new_stripe.key), 1424 !s->have_existing_stripe)); 1425 bch_err_msg(c, ret, "creating stripe key"); 1426 if (ret) { 1427 goto err; 1428 } 1429 1430 ret = ec_stripe_update_extents(c, &s->new_stripe); 1431 bch_err_msg(c, ret, "error updating extents"); 1432 if (ret) 1433 goto err; 1434 err: 1435 bch2_disk_reservation_put(c, &s->res); 1436 1437 for (i = 0; i < v->nr_blocks; i++) 1438 if (s->blocks[i]) { 1439 ob = c->open_buckets + s->blocks[i]; 1440 1441 if (i < nr_data) { 1442 ob->ec = NULL; 1443 __bch2_open_bucket_put(c, ob); 1444 } else { 1445 bch2_open_bucket_put(c, ob); 1446 } 1447 } 1448 1449 mutex_lock(&c->ec_stripe_new_lock); 1450 list_del(&s->list); 1451 mutex_unlock(&c->ec_stripe_new_lock); 1452 wake_up(&c->ec_stripe_new_wait); 1453 1454 ec_stripe_buf_exit(&s->existing_stripe); 1455 ec_stripe_buf_exit(&s->new_stripe); 1456 closure_debug_destroy(&s->iodone); 1457 1458 ec_stripe_new_put(c, s, STRIPE_REF_stripe); 1459 } 1460 1461 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) 1462 { 1463 struct ec_stripe_new *s; 1464 1465 mutex_lock(&c->ec_stripe_new_lock); 1466 list_for_each_entry(s, &c->ec_stripe_new_list, list) 1467 if (!atomic_read(&s->ref[STRIPE_REF_io])) 1468 goto out; 1469 s = NULL; 1470 out: 1471 mutex_unlock(&c->ec_stripe_new_lock); 1472 1473 return s; 1474 } 1475 1476 static void ec_stripe_create_work(struct work_struct *work) 1477 { 1478 struct bch_fs *c = container_of(work, 1479 struct bch_fs, ec_stripe_create_work); 1480 struct ec_stripe_new *s; 1481 1482 while ((s = get_pending_stripe(c))) 1483 ec_stripe_create(s); 1484 1485 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1486 } 1487 1488 void bch2_ec_do_stripe_creates(struct bch_fs *c) 1489 { 1490 bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); 1491 1492 if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) 1493 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); 1494 } 1495 1496 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) 1497 { 1498 struct ec_stripe_new *s = h->s; 1499 1500 BUG_ON(!s->allocated && !s->err); 1501 1502 h->s = NULL; 1503 s->pending = true; 1504 1505 mutex_lock(&c->ec_stripe_new_lock); 1506 list_add(&s->list, &c->ec_stripe_new_list); 1507 mutex_unlock(&c->ec_stripe_new_lock); 1508 1509 ec_stripe_new_put(c, s, STRIPE_REF_io); 1510 } 1511 1512 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) 1513 { 1514 struct ec_stripe_new *s = ob->ec; 1515 1516 s->err = -EIO; 1517 } 1518 1519 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) 1520 { 1521 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); 1522 struct bch_dev *ca; 1523 unsigned offset; 1524 1525 if (!ob) 1526 return NULL; 1527 1528 BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); 1529 1530 ca = bch_dev_bkey_exists(c, ob->dev); 1531 offset = ca->mi.bucket_size - ob->sectors_free; 1532 1533 return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); 1534 } 1535 1536 static int unsigned_cmp(const void *_l, const void *_r) 1537 { 1538 unsigned l = *((const unsigned *) _l); 1539 unsigned r = *((const unsigned *) _r); 1540 1541 return cmp_int(l, r); 1542 } 1543 1544 /* pick most common bucket size: */ 1545 static unsigned pick_blocksize(struct bch_fs *c, 1546 struct bch_devs_mask *devs) 1547 { 1548 unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; 1549 struct { 1550 unsigned nr, size; 1551 } cur = { 0, 0 }, best = { 0, 0 }; 1552 1553 for_each_member_device_rcu(c, ca, devs) 1554 sizes[nr++] = ca->mi.bucket_size; 1555 1556 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); 1557 1558 for (unsigned i = 0; i < nr; i++) { 1559 if (sizes[i] != cur.size) { 1560 if (cur.nr > best.nr) 1561 best = cur; 1562 1563 cur.nr = 0; 1564 cur.size = sizes[i]; 1565 } 1566 1567 cur.nr++; 1568 } 1569 1570 if (cur.nr > best.nr) 1571 best = cur; 1572 1573 return best.size; 1574 } 1575 1576 static bool may_create_new_stripe(struct bch_fs *c) 1577 { 1578 return false; 1579 } 1580 1581 static void ec_stripe_key_init(struct bch_fs *c, 1582 struct bkey_i *k, 1583 unsigned nr_data, 1584 unsigned nr_parity, 1585 unsigned stripe_size) 1586 { 1587 struct bkey_i_stripe *s = bkey_stripe_init(k); 1588 unsigned u64s; 1589 1590 s->v.sectors = cpu_to_le16(stripe_size); 1591 s->v.algorithm = 0; 1592 s->v.nr_blocks = nr_data + nr_parity; 1593 s->v.nr_redundant = nr_parity; 1594 s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); 1595 s->v.csum_type = BCH_CSUM_crc32c; 1596 s->v.pad = 0; 1597 1598 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { 1599 BUG_ON(1 << s->v.csum_granularity_bits >= 1600 le16_to_cpu(s->v.sectors) || 1601 s->v.csum_granularity_bits == U8_MAX); 1602 s->v.csum_granularity_bits++; 1603 } 1604 1605 set_bkey_val_u64s(&s->k, u64s); 1606 } 1607 1608 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) 1609 { 1610 struct ec_stripe_new *s; 1611 1612 lockdep_assert_held(&h->lock); 1613 1614 s = kzalloc(sizeof(*s), GFP_KERNEL); 1615 if (!s) 1616 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; 1617 1618 mutex_init(&s->lock); 1619 closure_init(&s->iodone, NULL); 1620 atomic_set(&s->ref[STRIPE_REF_stripe], 1); 1621 atomic_set(&s->ref[STRIPE_REF_io], 1); 1622 s->c = c; 1623 s->h = h; 1624 s->nr_data = min_t(unsigned, h->nr_active_devs, 1625 BCH_BKEY_PTRS_MAX) - h->redundancy; 1626 s->nr_parity = h->redundancy; 1627 1628 ec_stripe_key_init(c, &s->new_stripe.key, 1629 s->nr_data, s->nr_parity, h->blocksize); 1630 1631 h->s = s; 1632 return 0; 1633 } 1634 1635 static struct ec_stripe_head * 1636 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, 1637 unsigned algo, unsigned redundancy, 1638 enum bch_watermark watermark) 1639 { 1640 struct ec_stripe_head *h; 1641 1642 h = kzalloc(sizeof(*h), GFP_KERNEL); 1643 if (!h) 1644 return NULL; 1645 1646 mutex_init(&h->lock); 1647 BUG_ON(!mutex_trylock(&h->lock)); 1648 1649 h->target = target; 1650 h->algo = algo; 1651 h->redundancy = redundancy; 1652 h->watermark = watermark; 1653 1654 rcu_read_lock(); 1655 h->devs = target_rw_devs(c, BCH_DATA_user, target); 1656 1657 for_each_member_device_rcu(c, ca, &h->devs) 1658 if (!ca->mi.durability) 1659 __clear_bit(ca->dev_idx, h->devs.d); 1660 1661 h->blocksize = pick_blocksize(c, &h->devs); 1662 1663 for_each_member_device_rcu(c, ca, &h->devs) 1664 if (ca->mi.bucket_size == h->blocksize) 1665 h->nr_active_devs++; 1666 1667 rcu_read_unlock(); 1668 1669 /* 1670 * If we only have redundancy + 1 devices, we're better off with just 1671 * replication: 1672 */ 1673 if (h->nr_active_devs < h->redundancy + 2) 1674 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", 1675 h->nr_active_devs, h->redundancy + 2); 1676 1677 list_add(&h->list, &c->ec_stripe_head_list); 1678 return h; 1679 } 1680 1681 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) 1682 { 1683 if (h->s && 1684 h->s->allocated && 1685 bitmap_weight(h->s->blocks_allocated, 1686 h->s->nr_data) == h->s->nr_data) 1687 ec_stripe_set_pending(c, h); 1688 1689 mutex_unlock(&h->lock); 1690 } 1691 1692 static struct ec_stripe_head * 1693 __bch2_ec_stripe_head_get(struct btree_trans *trans, 1694 unsigned target, 1695 unsigned algo, 1696 unsigned redundancy, 1697 enum bch_watermark watermark) 1698 { 1699 struct bch_fs *c = trans->c; 1700 struct ec_stripe_head *h; 1701 int ret; 1702 1703 if (!redundancy) 1704 return NULL; 1705 1706 ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); 1707 if (ret) 1708 return ERR_PTR(ret); 1709 1710 if (test_bit(BCH_FS_going_ro, &c->flags)) { 1711 h = ERR_PTR(-BCH_ERR_erofs_no_writes); 1712 goto found; 1713 } 1714 1715 list_for_each_entry(h, &c->ec_stripe_head_list, list) 1716 if (h->target == target && 1717 h->algo == algo && 1718 h->redundancy == redundancy && 1719 h->watermark == watermark) { 1720 ret = bch2_trans_mutex_lock(trans, &h->lock); 1721 if (ret) 1722 h = ERR_PTR(ret); 1723 goto found; 1724 } 1725 1726 h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); 1727 found: 1728 if (!IS_ERR_OR_NULL(h) && 1729 h->nr_active_devs < h->redundancy + 2) { 1730 mutex_unlock(&h->lock); 1731 h = NULL; 1732 } 1733 mutex_unlock(&c->ec_stripe_head_lock); 1734 return h; 1735 } 1736 1737 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, 1738 enum bch_watermark watermark, struct closure *cl) 1739 { 1740 struct bch_fs *c = trans->c; 1741 struct bch_devs_mask devs = h->devs; 1742 struct open_bucket *ob; 1743 struct open_buckets buckets; 1744 struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1745 unsigned i, j, nr_have_parity = 0, nr_have_data = 0; 1746 bool have_cache = true; 1747 int ret = 0; 1748 1749 BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); 1750 BUG_ON(v->nr_redundant != h->s->nr_parity); 1751 1752 for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { 1753 __clear_bit(v->ptrs[i].dev, devs.d); 1754 if (i < h->s->nr_data) 1755 nr_have_data++; 1756 else 1757 nr_have_parity++; 1758 } 1759 1760 BUG_ON(nr_have_data > h->s->nr_data); 1761 BUG_ON(nr_have_parity > h->s->nr_parity); 1762 1763 buckets.nr = 0; 1764 if (nr_have_parity < h->s->nr_parity) { 1765 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1766 &h->parity_stripe, 1767 &devs, 1768 h->s->nr_parity, 1769 &nr_have_parity, 1770 &have_cache, 0, 1771 BCH_DATA_parity, 1772 watermark, 1773 cl); 1774 1775 open_bucket_for_each(c, &buckets, ob, i) { 1776 j = find_next_zero_bit(h->s->blocks_gotten, 1777 h->s->nr_data + h->s->nr_parity, 1778 h->s->nr_data); 1779 BUG_ON(j >= h->s->nr_data + h->s->nr_parity); 1780 1781 h->s->blocks[j] = buckets.v[i]; 1782 v->ptrs[j] = bch2_ob_ptr(c, ob); 1783 __set_bit(j, h->s->blocks_gotten); 1784 } 1785 1786 if (ret) 1787 return ret; 1788 } 1789 1790 buckets.nr = 0; 1791 if (nr_have_data < h->s->nr_data) { 1792 ret = bch2_bucket_alloc_set_trans(trans, &buckets, 1793 &h->block_stripe, 1794 &devs, 1795 h->s->nr_data, 1796 &nr_have_data, 1797 &have_cache, 0, 1798 BCH_DATA_user, 1799 watermark, 1800 cl); 1801 1802 open_bucket_for_each(c, &buckets, ob, i) { 1803 j = find_next_zero_bit(h->s->blocks_gotten, 1804 h->s->nr_data, 0); 1805 BUG_ON(j >= h->s->nr_data); 1806 1807 h->s->blocks[j] = buckets.v[i]; 1808 v->ptrs[j] = bch2_ob_ptr(c, ob); 1809 __set_bit(j, h->s->blocks_gotten); 1810 } 1811 1812 if (ret) 1813 return ret; 1814 } 1815 1816 return 0; 1817 } 1818 1819 /* XXX: doesn't obey target: */ 1820 static s64 get_existing_stripe(struct bch_fs *c, 1821 struct ec_stripe_head *head) 1822 { 1823 ec_stripes_heap *h = &c->ec_stripes_heap; 1824 struct stripe *m; 1825 size_t heap_idx; 1826 u64 stripe_idx; 1827 s64 ret = -1; 1828 1829 if (may_create_new_stripe(c)) 1830 return -1; 1831 1832 mutex_lock(&c->ec_stripes_heap_lock); 1833 for (heap_idx = 0; heap_idx < h->used; heap_idx++) { 1834 /* No blocks worth reusing, stripe will just be deleted: */ 1835 if (!h->data[heap_idx].blocks_nonempty) 1836 continue; 1837 1838 stripe_idx = h->data[heap_idx].idx; 1839 1840 m = genradix_ptr(&c->stripes, stripe_idx); 1841 1842 if (m->algorithm == head->algo && 1843 m->nr_redundant == head->redundancy && 1844 m->sectors == head->blocksize && 1845 m->blocks_nonempty < m->nr_blocks - m->nr_redundant && 1846 bch2_try_open_stripe(c, head->s, stripe_idx)) { 1847 ret = stripe_idx; 1848 break; 1849 } 1850 } 1851 mutex_unlock(&c->ec_stripes_heap_lock); 1852 return ret; 1853 } 1854 1855 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) 1856 { 1857 struct bch_fs *c = trans->c; 1858 struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; 1859 struct bch_stripe *existing_v; 1860 unsigned i; 1861 s64 idx; 1862 int ret; 1863 1864 /* 1865 * If we can't allocate a new stripe, and there's no stripes with empty 1866 * blocks for us to reuse, that means we have to wait on copygc: 1867 */ 1868 idx = get_existing_stripe(c, h); 1869 if (idx < 0) 1870 return -BCH_ERR_stripe_alloc_blocked; 1871 1872 ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); 1873 bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, 1874 "reading stripe key: %s", bch2_err_str(ret)); 1875 if (ret) { 1876 bch2_stripe_close(c, h->s); 1877 return ret; 1878 } 1879 1880 existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; 1881 1882 BUG_ON(existing_v->nr_redundant != h->s->nr_parity); 1883 h->s->nr_data = existing_v->nr_blocks - 1884 existing_v->nr_redundant; 1885 1886 ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); 1887 if (ret) { 1888 bch2_stripe_close(c, h->s); 1889 return ret; 1890 } 1891 1892 BUG_ON(h->s->existing_stripe.size != h->blocksize); 1893 BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); 1894 1895 /* 1896 * Free buckets we initially allocated - they might conflict with 1897 * blocks from the stripe we're reusing: 1898 */ 1899 for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { 1900 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); 1901 h->s->blocks[i] = 0; 1902 } 1903 memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); 1904 memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); 1905 1906 for (i = 0; i < existing_v->nr_blocks; i++) { 1907 if (stripe_blockcount_get(existing_v, i)) { 1908 __set_bit(i, h->s->blocks_gotten); 1909 __set_bit(i, h->s->blocks_allocated); 1910 } 1911 1912 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); 1913 } 1914 1915 bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); 1916 h->s->have_existing_stripe = true; 1917 1918 return 0; 1919 } 1920 1921 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) 1922 { 1923 struct bch_fs *c = trans->c; 1924 struct btree_iter iter; 1925 struct bkey_s_c k; 1926 struct bpos min_pos = POS(0, 1); 1927 struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); 1928 int ret; 1929 1930 if (!h->s->res.sectors) { 1931 ret = bch2_disk_reservation_get(c, &h->s->res, 1932 h->blocksize, 1933 h->s->nr_parity, 1934 BCH_DISK_RESERVATION_NOFAIL); 1935 if (ret) 1936 return ret; 1937 } 1938 1939 for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, 1940 BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { 1941 if (bkey_gt(k.k->p, POS(0, U32_MAX))) { 1942 if (start_pos.offset) { 1943 start_pos = min_pos; 1944 bch2_btree_iter_set_pos(&iter, start_pos); 1945 continue; 1946 } 1947 1948 ret = -BCH_ERR_ENOSPC_stripe_create; 1949 break; 1950 } 1951 1952 if (bkey_deleted(k.k) && 1953 bch2_try_open_stripe(c, h->s, k.k->p.offset)) 1954 break; 1955 } 1956 1957 c->ec_stripe_hint = iter.pos.offset; 1958 1959 if (ret) 1960 goto err; 1961 1962 ret = ec_stripe_mem_alloc(trans, &iter); 1963 if (ret) { 1964 bch2_stripe_close(c, h->s); 1965 goto err; 1966 } 1967 1968 h->s->new_stripe.key.k.p = iter.pos; 1969 out: 1970 bch2_trans_iter_exit(trans, &iter); 1971 return ret; 1972 err: 1973 bch2_disk_reservation_put(c, &h->s->res); 1974 goto out; 1975 } 1976 1977 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, 1978 unsigned target, 1979 unsigned algo, 1980 unsigned redundancy, 1981 enum bch_watermark watermark, 1982 struct closure *cl) 1983 { 1984 struct bch_fs *c = trans->c; 1985 struct ec_stripe_head *h; 1986 bool waiting = false; 1987 int ret; 1988 1989 h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); 1990 if (IS_ERR_OR_NULL(h)) 1991 return h; 1992 1993 if (!h->s) { 1994 ret = ec_new_stripe_alloc(c, h); 1995 if (ret) { 1996 bch_err(c, "failed to allocate new stripe"); 1997 goto err; 1998 } 1999 } 2000 2001 if (h->s->allocated) 2002 goto allocated; 2003 2004 if (h->s->have_existing_stripe) 2005 goto alloc_existing; 2006 2007 /* First, try to allocate a full stripe: */ 2008 ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: 2009 __bch2_ec_stripe_head_reserve(trans, h); 2010 if (!ret) 2011 goto allocate_buf; 2012 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 2013 bch2_err_matches(ret, ENOMEM)) 2014 goto err; 2015 2016 /* 2017 * Not enough buckets available for a full stripe: we must reuse an 2018 * existing stripe: 2019 */ 2020 while (1) { 2021 ret = __bch2_ec_stripe_head_reuse(trans, h); 2022 if (!ret) 2023 break; 2024 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) 2025 goto err; 2026 2027 if (watermark == BCH_WATERMARK_copygc) { 2028 ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: 2029 __bch2_ec_stripe_head_reserve(trans, h); 2030 if (ret) 2031 goto err; 2032 goto allocate_buf; 2033 } 2034 2035 /* XXX freelist_wait? */ 2036 closure_wait(&c->freelist_wait, cl); 2037 waiting = true; 2038 } 2039 2040 if (waiting) 2041 closure_wake_up(&c->freelist_wait); 2042 alloc_existing: 2043 /* 2044 * Retry allocating buckets, with the watermark for this 2045 * particular write: 2046 */ 2047 ret = new_stripe_alloc_buckets(trans, h, watermark, cl); 2048 if (ret) 2049 goto err; 2050 2051 allocate_buf: 2052 ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); 2053 if (ret) 2054 goto err; 2055 2056 h->s->allocated = true; 2057 allocated: 2058 BUG_ON(!h->s->idx); 2059 BUG_ON(!h->s->new_stripe.data[0]); 2060 BUG_ON(trans->restarted); 2061 return h; 2062 err: 2063 bch2_ec_stripe_head_put(c, h); 2064 return ERR_PTR(ret); 2065 } 2066 2067 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) 2068 { 2069 struct ec_stripe_head *h; 2070 struct open_bucket *ob; 2071 unsigned i; 2072 2073 mutex_lock(&c->ec_stripe_head_lock); 2074 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 2075 mutex_lock(&h->lock); 2076 if (!h->s) 2077 goto unlock; 2078 2079 if (!ca) 2080 goto found; 2081 2082 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { 2083 if (!h->s->blocks[i]) 2084 continue; 2085 2086 ob = c->open_buckets + h->s->blocks[i]; 2087 if (ob->dev == ca->dev_idx) 2088 goto found; 2089 } 2090 goto unlock; 2091 found: 2092 h->s->err = -BCH_ERR_erofs_no_writes; 2093 ec_stripe_set_pending(c, h); 2094 unlock: 2095 mutex_unlock(&h->lock); 2096 } 2097 mutex_unlock(&c->ec_stripe_head_lock); 2098 } 2099 2100 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) 2101 { 2102 __bch2_ec_stop(c, ca); 2103 } 2104 2105 void bch2_fs_ec_stop(struct bch_fs *c) 2106 { 2107 __bch2_ec_stop(c, NULL); 2108 } 2109 2110 static bool bch2_fs_ec_flush_done(struct bch_fs *c) 2111 { 2112 bool ret; 2113 2114 mutex_lock(&c->ec_stripe_new_lock); 2115 ret = list_empty(&c->ec_stripe_new_list); 2116 mutex_unlock(&c->ec_stripe_new_lock); 2117 2118 return ret; 2119 } 2120 2121 void bch2_fs_ec_flush(struct bch_fs *c) 2122 { 2123 wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); 2124 } 2125 2126 int bch2_stripes_read(struct bch_fs *c) 2127 { 2128 int ret = bch2_trans_run(c, 2129 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, 2130 BTREE_ITER_PREFETCH, k, ({ 2131 if (k.k->type != KEY_TYPE_stripe) 2132 continue; 2133 2134 ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); 2135 if (ret) 2136 break; 2137 2138 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; 2139 2140 struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); 2141 m->sectors = le16_to_cpu(s->sectors); 2142 m->algorithm = s->algorithm; 2143 m->nr_blocks = s->nr_blocks; 2144 m->nr_redundant = s->nr_redundant; 2145 m->blocks_nonempty = 0; 2146 2147 for (unsigned i = 0; i < s->nr_blocks; i++) 2148 m->blocks_nonempty += !!stripe_blockcount_get(s, i); 2149 2150 bch2_stripes_heap_insert(c, m, k.k->p.offset); 2151 0; 2152 }))); 2153 bch_err_fn(c, ret); 2154 return ret; 2155 } 2156 2157 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) 2158 { 2159 ec_stripes_heap *h = &c->ec_stripes_heap; 2160 struct stripe *m; 2161 size_t i; 2162 2163 mutex_lock(&c->ec_stripes_heap_lock); 2164 for (i = 0; i < min_t(size_t, h->used, 50); i++) { 2165 m = genradix_ptr(&c->stripes, h->data[i].idx); 2166 2167 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, 2168 h->data[i].blocks_nonempty, 2169 m->nr_blocks - m->nr_redundant, 2170 m->nr_redundant); 2171 if (bch2_stripe_is_open(c, h->data[i].idx)) 2172 prt_str(out, " open"); 2173 prt_newline(out); 2174 } 2175 mutex_unlock(&c->ec_stripes_heap_lock); 2176 } 2177 2178 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) 2179 { 2180 struct ec_stripe_head *h; 2181 struct ec_stripe_new *s; 2182 2183 mutex_lock(&c->ec_stripe_head_lock); 2184 list_for_each_entry(h, &c->ec_stripe_head_list, list) { 2185 prt_printf(out, "target %u algo %u redundancy %u %s:\n", 2186 h->target, h->algo, h->redundancy, 2187 bch2_watermarks[h->watermark]); 2188 2189 if (h->s) 2190 prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", 2191 h->s->idx, h->s->nr_data, h->s->nr_parity, 2192 bitmap_weight(h->s->blocks_allocated, 2193 h->s->nr_data)); 2194 } 2195 mutex_unlock(&c->ec_stripe_head_lock); 2196 2197 prt_printf(out, "in flight:\n"); 2198 2199 mutex_lock(&c->ec_stripe_new_lock); 2200 list_for_each_entry(s, &c->ec_stripe_new_list, list) { 2201 prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", 2202 s->idx, s->nr_data, s->nr_parity, 2203 atomic_read(&s->ref[STRIPE_REF_io]), 2204 atomic_read(&s->ref[STRIPE_REF_stripe]), 2205 bch2_watermarks[s->h->watermark]); 2206 } 2207 mutex_unlock(&c->ec_stripe_new_lock); 2208 } 2209 2210 void bch2_fs_ec_exit(struct bch_fs *c) 2211 { 2212 struct ec_stripe_head *h; 2213 unsigned i; 2214 2215 while (1) { 2216 mutex_lock(&c->ec_stripe_head_lock); 2217 h = list_first_entry_or_null(&c->ec_stripe_head_list, 2218 struct ec_stripe_head, list); 2219 if (h) 2220 list_del(&h->list); 2221 mutex_unlock(&c->ec_stripe_head_lock); 2222 if (!h) 2223 break; 2224 2225 if (h->s) { 2226 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) 2227 BUG_ON(h->s->blocks[i]); 2228 2229 kfree(h->s); 2230 } 2231 kfree(h); 2232 } 2233 2234 BUG_ON(!list_empty(&c->ec_stripe_new_list)); 2235 2236 free_heap(&c->ec_stripes_heap); 2237 genradix_free(&c->stripes); 2238 bioset_exit(&c->ec_bioset); 2239 } 2240 2241 void bch2_fs_ec_init_early(struct bch_fs *c) 2242 { 2243 spin_lock_init(&c->ec_stripes_new_lock); 2244 mutex_init(&c->ec_stripes_heap_lock); 2245 2246 INIT_LIST_HEAD(&c->ec_stripe_head_list); 2247 mutex_init(&c->ec_stripe_head_lock); 2248 2249 INIT_LIST_HEAD(&c->ec_stripe_new_list); 2250 mutex_init(&c->ec_stripe_new_lock); 2251 init_waitqueue_head(&c->ec_stripe_new_wait); 2252 2253 INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); 2254 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); 2255 } 2256 2257 int bch2_fs_ec_init(struct bch_fs *c) 2258 { 2259 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), 2260 BIOSET_NEED_BVECS); 2261 } 2262