1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2012 Google, Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_foreground.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_update.h" 12 #include "buckets.h" 13 #include "checksum.h" 14 #include "clock.h" 15 #include "compress.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "extent_update.h" 20 #include "inode.h" 21 #include "io_write.h" 22 #include "journal.h" 23 #include "keylist.h" 24 #include "move.h" 25 #include "nocow_locking.h" 26 #include "rebalance.h" 27 #include "subvolume.h" 28 #include "super.h" 29 #include "super-io.h" 30 #include "trace.h" 31 32 #include <linux/blkdev.h> 33 #include <linux/prefetch.h> 34 #include <linux/random.h> 35 #include <linux/sched/mm.h> 36 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 38 39 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, 40 u64 now, int rw) 41 { 42 u64 latency_capable = 43 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; 44 /* ideally we'd be taking into account the device's variance here: */ 45 u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); 46 s64 latency_over = io_latency - latency_threshold; 47 48 if (latency_threshold && latency_over > 0) { 49 /* 50 * bump up congested by approximately latency_over * 4 / 51 * latency_threshold - we don't need much accuracy here so don't 52 * bother with the divide: 53 */ 54 if (atomic_read(&ca->congested) < CONGESTED_MAX) 55 atomic_add(latency_over >> 56 max_t(int, ilog2(latency_threshold) - 2, 0), 57 &ca->congested); 58 59 ca->congested_last = now; 60 } else if (atomic_read(&ca->congested) > 0) { 61 atomic_dec(&ca->congested); 62 } 63 } 64 65 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) 66 { 67 atomic64_t *latency = &ca->cur_latency[rw]; 68 u64 now = local_clock(); 69 u64 io_latency = time_after64(now, submit_time) 70 ? now - submit_time 71 : 0; 72 u64 old, new, v = atomic64_read(latency); 73 74 do { 75 old = v; 76 77 /* 78 * If the io latency was reasonably close to the current 79 * latency, skip doing the update and atomic operation - most of 80 * the time: 81 */ 82 if (abs((int) (old - io_latency)) < (old >> 1) && 83 now & ~(~0U << 5)) 84 break; 85 86 new = ewma_add(old, io_latency, 5); 87 } while ((v = atomic64_cmpxchg(latency, old, new)) != old); 88 89 bch2_congested_acct(ca, io_latency, now, rw); 90 91 __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); 92 } 93 94 #endif 95 96 /* Allocate, free from mempool: */ 97 98 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) 99 { 100 struct bvec_iter_all iter; 101 struct bio_vec *bv; 102 103 bio_for_each_segment_all(bv, bio, iter) 104 if (bv->bv_page != ZERO_PAGE(0)) 105 mempool_free(bv->bv_page, &c->bio_bounce_pages); 106 bio->bi_vcnt = 0; 107 } 108 109 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) 110 { 111 struct page *page; 112 113 if (likely(!*using_mempool)) { 114 page = alloc_page(GFP_NOFS); 115 if (unlikely(!page)) { 116 mutex_lock(&c->bio_bounce_pages_lock); 117 *using_mempool = true; 118 goto pool_alloc; 119 120 } 121 } else { 122 pool_alloc: 123 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); 124 } 125 126 return page; 127 } 128 129 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, 130 size_t size) 131 { 132 bool using_mempool = false; 133 134 while (size) { 135 struct page *page = __bio_alloc_page_pool(c, &using_mempool); 136 unsigned len = min_t(size_t, PAGE_SIZE, size); 137 138 BUG_ON(!bio_add_page(bio, page, len, 0)); 139 size -= len; 140 } 141 142 if (using_mempool) 143 mutex_unlock(&c->bio_bounce_pages_lock); 144 } 145 146 /* Extent update path: */ 147 148 int bch2_sum_sector_overwrites(struct btree_trans *trans, 149 struct btree_iter *extent_iter, 150 struct bkey_i *new, 151 bool *usage_increasing, 152 s64 *i_sectors_delta, 153 s64 *disk_sectors_delta) 154 { 155 struct bch_fs *c = trans->c; 156 struct btree_iter iter; 157 struct bkey_s_c old; 158 unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); 159 bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); 160 int ret = 0; 161 162 *usage_increasing = false; 163 *i_sectors_delta = 0; 164 *disk_sectors_delta = 0; 165 166 bch2_trans_copy_iter(&iter, extent_iter); 167 168 for_each_btree_key_upto_continue_norestart(iter, 169 new->k.p, BTREE_ITER_SLOTS, old, ret) { 170 s64 sectors = min(new->k.p.offset, old.k->p.offset) - 171 max(bkey_start_offset(&new->k), 172 bkey_start_offset(old.k)); 173 174 *i_sectors_delta += sectors * 175 (bkey_extent_is_allocation(&new->k) - 176 bkey_extent_is_allocation(old.k)); 177 178 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); 179 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot 180 ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) 181 : 0; 182 183 if (!*usage_increasing && 184 (new->k.p.snapshot != old.k->p.snapshot || 185 new_replicas > bch2_bkey_replicas(c, old) || 186 (!new_compressed && bch2_bkey_sectors_compressed(old)))) 187 *usage_increasing = true; 188 189 if (bkey_ge(old.k->p, new->k.p)) 190 break; 191 } 192 193 bch2_trans_iter_exit(trans, &iter); 194 return ret; 195 } 196 197 static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, 198 struct btree_iter *extent_iter, 199 u64 new_i_size, 200 s64 i_sectors_delta) 201 { 202 struct btree_iter iter; 203 struct bkey_i *k; 204 struct bkey_i_inode_v3 *inode; 205 unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; 206 int ret; 207 208 k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, 209 SPOS(0, 210 extent_iter->pos.inode, 211 extent_iter->snapshot), 212 BTREE_ITER_CACHED); 213 ret = PTR_ERR_OR_ZERO(k); 214 if (unlikely(ret)) 215 return ret; 216 217 if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { 218 k = bch2_inode_to_v3(trans, k); 219 ret = PTR_ERR_OR_ZERO(k); 220 if (unlikely(ret)) 221 goto err; 222 } 223 224 inode = bkey_i_to_inode_v3(k); 225 226 if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && 227 new_i_size > le64_to_cpu(inode->v.bi_size)) { 228 inode->v.bi_size = cpu_to_le64(new_i_size); 229 inode_update_flags = 0; 230 } 231 232 if (i_sectors_delta) { 233 le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); 234 inode_update_flags = 0; 235 } 236 237 if (inode->k.p.snapshot != iter.snapshot) { 238 inode->k.p.snapshot = iter.snapshot; 239 inode_update_flags = 0; 240 } 241 242 ret = bch2_trans_update(trans, &iter, &inode->k_i, 243 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 244 inode_update_flags); 245 err: 246 bch2_trans_iter_exit(trans, &iter); 247 return ret; 248 } 249 250 int bch2_extent_update(struct btree_trans *trans, 251 subvol_inum inum, 252 struct btree_iter *iter, 253 struct bkey_i *k, 254 struct disk_reservation *disk_res, 255 u64 new_i_size, 256 s64 *i_sectors_delta_total, 257 bool check_enospc) 258 { 259 struct bpos next_pos; 260 bool usage_increasing; 261 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 262 int ret; 263 264 /* 265 * This traverses us the iterator without changing iter->path->pos to 266 * search_key() (which is pos + 1 for extents): we want there to be a 267 * path already traversed at iter->pos because 268 * bch2_trans_extent_update() will use it to attempt extent merging 269 */ 270 ret = __bch2_btree_iter_traverse(iter); 271 if (ret) 272 return ret; 273 274 ret = bch2_extent_trim_atomic(trans, iter, k); 275 if (ret) 276 return ret; 277 278 next_pos = k->k.p; 279 280 ret = bch2_sum_sector_overwrites(trans, iter, k, 281 &usage_increasing, 282 &i_sectors_delta, 283 &disk_sectors_delta); 284 if (ret) 285 return ret; 286 287 if (disk_res && 288 disk_sectors_delta > (s64) disk_res->sectors) { 289 ret = bch2_disk_reservation_add(trans->c, disk_res, 290 disk_sectors_delta - disk_res->sectors, 291 !check_enospc || !usage_increasing 292 ? BCH_DISK_RESERVATION_NOFAIL : 0); 293 if (ret) 294 return ret; 295 } 296 297 /* 298 * Note: 299 * We always have to do an inode update - even when i_size/i_sectors 300 * aren't changing - for fsync to work properly; fsync relies on 301 * inode->bi_journal_seq which is updated by the trigger code: 302 */ 303 ret = bch2_extent_update_i_size_sectors(trans, iter, 304 min(k->k.p.offset << 9, new_i_size), 305 i_sectors_delta) ?: 306 bch2_trans_update(trans, iter, k, 0) ?: 307 bch2_trans_commit(trans, disk_res, NULL, 308 BTREE_INSERT_NOCHECK_RW| 309 BTREE_INSERT_NOFAIL); 310 if (unlikely(ret)) 311 return ret; 312 313 if (i_sectors_delta_total) 314 *i_sectors_delta_total += i_sectors_delta; 315 bch2_btree_iter_set_pos(iter, next_pos); 316 return 0; 317 } 318 319 static int bch2_write_index_default(struct bch_write_op *op) 320 { 321 struct bch_fs *c = op->c; 322 struct bkey_buf sk; 323 struct keylist *keys = &op->insert_keys; 324 struct bkey_i *k = bch2_keylist_front(keys); 325 struct btree_trans *trans = bch2_trans_get(c); 326 struct btree_iter iter; 327 subvol_inum inum = { 328 .subvol = op->subvol, 329 .inum = k->k.p.inode, 330 }; 331 int ret; 332 333 BUG_ON(!inum.subvol); 334 335 bch2_bkey_buf_init(&sk); 336 337 do { 338 bch2_trans_begin(trans); 339 340 k = bch2_keylist_front(keys); 341 bch2_bkey_buf_copy(&sk, c, k); 342 343 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, 344 &sk.k->k.p.snapshot); 345 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 346 continue; 347 if (ret) 348 break; 349 350 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 351 bkey_start_pos(&sk.k->k), 352 BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 353 354 ret = bch2_extent_update(trans, inum, &iter, sk.k, 355 &op->res, 356 op->new_i_size, &op->i_sectors_delta, 357 op->flags & BCH_WRITE_CHECK_ENOSPC); 358 bch2_trans_iter_exit(trans, &iter); 359 360 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 361 continue; 362 if (ret) 363 break; 364 365 if (bkey_ge(iter.pos, k->k.p)) 366 bch2_keylist_pop_front(&op->insert_keys); 367 else 368 bch2_cut_front(iter.pos, k); 369 } while (!bch2_keylist_empty(keys)); 370 371 bch2_trans_put(trans); 372 bch2_bkey_buf_exit(&sk, c); 373 374 return ret; 375 } 376 377 /* Writes */ 378 379 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, 380 enum bch_data_type type, 381 const struct bkey_i *k, 382 bool nocow) 383 { 384 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 385 const struct bch_extent_ptr *ptr; 386 struct bch_write_bio *n; 387 struct bch_dev *ca; 388 389 BUG_ON(c->opts.nochanges); 390 391 bkey_for_each_ptr(ptrs, ptr) { 392 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || 393 !c->devs[ptr->dev]); 394 395 ca = bch_dev_bkey_exists(c, ptr->dev); 396 397 if (to_entry(ptr + 1) < ptrs.end) { 398 n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, 399 GFP_NOFS, &ca->replica_set)); 400 401 n->bio.bi_end_io = wbio->bio.bi_end_io; 402 n->bio.bi_private = wbio->bio.bi_private; 403 n->parent = wbio; 404 n->split = true; 405 n->bounce = false; 406 n->put_bio = true; 407 n->bio.bi_opf = wbio->bio.bi_opf; 408 bio_inc_remaining(&wbio->bio); 409 } else { 410 n = wbio; 411 n->split = false; 412 } 413 414 n->c = c; 415 n->dev = ptr->dev; 416 n->have_ioref = nocow || bch2_dev_get_ioref(ca, 417 type == BCH_DATA_btree ? READ : WRITE); 418 n->nocow = nocow; 419 n->submit_time = local_clock(); 420 n->inode_offset = bkey_start_offset(&k->k); 421 n->bio.bi_iter.bi_sector = ptr->offset; 422 423 if (likely(n->have_ioref)) { 424 this_cpu_add(ca->io_done->sectors[WRITE][type], 425 bio_sectors(&n->bio)); 426 427 bio_set_dev(&n->bio, ca->disk_sb.bdev); 428 429 if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { 430 bio_endio(&n->bio); 431 continue; 432 } 433 434 submit_bio(&n->bio); 435 } else { 436 n->bio.bi_status = BLK_STS_REMOVED; 437 bio_endio(&n->bio); 438 } 439 } 440 } 441 442 static void __bch2_write(struct bch_write_op *); 443 444 static void bch2_write_done(struct closure *cl) 445 { 446 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 447 struct bch_fs *c = op->c; 448 449 EBUG_ON(op->open_buckets.nr); 450 451 bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); 452 bch2_disk_reservation_put(c, &op->res); 453 454 if (!(op->flags & BCH_WRITE_MOVE)) 455 bch2_write_ref_put(c, BCH_WRITE_REF_write); 456 bch2_keylist_free(&op->insert_keys, op->inline_keys); 457 458 EBUG_ON(cl->parent); 459 closure_debug_destroy(cl); 460 if (op->end_io) 461 op->end_io(op); 462 } 463 464 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) 465 { 466 struct keylist *keys = &op->insert_keys; 467 struct bch_extent_ptr *ptr; 468 struct bkey_i *src, *dst = keys->keys, *n; 469 470 for (src = keys->keys; src != keys->top; src = n) { 471 n = bkey_next(src); 472 473 if (bkey_extent_is_direct_data(&src->k)) { 474 bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, 475 test_bit(ptr->dev, op->failed.d)); 476 477 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) 478 return -EIO; 479 } 480 481 if (dst != src) 482 memmove_u64s_down(dst, src, src->k.u64s); 483 dst = bkey_next(dst); 484 } 485 486 keys->top = dst; 487 return 0; 488 } 489 490 /** 491 * __bch2_write_index - after a write, update index to point to new data 492 * @op: bch_write_op to process 493 */ 494 static void __bch2_write_index(struct bch_write_op *op) 495 { 496 struct bch_fs *c = op->c; 497 struct keylist *keys = &op->insert_keys; 498 struct bkey_i *k; 499 unsigned dev; 500 int ret = 0; 501 502 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 503 ret = bch2_write_drop_io_error_ptrs(op); 504 if (ret) 505 goto err; 506 } 507 508 /* 509 * probably not the ideal place to hook this in, but I don't 510 * particularly want to plumb io_opts all the way through the btree 511 * update stack right now 512 */ 513 for_each_keylist_key(keys, k) 514 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); 515 516 if (!bch2_keylist_empty(keys)) { 517 u64 sectors_start = keylist_sectors(keys); 518 519 ret = !(op->flags & BCH_WRITE_MOVE) 520 ? bch2_write_index_default(op) 521 : bch2_data_update_index_update(op); 522 523 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 524 BUG_ON(keylist_sectors(keys) && !ret); 525 526 op->written += sectors_start - keylist_sectors(keys); 527 528 if (ret && !bch2_err_matches(ret, EROFS)) { 529 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 530 531 bch_err_inum_offset_ratelimited(c, 532 insert->k.p.inode, insert->k.p.offset << 9, 533 "write error while doing btree update: %s", 534 bch2_err_str(ret)); 535 } 536 537 if (ret) 538 goto err; 539 } 540 out: 541 /* If some a bucket wasn't written, we can't erasure code it: */ 542 for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) 543 bch2_open_bucket_write_error(c, &op->open_buckets, dev); 544 545 bch2_open_buckets_put(c, &op->open_buckets); 546 return; 547 err: 548 keys->top = keys->keys; 549 op->error = ret; 550 op->flags |= BCH_WRITE_DONE; 551 goto out; 552 } 553 554 static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) 555 { 556 if (state != wp->state) { 557 u64 now = ktime_get_ns(); 558 559 if (wp->last_state_change && 560 time_after64(now, wp->last_state_change)) 561 wp->time[wp->state] += now - wp->last_state_change; 562 wp->state = state; 563 wp->last_state_change = now; 564 } 565 } 566 567 static inline void wp_update_state(struct write_point *wp, bool running) 568 { 569 enum write_point_state state; 570 571 state = running ? WRITE_POINT_running : 572 !list_empty(&wp->writes) ? WRITE_POINT_waiting_io 573 : WRITE_POINT_stopped; 574 575 __wp_update_state(wp, state); 576 } 577 578 static void bch2_write_index(struct closure *cl) 579 { 580 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 581 struct write_point *wp = op->wp; 582 struct workqueue_struct *wq = index_update_wq(op); 583 unsigned long flags; 584 585 if ((op->flags & BCH_WRITE_DONE) && 586 (op->flags & BCH_WRITE_MOVE)) 587 bch2_bio_free_pages_pool(op->c, &op->wbio.bio); 588 589 spin_lock_irqsave(&wp->writes_lock, flags); 590 if (wp->state == WRITE_POINT_waiting_io) 591 __wp_update_state(wp, WRITE_POINT_waiting_work); 592 list_add_tail(&op->wp_list, &wp->writes); 593 spin_unlock_irqrestore (&wp->writes_lock, flags); 594 595 queue_work(wq, &wp->index_update_work); 596 } 597 598 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) 599 { 600 op->wp = wp; 601 602 if (wp->state == WRITE_POINT_stopped) { 603 spin_lock_irq(&wp->writes_lock); 604 __wp_update_state(wp, WRITE_POINT_waiting_io); 605 spin_unlock_irq(&wp->writes_lock); 606 } 607 } 608 609 void bch2_write_point_do_index_updates(struct work_struct *work) 610 { 611 struct write_point *wp = 612 container_of(work, struct write_point, index_update_work); 613 struct bch_write_op *op; 614 615 while (1) { 616 spin_lock_irq(&wp->writes_lock); 617 op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); 618 if (op) 619 list_del(&op->wp_list); 620 wp_update_state(wp, op != NULL); 621 spin_unlock_irq(&wp->writes_lock); 622 623 if (!op) 624 break; 625 626 op->flags |= BCH_WRITE_IN_WORKER; 627 628 __bch2_write_index(op); 629 630 if (!(op->flags & BCH_WRITE_DONE)) 631 __bch2_write(op); 632 else 633 bch2_write_done(&op->cl); 634 } 635 } 636 637 static void bch2_write_endio(struct bio *bio) 638 { 639 struct closure *cl = bio->bi_private; 640 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 641 struct bch_write_bio *wbio = to_wbio(bio); 642 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; 643 struct bch_fs *c = wbio->c; 644 struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); 645 646 if (bch2_dev_inum_io_err_on(bio->bi_status, ca, 647 op->pos.inode, 648 wbio->inode_offset << 9, 649 "data write error: %s", 650 bch2_blk_status_to_str(bio->bi_status))) { 651 set_bit(wbio->dev, op->failed.d); 652 op->flags |= BCH_WRITE_IO_ERROR; 653 } 654 655 if (wbio->nocow) 656 set_bit(wbio->dev, op->devs_need_flush->d); 657 658 if (wbio->have_ioref) { 659 bch2_latency_acct(ca, wbio->submit_time, WRITE); 660 percpu_ref_put(&ca->io_ref); 661 } 662 663 if (wbio->bounce) 664 bch2_bio_free_pages_pool(c, bio); 665 666 if (wbio->put_bio) 667 bio_put(bio); 668 669 if (parent) 670 bio_endio(&parent->bio); 671 else 672 closure_put(cl); 673 } 674 675 static void init_append_extent(struct bch_write_op *op, 676 struct write_point *wp, 677 struct bversion version, 678 struct bch_extent_crc_unpacked crc) 679 { 680 struct bkey_i_extent *e; 681 682 op->pos.offset += crc.uncompressed_size; 683 684 e = bkey_extent_init(op->insert_keys.top); 685 e->k.p = op->pos; 686 e->k.size = crc.uncompressed_size; 687 e->k.version = version; 688 689 if (crc.csum_type || 690 crc.compression_type || 691 crc.nonce) 692 bch2_extent_crc_append(&e->k_i, crc); 693 694 bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, 695 op->flags & BCH_WRITE_CACHED); 696 697 bch2_keylist_push(&op->insert_keys); 698 } 699 700 static struct bio *bch2_write_bio_alloc(struct bch_fs *c, 701 struct write_point *wp, 702 struct bio *src, 703 bool *page_alloc_failed, 704 void *buf) 705 { 706 struct bch_write_bio *wbio; 707 struct bio *bio; 708 unsigned output_available = 709 min(wp->sectors_free << 9, src->bi_iter.bi_size); 710 unsigned pages = DIV_ROUND_UP(output_available + 711 (buf 712 ? ((unsigned long) buf & (PAGE_SIZE - 1)) 713 : 0), PAGE_SIZE); 714 715 pages = min(pages, BIO_MAX_VECS); 716 717 bio = bio_alloc_bioset(NULL, pages, 0, 718 GFP_NOFS, &c->bio_write); 719 wbio = wbio_init(bio); 720 wbio->put_bio = true; 721 /* copy WRITE_SYNC flag */ 722 wbio->bio.bi_opf = src->bi_opf; 723 724 if (buf) { 725 bch2_bio_map(bio, buf, output_available); 726 return bio; 727 } 728 729 wbio->bounce = true; 730 731 /* 732 * We can't use mempool for more than c->sb.encoded_extent_max 733 * worth of pages, but we'd like to allocate more if we can: 734 */ 735 bch2_bio_alloc_pages_pool(c, bio, 736 min_t(unsigned, output_available, 737 c->opts.encoded_extent_max)); 738 739 if (bio->bi_iter.bi_size < output_available) 740 *page_alloc_failed = 741 bch2_bio_alloc_pages(bio, 742 output_available - 743 bio->bi_iter.bi_size, 744 GFP_NOFS) != 0; 745 746 return bio; 747 } 748 749 static int bch2_write_rechecksum(struct bch_fs *c, 750 struct bch_write_op *op, 751 unsigned new_csum_type) 752 { 753 struct bio *bio = &op->wbio.bio; 754 struct bch_extent_crc_unpacked new_crc; 755 int ret; 756 757 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ 758 759 if (bch2_csum_type_is_encryption(op->crc.csum_type) != 760 bch2_csum_type_is_encryption(new_csum_type)) 761 new_csum_type = op->crc.csum_type; 762 763 ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, 764 NULL, &new_crc, 765 op->crc.offset, op->crc.live_size, 766 new_csum_type); 767 if (ret) 768 return ret; 769 770 bio_advance(bio, op->crc.offset << 9); 771 bio->bi_iter.bi_size = op->crc.live_size << 9; 772 op->crc = new_crc; 773 return 0; 774 } 775 776 static int bch2_write_decrypt(struct bch_write_op *op) 777 { 778 struct bch_fs *c = op->c; 779 struct nonce nonce = extent_nonce(op->version, op->crc); 780 struct bch_csum csum; 781 int ret; 782 783 if (!bch2_csum_type_is_encryption(op->crc.csum_type)) 784 return 0; 785 786 /* 787 * If we need to decrypt data in the write path, we'll no longer be able 788 * to verify the existing checksum (poly1305 mac, in this case) after 789 * it's decrypted - this is the last point we'll be able to reverify the 790 * checksum: 791 */ 792 csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 793 if (bch2_crc_cmp(op->crc.csum, csum)) 794 return -EIO; 795 796 ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 797 op->crc.csum_type = 0; 798 op->crc.csum = (struct bch_csum) { 0, 0 }; 799 return ret; 800 } 801 802 static enum prep_encoded_ret { 803 PREP_ENCODED_OK, 804 PREP_ENCODED_ERR, 805 PREP_ENCODED_CHECKSUM_ERR, 806 PREP_ENCODED_DO_WRITE, 807 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) 808 { 809 struct bch_fs *c = op->c; 810 struct bio *bio = &op->wbio.bio; 811 812 if (!(op->flags & BCH_WRITE_DATA_ENCODED)) 813 return PREP_ENCODED_OK; 814 815 BUG_ON(bio_sectors(bio) != op->crc.compressed_size); 816 817 /* Can we just write the entire extent as is? */ 818 if (op->crc.uncompressed_size == op->crc.live_size && 819 op->crc.compressed_size <= wp->sectors_free && 820 (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || 821 op->incompressible)) { 822 if (!crc_is_compressed(op->crc) && 823 op->csum_type != op->crc.csum_type && 824 bch2_write_rechecksum(c, op, op->csum_type) && 825 !c->opts.no_data_io) 826 return PREP_ENCODED_CHECKSUM_ERR; 827 828 return PREP_ENCODED_DO_WRITE; 829 } 830 831 /* 832 * If the data is compressed and we couldn't write the entire extent as 833 * is, we have to decompress it: 834 */ 835 if (crc_is_compressed(op->crc)) { 836 struct bch_csum csum; 837 838 if (bch2_write_decrypt(op)) 839 return PREP_ENCODED_CHECKSUM_ERR; 840 841 /* Last point we can still verify checksum: */ 842 csum = bch2_checksum_bio(c, op->crc.csum_type, 843 extent_nonce(op->version, op->crc), 844 bio); 845 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 846 return PREP_ENCODED_CHECKSUM_ERR; 847 848 if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) 849 return PREP_ENCODED_ERR; 850 } 851 852 /* 853 * No longer have compressed data after this point - data might be 854 * encrypted: 855 */ 856 857 /* 858 * If the data is checksummed and we're only writing a subset, 859 * rechecksum and adjust bio to point to currently live data: 860 */ 861 if ((op->crc.live_size != op->crc.uncompressed_size || 862 op->crc.csum_type != op->csum_type) && 863 bch2_write_rechecksum(c, op, op->csum_type) && 864 !c->opts.no_data_io) 865 return PREP_ENCODED_CHECKSUM_ERR; 866 867 /* 868 * If we want to compress the data, it has to be decrypted: 869 */ 870 if ((op->compression_opt || 871 bch2_csum_type_is_encryption(op->crc.csum_type) != 872 bch2_csum_type_is_encryption(op->csum_type)) && 873 bch2_write_decrypt(op)) 874 return PREP_ENCODED_CHECKSUM_ERR; 875 876 return PREP_ENCODED_OK; 877 } 878 879 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, 880 struct bio **_dst) 881 { 882 struct bch_fs *c = op->c; 883 struct bio *src = &op->wbio.bio, *dst = src; 884 struct bvec_iter saved_iter; 885 void *ec_buf; 886 unsigned total_output = 0, total_input = 0; 887 bool bounce = false; 888 bool page_alloc_failed = false; 889 int ret, more = 0; 890 891 BUG_ON(!bio_sectors(src)); 892 893 ec_buf = bch2_writepoint_ec_buf(c, wp); 894 895 switch (bch2_write_prep_encoded_data(op, wp)) { 896 case PREP_ENCODED_OK: 897 break; 898 case PREP_ENCODED_ERR: 899 ret = -EIO; 900 goto err; 901 case PREP_ENCODED_CHECKSUM_ERR: 902 goto csum_err; 903 case PREP_ENCODED_DO_WRITE: 904 /* XXX look for bug here */ 905 if (ec_buf) { 906 dst = bch2_write_bio_alloc(c, wp, src, 907 &page_alloc_failed, 908 ec_buf); 909 bio_copy_data(dst, src); 910 bounce = true; 911 } 912 init_append_extent(op, wp, op->version, op->crc); 913 goto do_write; 914 } 915 916 if (ec_buf || 917 op->compression_opt || 918 (op->csum_type && 919 !(op->flags & BCH_WRITE_PAGES_STABLE)) || 920 (bch2_csum_type_is_encryption(op->csum_type) && 921 !(op->flags & BCH_WRITE_PAGES_OWNED))) { 922 dst = bch2_write_bio_alloc(c, wp, src, 923 &page_alloc_failed, 924 ec_buf); 925 bounce = true; 926 } 927 928 saved_iter = dst->bi_iter; 929 930 do { 931 struct bch_extent_crc_unpacked crc = { 0 }; 932 struct bversion version = op->version; 933 size_t dst_len = 0, src_len = 0; 934 935 if (page_alloc_failed && 936 dst->bi_iter.bi_size < (wp->sectors_free << 9) && 937 dst->bi_iter.bi_size < c->opts.encoded_extent_max) 938 break; 939 940 BUG_ON(op->compression_opt && 941 (op->flags & BCH_WRITE_DATA_ENCODED) && 942 bch2_csum_type_is_encryption(op->crc.csum_type)); 943 BUG_ON(op->compression_opt && !bounce); 944 945 crc.compression_type = op->incompressible 946 ? BCH_COMPRESSION_TYPE_incompressible 947 : op->compression_opt 948 ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, 949 op->compression_opt) 950 : 0; 951 if (!crc_is_compressed(crc)) { 952 dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); 953 dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); 954 955 if (op->csum_type) 956 dst_len = min_t(unsigned, dst_len, 957 c->opts.encoded_extent_max); 958 959 if (bounce) { 960 swap(dst->bi_iter.bi_size, dst_len); 961 bio_copy_data(dst, src); 962 swap(dst->bi_iter.bi_size, dst_len); 963 } 964 965 src_len = dst_len; 966 } 967 968 BUG_ON(!src_len || !dst_len); 969 970 if (bch2_csum_type_is_encryption(op->csum_type)) { 971 if (bversion_zero(version)) { 972 version.lo = atomic64_inc_return(&c->key_version); 973 } else { 974 crc.nonce = op->nonce; 975 op->nonce += src_len >> 9; 976 } 977 } 978 979 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 980 !crc_is_compressed(crc) && 981 bch2_csum_type_is_encryption(op->crc.csum_type) == 982 bch2_csum_type_is_encryption(op->csum_type)) { 983 u8 compression_type = crc.compression_type; 984 u16 nonce = crc.nonce; 985 /* 986 * Note: when we're using rechecksum(), we need to be 987 * checksumming @src because it has all the data our 988 * existing checksum covers - if we bounced (because we 989 * were trying to compress), @dst will only have the 990 * part of the data the new checksum will cover. 991 * 992 * But normally we want to be checksumming post bounce, 993 * because part of the reason for bouncing is so the 994 * data can't be modified (by userspace) while it's in 995 * flight. 996 */ 997 if (bch2_rechecksum_bio(c, src, version, op->crc, 998 &crc, &op->crc, 999 src_len >> 9, 1000 bio_sectors(src) - (src_len >> 9), 1001 op->csum_type)) 1002 goto csum_err; 1003 /* 1004 * rchecksum_bio sets compression_type on crc from op->crc, 1005 * this isn't always correct as sometimes we're changing 1006 * an extent from uncompressed to incompressible. 1007 */ 1008 crc.compression_type = compression_type; 1009 crc.nonce = nonce; 1010 } else { 1011 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 1012 bch2_rechecksum_bio(c, src, version, op->crc, 1013 NULL, &op->crc, 1014 src_len >> 9, 1015 bio_sectors(src) - (src_len >> 9), 1016 op->crc.csum_type)) 1017 goto csum_err; 1018 1019 crc.compressed_size = dst_len >> 9; 1020 crc.uncompressed_size = src_len >> 9; 1021 crc.live_size = src_len >> 9; 1022 1023 swap(dst->bi_iter.bi_size, dst_len); 1024 ret = bch2_encrypt_bio(c, op->csum_type, 1025 extent_nonce(version, crc), dst); 1026 if (ret) 1027 goto err; 1028 1029 crc.csum = bch2_checksum_bio(c, op->csum_type, 1030 extent_nonce(version, crc), dst); 1031 crc.csum_type = op->csum_type; 1032 swap(dst->bi_iter.bi_size, dst_len); 1033 } 1034 1035 init_append_extent(op, wp, version, crc); 1036 1037 if (dst != src) 1038 bio_advance(dst, dst_len); 1039 bio_advance(src, src_len); 1040 total_output += dst_len; 1041 total_input += src_len; 1042 } while (dst->bi_iter.bi_size && 1043 src->bi_iter.bi_size && 1044 wp->sectors_free && 1045 !bch2_keylist_realloc(&op->insert_keys, 1046 op->inline_keys, 1047 ARRAY_SIZE(op->inline_keys), 1048 BKEY_EXTENT_U64s_MAX)); 1049 1050 more = src->bi_iter.bi_size != 0; 1051 1052 dst->bi_iter = saved_iter; 1053 1054 if (dst == src && more) { 1055 BUG_ON(total_output != total_input); 1056 1057 dst = bio_split(src, total_input >> 9, 1058 GFP_NOFS, &c->bio_write); 1059 wbio_init(dst)->put_bio = true; 1060 /* copy WRITE_SYNC flag */ 1061 dst->bi_opf = src->bi_opf; 1062 } 1063 1064 dst->bi_iter.bi_size = total_output; 1065 do_write: 1066 *_dst = dst; 1067 return more; 1068 csum_err: 1069 bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); 1070 ret = -EIO; 1071 err: 1072 if (to_wbio(dst)->bounce) 1073 bch2_bio_free_pages_pool(c, dst); 1074 if (to_wbio(dst)->put_bio) 1075 bio_put(dst); 1076 1077 return ret; 1078 } 1079 1080 static bool bch2_extent_is_writeable(struct bch_write_op *op, 1081 struct bkey_s_c k) 1082 { 1083 struct bch_fs *c = op->c; 1084 struct bkey_s_c_extent e; 1085 struct extent_ptr_decoded p; 1086 const union bch_extent_entry *entry; 1087 unsigned replicas = 0; 1088 1089 if (k.k->type != KEY_TYPE_extent) 1090 return false; 1091 1092 e = bkey_s_c_to_extent(k); 1093 extent_for_each_ptr_decode(e, p, entry) { 1094 if (p.crc.csum_type || 1095 crc_is_compressed(p.crc) || 1096 p.has_ec) 1097 return false; 1098 1099 replicas += bch2_extent_ptr_durability(c, &p); 1100 } 1101 1102 return replicas >= op->opts.data_replicas; 1103 } 1104 1105 static inline void bch2_nocow_write_unlock(struct bch_write_op *op) 1106 { 1107 struct bch_fs *c = op->c; 1108 const struct bch_extent_ptr *ptr; 1109 struct bkey_i *k; 1110 1111 for_each_keylist_key(&op->insert_keys, k) { 1112 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 1113 1114 bkey_for_each_ptr(ptrs, ptr) 1115 bch2_bucket_nocow_unlock(&c->nocow_locks, 1116 PTR_BUCKET_POS(c, ptr), 1117 BUCKET_NOCOW_LOCK_UPDATE); 1118 } 1119 } 1120 1121 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, 1122 struct btree_iter *iter, 1123 struct bkey_i *orig, 1124 struct bkey_s_c k, 1125 u64 new_i_size) 1126 { 1127 struct bkey_i *new; 1128 struct bkey_ptrs ptrs; 1129 struct bch_extent_ptr *ptr; 1130 int ret; 1131 1132 if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { 1133 /* trace this */ 1134 return 0; 1135 } 1136 1137 new = bch2_bkey_make_mut_noupdate(trans, k); 1138 ret = PTR_ERR_OR_ZERO(new); 1139 if (ret) 1140 return ret; 1141 1142 bch2_cut_front(bkey_start_pos(&orig->k), new); 1143 bch2_cut_back(orig->k.p, new); 1144 1145 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 1146 bkey_for_each_ptr(ptrs, ptr) 1147 ptr->unwritten = 0; 1148 1149 /* 1150 * Note that we're not calling bch2_subvol_get_snapshot() in this path - 1151 * that was done when we kicked off the write, and here it's important 1152 * that we update the extent that we wrote to - even if a snapshot has 1153 * since been created. The write is still outstanding, so we're ok 1154 * w.r.t. snapshot atomicity: 1155 */ 1156 return bch2_extent_update_i_size_sectors(trans, iter, 1157 min(new->k.p.offset << 9, new_i_size), 0) ?: 1158 bch2_trans_update(trans, iter, new, 1159 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1160 } 1161 1162 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) 1163 { 1164 struct bch_fs *c = op->c; 1165 struct btree_trans *trans = bch2_trans_get(c); 1166 struct btree_iter iter; 1167 struct bkey_i *orig; 1168 struct bkey_s_c k; 1169 int ret; 1170 1171 for_each_keylist_key(&op->insert_keys, orig) { 1172 ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, 1173 bkey_start_pos(&orig->k), orig->k.p, 1174 BTREE_ITER_INTENT, k, 1175 NULL, NULL, BTREE_INSERT_NOFAIL, ({ 1176 bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); 1177 })); 1178 1179 if (ret && !bch2_err_matches(ret, EROFS)) { 1180 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 1181 1182 bch_err_inum_offset_ratelimited(c, 1183 insert->k.p.inode, insert->k.p.offset << 9, 1184 "write error while doing btree update: %s", 1185 bch2_err_str(ret)); 1186 } 1187 1188 if (ret) { 1189 op->error = ret; 1190 break; 1191 } 1192 } 1193 1194 bch2_trans_put(trans); 1195 } 1196 1197 static void __bch2_nocow_write_done(struct bch_write_op *op) 1198 { 1199 bch2_nocow_write_unlock(op); 1200 1201 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 1202 op->error = -EIO; 1203 } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) 1204 bch2_nocow_write_convert_unwritten(op); 1205 } 1206 1207 static void bch2_nocow_write_done(struct closure *cl) 1208 { 1209 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 1210 1211 __bch2_nocow_write_done(op); 1212 bch2_write_done(cl); 1213 } 1214 1215 static void bch2_nocow_write(struct bch_write_op *op) 1216 { 1217 struct bch_fs *c = op->c; 1218 struct btree_trans *trans; 1219 struct btree_iter iter; 1220 struct bkey_s_c k; 1221 struct bkey_ptrs_c ptrs; 1222 const struct bch_extent_ptr *ptr; 1223 struct { 1224 struct bpos b; 1225 unsigned gen; 1226 struct nocow_lock_bucket *l; 1227 } buckets[BCH_REPLICAS_MAX]; 1228 unsigned nr_buckets = 0; 1229 u32 snapshot; 1230 int ret, i; 1231 1232 if (op->flags & BCH_WRITE_MOVE) 1233 return; 1234 1235 trans = bch2_trans_get(c); 1236 retry: 1237 bch2_trans_begin(trans); 1238 1239 ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); 1240 if (unlikely(ret)) 1241 goto err; 1242 1243 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1244 SPOS(op->pos.inode, op->pos.offset, snapshot), 1245 BTREE_ITER_SLOTS); 1246 while (1) { 1247 struct bio *bio = &op->wbio.bio; 1248 1249 nr_buckets = 0; 1250 1251 k = bch2_btree_iter_peek_slot(&iter); 1252 ret = bkey_err(k); 1253 if (ret) 1254 break; 1255 1256 /* fall back to normal cow write path? */ 1257 if (unlikely(k.k->p.snapshot != snapshot || 1258 !bch2_extent_is_writeable(op, k))) 1259 break; 1260 1261 if (bch2_keylist_realloc(&op->insert_keys, 1262 op->inline_keys, 1263 ARRAY_SIZE(op->inline_keys), 1264 k.k->u64s)) 1265 break; 1266 1267 /* Get iorefs before dropping btree locks: */ 1268 ptrs = bch2_bkey_ptrs_c(k); 1269 bkey_for_each_ptr(ptrs, ptr) { 1270 buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); 1271 buckets[nr_buckets].gen = ptr->gen; 1272 buckets[nr_buckets].l = 1273 bucket_nocow_lock(&c->nocow_locks, 1274 bucket_to_u64(buckets[nr_buckets].b)); 1275 1276 prefetch(buckets[nr_buckets].l); 1277 1278 if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) 1279 goto err_get_ioref; 1280 1281 nr_buckets++; 1282 1283 if (ptr->unwritten) 1284 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; 1285 } 1286 1287 /* Unlock before taking nocow locks, doing IO: */ 1288 bkey_reassemble(op->insert_keys.top, k); 1289 bch2_trans_unlock(trans); 1290 1291 bch2_cut_front(op->pos, op->insert_keys.top); 1292 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) 1293 bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); 1294 1295 for (i = 0; i < nr_buckets; i++) { 1296 struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); 1297 struct nocow_lock_bucket *l = buckets[i].l; 1298 bool stale; 1299 1300 __bch2_bucket_nocow_lock(&c->nocow_locks, l, 1301 bucket_to_u64(buckets[i].b), 1302 BUCKET_NOCOW_LOCK_UPDATE); 1303 1304 rcu_read_lock(); 1305 stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); 1306 rcu_read_unlock(); 1307 1308 if (unlikely(stale)) 1309 goto err_bucket_stale; 1310 } 1311 1312 bio = &op->wbio.bio; 1313 if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { 1314 bio = bio_split(bio, k.k->p.offset - op->pos.offset, 1315 GFP_KERNEL, &c->bio_write); 1316 wbio_init(bio)->put_bio = true; 1317 bio->bi_opf = op->wbio.bio.bi_opf; 1318 } else { 1319 op->flags |= BCH_WRITE_DONE; 1320 } 1321 1322 op->pos.offset += bio_sectors(bio); 1323 op->written += bio_sectors(bio); 1324 1325 bio->bi_end_io = bch2_write_endio; 1326 bio->bi_private = &op->cl; 1327 bio->bi_opf |= REQ_OP_WRITE; 1328 closure_get(&op->cl); 1329 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1330 op->insert_keys.top, true); 1331 1332 bch2_keylist_push(&op->insert_keys); 1333 if (op->flags & BCH_WRITE_DONE) 1334 break; 1335 bch2_btree_iter_advance(&iter); 1336 } 1337 out: 1338 bch2_trans_iter_exit(trans, &iter); 1339 err: 1340 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1341 goto retry; 1342 1343 if (ret) { 1344 bch_err_inum_offset_ratelimited(c, 1345 op->pos.inode, 1346 op->pos.offset << 9, 1347 "%s: btree lookup error %s", 1348 __func__, bch2_err_str(ret)); 1349 op->error = ret; 1350 op->flags |= BCH_WRITE_DONE; 1351 } 1352 1353 bch2_trans_put(trans); 1354 1355 /* fallback to cow write path? */ 1356 if (!(op->flags & BCH_WRITE_DONE)) { 1357 closure_sync(&op->cl); 1358 __bch2_nocow_write_done(op); 1359 op->insert_keys.top = op->insert_keys.keys; 1360 } else if (op->flags & BCH_WRITE_SYNC) { 1361 closure_sync(&op->cl); 1362 bch2_nocow_write_done(&op->cl); 1363 } else { 1364 /* 1365 * XXX 1366 * needs to run out of process context because ei_quota_lock is 1367 * a mutex 1368 */ 1369 continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); 1370 } 1371 return; 1372 err_get_ioref: 1373 for (i = 0; i < nr_buckets; i++) 1374 percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); 1375 1376 /* Fall back to COW path: */ 1377 goto out; 1378 err_bucket_stale: 1379 while (i >= 0) { 1380 bch2_bucket_nocow_unlock(&c->nocow_locks, 1381 buckets[i].b, 1382 BUCKET_NOCOW_LOCK_UPDATE); 1383 --i; 1384 } 1385 for (i = 0; i < nr_buckets; i++) 1386 percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); 1387 1388 /* We can retry this: */ 1389 ret = -BCH_ERR_transaction_restart; 1390 goto out; 1391 } 1392 1393 static void __bch2_write(struct bch_write_op *op) 1394 { 1395 struct bch_fs *c = op->c; 1396 struct write_point *wp = NULL; 1397 struct bio *bio = NULL; 1398 unsigned nofs_flags; 1399 int ret; 1400 1401 nofs_flags = memalloc_nofs_save(); 1402 1403 if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { 1404 bch2_nocow_write(op); 1405 if (op->flags & BCH_WRITE_DONE) 1406 goto out_nofs_restore; 1407 } 1408 again: 1409 memset(&op->failed, 0, sizeof(op->failed)); 1410 1411 do { 1412 struct bkey_i *key_to_write; 1413 unsigned key_to_write_offset = op->insert_keys.top_p - 1414 op->insert_keys.keys_p; 1415 1416 /* +1 for possible cache device: */ 1417 if (op->open_buckets.nr + op->nr_replicas + 1 > 1418 ARRAY_SIZE(op->open_buckets.v)) 1419 break; 1420 1421 if (bch2_keylist_realloc(&op->insert_keys, 1422 op->inline_keys, 1423 ARRAY_SIZE(op->inline_keys), 1424 BKEY_EXTENT_U64s_MAX)) 1425 break; 1426 1427 /* 1428 * The copygc thread is now global, which means it's no longer 1429 * freeing up space on specific disks, which means that 1430 * allocations for specific disks may hang arbitrarily long: 1431 */ 1432 ret = bch2_trans_do(c, NULL, NULL, 0, 1433 bch2_alloc_sectors_start_trans(trans, 1434 op->target, 1435 op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), 1436 op->write_point, 1437 &op->devs_have, 1438 op->nr_replicas, 1439 op->nr_replicas_required, 1440 op->watermark, 1441 op->flags, 1442 (op->flags & (BCH_WRITE_ALLOC_NOWAIT| 1443 BCH_WRITE_ONLY_SPECIFIED_DEVS)) 1444 ? NULL : &op->cl, &wp)); 1445 if (unlikely(ret)) { 1446 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) 1447 break; 1448 1449 goto err; 1450 } 1451 1452 EBUG_ON(!wp); 1453 1454 bch2_open_bucket_get(c, wp, &op->open_buckets); 1455 ret = bch2_write_extent(op, wp, &bio); 1456 1457 bch2_alloc_sectors_done_inlined(c, wp); 1458 err: 1459 if (ret <= 0) { 1460 op->flags |= BCH_WRITE_DONE; 1461 1462 if (ret < 0) { 1463 op->error = ret; 1464 break; 1465 } 1466 } 1467 1468 bio->bi_end_io = bch2_write_endio; 1469 bio->bi_private = &op->cl; 1470 bio->bi_opf |= REQ_OP_WRITE; 1471 1472 closure_get(bio->bi_private); 1473 1474 key_to_write = (void *) (op->insert_keys.keys_p + 1475 key_to_write_offset); 1476 1477 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1478 key_to_write, false); 1479 } while (ret); 1480 1481 /* 1482 * Sync or no? 1483 * 1484 * If we're running asynchronously, wne may still want to block 1485 * synchronously here if we weren't able to submit all of the IO at 1486 * once, as that signals backpressure to the caller. 1487 */ 1488 if ((op->flags & BCH_WRITE_SYNC) || 1489 (!(op->flags & BCH_WRITE_DONE) && 1490 !(op->flags & BCH_WRITE_IN_WORKER))) { 1491 closure_sync(&op->cl); 1492 __bch2_write_index(op); 1493 1494 if (!(op->flags & BCH_WRITE_DONE)) 1495 goto again; 1496 bch2_write_done(&op->cl); 1497 } else { 1498 bch2_write_queue(op, wp); 1499 continue_at(&op->cl, bch2_write_index, NULL); 1500 } 1501 out_nofs_restore: 1502 memalloc_nofs_restore(nofs_flags); 1503 } 1504 1505 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) 1506 { 1507 struct bio *bio = &op->wbio.bio; 1508 struct bvec_iter iter; 1509 struct bkey_i_inline_data *id; 1510 unsigned sectors; 1511 int ret; 1512 1513 op->flags |= BCH_WRITE_WROTE_DATA_INLINE; 1514 op->flags |= BCH_WRITE_DONE; 1515 1516 bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); 1517 1518 ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, 1519 ARRAY_SIZE(op->inline_keys), 1520 BKEY_U64s + DIV_ROUND_UP(data_len, 8)); 1521 if (ret) { 1522 op->error = ret; 1523 goto err; 1524 } 1525 1526 sectors = bio_sectors(bio); 1527 op->pos.offset += sectors; 1528 1529 id = bkey_inline_data_init(op->insert_keys.top); 1530 id->k.p = op->pos; 1531 id->k.version = op->version; 1532 id->k.size = sectors; 1533 1534 iter = bio->bi_iter; 1535 iter.bi_size = data_len; 1536 memcpy_from_bio(id->v.data, bio, iter); 1537 1538 while (data_len & 7) 1539 id->v.data[data_len++] = '\0'; 1540 set_bkey_val_bytes(&id->k, data_len); 1541 bch2_keylist_push(&op->insert_keys); 1542 1543 __bch2_write_index(op); 1544 err: 1545 bch2_write_done(&op->cl); 1546 } 1547 1548 /** 1549 * bch2_write() - handle a write to a cache device or flash only volume 1550 * @cl: &bch_write_op->cl 1551 * 1552 * This is the starting point for any data to end up in a cache device; it could 1553 * be from a normal write, or a writeback write, or a write to a flash only 1554 * volume - it's also used by the moving garbage collector to compact data in 1555 * mostly empty buckets. 1556 * 1557 * It first writes the data to the cache, creating a list of keys to be inserted 1558 * (if the data won't fit in a single open bucket, there will be multiple keys); 1559 * after the data is written it calls bch_journal, and after the keys have been 1560 * added to the next journal write they're inserted into the btree. 1561 * 1562 * If op->discard is true, instead of inserting the data it invalidates the 1563 * region of the cache represented by op->bio and op->inode. 1564 */ 1565 void bch2_write(struct closure *cl) 1566 { 1567 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 1568 struct bio *bio = &op->wbio.bio; 1569 struct bch_fs *c = op->c; 1570 unsigned data_len; 1571 1572 EBUG_ON(op->cl.parent); 1573 BUG_ON(!op->nr_replicas); 1574 BUG_ON(!op->write_point.v); 1575 BUG_ON(bkey_eq(op->pos, POS_MAX)); 1576 1577 op->start_time = local_clock(); 1578 bch2_keylist_init(&op->insert_keys, op->inline_keys); 1579 wbio_init(bio)->put_bio = false; 1580 1581 if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { 1582 bch_err_inum_offset_ratelimited(c, 1583 op->pos.inode, 1584 op->pos.offset << 9, 1585 "misaligned write"); 1586 op->error = -EIO; 1587 goto err; 1588 } 1589 1590 if (c->opts.nochanges) { 1591 op->error = -BCH_ERR_erofs_no_writes; 1592 goto err; 1593 } 1594 1595 if (!(op->flags & BCH_WRITE_MOVE) && 1596 !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { 1597 op->error = -BCH_ERR_erofs_no_writes; 1598 goto err; 1599 } 1600 1601 this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); 1602 bch2_increment_clock(c, bio_sectors(bio), WRITE); 1603 1604 data_len = min_t(u64, bio->bi_iter.bi_size, 1605 op->new_i_size - (op->pos.offset << 9)); 1606 1607 if (c->opts.inline_data && 1608 data_len <= min(block_bytes(c) / 2, 1024U)) { 1609 bch2_write_data_inline(op, data_len); 1610 return; 1611 } 1612 1613 __bch2_write(op); 1614 return; 1615 err: 1616 bch2_disk_reservation_put(c, &op->res); 1617 1618 closure_debug_destroy(&op->cl); 1619 if (op->end_io) 1620 op->end_io(op); 1621 } 1622 1623 static const char * const bch2_write_flags[] = { 1624 #define x(f) #f, 1625 BCH_WRITE_FLAGS() 1626 #undef x 1627 NULL 1628 }; 1629 1630 void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) 1631 { 1632 prt_str(out, "pos: "); 1633 bch2_bpos_to_text(out, op->pos); 1634 prt_newline(out); 1635 printbuf_indent_add(out, 2); 1636 1637 prt_str(out, "started: "); 1638 bch2_pr_time_units(out, local_clock() - op->start_time); 1639 prt_newline(out); 1640 1641 prt_str(out, "flags: "); 1642 prt_bitflags(out, bch2_write_flags, op->flags); 1643 prt_newline(out); 1644 1645 prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); 1646 prt_newline(out); 1647 1648 printbuf_indent_sub(out, 2); 1649 } 1650 1651 void bch2_fs_io_write_exit(struct bch_fs *c) 1652 { 1653 mempool_exit(&c->bio_bounce_pages); 1654 bioset_exit(&c->bio_write); 1655 } 1656 1657 int bch2_fs_io_write_init(struct bch_fs *c) 1658 { 1659 if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), 1660 BIOSET_NEED_BVECS)) 1661 return -BCH_ERR_ENOMEM_bio_write_init; 1662 1663 if (mempool_init_page_pool(&c->bio_bounce_pages, 1664 max_t(unsigned, 1665 c->opts.btree_node_size, 1666 c->opts.encoded_extent_max) / 1667 PAGE_SIZE, 0)) 1668 return -BCH_ERR_ENOMEM_bio_bounce_pages_init; 1669 1670 return 0; 1671 } 1672