1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2012 Google, Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_foreground.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_update.h" 12 #include "buckets.h" 13 #include "checksum.h" 14 #include "clock.h" 15 #include "compress.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "extent_update.h" 20 #include "inode.h" 21 #include "io_write.h" 22 #include "journal.h" 23 #include "keylist.h" 24 #include "move.h" 25 #include "nocow_locking.h" 26 #include "rebalance.h" 27 #include "subvolume.h" 28 #include "super.h" 29 #include "super-io.h" 30 #include "trace.h" 31 32 #include <linux/blkdev.h> 33 #include <linux/prefetch.h> 34 #include <linux/random.h> 35 #include <linux/sched/mm.h> 36 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 38 39 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, 40 u64 now, int rw) 41 { 42 u64 latency_capable = 43 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; 44 /* ideally we'd be taking into account the device's variance here: */ 45 u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); 46 s64 latency_over = io_latency - latency_threshold; 47 48 if (latency_threshold && latency_over > 0) { 49 /* 50 * bump up congested by approximately latency_over * 4 / 51 * latency_threshold - we don't need much accuracy here so don't 52 * bother with the divide: 53 */ 54 if (atomic_read(&ca->congested) < CONGESTED_MAX) 55 atomic_add(latency_over >> 56 max_t(int, ilog2(latency_threshold) - 2, 0), 57 &ca->congested); 58 59 ca->congested_last = now; 60 } else if (atomic_read(&ca->congested) > 0) { 61 atomic_dec(&ca->congested); 62 } 63 } 64 65 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) 66 { 67 atomic64_t *latency = &ca->cur_latency[rw]; 68 u64 now = local_clock(); 69 u64 io_latency = time_after64(now, submit_time) 70 ? now - submit_time 71 : 0; 72 u64 old, new, v = atomic64_read(latency); 73 74 do { 75 old = v; 76 77 /* 78 * If the io latency was reasonably close to the current 79 * latency, skip doing the update and atomic operation - most of 80 * the time: 81 */ 82 if (abs((int) (old - io_latency)) < (old >> 1) && 83 now & ~(~0U << 5)) 84 break; 85 86 new = ewma_add(old, io_latency, 5); 87 } while ((v = atomic64_cmpxchg(latency, old, new)) != old); 88 89 bch2_congested_acct(ca, io_latency, now, rw); 90 91 __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); 92 } 93 94 #endif 95 96 /* Allocate, free from mempool: */ 97 98 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) 99 { 100 struct bvec_iter_all iter; 101 struct bio_vec *bv; 102 103 bio_for_each_segment_all(bv, bio, iter) 104 if (bv->bv_page != ZERO_PAGE(0)) 105 mempool_free(bv->bv_page, &c->bio_bounce_pages); 106 bio->bi_vcnt = 0; 107 } 108 109 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) 110 { 111 struct page *page; 112 113 if (likely(!*using_mempool)) { 114 page = alloc_page(GFP_NOFS); 115 if (unlikely(!page)) { 116 mutex_lock(&c->bio_bounce_pages_lock); 117 *using_mempool = true; 118 goto pool_alloc; 119 120 } 121 } else { 122 pool_alloc: 123 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); 124 } 125 126 return page; 127 } 128 129 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, 130 size_t size) 131 { 132 bool using_mempool = false; 133 134 while (size) { 135 struct page *page = __bio_alloc_page_pool(c, &using_mempool); 136 unsigned len = min_t(size_t, PAGE_SIZE, size); 137 138 BUG_ON(!bio_add_page(bio, page, len, 0)); 139 size -= len; 140 } 141 142 if (using_mempool) 143 mutex_unlock(&c->bio_bounce_pages_lock); 144 } 145 146 /* Extent update path: */ 147 148 int bch2_sum_sector_overwrites(struct btree_trans *trans, 149 struct btree_iter *extent_iter, 150 struct bkey_i *new, 151 bool *usage_increasing, 152 s64 *i_sectors_delta, 153 s64 *disk_sectors_delta) 154 { 155 struct bch_fs *c = trans->c; 156 struct btree_iter iter; 157 struct bkey_s_c old; 158 unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); 159 bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); 160 int ret = 0; 161 162 *usage_increasing = false; 163 *i_sectors_delta = 0; 164 *disk_sectors_delta = 0; 165 166 bch2_trans_copy_iter(&iter, extent_iter); 167 168 for_each_btree_key_upto_continue_norestart(iter, 169 new->k.p, BTREE_ITER_SLOTS, old, ret) { 170 s64 sectors = min(new->k.p.offset, old.k->p.offset) - 171 max(bkey_start_offset(&new->k), 172 bkey_start_offset(old.k)); 173 174 *i_sectors_delta += sectors * 175 (bkey_extent_is_allocation(&new->k) - 176 bkey_extent_is_allocation(old.k)); 177 178 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); 179 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot 180 ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) 181 : 0; 182 183 if (!*usage_increasing && 184 (new->k.p.snapshot != old.k->p.snapshot || 185 new_replicas > bch2_bkey_replicas(c, old) || 186 (!new_compressed && bch2_bkey_sectors_compressed(old)))) 187 *usage_increasing = true; 188 189 if (bkey_ge(old.k->p, new->k.p)) 190 break; 191 } 192 193 bch2_trans_iter_exit(trans, &iter); 194 return ret; 195 } 196 197 static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, 198 struct btree_iter *extent_iter, 199 u64 new_i_size, 200 s64 i_sectors_delta) 201 { 202 struct btree_iter iter; 203 struct bkey_i *k; 204 struct bkey_i_inode_v3 *inode; 205 /* 206 * Crazy performance optimization: 207 * Every extent update needs to also update the inode: the inode trigger 208 * will set bi->journal_seq to the journal sequence number of this 209 * transaction - for fsync. 210 * 211 * But if that's the only reason we're updating the inode (we're not 212 * updating bi_size or bi_sectors), then we don't need the inode update 213 * to be journalled - if we crash, the bi_journal_seq update will be 214 * lost, but that's fine. 215 */ 216 unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; 217 int ret; 218 219 k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, 220 SPOS(0, 221 extent_iter->pos.inode, 222 extent_iter->snapshot), 223 BTREE_ITER_CACHED); 224 ret = PTR_ERR_OR_ZERO(k); 225 if (unlikely(ret)) 226 return ret; 227 228 if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { 229 k = bch2_inode_to_v3(trans, k); 230 ret = PTR_ERR_OR_ZERO(k); 231 if (unlikely(ret)) 232 goto err; 233 } 234 235 inode = bkey_i_to_inode_v3(k); 236 237 if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && 238 new_i_size > le64_to_cpu(inode->v.bi_size)) { 239 inode->v.bi_size = cpu_to_le64(new_i_size); 240 inode_update_flags = 0; 241 } 242 243 if (i_sectors_delta) { 244 le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); 245 inode_update_flags = 0; 246 } 247 248 if (inode->k.p.snapshot != iter.snapshot) { 249 inode->k.p.snapshot = iter.snapshot; 250 inode_update_flags = 0; 251 } 252 253 ret = bch2_trans_update(trans, &iter, &inode->k_i, 254 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| 255 inode_update_flags); 256 err: 257 bch2_trans_iter_exit(trans, &iter); 258 return ret; 259 } 260 261 int bch2_extent_update(struct btree_trans *trans, 262 subvol_inum inum, 263 struct btree_iter *iter, 264 struct bkey_i *k, 265 struct disk_reservation *disk_res, 266 u64 new_i_size, 267 s64 *i_sectors_delta_total, 268 bool check_enospc) 269 { 270 struct bpos next_pos; 271 bool usage_increasing; 272 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 273 int ret; 274 275 /* 276 * This traverses us the iterator without changing iter->path->pos to 277 * search_key() (which is pos + 1 for extents): we want there to be a 278 * path already traversed at iter->pos because 279 * bch2_trans_extent_update() will use it to attempt extent merging 280 */ 281 ret = __bch2_btree_iter_traverse(iter); 282 if (ret) 283 return ret; 284 285 ret = bch2_extent_trim_atomic(trans, iter, k); 286 if (ret) 287 return ret; 288 289 next_pos = k->k.p; 290 291 ret = bch2_sum_sector_overwrites(trans, iter, k, 292 &usage_increasing, 293 &i_sectors_delta, 294 &disk_sectors_delta); 295 if (ret) 296 return ret; 297 298 if (disk_res && 299 disk_sectors_delta > (s64) disk_res->sectors) { 300 ret = bch2_disk_reservation_add(trans->c, disk_res, 301 disk_sectors_delta - disk_res->sectors, 302 !check_enospc || !usage_increasing 303 ? BCH_DISK_RESERVATION_NOFAIL : 0); 304 if (ret) 305 return ret; 306 } 307 308 /* 309 * Note: 310 * We always have to do an inode update - even when i_size/i_sectors 311 * aren't changing - for fsync to work properly; fsync relies on 312 * inode->bi_journal_seq which is updated by the trigger code: 313 */ 314 ret = bch2_extent_update_i_size_sectors(trans, iter, 315 min(k->k.p.offset << 9, new_i_size), 316 i_sectors_delta) ?: 317 bch2_trans_update(trans, iter, k, 0) ?: 318 bch2_trans_commit(trans, disk_res, NULL, 319 BTREE_INSERT_NOCHECK_RW| 320 BTREE_INSERT_NOFAIL); 321 if (unlikely(ret)) 322 return ret; 323 324 if (i_sectors_delta_total) 325 *i_sectors_delta_total += i_sectors_delta; 326 bch2_btree_iter_set_pos(iter, next_pos); 327 return 0; 328 } 329 330 static int bch2_write_index_default(struct bch_write_op *op) 331 { 332 struct bch_fs *c = op->c; 333 struct bkey_buf sk; 334 struct keylist *keys = &op->insert_keys; 335 struct bkey_i *k = bch2_keylist_front(keys); 336 struct btree_trans *trans = bch2_trans_get(c); 337 struct btree_iter iter; 338 subvol_inum inum = { 339 .subvol = op->subvol, 340 .inum = k->k.p.inode, 341 }; 342 int ret; 343 344 BUG_ON(!inum.subvol); 345 346 bch2_bkey_buf_init(&sk); 347 348 do { 349 bch2_trans_begin(trans); 350 351 k = bch2_keylist_front(keys); 352 bch2_bkey_buf_copy(&sk, c, k); 353 354 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, 355 &sk.k->k.p.snapshot); 356 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 357 continue; 358 if (ret) 359 break; 360 361 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 362 bkey_start_pos(&sk.k->k), 363 BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 364 365 ret = bch2_bkey_set_needs_rebalance(c, sk.k, 366 op->opts.background_target, 367 op->opts.background_compression) ?: 368 bch2_extent_update(trans, inum, &iter, sk.k, 369 &op->res, 370 op->new_i_size, &op->i_sectors_delta, 371 op->flags & BCH_WRITE_CHECK_ENOSPC); 372 bch2_trans_iter_exit(trans, &iter); 373 374 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 375 continue; 376 if (ret) 377 break; 378 379 if (bkey_ge(iter.pos, k->k.p)) 380 bch2_keylist_pop_front(&op->insert_keys); 381 else 382 bch2_cut_front(iter.pos, k); 383 } while (!bch2_keylist_empty(keys)); 384 385 bch2_trans_put(trans); 386 bch2_bkey_buf_exit(&sk, c); 387 388 return ret; 389 } 390 391 /* Writes */ 392 393 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, 394 enum bch_data_type type, 395 const struct bkey_i *k, 396 bool nocow) 397 { 398 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 399 const struct bch_extent_ptr *ptr; 400 struct bch_write_bio *n; 401 struct bch_dev *ca; 402 403 BUG_ON(c->opts.nochanges); 404 405 bkey_for_each_ptr(ptrs, ptr) { 406 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || 407 !c->devs[ptr->dev]); 408 409 ca = bch_dev_bkey_exists(c, ptr->dev); 410 411 if (to_entry(ptr + 1) < ptrs.end) { 412 n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, 413 GFP_NOFS, &ca->replica_set)); 414 415 n->bio.bi_end_io = wbio->bio.bi_end_io; 416 n->bio.bi_private = wbio->bio.bi_private; 417 n->parent = wbio; 418 n->split = true; 419 n->bounce = false; 420 n->put_bio = true; 421 n->bio.bi_opf = wbio->bio.bi_opf; 422 bio_inc_remaining(&wbio->bio); 423 } else { 424 n = wbio; 425 n->split = false; 426 } 427 428 n->c = c; 429 n->dev = ptr->dev; 430 n->have_ioref = nocow || bch2_dev_get_ioref(ca, 431 type == BCH_DATA_btree ? READ : WRITE); 432 n->nocow = nocow; 433 n->submit_time = local_clock(); 434 n->inode_offset = bkey_start_offset(&k->k); 435 n->bio.bi_iter.bi_sector = ptr->offset; 436 437 if (likely(n->have_ioref)) { 438 this_cpu_add(ca->io_done->sectors[WRITE][type], 439 bio_sectors(&n->bio)); 440 441 bio_set_dev(&n->bio, ca->disk_sb.bdev); 442 443 if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { 444 bio_endio(&n->bio); 445 continue; 446 } 447 448 submit_bio(&n->bio); 449 } else { 450 n->bio.bi_status = BLK_STS_REMOVED; 451 bio_endio(&n->bio); 452 } 453 } 454 } 455 456 static void __bch2_write(struct bch_write_op *); 457 458 static void bch2_write_done(struct closure *cl) 459 { 460 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 461 struct bch_fs *c = op->c; 462 463 EBUG_ON(op->open_buckets.nr); 464 465 bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); 466 bch2_disk_reservation_put(c, &op->res); 467 468 if (!(op->flags & BCH_WRITE_MOVE)) 469 bch2_write_ref_put(c, BCH_WRITE_REF_write); 470 bch2_keylist_free(&op->insert_keys, op->inline_keys); 471 472 EBUG_ON(cl->parent); 473 closure_debug_destroy(cl); 474 if (op->end_io) 475 op->end_io(op); 476 } 477 478 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) 479 { 480 struct keylist *keys = &op->insert_keys; 481 struct bch_extent_ptr *ptr; 482 struct bkey_i *src, *dst = keys->keys, *n; 483 484 for (src = keys->keys; src != keys->top; src = n) { 485 n = bkey_next(src); 486 487 if (bkey_extent_is_direct_data(&src->k)) { 488 bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, 489 test_bit(ptr->dev, op->failed.d)); 490 491 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) 492 return -EIO; 493 } 494 495 if (dst != src) 496 memmove_u64s_down(dst, src, src->k.u64s); 497 dst = bkey_next(dst); 498 } 499 500 keys->top = dst; 501 return 0; 502 } 503 504 /** 505 * __bch2_write_index - after a write, update index to point to new data 506 * @op: bch_write_op to process 507 */ 508 static void __bch2_write_index(struct bch_write_op *op) 509 { 510 struct bch_fs *c = op->c; 511 struct keylist *keys = &op->insert_keys; 512 unsigned dev; 513 int ret = 0; 514 515 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 516 ret = bch2_write_drop_io_error_ptrs(op); 517 if (ret) 518 goto err; 519 } 520 521 if (!bch2_keylist_empty(keys)) { 522 u64 sectors_start = keylist_sectors(keys); 523 524 ret = !(op->flags & BCH_WRITE_MOVE) 525 ? bch2_write_index_default(op) 526 : bch2_data_update_index_update(op); 527 528 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 529 BUG_ON(keylist_sectors(keys) && !ret); 530 531 op->written += sectors_start - keylist_sectors(keys); 532 533 if (ret && !bch2_err_matches(ret, EROFS)) { 534 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 535 536 bch_err_inum_offset_ratelimited(c, 537 insert->k.p.inode, insert->k.p.offset << 9, 538 "write error while doing btree update: %s", 539 bch2_err_str(ret)); 540 } 541 542 if (ret) 543 goto err; 544 } 545 out: 546 /* If some a bucket wasn't written, we can't erasure code it: */ 547 for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) 548 bch2_open_bucket_write_error(c, &op->open_buckets, dev); 549 550 bch2_open_buckets_put(c, &op->open_buckets); 551 return; 552 err: 553 keys->top = keys->keys; 554 op->error = ret; 555 op->flags |= BCH_WRITE_DONE; 556 goto out; 557 } 558 559 static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) 560 { 561 if (state != wp->state) { 562 u64 now = ktime_get_ns(); 563 564 if (wp->last_state_change && 565 time_after64(now, wp->last_state_change)) 566 wp->time[wp->state] += now - wp->last_state_change; 567 wp->state = state; 568 wp->last_state_change = now; 569 } 570 } 571 572 static inline void wp_update_state(struct write_point *wp, bool running) 573 { 574 enum write_point_state state; 575 576 state = running ? WRITE_POINT_running : 577 !list_empty(&wp->writes) ? WRITE_POINT_waiting_io 578 : WRITE_POINT_stopped; 579 580 __wp_update_state(wp, state); 581 } 582 583 static void bch2_write_index(struct closure *cl) 584 { 585 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 586 struct write_point *wp = op->wp; 587 struct workqueue_struct *wq = index_update_wq(op); 588 unsigned long flags; 589 590 if ((op->flags & BCH_WRITE_DONE) && 591 (op->flags & BCH_WRITE_MOVE)) 592 bch2_bio_free_pages_pool(op->c, &op->wbio.bio); 593 594 spin_lock_irqsave(&wp->writes_lock, flags); 595 if (wp->state == WRITE_POINT_waiting_io) 596 __wp_update_state(wp, WRITE_POINT_waiting_work); 597 list_add_tail(&op->wp_list, &wp->writes); 598 spin_unlock_irqrestore (&wp->writes_lock, flags); 599 600 queue_work(wq, &wp->index_update_work); 601 } 602 603 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) 604 { 605 op->wp = wp; 606 607 if (wp->state == WRITE_POINT_stopped) { 608 spin_lock_irq(&wp->writes_lock); 609 __wp_update_state(wp, WRITE_POINT_waiting_io); 610 spin_unlock_irq(&wp->writes_lock); 611 } 612 } 613 614 void bch2_write_point_do_index_updates(struct work_struct *work) 615 { 616 struct write_point *wp = 617 container_of(work, struct write_point, index_update_work); 618 struct bch_write_op *op; 619 620 while (1) { 621 spin_lock_irq(&wp->writes_lock); 622 op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); 623 if (op) 624 list_del(&op->wp_list); 625 wp_update_state(wp, op != NULL); 626 spin_unlock_irq(&wp->writes_lock); 627 628 if (!op) 629 break; 630 631 op->flags |= BCH_WRITE_IN_WORKER; 632 633 __bch2_write_index(op); 634 635 if (!(op->flags & BCH_WRITE_DONE)) 636 __bch2_write(op); 637 else 638 bch2_write_done(&op->cl); 639 } 640 } 641 642 static void bch2_write_endio(struct bio *bio) 643 { 644 struct closure *cl = bio->bi_private; 645 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 646 struct bch_write_bio *wbio = to_wbio(bio); 647 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; 648 struct bch_fs *c = wbio->c; 649 struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); 650 651 if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 652 op->pos.inode, 653 wbio->inode_offset << 9, 654 "data write error: %s", 655 bch2_blk_status_to_str(bio->bi_status))) { 656 set_bit(wbio->dev, op->failed.d); 657 op->flags |= BCH_WRITE_IO_ERROR; 658 } 659 660 if (wbio->nocow) 661 set_bit(wbio->dev, op->devs_need_flush->d); 662 663 if (wbio->have_ioref) { 664 bch2_latency_acct(ca, wbio->submit_time, WRITE); 665 percpu_ref_put(&ca->io_ref); 666 } 667 668 if (wbio->bounce) 669 bch2_bio_free_pages_pool(c, bio); 670 671 if (wbio->put_bio) 672 bio_put(bio); 673 674 if (parent) 675 bio_endio(&parent->bio); 676 else 677 closure_put(cl); 678 } 679 680 static void init_append_extent(struct bch_write_op *op, 681 struct write_point *wp, 682 struct bversion version, 683 struct bch_extent_crc_unpacked crc) 684 { 685 struct bkey_i_extent *e; 686 687 op->pos.offset += crc.uncompressed_size; 688 689 e = bkey_extent_init(op->insert_keys.top); 690 e->k.p = op->pos; 691 e->k.size = crc.uncompressed_size; 692 e->k.version = version; 693 694 if (crc.csum_type || 695 crc.compression_type || 696 crc.nonce) 697 bch2_extent_crc_append(&e->k_i, crc); 698 699 bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, 700 op->flags & BCH_WRITE_CACHED); 701 702 bch2_keylist_push(&op->insert_keys); 703 } 704 705 static struct bio *bch2_write_bio_alloc(struct bch_fs *c, 706 struct write_point *wp, 707 struct bio *src, 708 bool *page_alloc_failed, 709 void *buf) 710 { 711 struct bch_write_bio *wbio; 712 struct bio *bio; 713 unsigned output_available = 714 min(wp->sectors_free << 9, src->bi_iter.bi_size); 715 unsigned pages = DIV_ROUND_UP(output_available + 716 (buf 717 ? ((unsigned long) buf & (PAGE_SIZE - 1)) 718 : 0), PAGE_SIZE); 719 720 pages = min(pages, BIO_MAX_VECS); 721 722 bio = bio_alloc_bioset(NULL, pages, 0, 723 GFP_NOFS, &c->bio_write); 724 wbio = wbio_init(bio); 725 wbio->put_bio = true; 726 /* copy WRITE_SYNC flag */ 727 wbio->bio.bi_opf = src->bi_opf; 728 729 if (buf) { 730 bch2_bio_map(bio, buf, output_available); 731 return bio; 732 } 733 734 wbio->bounce = true; 735 736 /* 737 * We can't use mempool for more than c->sb.encoded_extent_max 738 * worth of pages, but we'd like to allocate more if we can: 739 */ 740 bch2_bio_alloc_pages_pool(c, bio, 741 min_t(unsigned, output_available, 742 c->opts.encoded_extent_max)); 743 744 if (bio->bi_iter.bi_size < output_available) 745 *page_alloc_failed = 746 bch2_bio_alloc_pages(bio, 747 output_available - 748 bio->bi_iter.bi_size, 749 GFP_NOFS) != 0; 750 751 return bio; 752 } 753 754 static int bch2_write_rechecksum(struct bch_fs *c, 755 struct bch_write_op *op, 756 unsigned new_csum_type) 757 { 758 struct bio *bio = &op->wbio.bio; 759 struct bch_extent_crc_unpacked new_crc; 760 int ret; 761 762 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ 763 764 if (bch2_csum_type_is_encryption(op->crc.csum_type) != 765 bch2_csum_type_is_encryption(new_csum_type)) 766 new_csum_type = op->crc.csum_type; 767 768 ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, 769 NULL, &new_crc, 770 op->crc.offset, op->crc.live_size, 771 new_csum_type); 772 if (ret) 773 return ret; 774 775 bio_advance(bio, op->crc.offset << 9); 776 bio->bi_iter.bi_size = op->crc.live_size << 9; 777 op->crc = new_crc; 778 return 0; 779 } 780 781 static int bch2_write_decrypt(struct bch_write_op *op) 782 { 783 struct bch_fs *c = op->c; 784 struct nonce nonce = extent_nonce(op->version, op->crc); 785 struct bch_csum csum; 786 int ret; 787 788 if (!bch2_csum_type_is_encryption(op->crc.csum_type)) 789 return 0; 790 791 /* 792 * If we need to decrypt data in the write path, we'll no longer be able 793 * to verify the existing checksum (poly1305 mac, in this case) after 794 * it's decrypted - this is the last point we'll be able to reverify the 795 * checksum: 796 */ 797 csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 798 if (bch2_crc_cmp(op->crc.csum, csum)) 799 return -EIO; 800 801 ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 802 op->crc.csum_type = 0; 803 op->crc.csum = (struct bch_csum) { 0, 0 }; 804 return ret; 805 } 806 807 static enum prep_encoded_ret { 808 PREP_ENCODED_OK, 809 PREP_ENCODED_ERR, 810 PREP_ENCODED_CHECKSUM_ERR, 811 PREP_ENCODED_DO_WRITE, 812 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) 813 { 814 struct bch_fs *c = op->c; 815 struct bio *bio = &op->wbio.bio; 816 817 if (!(op->flags & BCH_WRITE_DATA_ENCODED)) 818 return PREP_ENCODED_OK; 819 820 BUG_ON(bio_sectors(bio) != op->crc.compressed_size); 821 822 /* Can we just write the entire extent as is? */ 823 if (op->crc.uncompressed_size == op->crc.live_size && 824 op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && 825 op->crc.compressed_size <= wp->sectors_free && 826 (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || 827 op->incompressible)) { 828 if (!crc_is_compressed(op->crc) && 829 op->csum_type != op->crc.csum_type && 830 bch2_write_rechecksum(c, op, op->csum_type) && 831 !c->opts.no_data_io) 832 return PREP_ENCODED_CHECKSUM_ERR; 833 834 return PREP_ENCODED_DO_WRITE; 835 } 836 837 /* 838 * If the data is compressed and we couldn't write the entire extent as 839 * is, we have to decompress it: 840 */ 841 if (crc_is_compressed(op->crc)) { 842 struct bch_csum csum; 843 844 if (bch2_write_decrypt(op)) 845 return PREP_ENCODED_CHECKSUM_ERR; 846 847 /* Last point we can still verify checksum: */ 848 csum = bch2_checksum_bio(c, op->crc.csum_type, 849 extent_nonce(op->version, op->crc), 850 bio); 851 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 852 return PREP_ENCODED_CHECKSUM_ERR; 853 854 if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) 855 return PREP_ENCODED_ERR; 856 } 857 858 /* 859 * No longer have compressed data after this point - data might be 860 * encrypted: 861 */ 862 863 /* 864 * If the data is checksummed and we're only writing a subset, 865 * rechecksum and adjust bio to point to currently live data: 866 */ 867 if ((op->crc.live_size != op->crc.uncompressed_size || 868 op->crc.csum_type != op->csum_type) && 869 bch2_write_rechecksum(c, op, op->csum_type) && 870 !c->opts.no_data_io) 871 return PREP_ENCODED_CHECKSUM_ERR; 872 873 /* 874 * If we want to compress the data, it has to be decrypted: 875 */ 876 if ((op->compression_opt || 877 bch2_csum_type_is_encryption(op->crc.csum_type) != 878 bch2_csum_type_is_encryption(op->csum_type)) && 879 bch2_write_decrypt(op)) 880 return PREP_ENCODED_CHECKSUM_ERR; 881 882 return PREP_ENCODED_OK; 883 } 884 885 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, 886 struct bio **_dst) 887 { 888 struct bch_fs *c = op->c; 889 struct bio *src = &op->wbio.bio, *dst = src; 890 struct bvec_iter saved_iter; 891 void *ec_buf; 892 unsigned total_output = 0, total_input = 0; 893 bool bounce = false; 894 bool page_alloc_failed = false; 895 int ret, more = 0; 896 897 BUG_ON(!bio_sectors(src)); 898 899 ec_buf = bch2_writepoint_ec_buf(c, wp); 900 901 switch (bch2_write_prep_encoded_data(op, wp)) { 902 case PREP_ENCODED_OK: 903 break; 904 case PREP_ENCODED_ERR: 905 ret = -EIO; 906 goto err; 907 case PREP_ENCODED_CHECKSUM_ERR: 908 goto csum_err; 909 case PREP_ENCODED_DO_WRITE: 910 /* XXX look for bug here */ 911 if (ec_buf) { 912 dst = bch2_write_bio_alloc(c, wp, src, 913 &page_alloc_failed, 914 ec_buf); 915 bio_copy_data(dst, src); 916 bounce = true; 917 } 918 init_append_extent(op, wp, op->version, op->crc); 919 goto do_write; 920 } 921 922 if (ec_buf || 923 op->compression_opt || 924 (op->csum_type && 925 !(op->flags & BCH_WRITE_PAGES_STABLE)) || 926 (bch2_csum_type_is_encryption(op->csum_type) && 927 !(op->flags & BCH_WRITE_PAGES_OWNED))) { 928 dst = bch2_write_bio_alloc(c, wp, src, 929 &page_alloc_failed, 930 ec_buf); 931 bounce = true; 932 } 933 934 saved_iter = dst->bi_iter; 935 936 do { 937 struct bch_extent_crc_unpacked crc = { 0 }; 938 struct bversion version = op->version; 939 size_t dst_len = 0, src_len = 0; 940 941 if (page_alloc_failed && 942 dst->bi_iter.bi_size < (wp->sectors_free << 9) && 943 dst->bi_iter.bi_size < c->opts.encoded_extent_max) 944 break; 945 946 BUG_ON(op->compression_opt && 947 (op->flags & BCH_WRITE_DATA_ENCODED) && 948 bch2_csum_type_is_encryption(op->crc.csum_type)); 949 BUG_ON(op->compression_opt && !bounce); 950 951 crc.compression_type = op->incompressible 952 ? BCH_COMPRESSION_TYPE_incompressible 953 : op->compression_opt 954 ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, 955 op->compression_opt) 956 : 0; 957 if (!crc_is_compressed(crc)) { 958 dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); 959 dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); 960 961 if (op->csum_type) 962 dst_len = min_t(unsigned, dst_len, 963 c->opts.encoded_extent_max); 964 965 if (bounce) { 966 swap(dst->bi_iter.bi_size, dst_len); 967 bio_copy_data(dst, src); 968 swap(dst->bi_iter.bi_size, dst_len); 969 } 970 971 src_len = dst_len; 972 } 973 974 BUG_ON(!src_len || !dst_len); 975 976 if (bch2_csum_type_is_encryption(op->csum_type)) { 977 if (bversion_zero(version)) { 978 version.lo = atomic64_inc_return(&c->key_version); 979 } else { 980 crc.nonce = op->nonce; 981 op->nonce += src_len >> 9; 982 } 983 } 984 985 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 986 !crc_is_compressed(crc) && 987 bch2_csum_type_is_encryption(op->crc.csum_type) == 988 bch2_csum_type_is_encryption(op->csum_type)) { 989 u8 compression_type = crc.compression_type; 990 u16 nonce = crc.nonce; 991 /* 992 * Note: when we're using rechecksum(), we need to be 993 * checksumming @src because it has all the data our 994 * existing checksum covers - if we bounced (because we 995 * were trying to compress), @dst will only have the 996 * part of the data the new checksum will cover. 997 * 998 * But normally we want to be checksumming post bounce, 999 * because part of the reason for bouncing is so the 1000 * data can't be modified (by userspace) while it's in 1001 * flight. 1002 */ 1003 if (bch2_rechecksum_bio(c, src, version, op->crc, 1004 &crc, &op->crc, 1005 src_len >> 9, 1006 bio_sectors(src) - (src_len >> 9), 1007 op->csum_type)) 1008 goto csum_err; 1009 /* 1010 * rchecksum_bio sets compression_type on crc from op->crc, 1011 * this isn't always correct as sometimes we're changing 1012 * an extent from uncompressed to incompressible. 1013 */ 1014 crc.compression_type = compression_type; 1015 crc.nonce = nonce; 1016 } else { 1017 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 1018 bch2_rechecksum_bio(c, src, version, op->crc, 1019 NULL, &op->crc, 1020 src_len >> 9, 1021 bio_sectors(src) - (src_len >> 9), 1022 op->crc.csum_type)) 1023 goto csum_err; 1024 1025 crc.compressed_size = dst_len >> 9; 1026 crc.uncompressed_size = src_len >> 9; 1027 crc.live_size = src_len >> 9; 1028 1029 swap(dst->bi_iter.bi_size, dst_len); 1030 ret = bch2_encrypt_bio(c, op->csum_type, 1031 extent_nonce(version, crc), dst); 1032 if (ret) 1033 goto err; 1034 1035 crc.csum = bch2_checksum_bio(c, op->csum_type, 1036 extent_nonce(version, crc), dst); 1037 crc.csum_type = op->csum_type; 1038 swap(dst->bi_iter.bi_size, dst_len); 1039 } 1040 1041 init_append_extent(op, wp, version, crc); 1042 1043 if (dst != src) 1044 bio_advance(dst, dst_len); 1045 bio_advance(src, src_len); 1046 total_output += dst_len; 1047 total_input += src_len; 1048 } while (dst->bi_iter.bi_size && 1049 src->bi_iter.bi_size && 1050 wp->sectors_free && 1051 !bch2_keylist_realloc(&op->insert_keys, 1052 op->inline_keys, 1053 ARRAY_SIZE(op->inline_keys), 1054 BKEY_EXTENT_U64s_MAX)); 1055 1056 more = src->bi_iter.bi_size != 0; 1057 1058 dst->bi_iter = saved_iter; 1059 1060 if (dst == src && more) { 1061 BUG_ON(total_output != total_input); 1062 1063 dst = bio_split(src, total_input >> 9, 1064 GFP_NOFS, &c->bio_write); 1065 wbio_init(dst)->put_bio = true; 1066 /* copy WRITE_SYNC flag */ 1067 dst->bi_opf = src->bi_opf; 1068 } 1069 1070 dst->bi_iter.bi_size = total_output; 1071 do_write: 1072 *_dst = dst; 1073 return more; 1074 csum_err: 1075 bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); 1076 ret = -EIO; 1077 err: 1078 if (to_wbio(dst)->bounce) 1079 bch2_bio_free_pages_pool(c, dst); 1080 if (to_wbio(dst)->put_bio) 1081 bio_put(dst); 1082 1083 return ret; 1084 } 1085 1086 static bool bch2_extent_is_writeable(struct bch_write_op *op, 1087 struct bkey_s_c k) 1088 { 1089 struct bch_fs *c = op->c; 1090 struct bkey_s_c_extent e; 1091 struct extent_ptr_decoded p; 1092 const union bch_extent_entry *entry; 1093 unsigned replicas = 0; 1094 1095 if (k.k->type != KEY_TYPE_extent) 1096 return false; 1097 1098 e = bkey_s_c_to_extent(k); 1099 extent_for_each_ptr_decode(e, p, entry) { 1100 if (crc_is_encoded(p.crc) || p.has_ec) 1101 return false; 1102 1103 replicas += bch2_extent_ptr_durability(c, &p); 1104 } 1105 1106 return replicas >= op->opts.data_replicas; 1107 } 1108 1109 static inline void bch2_nocow_write_unlock(struct bch_write_op *op) 1110 { 1111 struct bch_fs *c = op->c; 1112 const struct bch_extent_ptr *ptr; 1113 struct bkey_i *k; 1114 1115 for_each_keylist_key(&op->insert_keys, k) { 1116 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 1117 1118 bkey_for_each_ptr(ptrs, ptr) 1119 bch2_bucket_nocow_unlock(&c->nocow_locks, 1120 PTR_BUCKET_POS(c, ptr), 1121 BUCKET_NOCOW_LOCK_UPDATE); 1122 } 1123 } 1124 1125 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, 1126 struct btree_iter *iter, 1127 struct bkey_i *orig, 1128 struct bkey_s_c k, 1129 u64 new_i_size) 1130 { 1131 struct bkey_i *new; 1132 struct bkey_ptrs ptrs; 1133 struct bch_extent_ptr *ptr; 1134 int ret; 1135 1136 if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { 1137 /* trace this */ 1138 return 0; 1139 } 1140 1141 new = bch2_bkey_make_mut_noupdate(trans, k); 1142 ret = PTR_ERR_OR_ZERO(new); 1143 if (ret) 1144 return ret; 1145 1146 bch2_cut_front(bkey_start_pos(&orig->k), new); 1147 bch2_cut_back(orig->k.p, new); 1148 1149 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 1150 bkey_for_each_ptr(ptrs, ptr) 1151 ptr->unwritten = 0; 1152 1153 /* 1154 * Note that we're not calling bch2_subvol_get_snapshot() in this path - 1155 * that was done when we kicked off the write, and here it's important 1156 * that we update the extent that we wrote to - even if a snapshot has 1157 * since been created. The write is still outstanding, so we're ok 1158 * w.r.t. snapshot atomicity: 1159 */ 1160 return bch2_extent_update_i_size_sectors(trans, iter, 1161 min(new->k.p.offset << 9, new_i_size), 0) ?: 1162 bch2_trans_update(trans, iter, new, 1163 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1164 } 1165 1166 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) 1167 { 1168 struct bch_fs *c = op->c; 1169 struct btree_trans *trans = bch2_trans_get(c); 1170 struct btree_iter iter; 1171 struct bkey_i *orig; 1172 struct bkey_s_c k; 1173 int ret; 1174 1175 for_each_keylist_key(&op->insert_keys, orig) { 1176 ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, 1177 bkey_start_pos(&orig->k), orig->k.p, 1178 BTREE_ITER_INTENT, k, 1179 NULL, NULL, BTREE_INSERT_NOFAIL, ({ 1180 bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); 1181 })); 1182 1183 if (ret && !bch2_err_matches(ret, EROFS)) { 1184 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 1185 1186 bch_err_inum_offset_ratelimited(c, 1187 insert->k.p.inode, insert->k.p.offset << 9, 1188 "write error while doing btree update: %s", 1189 bch2_err_str(ret)); 1190 } 1191 1192 if (ret) { 1193 op->error = ret; 1194 break; 1195 } 1196 } 1197 1198 bch2_trans_put(trans); 1199 } 1200 1201 static void __bch2_nocow_write_done(struct bch_write_op *op) 1202 { 1203 bch2_nocow_write_unlock(op); 1204 1205 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 1206 op->error = -EIO; 1207 } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) 1208 bch2_nocow_write_convert_unwritten(op); 1209 } 1210 1211 static void bch2_nocow_write_done(struct closure *cl) 1212 { 1213 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 1214 1215 __bch2_nocow_write_done(op); 1216 bch2_write_done(cl); 1217 } 1218 1219 static void bch2_nocow_write(struct bch_write_op *op) 1220 { 1221 struct bch_fs *c = op->c; 1222 struct btree_trans *trans; 1223 struct btree_iter iter; 1224 struct bkey_s_c k; 1225 struct bkey_ptrs_c ptrs; 1226 const struct bch_extent_ptr *ptr; 1227 struct { 1228 struct bpos b; 1229 unsigned gen; 1230 struct nocow_lock_bucket *l; 1231 } buckets[BCH_REPLICAS_MAX]; 1232 unsigned nr_buckets = 0; 1233 u32 snapshot; 1234 int ret, i; 1235 1236 if (op->flags & BCH_WRITE_MOVE) 1237 return; 1238 1239 trans = bch2_trans_get(c); 1240 retry: 1241 bch2_trans_begin(trans); 1242 1243 ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); 1244 if (unlikely(ret)) 1245 goto err; 1246 1247 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1248 SPOS(op->pos.inode, op->pos.offset, snapshot), 1249 BTREE_ITER_SLOTS); 1250 while (1) { 1251 struct bio *bio = &op->wbio.bio; 1252 1253 nr_buckets = 0; 1254 1255 k = bch2_btree_iter_peek_slot(&iter); 1256 ret = bkey_err(k); 1257 if (ret) 1258 break; 1259 1260 /* fall back to normal cow write path? */ 1261 if (unlikely(k.k->p.snapshot != snapshot || 1262 !bch2_extent_is_writeable(op, k))) 1263 break; 1264 1265 if (bch2_keylist_realloc(&op->insert_keys, 1266 op->inline_keys, 1267 ARRAY_SIZE(op->inline_keys), 1268 k.k->u64s)) 1269 break; 1270 1271 /* Get iorefs before dropping btree locks: */ 1272 ptrs = bch2_bkey_ptrs_c(k); 1273 bkey_for_each_ptr(ptrs, ptr) { 1274 buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); 1275 buckets[nr_buckets].gen = ptr->gen; 1276 buckets[nr_buckets].l = 1277 bucket_nocow_lock(&c->nocow_locks, 1278 bucket_to_u64(buckets[nr_buckets].b)); 1279 1280 prefetch(buckets[nr_buckets].l); 1281 1282 if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) 1283 goto err_get_ioref; 1284 1285 nr_buckets++; 1286 1287 if (ptr->unwritten) 1288 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; 1289 } 1290 1291 /* Unlock before taking nocow locks, doing IO: */ 1292 bkey_reassemble(op->insert_keys.top, k); 1293 bch2_trans_unlock(trans); 1294 1295 bch2_cut_front(op->pos, op->insert_keys.top); 1296 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) 1297 bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); 1298 1299 for (i = 0; i < nr_buckets; i++) { 1300 struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); 1301 struct nocow_lock_bucket *l = buckets[i].l; 1302 bool stale; 1303 1304 __bch2_bucket_nocow_lock(&c->nocow_locks, l, 1305 bucket_to_u64(buckets[i].b), 1306 BUCKET_NOCOW_LOCK_UPDATE); 1307 1308 rcu_read_lock(); 1309 stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); 1310 rcu_read_unlock(); 1311 1312 if (unlikely(stale)) 1313 goto err_bucket_stale; 1314 } 1315 1316 bio = &op->wbio.bio; 1317 if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { 1318 bio = bio_split(bio, k.k->p.offset - op->pos.offset, 1319 GFP_KERNEL, &c->bio_write); 1320 wbio_init(bio)->put_bio = true; 1321 bio->bi_opf = op->wbio.bio.bi_opf; 1322 } else { 1323 op->flags |= BCH_WRITE_DONE; 1324 } 1325 1326 op->pos.offset += bio_sectors(bio); 1327 op->written += bio_sectors(bio); 1328 1329 bio->bi_end_io = bch2_write_endio; 1330 bio->bi_private = &op->cl; 1331 bio->bi_opf |= REQ_OP_WRITE; 1332 closure_get(&op->cl); 1333 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1334 op->insert_keys.top, true); 1335 1336 bch2_keylist_push(&op->insert_keys); 1337 if (op->flags & BCH_WRITE_DONE) 1338 break; 1339 bch2_btree_iter_advance(&iter); 1340 } 1341 out: 1342 bch2_trans_iter_exit(trans, &iter); 1343 err: 1344 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1345 goto retry; 1346 1347 if (ret) { 1348 bch_err_inum_offset_ratelimited(c, 1349 op->pos.inode, 1350 op->pos.offset << 9, 1351 "%s: btree lookup error %s", 1352 __func__, bch2_err_str(ret)); 1353 op->error = ret; 1354 op->flags |= BCH_WRITE_DONE; 1355 } 1356 1357 bch2_trans_put(trans); 1358 1359 /* fallback to cow write path? */ 1360 if (!(op->flags & BCH_WRITE_DONE)) { 1361 closure_sync(&op->cl); 1362 __bch2_nocow_write_done(op); 1363 op->insert_keys.top = op->insert_keys.keys; 1364 } else if (op->flags & BCH_WRITE_SYNC) { 1365 closure_sync(&op->cl); 1366 bch2_nocow_write_done(&op->cl); 1367 } else { 1368 /* 1369 * XXX 1370 * needs to run out of process context because ei_quota_lock is 1371 * a mutex 1372 */ 1373 continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); 1374 } 1375 return; 1376 err_get_ioref: 1377 for (i = 0; i < nr_buckets; i++) 1378 percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); 1379 1380 /* Fall back to COW path: */ 1381 goto out; 1382 err_bucket_stale: 1383 while (i >= 0) { 1384 bch2_bucket_nocow_unlock(&c->nocow_locks, 1385 buckets[i].b, 1386 BUCKET_NOCOW_LOCK_UPDATE); 1387 --i; 1388 } 1389 for (i = 0; i < nr_buckets; i++) 1390 percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); 1391 1392 /* We can retry this: */ 1393 ret = -BCH_ERR_transaction_restart; 1394 goto out; 1395 } 1396 1397 static void __bch2_write(struct bch_write_op *op) 1398 { 1399 struct bch_fs *c = op->c; 1400 struct write_point *wp = NULL; 1401 struct bio *bio = NULL; 1402 unsigned nofs_flags; 1403 int ret; 1404 1405 nofs_flags = memalloc_nofs_save(); 1406 1407 if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { 1408 bch2_nocow_write(op); 1409 if (op->flags & BCH_WRITE_DONE) 1410 goto out_nofs_restore; 1411 } 1412 again: 1413 memset(&op->failed, 0, sizeof(op->failed)); 1414 1415 do { 1416 struct bkey_i *key_to_write; 1417 unsigned key_to_write_offset = op->insert_keys.top_p - 1418 op->insert_keys.keys_p; 1419 1420 /* +1 for possible cache device: */ 1421 if (op->open_buckets.nr + op->nr_replicas + 1 > 1422 ARRAY_SIZE(op->open_buckets.v)) 1423 break; 1424 1425 if (bch2_keylist_realloc(&op->insert_keys, 1426 op->inline_keys, 1427 ARRAY_SIZE(op->inline_keys), 1428 BKEY_EXTENT_U64s_MAX)) 1429 break; 1430 1431 /* 1432 * The copygc thread is now global, which means it's no longer 1433 * freeing up space on specific disks, which means that 1434 * allocations for specific disks may hang arbitrarily long: 1435 */ 1436 ret = bch2_trans_do(c, NULL, NULL, 0, 1437 bch2_alloc_sectors_start_trans(trans, 1438 op->target, 1439 op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), 1440 op->write_point, 1441 &op->devs_have, 1442 op->nr_replicas, 1443 op->nr_replicas_required, 1444 op->watermark, 1445 op->flags, 1446 (op->flags & (BCH_WRITE_ALLOC_NOWAIT| 1447 BCH_WRITE_ONLY_SPECIFIED_DEVS)) 1448 ? NULL : &op->cl, &wp)); 1449 if (unlikely(ret)) { 1450 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) 1451 break; 1452 1453 goto err; 1454 } 1455 1456 EBUG_ON(!wp); 1457 1458 bch2_open_bucket_get(c, wp, &op->open_buckets); 1459 ret = bch2_write_extent(op, wp, &bio); 1460 1461 bch2_alloc_sectors_done_inlined(c, wp); 1462 err: 1463 if (ret <= 0) { 1464 op->flags |= BCH_WRITE_DONE; 1465 1466 if (ret < 0) { 1467 op->error = ret; 1468 break; 1469 } 1470 } 1471 1472 bio->bi_end_io = bch2_write_endio; 1473 bio->bi_private = &op->cl; 1474 bio->bi_opf |= REQ_OP_WRITE; 1475 1476 closure_get(bio->bi_private); 1477 1478 key_to_write = (void *) (op->insert_keys.keys_p + 1479 key_to_write_offset); 1480 1481 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1482 key_to_write, false); 1483 } while (ret); 1484 1485 /* 1486 * Sync or no? 1487 * 1488 * If we're running asynchronously, wne may still want to block 1489 * synchronously here if we weren't able to submit all of the IO at 1490 * once, as that signals backpressure to the caller. 1491 */ 1492 if ((op->flags & BCH_WRITE_SYNC) || 1493 (!(op->flags & BCH_WRITE_DONE) && 1494 !(op->flags & BCH_WRITE_IN_WORKER))) { 1495 closure_sync(&op->cl); 1496 __bch2_write_index(op); 1497 1498 if (!(op->flags & BCH_WRITE_DONE)) 1499 goto again; 1500 bch2_write_done(&op->cl); 1501 } else { 1502 bch2_write_queue(op, wp); 1503 continue_at(&op->cl, bch2_write_index, NULL); 1504 } 1505 out_nofs_restore: 1506 memalloc_nofs_restore(nofs_flags); 1507 } 1508 1509 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) 1510 { 1511 struct bio *bio = &op->wbio.bio; 1512 struct bvec_iter iter; 1513 struct bkey_i_inline_data *id; 1514 unsigned sectors; 1515 int ret; 1516 1517 op->flags |= BCH_WRITE_WROTE_DATA_INLINE; 1518 op->flags |= BCH_WRITE_DONE; 1519 1520 bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); 1521 1522 ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, 1523 ARRAY_SIZE(op->inline_keys), 1524 BKEY_U64s + DIV_ROUND_UP(data_len, 8)); 1525 if (ret) { 1526 op->error = ret; 1527 goto err; 1528 } 1529 1530 sectors = bio_sectors(bio); 1531 op->pos.offset += sectors; 1532 1533 id = bkey_inline_data_init(op->insert_keys.top); 1534 id->k.p = op->pos; 1535 id->k.version = op->version; 1536 id->k.size = sectors; 1537 1538 iter = bio->bi_iter; 1539 iter.bi_size = data_len; 1540 memcpy_from_bio(id->v.data, bio, iter); 1541 1542 while (data_len & 7) 1543 id->v.data[data_len++] = '\0'; 1544 set_bkey_val_bytes(&id->k, data_len); 1545 bch2_keylist_push(&op->insert_keys); 1546 1547 __bch2_write_index(op); 1548 err: 1549 bch2_write_done(&op->cl); 1550 } 1551 1552 /** 1553 * bch2_write() - handle a write to a cache device or flash only volume 1554 * @cl: &bch_write_op->cl 1555 * 1556 * This is the starting point for any data to end up in a cache device; it could 1557 * be from a normal write, or a writeback write, or a write to a flash only 1558 * volume - it's also used by the moving garbage collector to compact data in 1559 * mostly empty buckets. 1560 * 1561 * It first writes the data to the cache, creating a list of keys to be inserted 1562 * (if the data won't fit in a single open bucket, there will be multiple keys); 1563 * after the data is written it calls bch_journal, and after the keys have been 1564 * added to the next journal write they're inserted into the btree. 1565 * 1566 * If op->discard is true, instead of inserting the data it invalidates the 1567 * region of the cache represented by op->bio and op->inode. 1568 */ 1569 void bch2_write(struct closure *cl) 1570 { 1571 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 1572 struct bio *bio = &op->wbio.bio; 1573 struct bch_fs *c = op->c; 1574 unsigned data_len; 1575 1576 EBUG_ON(op->cl.parent); 1577 BUG_ON(!op->nr_replicas); 1578 BUG_ON(!op->write_point.v); 1579 BUG_ON(bkey_eq(op->pos, POS_MAX)); 1580 1581 op->start_time = local_clock(); 1582 bch2_keylist_init(&op->insert_keys, op->inline_keys); 1583 wbio_init(bio)->put_bio = false; 1584 1585 if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { 1586 bch_err_inum_offset_ratelimited(c, 1587 op->pos.inode, 1588 op->pos.offset << 9, 1589 "misaligned write"); 1590 op->error = -EIO; 1591 goto err; 1592 } 1593 1594 if (c->opts.nochanges) { 1595 op->error = -BCH_ERR_erofs_no_writes; 1596 goto err; 1597 } 1598 1599 if (!(op->flags & BCH_WRITE_MOVE) && 1600 !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { 1601 op->error = -BCH_ERR_erofs_no_writes; 1602 goto err; 1603 } 1604 1605 this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); 1606 bch2_increment_clock(c, bio_sectors(bio), WRITE); 1607 1608 data_len = min_t(u64, bio->bi_iter.bi_size, 1609 op->new_i_size - (op->pos.offset << 9)); 1610 1611 if (c->opts.inline_data && 1612 data_len <= min(block_bytes(c) / 2, 1024U)) { 1613 bch2_write_data_inline(op, data_len); 1614 return; 1615 } 1616 1617 __bch2_write(op); 1618 return; 1619 err: 1620 bch2_disk_reservation_put(c, &op->res); 1621 1622 closure_debug_destroy(&op->cl); 1623 if (op->end_io) 1624 op->end_io(op); 1625 } 1626 1627 static const char * const bch2_write_flags[] = { 1628 #define x(f) #f, 1629 BCH_WRITE_FLAGS() 1630 #undef x 1631 NULL 1632 }; 1633 1634 void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) 1635 { 1636 prt_str(out, "pos: "); 1637 bch2_bpos_to_text(out, op->pos); 1638 prt_newline(out); 1639 printbuf_indent_add(out, 2); 1640 1641 prt_str(out, "started: "); 1642 bch2_pr_time_units(out, local_clock() - op->start_time); 1643 prt_newline(out); 1644 1645 prt_str(out, "flags: "); 1646 prt_bitflags(out, bch2_write_flags, op->flags); 1647 prt_newline(out); 1648 1649 prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); 1650 prt_newline(out); 1651 1652 printbuf_indent_sub(out, 2); 1653 } 1654 1655 void bch2_fs_io_write_exit(struct bch_fs *c) 1656 { 1657 mempool_exit(&c->bio_bounce_pages); 1658 bioset_exit(&c->bio_write); 1659 } 1660 1661 int bch2_fs_io_write_init(struct bch_fs *c) 1662 { 1663 if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), 1664 BIOSET_NEED_BVECS)) 1665 return -BCH_ERR_ENOMEM_bio_write_init; 1666 1667 if (mempool_init_page_pool(&c->bio_bounce_pages, 1668 max_t(unsigned, 1669 c->opts.btree_node_size, 1670 c->opts.encoded_extent_max) / 1671 PAGE_SIZE, 0)) 1672 return -BCH_ERR_ENOMEM_bio_bounce_pages_init; 1673 1674 return 0; 1675 } 1676