1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2012 Google, Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_foreground.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_update.h" 12 #include "buckets.h" 13 #include "checksum.h" 14 #include "clock.h" 15 #include "compress.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "extent_update.h" 20 #include "inode.h" 21 #include "io_write.h" 22 #include "journal.h" 23 #include "keylist.h" 24 #include "move.h" 25 #include "nocow_locking.h" 26 #include "rebalance.h" 27 #include "subvolume.h" 28 #include "super.h" 29 #include "super-io.h" 30 #include "trace.h" 31 32 #include <linux/blkdev.h> 33 #include <linux/prefetch.h> 34 #include <linux/random.h> 35 #include <linux/sched/mm.h> 36 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 38 39 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, 40 u64 now, int rw) 41 { 42 u64 latency_capable = 43 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; 44 /* ideally we'd be taking into account the device's variance here: */ 45 u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); 46 s64 latency_over = io_latency - latency_threshold; 47 48 if (latency_threshold && latency_over > 0) { 49 /* 50 * bump up congested by approximately latency_over * 4 / 51 * latency_threshold - we don't need much accuracy here so don't 52 * bother with the divide: 53 */ 54 if (atomic_read(&ca->congested) < CONGESTED_MAX) 55 atomic_add(latency_over >> 56 max_t(int, ilog2(latency_threshold) - 2, 0), 57 &ca->congested); 58 59 ca->congested_last = now; 60 } else if (atomic_read(&ca->congested) > 0) { 61 atomic_dec(&ca->congested); 62 } 63 } 64 65 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) 66 { 67 atomic64_t *latency = &ca->cur_latency[rw]; 68 u64 now = local_clock(); 69 u64 io_latency = time_after64(now, submit_time) 70 ? now - submit_time 71 : 0; 72 u64 old, new; 73 74 old = atomic64_read(latency); 75 do { 76 /* 77 * If the io latency was reasonably close to the current 78 * latency, skip doing the update and atomic operation - most of 79 * the time: 80 */ 81 if (abs((int) (old - io_latency)) < (old >> 1) && 82 now & ~(~0U << 5)) 83 break; 84 85 new = ewma_add(old, io_latency, 5); 86 } while (!atomic64_try_cmpxchg(latency, &old, new)); 87 88 bch2_congested_acct(ca, io_latency, now, rw); 89 90 __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); 91 } 92 93 #endif 94 95 /* Allocate, free from mempool: */ 96 97 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) 98 { 99 struct bvec_iter_all iter; 100 struct bio_vec *bv; 101 102 bio_for_each_segment_all(bv, bio, iter) 103 if (bv->bv_page != ZERO_PAGE(0)) 104 mempool_free(bv->bv_page, &c->bio_bounce_pages); 105 bio->bi_vcnt = 0; 106 } 107 108 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) 109 { 110 struct page *page; 111 112 if (likely(!*using_mempool)) { 113 page = alloc_page(GFP_NOFS); 114 if (unlikely(!page)) { 115 mutex_lock(&c->bio_bounce_pages_lock); 116 *using_mempool = true; 117 goto pool_alloc; 118 119 } 120 } else { 121 pool_alloc: 122 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); 123 } 124 125 return page; 126 } 127 128 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, 129 size_t size) 130 { 131 bool using_mempool = false; 132 133 while (size) { 134 struct page *page = __bio_alloc_page_pool(c, &using_mempool); 135 unsigned len = min_t(size_t, PAGE_SIZE, size); 136 137 BUG_ON(!bio_add_page(bio, page, len, 0)); 138 size -= len; 139 } 140 141 if (using_mempool) 142 mutex_unlock(&c->bio_bounce_pages_lock); 143 } 144 145 /* Extent update path: */ 146 147 int bch2_sum_sector_overwrites(struct btree_trans *trans, 148 struct btree_iter *extent_iter, 149 struct bkey_i *new, 150 bool *usage_increasing, 151 s64 *i_sectors_delta, 152 s64 *disk_sectors_delta) 153 { 154 struct bch_fs *c = trans->c; 155 struct btree_iter iter; 156 struct bkey_s_c old; 157 unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); 158 bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); 159 int ret = 0; 160 161 *usage_increasing = false; 162 *i_sectors_delta = 0; 163 *disk_sectors_delta = 0; 164 165 bch2_trans_copy_iter(&iter, extent_iter); 166 167 for_each_btree_key_max_continue_norestart(iter, 168 new->k.p, BTREE_ITER_slots, old, ret) { 169 s64 sectors = min(new->k.p.offset, old.k->p.offset) - 170 max(bkey_start_offset(&new->k), 171 bkey_start_offset(old.k)); 172 173 *i_sectors_delta += sectors * 174 (bkey_extent_is_allocation(&new->k) - 175 bkey_extent_is_allocation(old.k)); 176 177 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); 178 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot 179 ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) 180 : 0; 181 182 if (!*usage_increasing && 183 (new->k.p.snapshot != old.k->p.snapshot || 184 new_replicas > bch2_bkey_replicas(c, old) || 185 (!new_compressed && bch2_bkey_sectors_compressed(old)))) 186 *usage_increasing = true; 187 188 if (bkey_ge(old.k->p, new->k.p)) 189 break; 190 } 191 192 bch2_trans_iter_exit(trans, &iter); 193 return ret; 194 } 195 196 static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, 197 struct btree_iter *extent_iter, 198 u64 new_i_size, 199 s64 i_sectors_delta) 200 { 201 /* 202 * Crazy performance optimization: 203 * Every extent update needs to also update the inode: the inode trigger 204 * will set bi->journal_seq to the journal sequence number of this 205 * transaction - for fsync. 206 * 207 * But if that's the only reason we're updating the inode (we're not 208 * updating bi_size or bi_sectors), then we don't need the inode update 209 * to be journalled - if we crash, the bi_journal_seq update will be 210 * lost, but that's fine. 211 */ 212 unsigned inode_update_flags = BTREE_UPDATE_nojournal; 213 214 struct btree_iter iter; 215 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 216 SPOS(0, 217 extent_iter->pos.inode, 218 extent_iter->snapshot), 219 BTREE_ITER_intent| 220 BTREE_ITER_cached); 221 int ret = bkey_err(k); 222 if (unlikely(ret)) 223 return ret; 224 225 /* 226 * varint_decode_fast(), in the inode .invalid method, reads up to 7 227 * bytes past the end of the buffer: 228 */ 229 struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); 230 ret = PTR_ERR_OR_ZERO(k_mut); 231 if (unlikely(ret)) 232 goto err; 233 234 bkey_reassemble(k_mut, k); 235 236 if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { 237 k_mut = bch2_inode_to_v3(trans, k_mut); 238 ret = PTR_ERR_OR_ZERO(k_mut); 239 if (unlikely(ret)) 240 goto err; 241 } 242 243 struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); 244 245 if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && 246 new_i_size > le64_to_cpu(inode->v.bi_size)) { 247 inode->v.bi_size = cpu_to_le64(new_i_size); 248 inode_update_flags = 0; 249 } 250 251 if (i_sectors_delta) { 252 le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); 253 inode_update_flags = 0; 254 } 255 256 if (inode->k.p.snapshot != iter.snapshot) { 257 inode->k.p.snapshot = iter.snapshot; 258 inode_update_flags = 0; 259 } 260 261 ret = bch2_trans_update(trans, &iter, &inode->k_i, 262 BTREE_UPDATE_internal_snapshot_node| 263 inode_update_flags); 264 err: 265 bch2_trans_iter_exit(trans, &iter); 266 return ret; 267 } 268 269 int bch2_extent_update(struct btree_trans *trans, 270 subvol_inum inum, 271 struct btree_iter *iter, 272 struct bkey_i *k, 273 struct disk_reservation *disk_res, 274 u64 new_i_size, 275 s64 *i_sectors_delta_total, 276 bool check_enospc) 277 { 278 struct bpos next_pos; 279 bool usage_increasing; 280 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 281 int ret; 282 283 /* 284 * This traverses us the iterator without changing iter->path->pos to 285 * search_key() (which is pos + 1 for extents): we want there to be a 286 * path already traversed at iter->pos because 287 * bch2_trans_extent_update() will use it to attempt extent merging 288 */ 289 ret = __bch2_btree_iter_traverse(iter); 290 if (ret) 291 return ret; 292 293 ret = bch2_extent_trim_atomic(trans, iter, k); 294 if (ret) 295 return ret; 296 297 next_pos = k->k.p; 298 299 ret = bch2_sum_sector_overwrites(trans, iter, k, 300 &usage_increasing, 301 &i_sectors_delta, 302 &disk_sectors_delta); 303 if (ret) 304 return ret; 305 306 if (disk_res && 307 disk_sectors_delta > (s64) disk_res->sectors) { 308 ret = bch2_disk_reservation_add(trans->c, disk_res, 309 disk_sectors_delta - disk_res->sectors, 310 !check_enospc || !usage_increasing 311 ? BCH_DISK_RESERVATION_NOFAIL : 0); 312 if (ret) 313 return ret; 314 } 315 316 /* 317 * Note: 318 * We always have to do an inode update - even when i_size/i_sectors 319 * aren't changing - for fsync to work properly; fsync relies on 320 * inode->bi_journal_seq which is updated by the trigger code: 321 */ 322 ret = bch2_extent_update_i_size_sectors(trans, iter, 323 min(k->k.p.offset << 9, new_i_size), 324 i_sectors_delta) ?: 325 bch2_trans_update(trans, iter, k, 0) ?: 326 bch2_trans_commit(trans, disk_res, NULL, 327 BCH_TRANS_COMMIT_no_check_rw| 328 BCH_TRANS_COMMIT_no_enospc); 329 if (unlikely(ret)) 330 return ret; 331 332 if (i_sectors_delta_total) 333 *i_sectors_delta_total += i_sectors_delta; 334 bch2_btree_iter_set_pos(iter, next_pos); 335 return 0; 336 } 337 338 static int bch2_write_index_default(struct bch_write_op *op) 339 { 340 struct bch_fs *c = op->c; 341 struct bkey_buf sk; 342 struct keylist *keys = &op->insert_keys; 343 struct bkey_i *k = bch2_keylist_front(keys); 344 struct btree_trans *trans = bch2_trans_get(c); 345 struct btree_iter iter; 346 subvol_inum inum = { 347 .subvol = op->subvol, 348 .inum = k->k.p.inode, 349 }; 350 int ret; 351 352 BUG_ON(!inum.subvol); 353 354 bch2_bkey_buf_init(&sk); 355 356 do { 357 bch2_trans_begin(trans); 358 359 k = bch2_keylist_front(keys); 360 bch2_bkey_buf_copy(&sk, c, k); 361 362 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, 363 &sk.k->k.p.snapshot); 364 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 365 continue; 366 if (ret) 367 break; 368 369 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 370 bkey_start_pos(&sk.k->k), 371 BTREE_ITER_slots|BTREE_ITER_intent); 372 373 ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: 374 bch2_extent_update(trans, inum, &iter, sk.k, 375 &op->res, 376 op->new_i_size, &op->i_sectors_delta, 377 op->flags & BCH_WRITE_CHECK_ENOSPC); 378 bch2_trans_iter_exit(trans, &iter); 379 380 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 381 continue; 382 if (ret) 383 break; 384 385 if (bkey_ge(iter.pos, k->k.p)) 386 bch2_keylist_pop_front(&op->insert_keys); 387 else 388 bch2_cut_front(iter.pos, k); 389 } while (!bch2_keylist_empty(keys)); 390 391 bch2_trans_put(trans); 392 bch2_bkey_buf_exit(&sk, c); 393 394 return ret; 395 } 396 397 /* Writes */ 398 399 static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, 400 u64 offset) 401 { 402 bch2_inum_offset_err_msg(op->c, out, 403 (subvol_inum) { op->subvol, op->pos.inode, }, 404 offset << 9); 405 prt_printf(out, "write error%s: ", 406 op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); 407 } 408 409 static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) 410 { 411 __bch2_write_op_error(out, op, op->pos.offset); 412 } 413 414 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, 415 enum bch_data_type type, 416 const struct bkey_i *k, 417 bool nocow) 418 { 419 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 420 struct bch_write_bio *n; 421 422 BUG_ON(c->opts.nochanges); 423 424 bkey_for_each_ptr(ptrs, ptr) { 425 struct bch_dev *ca = nocow 426 ? bch2_dev_have_ref(c, ptr->dev) 427 : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); 428 429 if (to_entry(ptr + 1) < ptrs.end) { 430 n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); 431 432 n->bio.bi_end_io = wbio->bio.bi_end_io; 433 n->bio.bi_private = wbio->bio.bi_private; 434 n->parent = wbio; 435 n->split = true; 436 n->bounce = false; 437 n->put_bio = true; 438 n->bio.bi_opf = wbio->bio.bi_opf; 439 bio_inc_remaining(&wbio->bio); 440 } else { 441 n = wbio; 442 n->split = false; 443 } 444 445 n->c = c; 446 n->dev = ptr->dev; 447 n->have_ioref = ca != NULL; 448 n->nocow = nocow; 449 n->submit_time = local_clock(); 450 n->inode_offset = bkey_start_offset(&k->k); 451 if (nocow) 452 n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); 453 n->bio.bi_iter.bi_sector = ptr->offset; 454 455 if (likely(n->have_ioref)) { 456 this_cpu_add(ca->io_done->sectors[WRITE][type], 457 bio_sectors(&n->bio)); 458 459 bio_set_dev(&n->bio, ca->disk_sb.bdev); 460 461 if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { 462 bio_endio(&n->bio); 463 continue; 464 } 465 466 submit_bio(&n->bio); 467 } else { 468 n->bio.bi_status = BLK_STS_REMOVED; 469 bio_endio(&n->bio); 470 } 471 } 472 } 473 474 static void __bch2_write(struct bch_write_op *); 475 476 static void bch2_write_done(struct closure *cl) 477 { 478 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 479 struct bch_fs *c = op->c; 480 481 EBUG_ON(op->open_buckets.nr); 482 483 bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); 484 bch2_disk_reservation_put(c, &op->res); 485 486 if (!(op->flags & BCH_WRITE_MOVE)) 487 bch2_write_ref_put(c, BCH_WRITE_REF_write); 488 bch2_keylist_free(&op->insert_keys, op->inline_keys); 489 490 EBUG_ON(cl->parent); 491 closure_debug_destroy(cl); 492 if (op->end_io) 493 op->end_io(op); 494 } 495 496 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) 497 { 498 struct keylist *keys = &op->insert_keys; 499 struct bkey_i *src, *dst = keys->keys, *n; 500 501 for (src = keys->keys; src != keys->top; src = n) { 502 n = bkey_next(src); 503 504 if (bkey_extent_is_direct_data(&src->k)) { 505 bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, 506 test_bit(ptr->dev, op->failed.d)); 507 508 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) 509 return -EIO; 510 } 511 512 if (dst != src) 513 memmove_u64s_down(dst, src, src->k.u64s); 514 dst = bkey_next(dst); 515 } 516 517 keys->top = dst; 518 return 0; 519 } 520 521 /** 522 * __bch2_write_index - after a write, update index to point to new data 523 * @op: bch_write_op to process 524 */ 525 static void __bch2_write_index(struct bch_write_op *op) 526 { 527 struct bch_fs *c = op->c; 528 struct keylist *keys = &op->insert_keys; 529 unsigned dev; 530 int ret = 0; 531 532 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 533 ret = bch2_write_drop_io_error_ptrs(op); 534 if (ret) 535 goto err; 536 } 537 538 if (!bch2_keylist_empty(keys)) { 539 u64 sectors_start = keylist_sectors(keys); 540 541 ret = !(op->flags & BCH_WRITE_MOVE) 542 ? bch2_write_index_default(op) 543 : bch2_data_update_index_update(op); 544 545 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 546 BUG_ON(keylist_sectors(keys) && !ret); 547 548 op->written += sectors_start - keylist_sectors(keys); 549 550 if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { 551 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 552 553 struct printbuf buf = PRINTBUF; 554 __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); 555 prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); 556 bch_err_ratelimited(c, "%s", buf.buf); 557 printbuf_exit(&buf); 558 } 559 560 if (ret) 561 goto err; 562 } 563 out: 564 /* If some a bucket wasn't written, we can't erasure code it: */ 565 for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) 566 bch2_open_bucket_write_error(c, &op->open_buckets, dev); 567 568 bch2_open_buckets_put(c, &op->open_buckets); 569 return; 570 err: 571 keys->top = keys->keys; 572 op->error = ret; 573 op->flags |= BCH_WRITE_SUBMITTED; 574 goto out; 575 } 576 577 static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) 578 { 579 if (state != wp->state) { 580 u64 now = ktime_get_ns(); 581 582 if (wp->last_state_change && 583 time_after64(now, wp->last_state_change)) 584 wp->time[wp->state] += now - wp->last_state_change; 585 wp->state = state; 586 wp->last_state_change = now; 587 } 588 } 589 590 static inline void wp_update_state(struct write_point *wp, bool running) 591 { 592 enum write_point_state state; 593 594 state = running ? WRITE_POINT_running : 595 !list_empty(&wp->writes) ? WRITE_POINT_waiting_io 596 : WRITE_POINT_stopped; 597 598 __wp_update_state(wp, state); 599 } 600 601 static CLOSURE_CALLBACK(bch2_write_index) 602 { 603 closure_type(op, struct bch_write_op, cl); 604 struct write_point *wp = op->wp; 605 struct workqueue_struct *wq = index_update_wq(op); 606 unsigned long flags; 607 608 if ((op->flags & BCH_WRITE_SUBMITTED) && 609 (op->flags & BCH_WRITE_MOVE)) 610 bch2_bio_free_pages_pool(op->c, &op->wbio.bio); 611 612 spin_lock_irqsave(&wp->writes_lock, flags); 613 if (wp->state == WRITE_POINT_waiting_io) 614 __wp_update_state(wp, WRITE_POINT_waiting_work); 615 list_add_tail(&op->wp_list, &wp->writes); 616 spin_unlock_irqrestore (&wp->writes_lock, flags); 617 618 queue_work(wq, &wp->index_update_work); 619 } 620 621 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) 622 { 623 op->wp = wp; 624 625 if (wp->state == WRITE_POINT_stopped) { 626 spin_lock_irq(&wp->writes_lock); 627 __wp_update_state(wp, WRITE_POINT_waiting_io); 628 spin_unlock_irq(&wp->writes_lock); 629 } 630 } 631 632 void bch2_write_point_do_index_updates(struct work_struct *work) 633 { 634 struct write_point *wp = 635 container_of(work, struct write_point, index_update_work); 636 struct bch_write_op *op; 637 638 while (1) { 639 spin_lock_irq(&wp->writes_lock); 640 op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); 641 wp_update_state(wp, op != NULL); 642 spin_unlock_irq(&wp->writes_lock); 643 644 if (!op) 645 break; 646 647 op->flags |= BCH_WRITE_IN_WORKER; 648 649 __bch2_write_index(op); 650 651 if (!(op->flags & BCH_WRITE_SUBMITTED)) 652 __bch2_write(op); 653 else 654 bch2_write_done(&op->cl); 655 } 656 } 657 658 static void bch2_write_endio(struct bio *bio) 659 { 660 struct closure *cl = bio->bi_private; 661 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 662 struct bch_write_bio *wbio = to_wbio(bio); 663 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; 664 struct bch_fs *c = wbio->c; 665 struct bch_dev *ca = wbio->have_ioref 666 ? bch2_dev_have_ref(c, wbio->dev) 667 : NULL; 668 669 if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 670 op->pos.inode, 671 wbio->inode_offset << 9, 672 "data write error: %s", 673 bch2_blk_status_to_str(bio->bi_status))) { 674 set_bit(wbio->dev, op->failed.d); 675 op->flags |= BCH_WRITE_IO_ERROR; 676 } 677 678 if (wbio->nocow) { 679 bch2_bucket_nocow_unlock(&c->nocow_locks, 680 POS(ca->dev_idx, wbio->nocow_bucket), 681 BUCKET_NOCOW_LOCK_UPDATE); 682 set_bit(wbio->dev, op->devs_need_flush->d); 683 } 684 685 if (wbio->have_ioref) { 686 bch2_latency_acct(ca, wbio->submit_time, WRITE); 687 percpu_ref_put(&ca->io_ref); 688 } 689 690 if (wbio->bounce) 691 bch2_bio_free_pages_pool(c, bio); 692 693 if (wbio->put_bio) 694 bio_put(bio); 695 696 if (parent) 697 bio_endio(&parent->bio); 698 else 699 closure_put(cl); 700 } 701 702 static void init_append_extent(struct bch_write_op *op, 703 struct write_point *wp, 704 struct bversion version, 705 struct bch_extent_crc_unpacked crc) 706 { 707 struct bkey_i_extent *e; 708 709 op->pos.offset += crc.uncompressed_size; 710 711 e = bkey_extent_init(op->insert_keys.top); 712 e->k.p = op->pos; 713 e->k.size = crc.uncompressed_size; 714 e->k.bversion = version; 715 716 if (crc.csum_type || 717 crc.compression_type || 718 crc.nonce) 719 bch2_extent_crc_append(&e->k_i, crc); 720 721 bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, 722 op->flags & BCH_WRITE_CACHED); 723 724 bch2_keylist_push(&op->insert_keys); 725 } 726 727 static struct bio *bch2_write_bio_alloc(struct bch_fs *c, 728 struct write_point *wp, 729 struct bio *src, 730 bool *page_alloc_failed, 731 void *buf) 732 { 733 struct bch_write_bio *wbio; 734 struct bio *bio; 735 unsigned output_available = 736 min(wp->sectors_free << 9, src->bi_iter.bi_size); 737 unsigned pages = DIV_ROUND_UP(output_available + 738 (buf 739 ? ((unsigned long) buf & (PAGE_SIZE - 1)) 740 : 0), PAGE_SIZE); 741 742 pages = min(pages, BIO_MAX_VECS); 743 744 bio = bio_alloc_bioset(NULL, pages, 0, 745 GFP_NOFS, &c->bio_write); 746 wbio = wbio_init(bio); 747 wbio->put_bio = true; 748 /* copy WRITE_SYNC flag */ 749 wbio->bio.bi_opf = src->bi_opf; 750 751 if (buf) { 752 bch2_bio_map(bio, buf, output_available); 753 return bio; 754 } 755 756 wbio->bounce = true; 757 758 /* 759 * We can't use mempool for more than c->sb.encoded_extent_max 760 * worth of pages, but we'd like to allocate more if we can: 761 */ 762 bch2_bio_alloc_pages_pool(c, bio, 763 min_t(unsigned, output_available, 764 c->opts.encoded_extent_max)); 765 766 if (bio->bi_iter.bi_size < output_available) 767 *page_alloc_failed = 768 bch2_bio_alloc_pages(bio, 769 output_available - 770 bio->bi_iter.bi_size, 771 GFP_NOFS) != 0; 772 773 return bio; 774 } 775 776 static int bch2_write_rechecksum(struct bch_fs *c, 777 struct bch_write_op *op, 778 unsigned new_csum_type) 779 { 780 struct bio *bio = &op->wbio.bio; 781 struct bch_extent_crc_unpacked new_crc; 782 int ret; 783 784 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ 785 786 if (bch2_csum_type_is_encryption(op->crc.csum_type) != 787 bch2_csum_type_is_encryption(new_csum_type)) 788 new_csum_type = op->crc.csum_type; 789 790 ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, 791 NULL, &new_crc, 792 op->crc.offset, op->crc.live_size, 793 new_csum_type); 794 if (ret) 795 return ret; 796 797 bio_advance(bio, op->crc.offset << 9); 798 bio->bi_iter.bi_size = op->crc.live_size << 9; 799 op->crc = new_crc; 800 return 0; 801 } 802 803 static int bch2_write_decrypt(struct bch_write_op *op) 804 { 805 struct bch_fs *c = op->c; 806 struct nonce nonce = extent_nonce(op->version, op->crc); 807 struct bch_csum csum; 808 int ret; 809 810 if (!bch2_csum_type_is_encryption(op->crc.csum_type)) 811 return 0; 812 813 /* 814 * If we need to decrypt data in the write path, we'll no longer be able 815 * to verify the existing checksum (poly1305 mac, in this case) after 816 * it's decrypted - this is the last point we'll be able to reverify the 817 * checksum: 818 */ 819 csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 820 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 821 return -EIO; 822 823 ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 824 op->crc.csum_type = 0; 825 op->crc.csum = (struct bch_csum) { 0, 0 }; 826 return ret; 827 } 828 829 static enum prep_encoded_ret { 830 PREP_ENCODED_OK, 831 PREP_ENCODED_ERR, 832 PREP_ENCODED_CHECKSUM_ERR, 833 PREP_ENCODED_DO_WRITE, 834 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) 835 { 836 struct bch_fs *c = op->c; 837 struct bio *bio = &op->wbio.bio; 838 839 if (!(op->flags & BCH_WRITE_DATA_ENCODED)) 840 return PREP_ENCODED_OK; 841 842 BUG_ON(bio_sectors(bio) != op->crc.compressed_size); 843 844 /* Can we just write the entire extent as is? */ 845 if (op->crc.uncompressed_size == op->crc.live_size && 846 op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && 847 op->crc.compressed_size <= wp->sectors_free && 848 (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || 849 op->incompressible)) { 850 if (!crc_is_compressed(op->crc) && 851 op->csum_type != op->crc.csum_type && 852 bch2_write_rechecksum(c, op, op->csum_type) && 853 !c->opts.no_data_io) 854 return PREP_ENCODED_CHECKSUM_ERR; 855 856 return PREP_ENCODED_DO_WRITE; 857 } 858 859 /* 860 * If the data is compressed and we couldn't write the entire extent as 861 * is, we have to decompress it: 862 */ 863 if (crc_is_compressed(op->crc)) { 864 struct bch_csum csum; 865 866 if (bch2_write_decrypt(op)) 867 return PREP_ENCODED_CHECKSUM_ERR; 868 869 /* Last point we can still verify checksum: */ 870 csum = bch2_checksum_bio(c, op->crc.csum_type, 871 extent_nonce(op->version, op->crc), 872 bio); 873 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 874 return PREP_ENCODED_CHECKSUM_ERR; 875 876 if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) 877 return PREP_ENCODED_ERR; 878 } 879 880 /* 881 * No longer have compressed data after this point - data might be 882 * encrypted: 883 */ 884 885 /* 886 * If the data is checksummed and we're only writing a subset, 887 * rechecksum and adjust bio to point to currently live data: 888 */ 889 if ((op->crc.live_size != op->crc.uncompressed_size || 890 op->crc.csum_type != op->csum_type) && 891 bch2_write_rechecksum(c, op, op->csum_type) && 892 !c->opts.no_data_io) 893 return PREP_ENCODED_CHECKSUM_ERR; 894 895 /* 896 * If we want to compress the data, it has to be decrypted: 897 */ 898 if ((op->compression_opt || 899 bch2_csum_type_is_encryption(op->crc.csum_type) != 900 bch2_csum_type_is_encryption(op->csum_type)) && 901 bch2_write_decrypt(op)) 902 return PREP_ENCODED_CHECKSUM_ERR; 903 904 return PREP_ENCODED_OK; 905 } 906 907 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, 908 struct bio **_dst) 909 { 910 struct bch_fs *c = op->c; 911 struct bio *src = &op->wbio.bio, *dst = src; 912 struct bvec_iter saved_iter; 913 void *ec_buf; 914 unsigned total_output = 0, total_input = 0; 915 bool bounce = false; 916 bool page_alloc_failed = false; 917 int ret, more = 0; 918 919 BUG_ON(!bio_sectors(src)); 920 921 ec_buf = bch2_writepoint_ec_buf(c, wp); 922 923 switch (bch2_write_prep_encoded_data(op, wp)) { 924 case PREP_ENCODED_OK: 925 break; 926 case PREP_ENCODED_ERR: 927 ret = -EIO; 928 goto err; 929 case PREP_ENCODED_CHECKSUM_ERR: 930 goto csum_err; 931 case PREP_ENCODED_DO_WRITE: 932 /* XXX look for bug here */ 933 if (ec_buf) { 934 dst = bch2_write_bio_alloc(c, wp, src, 935 &page_alloc_failed, 936 ec_buf); 937 bio_copy_data(dst, src); 938 bounce = true; 939 } 940 init_append_extent(op, wp, op->version, op->crc); 941 goto do_write; 942 } 943 944 if (ec_buf || 945 op->compression_opt || 946 (op->csum_type && 947 !(op->flags & BCH_WRITE_PAGES_STABLE)) || 948 (bch2_csum_type_is_encryption(op->csum_type) && 949 !(op->flags & BCH_WRITE_PAGES_OWNED))) { 950 dst = bch2_write_bio_alloc(c, wp, src, 951 &page_alloc_failed, 952 ec_buf); 953 bounce = true; 954 } 955 956 saved_iter = dst->bi_iter; 957 958 do { 959 struct bch_extent_crc_unpacked crc = { 0 }; 960 struct bversion version = op->version; 961 size_t dst_len = 0, src_len = 0; 962 963 if (page_alloc_failed && 964 dst->bi_iter.bi_size < (wp->sectors_free << 9) && 965 dst->bi_iter.bi_size < c->opts.encoded_extent_max) 966 break; 967 968 BUG_ON(op->compression_opt && 969 (op->flags & BCH_WRITE_DATA_ENCODED) && 970 bch2_csum_type_is_encryption(op->crc.csum_type)); 971 BUG_ON(op->compression_opt && !bounce); 972 973 crc.compression_type = op->incompressible 974 ? BCH_COMPRESSION_TYPE_incompressible 975 : op->compression_opt 976 ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, 977 op->compression_opt) 978 : 0; 979 if (!crc_is_compressed(crc)) { 980 dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); 981 dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); 982 983 if (op->csum_type) 984 dst_len = min_t(unsigned, dst_len, 985 c->opts.encoded_extent_max); 986 987 if (bounce) { 988 swap(dst->bi_iter.bi_size, dst_len); 989 bio_copy_data(dst, src); 990 swap(dst->bi_iter.bi_size, dst_len); 991 } 992 993 src_len = dst_len; 994 } 995 996 BUG_ON(!src_len || !dst_len); 997 998 if (bch2_csum_type_is_encryption(op->csum_type)) { 999 if (bversion_zero(version)) { 1000 version.lo = atomic64_inc_return(&c->key_version); 1001 } else { 1002 crc.nonce = op->nonce; 1003 op->nonce += src_len >> 9; 1004 } 1005 } 1006 1007 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 1008 !crc_is_compressed(crc) && 1009 bch2_csum_type_is_encryption(op->crc.csum_type) == 1010 bch2_csum_type_is_encryption(op->csum_type)) { 1011 u8 compression_type = crc.compression_type; 1012 u16 nonce = crc.nonce; 1013 /* 1014 * Note: when we're using rechecksum(), we need to be 1015 * checksumming @src because it has all the data our 1016 * existing checksum covers - if we bounced (because we 1017 * were trying to compress), @dst will only have the 1018 * part of the data the new checksum will cover. 1019 * 1020 * But normally we want to be checksumming post bounce, 1021 * because part of the reason for bouncing is so the 1022 * data can't be modified (by userspace) while it's in 1023 * flight. 1024 */ 1025 if (bch2_rechecksum_bio(c, src, version, op->crc, 1026 &crc, &op->crc, 1027 src_len >> 9, 1028 bio_sectors(src) - (src_len >> 9), 1029 op->csum_type)) 1030 goto csum_err; 1031 /* 1032 * rchecksum_bio sets compression_type on crc from op->crc, 1033 * this isn't always correct as sometimes we're changing 1034 * an extent from uncompressed to incompressible. 1035 */ 1036 crc.compression_type = compression_type; 1037 crc.nonce = nonce; 1038 } else { 1039 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 1040 bch2_rechecksum_bio(c, src, version, op->crc, 1041 NULL, &op->crc, 1042 src_len >> 9, 1043 bio_sectors(src) - (src_len >> 9), 1044 op->crc.csum_type)) 1045 goto csum_err; 1046 1047 crc.compressed_size = dst_len >> 9; 1048 crc.uncompressed_size = src_len >> 9; 1049 crc.live_size = src_len >> 9; 1050 1051 swap(dst->bi_iter.bi_size, dst_len); 1052 ret = bch2_encrypt_bio(c, op->csum_type, 1053 extent_nonce(version, crc), dst); 1054 if (ret) 1055 goto err; 1056 1057 crc.csum = bch2_checksum_bio(c, op->csum_type, 1058 extent_nonce(version, crc), dst); 1059 crc.csum_type = op->csum_type; 1060 swap(dst->bi_iter.bi_size, dst_len); 1061 } 1062 1063 init_append_extent(op, wp, version, crc); 1064 1065 if (dst != src) 1066 bio_advance(dst, dst_len); 1067 bio_advance(src, src_len); 1068 total_output += dst_len; 1069 total_input += src_len; 1070 } while (dst->bi_iter.bi_size && 1071 src->bi_iter.bi_size && 1072 wp->sectors_free && 1073 !bch2_keylist_realloc(&op->insert_keys, 1074 op->inline_keys, 1075 ARRAY_SIZE(op->inline_keys), 1076 BKEY_EXTENT_U64s_MAX)); 1077 1078 more = src->bi_iter.bi_size != 0; 1079 1080 dst->bi_iter = saved_iter; 1081 1082 if (dst == src && more) { 1083 BUG_ON(total_output != total_input); 1084 1085 dst = bio_split(src, total_input >> 9, 1086 GFP_NOFS, &c->bio_write); 1087 wbio_init(dst)->put_bio = true; 1088 /* copy WRITE_SYNC flag */ 1089 dst->bi_opf = src->bi_opf; 1090 } 1091 1092 dst->bi_iter.bi_size = total_output; 1093 do_write: 1094 *_dst = dst; 1095 return more; 1096 csum_err: 1097 { 1098 struct printbuf buf = PRINTBUF; 1099 bch2_write_op_error(&buf, op); 1100 prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); 1101 bch_err_ratelimited(c, "%s", buf.buf); 1102 printbuf_exit(&buf); 1103 } 1104 1105 ret = -EIO; 1106 err: 1107 if (to_wbio(dst)->bounce) 1108 bch2_bio_free_pages_pool(c, dst); 1109 if (to_wbio(dst)->put_bio) 1110 bio_put(dst); 1111 1112 return ret; 1113 } 1114 1115 static bool bch2_extent_is_writeable(struct bch_write_op *op, 1116 struct bkey_s_c k) 1117 { 1118 struct bch_fs *c = op->c; 1119 struct bkey_s_c_extent e; 1120 struct extent_ptr_decoded p; 1121 const union bch_extent_entry *entry; 1122 unsigned replicas = 0; 1123 1124 if (k.k->type != KEY_TYPE_extent) 1125 return false; 1126 1127 e = bkey_s_c_to_extent(k); 1128 1129 rcu_read_lock(); 1130 extent_for_each_ptr_decode(e, p, entry) { 1131 if (crc_is_encoded(p.crc) || p.has_ec) { 1132 rcu_read_unlock(); 1133 return false; 1134 } 1135 1136 replicas += bch2_extent_ptr_durability(c, &p); 1137 } 1138 rcu_read_unlock(); 1139 1140 return replicas >= op->opts.data_replicas; 1141 } 1142 1143 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, 1144 struct btree_iter *iter, 1145 struct bkey_i *orig, 1146 struct bkey_s_c k, 1147 u64 new_i_size) 1148 { 1149 if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { 1150 /* trace this */ 1151 return 0; 1152 } 1153 1154 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); 1155 int ret = PTR_ERR_OR_ZERO(new); 1156 if (ret) 1157 return ret; 1158 1159 bch2_cut_front(bkey_start_pos(&orig->k), new); 1160 bch2_cut_back(orig->k.p, new); 1161 1162 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 1163 bkey_for_each_ptr(ptrs, ptr) 1164 ptr->unwritten = 0; 1165 1166 /* 1167 * Note that we're not calling bch2_subvol_get_snapshot() in this path - 1168 * that was done when we kicked off the write, and here it's important 1169 * that we update the extent that we wrote to - even if a snapshot has 1170 * since been created. The write is still outstanding, so we're ok 1171 * w.r.t. snapshot atomicity: 1172 */ 1173 return bch2_extent_update_i_size_sectors(trans, iter, 1174 min(new->k.p.offset << 9, new_i_size), 0) ?: 1175 bch2_trans_update(trans, iter, new, 1176 BTREE_UPDATE_internal_snapshot_node); 1177 } 1178 1179 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) 1180 { 1181 struct bch_fs *c = op->c; 1182 struct btree_trans *trans = bch2_trans_get(c); 1183 1184 for_each_keylist_key(&op->insert_keys, orig) { 1185 int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, 1186 bkey_start_pos(&orig->k), orig->k.p, 1187 BTREE_ITER_intent, k, 1188 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 1189 bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); 1190 })); 1191 1192 if (ret && !bch2_err_matches(ret, EROFS)) { 1193 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 1194 1195 struct printbuf buf = PRINTBUF; 1196 __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); 1197 prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); 1198 bch_err_ratelimited(c, "%s", buf.buf); 1199 printbuf_exit(&buf); 1200 } 1201 1202 if (ret) { 1203 op->error = ret; 1204 break; 1205 } 1206 } 1207 1208 bch2_trans_put(trans); 1209 } 1210 1211 static void __bch2_nocow_write_done(struct bch_write_op *op) 1212 { 1213 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 1214 op->error = -EIO; 1215 } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) 1216 bch2_nocow_write_convert_unwritten(op); 1217 } 1218 1219 static CLOSURE_CALLBACK(bch2_nocow_write_done) 1220 { 1221 closure_type(op, struct bch_write_op, cl); 1222 1223 __bch2_nocow_write_done(op); 1224 bch2_write_done(cl); 1225 } 1226 1227 struct bucket_to_lock { 1228 struct bpos b; 1229 unsigned gen; 1230 struct nocow_lock_bucket *l; 1231 }; 1232 1233 static void bch2_nocow_write(struct bch_write_op *op) 1234 { 1235 struct bch_fs *c = op->c; 1236 struct btree_trans *trans; 1237 struct btree_iter iter; 1238 struct bkey_s_c k; 1239 DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; 1240 u32 snapshot; 1241 struct bucket_to_lock *stale_at; 1242 int stale, ret; 1243 1244 if (op->flags & BCH_WRITE_MOVE) 1245 return; 1246 1247 darray_init(&buckets); 1248 trans = bch2_trans_get(c); 1249 retry: 1250 bch2_trans_begin(trans); 1251 1252 ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); 1253 if (unlikely(ret)) 1254 goto err; 1255 1256 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1257 SPOS(op->pos.inode, op->pos.offset, snapshot), 1258 BTREE_ITER_slots); 1259 while (1) { 1260 struct bio *bio = &op->wbio.bio; 1261 1262 buckets.nr = 0; 1263 1264 ret = bch2_trans_relock(trans); 1265 if (ret) 1266 break; 1267 1268 k = bch2_btree_iter_peek_slot(&iter); 1269 ret = bkey_err(k); 1270 if (ret) 1271 break; 1272 1273 /* fall back to normal cow write path? */ 1274 if (unlikely(k.k->p.snapshot != snapshot || 1275 !bch2_extent_is_writeable(op, k))) 1276 break; 1277 1278 if (bch2_keylist_realloc(&op->insert_keys, 1279 op->inline_keys, 1280 ARRAY_SIZE(op->inline_keys), 1281 k.k->u64s)) 1282 break; 1283 1284 /* Get iorefs before dropping btree locks: */ 1285 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1286 bkey_for_each_ptr(ptrs, ptr) { 1287 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1288 if (unlikely(!ca)) 1289 goto err_get_ioref; 1290 1291 struct bpos b = PTR_BUCKET_POS(ca, ptr); 1292 struct nocow_lock_bucket *l = 1293 bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); 1294 prefetch(l); 1295 1296 /* XXX allocating memory with btree locks held - rare */ 1297 darray_push_gfp(&buckets, ((struct bucket_to_lock) { 1298 .b = b, .gen = ptr->gen, .l = l, 1299 }), GFP_KERNEL|__GFP_NOFAIL); 1300 1301 if (ptr->unwritten) 1302 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; 1303 } 1304 1305 /* Unlock before taking nocow locks, doing IO: */ 1306 bkey_reassemble(op->insert_keys.top, k); 1307 bch2_trans_unlock(trans); 1308 1309 bch2_cut_front(op->pos, op->insert_keys.top); 1310 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) 1311 bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); 1312 1313 darray_for_each(buckets, i) { 1314 struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); 1315 1316 __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, 1317 bucket_to_u64(i->b), 1318 BUCKET_NOCOW_LOCK_UPDATE); 1319 1320 int gen = bucket_gen_get(ca, i->b.offset); 1321 stale = gen < 0 ? gen : gen_after(gen, i->gen); 1322 if (unlikely(stale)) { 1323 stale_at = i; 1324 goto err_bucket_stale; 1325 } 1326 } 1327 1328 bio = &op->wbio.bio; 1329 if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { 1330 bio = bio_split(bio, k.k->p.offset - op->pos.offset, 1331 GFP_KERNEL, &c->bio_write); 1332 wbio_init(bio)->put_bio = true; 1333 bio->bi_opf = op->wbio.bio.bi_opf; 1334 } else { 1335 op->flags |= BCH_WRITE_SUBMITTED; 1336 } 1337 1338 op->pos.offset += bio_sectors(bio); 1339 op->written += bio_sectors(bio); 1340 1341 bio->bi_end_io = bch2_write_endio; 1342 bio->bi_private = &op->cl; 1343 bio->bi_opf |= REQ_OP_WRITE; 1344 closure_get(&op->cl); 1345 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1346 op->insert_keys.top, true); 1347 1348 bch2_keylist_push(&op->insert_keys); 1349 if (op->flags & BCH_WRITE_SUBMITTED) 1350 break; 1351 bch2_btree_iter_advance(&iter); 1352 } 1353 out: 1354 bch2_trans_iter_exit(trans, &iter); 1355 err: 1356 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1357 goto retry; 1358 1359 bch2_trans_put(trans); 1360 darray_exit(&buckets); 1361 1362 if (ret) { 1363 struct printbuf buf = PRINTBUF; 1364 bch2_write_op_error(&buf, op); 1365 prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); 1366 bch_err_ratelimited(c, "%s", buf.buf); 1367 printbuf_exit(&buf); 1368 op->error = ret; 1369 op->flags |= BCH_WRITE_SUBMITTED; 1370 } 1371 1372 /* fallback to cow write path? */ 1373 if (!(op->flags & BCH_WRITE_SUBMITTED)) { 1374 closure_sync(&op->cl); 1375 __bch2_nocow_write_done(op); 1376 op->insert_keys.top = op->insert_keys.keys; 1377 } else if (op->flags & BCH_WRITE_SYNC) { 1378 closure_sync(&op->cl); 1379 bch2_nocow_write_done(&op->cl.work); 1380 } else { 1381 /* 1382 * XXX 1383 * needs to run out of process context because ei_quota_lock is 1384 * a mutex 1385 */ 1386 continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); 1387 } 1388 return; 1389 err_get_ioref: 1390 darray_for_each(buckets, i) 1391 percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); 1392 1393 /* Fall back to COW path: */ 1394 goto out; 1395 err_bucket_stale: 1396 darray_for_each(buckets, i) { 1397 bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); 1398 if (i == stale_at) 1399 break; 1400 } 1401 1402 struct printbuf buf = PRINTBUF; 1403 if (bch2_fs_inconsistent_on(stale < 0, c, 1404 "pointer to invalid bucket in nocow path on device %llu\n %s", 1405 stale_at->b.inode, 1406 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1407 ret = -EIO; 1408 } else { 1409 /* We can retry this: */ 1410 ret = -BCH_ERR_transaction_restart; 1411 } 1412 printbuf_exit(&buf); 1413 1414 goto err_get_ioref; 1415 } 1416 1417 static void __bch2_write(struct bch_write_op *op) 1418 { 1419 struct bch_fs *c = op->c; 1420 struct write_point *wp = NULL; 1421 struct bio *bio = NULL; 1422 unsigned nofs_flags; 1423 int ret; 1424 1425 nofs_flags = memalloc_nofs_save(); 1426 1427 if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { 1428 bch2_nocow_write(op); 1429 if (op->flags & BCH_WRITE_SUBMITTED) 1430 goto out_nofs_restore; 1431 } 1432 again: 1433 memset(&op->failed, 0, sizeof(op->failed)); 1434 1435 do { 1436 struct bkey_i *key_to_write; 1437 unsigned key_to_write_offset = op->insert_keys.top_p - 1438 op->insert_keys.keys_p; 1439 1440 /* +1 for possible cache device: */ 1441 if (op->open_buckets.nr + op->nr_replicas + 1 > 1442 ARRAY_SIZE(op->open_buckets.v)) 1443 break; 1444 1445 if (bch2_keylist_realloc(&op->insert_keys, 1446 op->inline_keys, 1447 ARRAY_SIZE(op->inline_keys), 1448 BKEY_EXTENT_U64s_MAX)) 1449 break; 1450 1451 /* 1452 * The copygc thread is now global, which means it's no longer 1453 * freeing up space on specific disks, which means that 1454 * allocations for specific disks may hang arbitrarily long: 1455 */ 1456 ret = bch2_trans_run(c, lockrestart_do(trans, 1457 bch2_alloc_sectors_start_trans(trans, 1458 op->target, 1459 op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), 1460 op->write_point, 1461 &op->devs_have, 1462 op->nr_replicas, 1463 op->nr_replicas_required, 1464 op->watermark, 1465 op->flags, 1466 &op->cl, &wp))); 1467 if (unlikely(ret)) { 1468 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) 1469 break; 1470 1471 goto err; 1472 } 1473 1474 EBUG_ON(!wp); 1475 1476 bch2_open_bucket_get(c, wp, &op->open_buckets); 1477 ret = bch2_write_extent(op, wp, &bio); 1478 1479 bch2_alloc_sectors_done_inlined(c, wp); 1480 err: 1481 if (ret <= 0) { 1482 op->flags |= BCH_WRITE_SUBMITTED; 1483 1484 if (unlikely(ret < 0)) { 1485 if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { 1486 struct printbuf buf = PRINTBUF; 1487 bch2_write_op_error(&buf, op); 1488 prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); 1489 bch_err_ratelimited(c, "%s", buf.buf); 1490 printbuf_exit(&buf); 1491 } 1492 op->error = ret; 1493 break; 1494 } 1495 } 1496 1497 bio->bi_end_io = bch2_write_endio; 1498 bio->bi_private = &op->cl; 1499 bio->bi_opf |= REQ_OP_WRITE; 1500 1501 closure_get(bio->bi_private); 1502 1503 key_to_write = (void *) (op->insert_keys.keys_p + 1504 key_to_write_offset); 1505 1506 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1507 key_to_write, false); 1508 } while (ret); 1509 1510 /* 1511 * Sync or no? 1512 * 1513 * If we're running asynchronously, wne may still want to block 1514 * synchronously here if we weren't able to submit all of the IO at 1515 * once, as that signals backpressure to the caller. 1516 */ 1517 if ((op->flags & BCH_WRITE_SYNC) || 1518 (!(op->flags & BCH_WRITE_SUBMITTED) && 1519 !(op->flags & BCH_WRITE_IN_WORKER))) { 1520 bch2_wait_on_allocator(c, &op->cl); 1521 1522 __bch2_write_index(op); 1523 1524 if (!(op->flags & BCH_WRITE_SUBMITTED)) 1525 goto again; 1526 bch2_write_done(&op->cl); 1527 } else { 1528 bch2_write_queue(op, wp); 1529 continue_at(&op->cl, bch2_write_index, NULL); 1530 } 1531 out_nofs_restore: 1532 memalloc_nofs_restore(nofs_flags); 1533 } 1534 1535 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) 1536 { 1537 struct bio *bio = &op->wbio.bio; 1538 struct bvec_iter iter; 1539 struct bkey_i_inline_data *id; 1540 unsigned sectors; 1541 int ret; 1542 1543 memset(&op->failed, 0, sizeof(op->failed)); 1544 1545 op->flags |= BCH_WRITE_WROTE_DATA_INLINE; 1546 op->flags |= BCH_WRITE_SUBMITTED; 1547 1548 bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); 1549 1550 ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, 1551 ARRAY_SIZE(op->inline_keys), 1552 BKEY_U64s + DIV_ROUND_UP(data_len, 8)); 1553 if (ret) { 1554 op->error = ret; 1555 goto err; 1556 } 1557 1558 sectors = bio_sectors(bio); 1559 op->pos.offset += sectors; 1560 1561 id = bkey_inline_data_init(op->insert_keys.top); 1562 id->k.p = op->pos; 1563 id->k.bversion = op->version; 1564 id->k.size = sectors; 1565 1566 iter = bio->bi_iter; 1567 iter.bi_size = data_len; 1568 memcpy_from_bio(id->v.data, bio, iter); 1569 1570 while (data_len & 7) 1571 id->v.data[data_len++] = '\0'; 1572 set_bkey_val_bytes(&id->k, data_len); 1573 bch2_keylist_push(&op->insert_keys); 1574 1575 __bch2_write_index(op); 1576 err: 1577 bch2_write_done(&op->cl); 1578 } 1579 1580 /** 1581 * bch2_write() - handle a write to a cache device or flash only volume 1582 * @cl: &bch_write_op->cl 1583 * 1584 * This is the starting point for any data to end up in a cache device; it could 1585 * be from a normal write, or a writeback write, or a write to a flash only 1586 * volume - it's also used by the moving garbage collector to compact data in 1587 * mostly empty buckets. 1588 * 1589 * It first writes the data to the cache, creating a list of keys to be inserted 1590 * (if the data won't fit in a single open bucket, there will be multiple keys); 1591 * after the data is written it calls bch_journal, and after the keys have been 1592 * added to the next journal write they're inserted into the btree. 1593 * 1594 * If op->discard is true, instead of inserting the data it invalidates the 1595 * region of the cache represented by op->bio and op->inode. 1596 */ 1597 CLOSURE_CALLBACK(bch2_write) 1598 { 1599 closure_type(op, struct bch_write_op, cl); 1600 struct bio *bio = &op->wbio.bio; 1601 struct bch_fs *c = op->c; 1602 unsigned data_len; 1603 1604 EBUG_ON(op->cl.parent); 1605 BUG_ON(!op->nr_replicas); 1606 BUG_ON(!op->write_point.v); 1607 BUG_ON(bkey_eq(op->pos, POS_MAX)); 1608 1609 if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) 1610 op->flags |= BCH_WRITE_ALLOC_NOWAIT; 1611 1612 op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); 1613 op->start_time = local_clock(); 1614 bch2_keylist_init(&op->insert_keys, op->inline_keys); 1615 wbio_init(bio)->put_bio = false; 1616 1617 if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { 1618 struct printbuf buf = PRINTBUF; 1619 bch2_write_op_error(&buf, op); 1620 prt_printf(&buf, "misaligned write"); 1621 printbuf_exit(&buf); 1622 op->error = -EIO; 1623 goto err; 1624 } 1625 1626 if (c->opts.nochanges) { 1627 op->error = -BCH_ERR_erofs_no_writes; 1628 goto err; 1629 } 1630 1631 if (!(op->flags & BCH_WRITE_MOVE) && 1632 !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { 1633 op->error = -BCH_ERR_erofs_no_writes; 1634 goto err; 1635 } 1636 1637 this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); 1638 bch2_increment_clock(c, bio_sectors(bio), WRITE); 1639 1640 data_len = min_t(u64, bio->bi_iter.bi_size, 1641 op->new_i_size - (op->pos.offset << 9)); 1642 1643 if (c->opts.inline_data && 1644 data_len <= min(block_bytes(c) / 2, 1024U)) { 1645 bch2_write_data_inline(op, data_len); 1646 return; 1647 } 1648 1649 __bch2_write(op); 1650 return; 1651 err: 1652 bch2_disk_reservation_put(c, &op->res); 1653 1654 closure_debug_destroy(&op->cl); 1655 if (op->end_io) 1656 op->end_io(op); 1657 } 1658 1659 static const char * const bch2_write_flags[] = { 1660 #define x(f) #f, 1661 BCH_WRITE_FLAGS() 1662 #undef x 1663 NULL 1664 }; 1665 1666 void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) 1667 { 1668 prt_str(out, "pos: "); 1669 bch2_bpos_to_text(out, op->pos); 1670 prt_newline(out); 1671 printbuf_indent_add(out, 2); 1672 1673 prt_str(out, "started: "); 1674 bch2_pr_time_units(out, local_clock() - op->start_time); 1675 prt_newline(out); 1676 1677 prt_str(out, "flags: "); 1678 prt_bitflags(out, bch2_write_flags, op->flags); 1679 prt_newline(out); 1680 1681 prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); 1682 1683 printbuf_indent_sub(out, 2); 1684 } 1685 1686 void bch2_fs_io_write_exit(struct bch_fs *c) 1687 { 1688 mempool_exit(&c->bio_bounce_pages); 1689 bioset_exit(&c->replica_set); 1690 bioset_exit(&c->bio_write); 1691 } 1692 1693 int bch2_fs_io_write_init(struct bch_fs *c) 1694 { 1695 if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || 1696 bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) 1697 return -BCH_ERR_ENOMEM_bio_write_init; 1698 1699 if (mempool_init_page_pool(&c->bio_bounce_pages, 1700 max_t(unsigned, 1701 c->opts.btree_node_size, 1702 c->opts.encoded_extent_max) / 1703 PAGE_SIZE, 0)) 1704 return -BCH_ERR_ENOMEM_bio_bounce_pages_init; 1705 1706 return 0; 1707 } 1708