1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 4 * Copyright 2012 Google, Inc. 5 */ 6 7 #include "bcachefs.h" 8 #include "alloc_foreground.h" 9 #include "bkey_buf.h" 10 #include "bset.h" 11 #include "btree_update.h" 12 #include "buckets.h" 13 #include "checksum.h" 14 #include "clock.h" 15 #include "compress.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "extent_update.h" 20 #include "inode.h" 21 #include "io_write.h" 22 #include "journal.h" 23 #include "keylist.h" 24 #include "move.h" 25 #include "nocow_locking.h" 26 #include "rebalance.h" 27 #include "subvolume.h" 28 #include "super.h" 29 #include "super-io.h" 30 #include "trace.h" 31 32 #include <linux/blkdev.h> 33 #include <linux/prefetch.h> 34 #include <linux/random.h> 35 #include <linux/sched/mm.h> 36 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 38 39 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, 40 u64 now, int rw) 41 { 42 u64 latency_capable = 43 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; 44 /* ideally we'd be taking into account the device's variance here: */ 45 u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); 46 s64 latency_over = io_latency - latency_threshold; 47 48 if (latency_threshold && latency_over > 0) { 49 /* 50 * bump up congested by approximately latency_over * 4 / 51 * latency_threshold - we don't need much accuracy here so don't 52 * bother with the divide: 53 */ 54 if (atomic_read(&ca->congested) < CONGESTED_MAX) 55 atomic_add(latency_over >> 56 max_t(int, ilog2(latency_threshold) - 2, 0), 57 &ca->congested); 58 59 ca->congested_last = now; 60 } else if (atomic_read(&ca->congested) > 0) { 61 atomic_dec(&ca->congested); 62 } 63 } 64 65 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) 66 { 67 atomic64_t *latency = &ca->cur_latency[rw]; 68 u64 now = local_clock(); 69 u64 io_latency = time_after64(now, submit_time) 70 ? now - submit_time 71 : 0; 72 u64 old, new; 73 74 old = atomic64_read(latency); 75 do { 76 /* 77 * If the io latency was reasonably close to the current 78 * latency, skip doing the update and atomic operation - most of 79 * the time: 80 */ 81 if (abs((int) (old - io_latency)) < (old >> 1) && 82 now & ~(~0U << 5)) 83 break; 84 85 new = ewma_add(old, io_latency, 5); 86 } while (!atomic64_try_cmpxchg(latency, &old, new)); 87 88 bch2_congested_acct(ca, io_latency, now, rw); 89 90 __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); 91 } 92 93 #endif 94 95 /* Allocate, free from mempool: */ 96 97 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) 98 { 99 struct bvec_iter_all iter; 100 struct bio_vec *bv; 101 102 bio_for_each_segment_all(bv, bio, iter) 103 if (bv->bv_page != ZERO_PAGE(0)) 104 mempool_free(bv->bv_page, &c->bio_bounce_pages); 105 bio->bi_vcnt = 0; 106 } 107 108 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) 109 { 110 struct page *page; 111 112 if (likely(!*using_mempool)) { 113 page = alloc_page(GFP_NOFS); 114 if (unlikely(!page)) { 115 mutex_lock(&c->bio_bounce_pages_lock); 116 *using_mempool = true; 117 goto pool_alloc; 118 119 } 120 } else { 121 pool_alloc: 122 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); 123 } 124 125 return page; 126 } 127 128 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, 129 size_t size) 130 { 131 bool using_mempool = false; 132 133 while (size) { 134 struct page *page = __bio_alloc_page_pool(c, &using_mempool); 135 unsigned len = min_t(size_t, PAGE_SIZE, size); 136 137 BUG_ON(!bio_add_page(bio, page, len, 0)); 138 size -= len; 139 } 140 141 if (using_mempool) 142 mutex_unlock(&c->bio_bounce_pages_lock); 143 } 144 145 /* Extent update path: */ 146 147 int bch2_sum_sector_overwrites(struct btree_trans *trans, 148 struct btree_iter *extent_iter, 149 struct bkey_i *new, 150 bool *usage_increasing, 151 s64 *i_sectors_delta, 152 s64 *disk_sectors_delta) 153 { 154 struct bch_fs *c = trans->c; 155 struct btree_iter iter; 156 struct bkey_s_c old; 157 unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); 158 bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); 159 int ret = 0; 160 161 *usage_increasing = false; 162 *i_sectors_delta = 0; 163 *disk_sectors_delta = 0; 164 165 bch2_trans_copy_iter(&iter, extent_iter); 166 167 for_each_btree_key_max_continue_norestart(iter, 168 new->k.p, BTREE_ITER_slots, old, ret) { 169 s64 sectors = min(new->k.p.offset, old.k->p.offset) - 170 max(bkey_start_offset(&new->k), 171 bkey_start_offset(old.k)); 172 173 *i_sectors_delta += sectors * 174 (bkey_extent_is_allocation(&new->k) - 175 bkey_extent_is_allocation(old.k)); 176 177 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); 178 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot 179 ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) 180 : 0; 181 182 if (!*usage_increasing && 183 (new->k.p.snapshot != old.k->p.snapshot || 184 new_replicas > bch2_bkey_replicas(c, old) || 185 (!new_compressed && bch2_bkey_sectors_compressed(old)))) 186 *usage_increasing = true; 187 188 if (bkey_ge(old.k->p, new->k.p)) 189 break; 190 } 191 192 bch2_trans_iter_exit(trans, &iter); 193 return ret; 194 } 195 196 static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, 197 struct btree_iter *extent_iter, 198 u64 new_i_size, 199 s64 i_sectors_delta) 200 { 201 /* 202 * Crazy performance optimization: 203 * Every extent update needs to also update the inode: the inode trigger 204 * will set bi->journal_seq to the journal sequence number of this 205 * transaction - for fsync. 206 * 207 * But if that's the only reason we're updating the inode (we're not 208 * updating bi_size or bi_sectors), then we don't need the inode update 209 * to be journalled - if we crash, the bi_journal_seq update will be 210 * lost, but that's fine. 211 */ 212 unsigned inode_update_flags = BTREE_UPDATE_nojournal; 213 214 struct btree_iter iter; 215 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 216 SPOS(0, 217 extent_iter->pos.inode, 218 extent_iter->snapshot), 219 BTREE_ITER_intent| 220 BTREE_ITER_cached); 221 int ret = bkey_err(k); 222 if (unlikely(ret)) 223 return ret; 224 225 /* 226 * varint_decode_fast(), in the inode .invalid method, reads up to 7 227 * bytes past the end of the buffer: 228 */ 229 struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); 230 ret = PTR_ERR_OR_ZERO(k_mut); 231 if (unlikely(ret)) 232 goto err; 233 234 bkey_reassemble(k_mut, k); 235 236 if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { 237 k_mut = bch2_inode_to_v3(trans, k_mut); 238 ret = PTR_ERR_OR_ZERO(k_mut); 239 if (unlikely(ret)) 240 goto err; 241 } 242 243 struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); 244 245 if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && 246 new_i_size > le64_to_cpu(inode->v.bi_size)) { 247 inode->v.bi_size = cpu_to_le64(new_i_size); 248 inode_update_flags = 0; 249 } 250 251 if (i_sectors_delta) { 252 le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); 253 inode_update_flags = 0; 254 } 255 256 if (inode->k.p.snapshot != iter.snapshot) { 257 inode->k.p.snapshot = iter.snapshot; 258 inode_update_flags = 0; 259 } 260 261 ret = bch2_trans_update(trans, &iter, &inode->k_i, 262 BTREE_UPDATE_internal_snapshot_node| 263 inode_update_flags); 264 err: 265 bch2_trans_iter_exit(trans, &iter); 266 return ret; 267 } 268 269 int bch2_extent_update(struct btree_trans *trans, 270 subvol_inum inum, 271 struct btree_iter *iter, 272 struct bkey_i *k, 273 struct disk_reservation *disk_res, 274 u64 new_i_size, 275 s64 *i_sectors_delta_total, 276 bool check_enospc) 277 { 278 struct bpos next_pos; 279 bool usage_increasing; 280 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 281 int ret; 282 283 /* 284 * This traverses us the iterator without changing iter->path->pos to 285 * search_key() (which is pos + 1 for extents): we want there to be a 286 * path already traversed at iter->pos because 287 * bch2_trans_extent_update() will use it to attempt extent merging 288 */ 289 ret = __bch2_btree_iter_traverse(iter); 290 if (ret) 291 return ret; 292 293 ret = bch2_extent_trim_atomic(trans, iter, k); 294 if (ret) 295 return ret; 296 297 next_pos = k->k.p; 298 299 ret = bch2_sum_sector_overwrites(trans, iter, k, 300 &usage_increasing, 301 &i_sectors_delta, 302 &disk_sectors_delta); 303 if (ret) 304 return ret; 305 306 if (disk_res && 307 disk_sectors_delta > (s64) disk_res->sectors) { 308 ret = bch2_disk_reservation_add(trans->c, disk_res, 309 disk_sectors_delta - disk_res->sectors, 310 !check_enospc || !usage_increasing 311 ? BCH_DISK_RESERVATION_NOFAIL : 0); 312 if (ret) 313 return ret; 314 } 315 316 /* 317 * Note: 318 * We always have to do an inode update - even when i_size/i_sectors 319 * aren't changing - for fsync to work properly; fsync relies on 320 * inode->bi_journal_seq which is updated by the trigger code: 321 */ 322 ret = bch2_extent_update_i_size_sectors(trans, iter, 323 min(k->k.p.offset << 9, new_i_size), 324 i_sectors_delta) ?: 325 bch2_trans_update(trans, iter, k, 0) ?: 326 bch2_trans_commit(trans, disk_res, NULL, 327 BCH_TRANS_COMMIT_no_check_rw| 328 BCH_TRANS_COMMIT_no_enospc); 329 if (unlikely(ret)) 330 return ret; 331 332 if (i_sectors_delta_total) 333 *i_sectors_delta_total += i_sectors_delta; 334 bch2_btree_iter_set_pos(iter, next_pos); 335 return 0; 336 } 337 338 static int bch2_write_index_default(struct bch_write_op *op) 339 { 340 struct bch_fs *c = op->c; 341 struct bkey_buf sk; 342 struct keylist *keys = &op->insert_keys; 343 struct bkey_i *k = bch2_keylist_front(keys); 344 struct btree_trans *trans = bch2_trans_get(c); 345 struct btree_iter iter; 346 subvol_inum inum = { 347 .subvol = op->subvol, 348 .inum = k->k.p.inode, 349 }; 350 int ret; 351 352 BUG_ON(!inum.subvol); 353 354 bch2_bkey_buf_init(&sk); 355 356 do { 357 bch2_trans_begin(trans); 358 359 k = bch2_keylist_front(keys); 360 bch2_bkey_buf_copy(&sk, c, k); 361 362 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, 363 &sk.k->k.p.snapshot); 364 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 365 continue; 366 if (ret) 367 break; 368 369 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 370 bkey_start_pos(&sk.k->k), 371 BTREE_ITER_slots|BTREE_ITER_intent); 372 373 ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: 374 bch2_extent_update(trans, inum, &iter, sk.k, 375 &op->res, 376 op->new_i_size, &op->i_sectors_delta, 377 op->flags & BCH_WRITE_CHECK_ENOSPC); 378 bch2_trans_iter_exit(trans, &iter); 379 380 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 381 continue; 382 if (ret) 383 break; 384 385 if (bkey_ge(iter.pos, k->k.p)) 386 bch2_keylist_pop_front(&op->insert_keys); 387 else 388 bch2_cut_front(iter.pos, k); 389 } while (!bch2_keylist_empty(keys)); 390 391 bch2_trans_put(trans); 392 bch2_bkey_buf_exit(&sk, c); 393 394 return ret; 395 } 396 397 /* Writes */ 398 399 static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, 400 u64 offset) 401 { 402 bch2_inum_offset_err_msg(op->c, out, 403 (subvol_inum) { op->subvol, op->pos.inode, }, 404 offset << 9); 405 prt_printf(out, "write error%s: ", 406 op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); 407 } 408 409 void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) 410 { 411 __bch2_write_op_error(out, op, op->pos.offset); 412 } 413 414 static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, 415 struct bch_write_op *op, u64 offset) 416 { 417 bch2_inum_offset_err_msg_trans(trans, out, 418 (subvol_inum) { op->subvol, op->pos.inode, }, 419 offset << 9); 420 prt_printf(out, "write error%s: ", 421 op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); 422 } 423 424 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, 425 enum bch_data_type type, 426 const struct bkey_i *k, 427 bool nocow) 428 { 429 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); 430 struct bch_write_bio *n; 431 432 BUG_ON(c->opts.nochanges); 433 434 bkey_for_each_ptr(ptrs, ptr) { 435 struct bch_dev *ca = nocow 436 ? bch2_dev_have_ref(c, ptr->dev) 437 : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); 438 439 if (to_entry(ptr + 1) < ptrs.end) { 440 n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); 441 442 n->bio.bi_end_io = wbio->bio.bi_end_io; 443 n->bio.bi_private = wbio->bio.bi_private; 444 n->parent = wbio; 445 n->split = true; 446 n->bounce = false; 447 n->put_bio = true; 448 n->bio.bi_opf = wbio->bio.bi_opf; 449 bio_inc_remaining(&wbio->bio); 450 } else { 451 n = wbio; 452 n->split = false; 453 } 454 455 n->c = c; 456 n->dev = ptr->dev; 457 n->have_ioref = ca != NULL; 458 n->nocow = nocow; 459 n->submit_time = local_clock(); 460 n->inode_offset = bkey_start_offset(&k->k); 461 if (nocow) 462 n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); 463 n->bio.bi_iter.bi_sector = ptr->offset; 464 465 if (likely(n->have_ioref)) { 466 this_cpu_add(ca->io_done->sectors[WRITE][type], 467 bio_sectors(&n->bio)); 468 469 bio_set_dev(&n->bio, ca->disk_sb.bdev); 470 471 if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { 472 bio_endio(&n->bio); 473 continue; 474 } 475 476 submit_bio(&n->bio); 477 } else { 478 n->bio.bi_status = BLK_STS_REMOVED; 479 bio_endio(&n->bio); 480 } 481 } 482 } 483 484 static void __bch2_write(struct bch_write_op *); 485 486 static void bch2_write_done(struct closure *cl) 487 { 488 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 489 struct bch_fs *c = op->c; 490 491 EBUG_ON(op->open_buckets.nr); 492 493 bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); 494 bch2_disk_reservation_put(c, &op->res); 495 496 if (!(op->flags & BCH_WRITE_MOVE)) 497 bch2_write_ref_put(c, BCH_WRITE_REF_write); 498 bch2_keylist_free(&op->insert_keys, op->inline_keys); 499 500 EBUG_ON(cl->parent); 501 closure_debug_destroy(cl); 502 if (op->end_io) 503 op->end_io(op); 504 } 505 506 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) 507 { 508 struct keylist *keys = &op->insert_keys; 509 struct bkey_i *src, *dst = keys->keys, *n; 510 511 for (src = keys->keys; src != keys->top; src = n) { 512 n = bkey_next(src); 513 514 if (bkey_extent_is_direct_data(&src->k)) { 515 bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, 516 test_bit(ptr->dev, op->failed.d)); 517 518 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) 519 return -EIO; 520 } 521 522 if (dst != src) 523 memmove_u64s_down(dst, src, src->k.u64s); 524 dst = bkey_next(dst); 525 } 526 527 keys->top = dst; 528 return 0; 529 } 530 531 /** 532 * __bch2_write_index - after a write, update index to point to new data 533 * @op: bch_write_op to process 534 */ 535 static void __bch2_write_index(struct bch_write_op *op) 536 { 537 struct bch_fs *c = op->c; 538 struct keylist *keys = &op->insert_keys; 539 unsigned dev; 540 int ret = 0; 541 542 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 543 ret = bch2_write_drop_io_error_ptrs(op); 544 if (ret) 545 goto err; 546 } 547 548 if (!bch2_keylist_empty(keys)) { 549 u64 sectors_start = keylist_sectors(keys); 550 551 ret = !(op->flags & BCH_WRITE_MOVE) 552 ? bch2_write_index_default(op) 553 : bch2_data_update_index_update(op); 554 555 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 556 BUG_ON(keylist_sectors(keys) && !ret); 557 558 op->written += sectors_start - keylist_sectors(keys); 559 560 if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { 561 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 562 563 struct printbuf buf = PRINTBUF; 564 __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); 565 prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); 566 bch_err_ratelimited(c, "%s", buf.buf); 567 printbuf_exit(&buf); 568 } 569 570 if (ret) 571 goto err; 572 } 573 out: 574 /* If some a bucket wasn't written, we can't erasure code it: */ 575 for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) 576 bch2_open_bucket_write_error(c, &op->open_buckets, dev); 577 578 bch2_open_buckets_put(c, &op->open_buckets); 579 return; 580 err: 581 keys->top = keys->keys; 582 op->error = ret; 583 op->flags |= BCH_WRITE_SUBMITTED; 584 goto out; 585 } 586 587 static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) 588 { 589 if (state != wp->state) { 590 u64 now = ktime_get_ns(); 591 592 if (wp->last_state_change && 593 time_after64(now, wp->last_state_change)) 594 wp->time[wp->state] += now - wp->last_state_change; 595 wp->state = state; 596 wp->last_state_change = now; 597 } 598 } 599 600 static inline void wp_update_state(struct write_point *wp, bool running) 601 { 602 enum write_point_state state; 603 604 state = running ? WRITE_POINT_running : 605 !list_empty(&wp->writes) ? WRITE_POINT_waiting_io 606 : WRITE_POINT_stopped; 607 608 __wp_update_state(wp, state); 609 } 610 611 static CLOSURE_CALLBACK(bch2_write_index) 612 { 613 closure_type(op, struct bch_write_op, cl); 614 struct write_point *wp = op->wp; 615 struct workqueue_struct *wq = index_update_wq(op); 616 unsigned long flags; 617 618 if ((op->flags & BCH_WRITE_SUBMITTED) && 619 (op->flags & BCH_WRITE_MOVE)) 620 bch2_bio_free_pages_pool(op->c, &op->wbio.bio); 621 622 spin_lock_irqsave(&wp->writes_lock, flags); 623 if (wp->state == WRITE_POINT_waiting_io) 624 __wp_update_state(wp, WRITE_POINT_waiting_work); 625 list_add_tail(&op->wp_list, &wp->writes); 626 spin_unlock_irqrestore (&wp->writes_lock, flags); 627 628 queue_work(wq, &wp->index_update_work); 629 } 630 631 static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) 632 { 633 op->wp = wp; 634 635 if (wp->state == WRITE_POINT_stopped) { 636 spin_lock_irq(&wp->writes_lock); 637 __wp_update_state(wp, WRITE_POINT_waiting_io); 638 spin_unlock_irq(&wp->writes_lock); 639 } 640 } 641 642 void bch2_write_point_do_index_updates(struct work_struct *work) 643 { 644 struct write_point *wp = 645 container_of(work, struct write_point, index_update_work); 646 struct bch_write_op *op; 647 648 while (1) { 649 spin_lock_irq(&wp->writes_lock); 650 op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); 651 wp_update_state(wp, op != NULL); 652 spin_unlock_irq(&wp->writes_lock); 653 654 if (!op) 655 break; 656 657 op->flags |= BCH_WRITE_IN_WORKER; 658 659 __bch2_write_index(op); 660 661 if (!(op->flags & BCH_WRITE_SUBMITTED)) 662 __bch2_write(op); 663 else 664 bch2_write_done(&op->cl); 665 } 666 } 667 668 static void bch2_write_endio(struct bio *bio) 669 { 670 struct closure *cl = bio->bi_private; 671 struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); 672 struct bch_write_bio *wbio = to_wbio(bio); 673 struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; 674 struct bch_fs *c = wbio->c; 675 struct bch_dev *ca = wbio->have_ioref 676 ? bch2_dev_have_ref(c, wbio->dev) 677 : NULL; 678 679 if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 680 op->pos.inode, 681 wbio->inode_offset << 9, 682 "data write error: %s", 683 bch2_blk_status_to_str(bio->bi_status))) { 684 set_bit(wbio->dev, op->failed.d); 685 op->flags |= BCH_WRITE_IO_ERROR; 686 } 687 688 if (wbio->nocow) { 689 bch2_bucket_nocow_unlock(&c->nocow_locks, 690 POS(ca->dev_idx, wbio->nocow_bucket), 691 BUCKET_NOCOW_LOCK_UPDATE); 692 set_bit(wbio->dev, op->devs_need_flush->d); 693 } 694 695 if (wbio->have_ioref) { 696 bch2_latency_acct(ca, wbio->submit_time, WRITE); 697 percpu_ref_put(&ca->io_ref); 698 } 699 700 if (wbio->bounce) 701 bch2_bio_free_pages_pool(c, bio); 702 703 if (wbio->put_bio) 704 bio_put(bio); 705 706 if (parent) 707 bio_endio(&parent->bio); 708 else 709 closure_put(cl); 710 } 711 712 static void init_append_extent(struct bch_write_op *op, 713 struct write_point *wp, 714 struct bversion version, 715 struct bch_extent_crc_unpacked crc) 716 { 717 struct bkey_i_extent *e; 718 719 op->pos.offset += crc.uncompressed_size; 720 721 e = bkey_extent_init(op->insert_keys.top); 722 e->k.p = op->pos; 723 e->k.size = crc.uncompressed_size; 724 e->k.bversion = version; 725 726 if (crc.csum_type || 727 crc.compression_type || 728 crc.nonce) 729 bch2_extent_crc_append(&e->k_i, crc); 730 731 bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, 732 op->flags & BCH_WRITE_CACHED); 733 734 bch2_keylist_push(&op->insert_keys); 735 } 736 737 static struct bio *bch2_write_bio_alloc(struct bch_fs *c, 738 struct write_point *wp, 739 struct bio *src, 740 bool *page_alloc_failed, 741 void *buf) 742 { 743 struct bch_write_bio *wbio; 744 struct bio *bio; 745 unsigned output_available = 746 min(wp->sectors_free << 9, src->bi_iter.bi_size); 747 unsigned pages = DIV_ROUND_UP(output_available + 748 (buf 749 ? ((unsigned long) buf & (PAGE_SIZE - 1)) 750 : 0), PAGE_SIZE); 751 752 pages = min(pages, BIO_MAX_VECS); 753 754 bio = bio_alloc_bioset(NULL, pages, 0, 755 GFP_NOFS, &c->bio_write); 756 wbio = wbio_init(bio); 757 wbio->put_bio = true; 758 /* copy WRITE_SYNC flag */ 759 wbio->bio.bi_opf = src->bi_opf; 760 761 if (buf) { 762 bch2_bio_map(bio, buf, output_available); 763 return bio; 764 } 765 766 wbio->bounce = true; 767 768 /* 769 * We can't use mempool for more than c->sb.encoded_extent_max 770 * worth of pages, but we'd like to allocate more if we can: 771 */ 772 bch2_bio_alloc_pages_pool(c, bio, 773 min_t(unsigned, output_available, 774 c->opts.encoded_extent_max)); 775 776 if (bio->bi_iter.bi_size < output_available) 777 *page_alloc_failed = 778 bch2_bio_alloc_pages(bio, 779 output_available - 780 bio->bi_iter.bi_size, 781 GFP_NOFS) != 0; 782 783 return bio; 784 } 785 786 static int bch2_write_rechecksum(struct bch_fs *c, 787 struct bch_write_op *op, 788 unsigned new_csum_type) 789 { 790 struct bio *bio = &op->wbio.bio; 791 struct bch_extent_crc_unpacked new_crc; 792 int ret; 793 794 /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ 795 796 if (bch2_csum_type_is_encryption(op->crc.csum_type) != 797 bch2_csum_type_is_encryption(new_csum_type)) 798 new_csum_type = op->crc.csum_type; 799 800 ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, 801 NULL, &new_crc, 802 op->crc.offset, op->crc.live_size, 803 new_csum_type); 804 if (ret) 805 return ret; 806 807 bio_advance(bio, op->crc.offset << 9); 808 bio->bi_iter.bi_size = op->crc.live_size << 9; 809 op->crc = new_crc; 810 return 0; 811 } 812 813 static int bch2_write_decrypt(struct bch_write_op *op) 814 { 815 struct bch_fs *c = op->c; 816 struct nonce nonce = extent_nonce(op->version, op->crc); 817 struct bch_csum csum; 818 int ret; 819 820 if (!bch2_csum_type_is_encryption(op->crc.csum_type)) 821 return 0; 822 823 /* 824 * If we need to decrypt data in the write path, we'll no longer be able 825 * to verify the existing checksum (poly1305 mac, in this case) after 826 * it's decrypted - this is the last point we'll be able to reverify the 827 * checksum: 828 */ 829 csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 830 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 831 return -EIO; 832 833 ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); 834 op->crc.csum_type = 0; 835 op->crc.csum = (struct bch_csum) { 0, 0 }; 836 return ret; 837 } 838 839 static enum prep_encoded_ret { 840 PREP_ENCODED_OK, 841 PREP_ENCODED_ERR, 842 PREP_ENCODED_CHECKSUM_ERR, 843 PREP_ENCODED_DO_WRITE, 844 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) 845 { 846 struct bch_fs *c = op->c; 847 struct bio *bio = &op->wbio.bio; 848 849 if (!(op->flags & BCH_WRITE_DATA_ENCODED)) 850 return PREP_ENCODED_OK; 851 852 BUG_ON(bio_sectors(bio) != op->crc.compressed_size); 853 854 /* Can we just write the entire extent as is? */ 855 if (op->crc.uncompressed_size == op->crc.live_size && 856 op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && 857 op->crc.compressed_size <= wp->sectors_free && 858 (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || 859 op->incompressible)) { 860 if (!crc_is_compressed(op->crc) && 861 op->csum_type != op->crc.csum_type && 862 bch2_write_rechecksum(c, op, op->csum_type) && 863 !c->opts.no_data_io) 864 return PREP_ENCODED_CHECKSUM_ERR; 865 866 return PREP_ENCODED_DO_WRITE; 867 } 868 869 /* 870 * If the data is compressed and we couldn't write the entire extent as 871 * is, we have to decompress it: 872 */ 873 if (crc_is_compressed(op->crc)) { 874 struct bch_csum csum; 875 876 if (bch2_write_decrypt(op)) 877 return PREP_ENCODED_CHECKSUM_ERR; 878 879 /* Last point we can still verify checksum: */ 880 csum = bch2_checksum_bio(c, op->crc.csum_type, 881 extent_nonce(op->version, op->crc), 882 bio); 883 if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) 884 return PREP_ENCODED_CHECKSUM_ERR; 885 886 if (bch2_bio_uncompress_inplace(op, bio)) 887 return PREP_ENCODED_ERR; 888 } 889 890 /* 891 * No longer have compressed data after this point - data might be 892 * encrypted: 893 */ 894 895 /* 896 * If the data is checksummed and we're only writing a subset, 897 * rechecksum and adjust bio to point to currently live data: 898 */ 899 if ((op->crc.live_size != op->crc.uncompressed_size || 900 op->crc.csum_type != op->csum_type) && 901 bch2_write_rechecksum(c, op, op->csum_type) && 902 !c->opts.no_data_io) 903 return PREP_ENCODED_CHECKSUM_ERR; 904 905 /* 906 * If we want to compress the data, it has to be decrypted: 907 */ 908 if ((op->compression_opt || 909 bch2_csum_type_is_encryption(op->crc.csum_type) != 910 bch2_csum_type_is_encryption(op->csum_type)) && 911 bch2_write_decrypt(op)) 912 return PREP_ENCODED_CHECKSUM_ERR; 913 914 return PREP_ENCODED_OK; 915 } 916 917 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, 918 struct bio **_dst) 919 { 920 struct bch_fs *c = op->c; 921 struct bio *src = &op->wbio.bio, *dst = src; 922 struct bvec_iter saved_iter; 923 void *ec_buf; 924 unsigned total_output = 0, total_input = 0; 925 bool bounce = false; 926 bool page_alloc_failed = false; 927 int ret, more = 0; 928 929 BUG_ON(!bio_sectors(src)); 930 931 ec_buf = bch2_writepoint_ec_buf(c, wp); 932 933 switch (bch2_write_prep_encoded_data(op, wp)) { 934 case PREP_ENCODED_OK: 935 break; 936 case PREP_ENCODED_ERR: 937 ret = -EIO; 938 goto err; 939 case PREP_ENCODED_CHECKSUM_ERR: 940 goto csum_err; 941 case PREP_ENCODED_DO_WRITE: 942 /* XXX look for bug here */ 943 if (ec_buf) { 944 dst = bch2_write_bio_alloc(c, wp, src, 945 &page_alloc_failed, 946 ec_buf); 947 bio_copy_data(dst, src); 948 bounce = true; 949 } 950 init_append_extent(op, wp, op->version, op->crc); 951 goto do_write; 952 } 953 954 if (ec_buf || 955 op->compression_opt || 956 (op->csum_type && 957 !(op->flags & BCH_WRITE_PAGES_STABLE)) || 958 (bch2_csum_type_is_encryption(op->csum_type) && 959 !(op->flags & BCH_WRITE_PAGES_OWNED))) { 960 dst = bch2_write_bio_alloc(c, wp, src, 961 &page_alloc_failed, 962 ec_buf); 963 bounce = true; 964 } 965 966 saved_iter = dst->bi_iter; 967 968 do { 969 struct bch_extent_crc_unpacked crc = { 0 }; 970 struct bversion version = op->version; 971 size_t dst_len = 0, src_len = 0; 972 973 if (page_alloc_failed && 974 dst->bi_iter.bi_size < (wp->sectors_free << 9) && 975 dst->bi_iter.bi_size < c->opts.encoded_extent_max) 976 break; 977 978 BUG_ON(op->compression_opt && 979 (op->flags & BCH_WRITE_DATA_ENCODED) && 980 bch2_csum_type_is_encryption(op->crc.csum_type)); 981 BUG_ON(op->compression_opt && !bounce); 982 983 crc.compression_type = op->incompressible 984 ? BCH_COMPRESSION_TYPE_incompressible 985 : op->compression_opt 986 ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, 987 op->compression_opt) 988 : 0; 989 if (!crc_is_compressed(crc)) { 990 dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); 991 dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); 992 993 if (op->csum_type) 994 dst_len = min_t(unsigned, dst_len, 995 c->opts.encoded_extent_max); 996 997 if (bounce) { 998 swap(dst->bi_iter.bi_size, dst_len); 999 bio_copy_data(dst, src); 1000 swap(dst->bi_iter.bi_size, dst_len); 1001 } 1002 1003 src_len = dst_len; 1004 } 1005 1006 BUG_ON(!src_len || !dst_len); 1007 1008 if (bch2_csum_type_is_encryption(op->csum_type)) { 1009 if (bversion_zero(version)) { 1010 version.lo = atomic64_inc_return(&c->key_version); 1011 } else { 1012 crc.nonce = op->nonce; 1013 op->nonce += src_len >> 9; 1014 } 1015 } 1016 1017 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 1018 !crc_is_compressed(crc) && 1019 bch2_csum_type_is_encryption(op->crc.csum_type) == 1020 bch2_csum_type_is_encryption(op->csum_type)) { 1021 u8 compression_type = crc.compression_type; 1022 u16 nonce = crc.nonce; 1023 /* 1024 * Note: when we're using rechecksum(), we need to be 1025 * checksumming @src because it has all the data our 1026 * existing checksum covers - if we bounced (because we 1027 * were trying to compress), @dst will only have the 1028 * part of the data the new checksum will cover. 1029 * 1030 * But normally we want to be checksumming post bounce, 1031 * because part of the reason for bouncing is so the 1032 * data can't be modified (by userspace) while it's in 1033 * flight. 1034 */ 1035 if (bch2_rechecksum_bio(c, src, version, op->crc, 1036 &crc, &op->crc, 1037 src_len >> 9, 1038 bio_sectors(src) - (src_len >> 9), 1039 op->csum_type)) 1040 goto csum_err; 1041 /* 1042 * rchecksum_bio sets compression_type on crc from op->crc, 1043 * this isn't always correct as sometimes we're changing 1044 * an extent from uncompressed to incompressible. 1045 */ 1046 crc.compression_type = compression_type; 1047 crc.nonce = nonce; 1048 } else { 1049 if ((op->flags & BCH_WRITE_DATA_ENCODED) && 1050 bch2_rechecksum_bio(c, src, version, op->crc, 1051 NULL, &op->crc, 1052 src_len >> 9, 1053 bio_sectors(src) - (src_len >> 9), 1054 op->crc.csum_type)) 1055 goto csum_err; 1056 1057 crc.compressed_size = dst_len >> 9; 1058 crc.uncompressed_size = src_len >> 9; 1059 crc.live_size = src_len >> 9; 1060 1061 swap(dst->bi_iter.bi_size, dst_len); 1062 ret = bch2_encrypt_bio(c, op->csum_type, 1063 extent_nonce(version, crc), dst); 1064 if (ret) 1065 goto err; 1066 1067 crc.csum = bch2_checksum_bio(c, op->csum_type, 1068 extent_nonce(version, crc), dst); 1069 crc.csum_type = op->csum_type; 1070 swap(dst->bi_iter.bi_size, dst_len); 1071 } 1072 1073 init_append_extent(op, wp, version, crc); 1074 1075 if (dst != src) 1076 bio_advance(dst, dst_len); 1077 bio_advance(src, src_len); 1078 total_output += dst_len; 1079 total_input += src_len; 1080 } while (dst->bi_iter.bi_size && 1081 src->bi_iter.bi_size && 1082 wp->sectors_free && 1083 !bch2_keylist_realloc(&op->insert_keys, 1084 op->inline_keys, 1085 ARRAY_SIZE(op->inline_keys), 1086 BKEY_EXTENT_U64s_MAX)); 1087 1088 more = src->bi_iter.bi_size != 0; 1089 1090 dst->bi_iter = saved_iter; 1091 1092 if (dst == src && more) { 1093 BUG_ON(total_output != total_input); 1094 1095 dst = bio_split(src, total_input >> 9, 1096 GFP_NOFS, &c->bio_write); 1097 wbio_init(dst)->put_bio = true; 1098 /* copy WRITE_SYNC flag */ 1099 dst->bi_opf = src->bi_opf; 1100 } 1101 1102 dst->bi_iter.bi_size = total_output; 1103 do_write: 1104 *_dst = dst; 1105 return more; 1106 csum_err: 1107 { 1108 struct printbuf buf = PRINTBUF; 1109 bch2_write_op_error(&buf, op); 1110 prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); 1111 bch_err_ratelimited(c, "%s", buf.buf); 1112 printbuf_exit(&buf); 1113 } 1114 1115 ret = -EIO; 1116 err: 1117 if (to_wbio(dst)->bounce) 1118 bch2_bio_free_pages_pool(c, dst); 1119 if (to_wbio(dst)->put_bio) 1120 bio_put(dst); 1121 1122 return ret; 1123 } 1124 1125 static bool bch2_extent_is_writeable(struct bch_write_op *op, 1126 struct bkey_s_c k) 1127 { 1128 struct bch_fs *c = op->c; 1129 struct bkey_s_c_extent e; 1130 struct extent_ptr_decoded p; 1131 const union bch_extent_entry *entry; 1132 unsigned replicas = 0; 1133 1134 if (k.k->type != KEY_TYPE_extent) 1135 return false; 1136 1137 e = bkey_s_c_to_extent(k); 1138 1139 rcu_read_lock(); 1140 extent_for_each_ptr_decode(e, p, entry) { 1141 if (crc_is_encoded(p.crc) || p.has_ec) { 1142 rcu_read_unlock(); 1143 return false; 1144 } 1145 1146 replicas += bch2_extent_ptr_durability(c, &p); 1147 } 1148 rcu_read_unlock(); 1149 1150 return replicas >= op->opts.data_replicas; 1151 } 1152 1153 static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, 1154 struct btree_iter *iter, 1155 struct bkey_i *orig, 1156 struct bkey_s_c k, 1157 u64 new_i_size) 1158 { 1159 if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { 1160 /* trace this */ 1161 return 0; 1162 } 1163 1164 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); 1165 int ret = PTR_ERR_OR_ZERO(new); 1166 if (ret) 1167 return ret; 1168 1169 bch2_cut_front(bkey_start_pos(&orig->k), new); 1170 bch2_cut_back(orig->k.p, new); 1171 1172 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); 1173 bkey_for_each_ptr(ptrs, ptr) 1174 ptr->unwritten = 0; 1175 1176 /* 1177 * Note that we're not calling bch2_subvol_get_snapshot() in this path - 1178 * that was done when we kicked off the write, and here it's important 1179 * that we update the extent that we wrote to - even if a snapshot has 1180 * since been created. The write is still outstanding, so we're ok 1181 * w.r.t. snapshot atomicity: 1182 */ 1183 return bch2_extent_update_i_size_sectors(trans, iter, 1184 min(new->k.p.offset << 9, new_i_size), 0) ?: 1185 bch2_trans_update(trans, iter, new, 1186 BTREE_UPDATE_internal_snapshot_node); 1187 } 1188 1189 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) 1190 { 1191 struct bch_fs *c = op->c; 1192 struct btree_trans *trans = bch2_trans_get(c); 1193 1194 for_each_keylist_key(&op->insert_keys, orig) { 1195 int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, 1196 bkey_start_pos(&orig->k), orig->k.p, 1197 BTREE_ITER_intent, k, 1198 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 1199 bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); 1200 })); 1201 1202 if (ret && !bch2_err_matches(ret, EROFS)) { 1203 struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); 1204 1205 struct printbuf buf = PRINTBUF; 1206 bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); 1207 prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); 1208 bch_err_ratelimited(c, "%s", buf.buf); 1209 printbuf_exit(&buf); 1210 } 1211 1212 if (ret) { 1213 op->error = ret; 1214 break; 1215 } 1216 } 1217 1218 bch2_trans_put(trans); 1219 } 1220 1221 static void __bch2_nocow_write_done(struct bch_write_op *op) 1222 { 1223 if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { 1224 op->error = -EIO; 1225 } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) 1226 bch2_nocow_write_convert_unwritten(op); 1227 } 1228 1229 static CLOSURE_CALLBACK(bch2_nocow_write_done) 1230 { 1231 closure_type(op, struct bch_write_op, cl); 1232 1233 __bch2_nocow_write_done(op); 1234 bch2_write_done(cl); 1235 } 1236 1237 struct bucket_to_lock { 1238 struct bpos b; 1239 unsigned gen; 1240 struct nocow_lock_bucket *l; 1241 }; 1242 1243 static void bch2_nocow_write(struct bch_write_op *op) 1244 { 1245 struct bch_fs *c = op->c; 1246 struct btree_trans *trans; 1247 struct btree_iter iter; 1248 struct bkey_s_c k; 1249 DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; 1250 u32 snapshot; 1251 struct bucket_to_lock *stale_at; 1252 int stale, ret; 1253 1254 if (op->flags & BCH_WRITE_MOVE) 1255 return; 1256 1257 darray_init(&buckets); 1258 trans = bch2_trans_get(c); 1259 retry: 1260 bch2_trans_begin(trans); 1261 1262 ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); 1263 if (unlikely(ret)) 1264 goto err; 1265 1266 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1267 SPOS(op->pos.inode, op->pos.offset, snapshot), 1268 BTREE_ITER_slots); 1269 while (1) { 1270 struct bio *bio = &op->wbio.bio; 1271 1272 buckets.nr = 0; 1273 1274 ret = bch2_trans_relock(trans); 1275 if (ret) 1276 break; 1277 1278 k = bch2_btree_iter_peek_slot(&iter); 1279 ret = bkey_err(k); 1280 if (ret) 1281 break; 1282 1283 /* fall back to normal cow write path? */ 1284 if (unlikely(k.k->p.snapshot != snapshot || 1285 !bch2_extent_is_writeable(op, k))) 1286 break; 1287 1288 if (bch2_keylist_realloc(&op->insert_keys, 1289 op->inline_keys, 1290 ARRAY_SIZE(op->inline_keys), 1291 k.k->u64s)) 1292 break; 1293 1294 /* Get iorefs before dropping btree locks: */ 1295 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1296 bkey_for_each_ptr(ptrs, ptr) { 1297 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1298 if (unlikely(!ca)) 1299 goto err_get_ioref; 1300 1301 struct bpos b = PTR_BUCKET_POS(ca, ptr); 1302 struct nocow_lock_bucket *l = 1303 bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); 1304 prefetch(l); 1305 1306 /* XXX allocating memory with btree locks held - rare */ 1307 darray_push_gfp(&buckets, ((struct bucket_to_lock) { 1308 .b = b, .gen = ptr->gen, .l = l, 1309 }), GFP_KERNEL|__GFP_NOFAIL); 1310 1311 if (ptr->unwritten) 1312 op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; 1313 } 1314 1315 /* Unlock before taking nocow locks, doing IO: */ 1316 bkey_reassemble(op->insert_keys.top, k); 1317 bch2_trans_unlock(trans); 1318 1319 bch2_cut_front(op->pos, op->insert_keys.top); 1320 if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) 1321 bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); 1322 1323 darray_for_each(buckets, i) { 1324 struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); 1325 1326 __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, 1327 bucket_to_u64(i->b), 1328 BUCKET_NOCOW_LOCK_UPDATE); 1329 1330 int gen = bucket_gen_get(ca, i->b.offset); 1331 stale = gen < 0 ? gen : gen_after(gen, i->gen); 1332 if (unlikely(stale)) { 1333 stale_at = i; 1334 goto err_bucket_stale; 1335 } 1336 } 1337 1338 bio = &op->wbio.bio; 1339 if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { 1340 bio = bio_split(bio, k.k->p.offset - op->pos.offset, 1341 GFP_KERNEL, &c->bio_write); 1342 wbio_init(bio)->put_bio = true; 1343 bio->bi_opf = op->wbio.bio.bi_opf; 1344 } else { 1345 op->flags |= BCH_WRITE_SUBMITTED; 1346 } 1347 1348 op->pos.offset += bio_sectors(bio); 1349 op->written += bio_sectors(bio); 1350 1351 bio->bi_end_io = bch2_write_endio; 1352 bio->bi_private = &op->cl; 1353 bio->bi_opf |= REQ_OP_WRITE; 1354 closure_get(&op->cl); 1355 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1356 op->insert_keys.top, true); 1357 1358 bch2_keylist_push(&op->insert_keys); 1359 if (op->flags & BCH_WRITE_SUBMITTED) 1360 break; 1361 bch2_btree_iter_advance(&iter); 1362 } 1363 out: 1364 bch2_trans_iter_exit(trans, &iter); 1365 err: 1366 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1367 goto retry; 1368 1369 bch2_trans_put(trans); 1370 darray_exit(&buckets); 1371 1372 if (ret) { 1373 struct printbuf buf = PRINTBUF; 1374 bch2_write_op_error(&buf, op); 1375 prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); 1376 bch_err_ratelimited(c, "%s", buf.buf); 1377 printbuf_exit(&buf); 1378 op->error = ret; 1379 op->flags |= BCH_WRITE_SUBMITTED; 1380 } 1381 1382 /* fallback to cow write path? */ 1383 if (!(op->flags & BCH_WRITE_SUBMITTED)) { 1384 closure_sync(&op->cl); 1385 __bch2_nocow_write_done(op); 1386 op->insert_keys.top = op->insert_keys.keys; 1387 } else if (op->flags & BCH_WRITE_SYNC) { 1388 closure_sync(&op->cl); 1389 bch2_nocow_write_done(&op->cl.work); 1390 } else { 1391 /* 1392 * XXX 1393 * needs to run out of process context because ei_quota_lock is 1394 * a mutex 1395 */ 1396 continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); 1397 } 1398 return; 1399 err_get_ioref: 1400 darray_for_each(buckets, i) 1401 percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); 1402 1403 /* Fall back to COW path: */ 1404 goto out; 1405 err_bucket_stale: 1406 darray_for_each(buckets, i) { 1407 bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); 1408 if (i == stale_at) 1409 break; 1410 } 1411 1412 struct printbuf buf = PRINTBUF; 1413 if (bch2_fs_inconsistent_on(stale < 0, c, 1414 "pointer to invalid bucket in nocow path on device %llu\n %s", 1415 stale_at->b.inode, 1416 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1417 ret = -EIO; 1418 } else { 1419 /* We can retry this: */ 1420 ret = -BCH_ERR_transaction_restart; 1421 } 1422 printbuf_exit(&buf); 1423 1424 goto err_get_ioref; 1425 } 1426 1427 static void __bch2_write(struct bch_write_op *op) 1428 { 1429 struct bch_fs *c = op->c; 1430 struct write_point *wp = NULL; 1431 struct bio *bio = NULL; 1432 unsigned nofs_flags; 1433 int ret; 1434 1435 nofs_flags = memalloc_nofs_save(); 1436 1437 if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { 1438 bch2_nocow_write(op); 1439 if (op->flags & BCH_WRITE_SUBMITTED) 1440 goto out_nofs_restore; 1441 } 1442 again: 1443 memset(&op->failed, 0, sizeof(op->failed)); 1444 1445 do { 1446 struct bkey_i *key_to_write; 1447 unsigned key_to_write_offset = op->insert_keys.top_p - 1448 op->insert_keys.keys_p; 1449 1450 /* +1 for possible cache device: */ 1451 if (op->open_buckets.nr + op->nr_replicas + 1 > 1452 ARRAY_SIZE(op->open_buckets.v)) 1453 break; 1454 1455 if (bch2_keylist_realloc(&op->insert_keys, 1456 op->inline_keys, 1457 ARRAY_SIZE(op->inline_keys), 1458 BKEY_EXTENT_U64s_MAX)) 1459 break; 1460 1461 /* 1462 * The copygc thread is now global, which means it's no longer 1463 * freeing up space on specific disks, which means that 1464 * allocations for specific disks may hang arbitrarily long: 1465 */ 1466 ret = bch2_trans_run(c, lockrestart_do(trans, 1467 bch2_alloc_sectors_start_trans(trans, 1468 op->target, 1469 op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), 1470 op->write_point, 1471 &op->devs_have, 1472 op->nr_replicas, 1473 op->nr_replicas_required, 1474 op->watermark, 1475 op->flags, 1476 &op->cl, &wp))); 1477 if (unlikely(ret)) { 1478 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) 1479 break; 1480 1481 goto err; 1482 } 1483 1484 EBUG_ON(!wp); 1485 1486 bch2_open_bucket_get(c, wp, &op->open_buckets); 1487 ret = bch2_write_extent(op, wp, &bio); 1488 1489 bch2_alloc_sectors_done_inlined(c, wp); 1490 err: 1491 if (ret <= 0) { 1492 op->flags |= BCH_WRITE_SUBMITTED; 1493 1494 if (unlikely(ret < 0)) { 1495 if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { 1496 struct printbuf buf = PRINTBUF; 1497 bch2_write_op_error(&buf, op); 1498 prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); 1499 bch_err_ratelimited(c, "%s", buf.buf); 1500 printbuf_exit(&buf); 1501 } 1502 op->error = ret; 1503 break; 1504 } 1505 } 1506 1507 bio->bi_end_io = bch2_write_endio; 1508 bio->bi_private = &op->cl; 1509 bio->bi_opf |= REQ_OP_WRITE; 1510 1511 closure_get(bio->bi_private); 1512 1513 key_to_write = (void *) (op->insert_keys.keys_p + 1514 key_to_write_offset); 1515 1516 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, 1517 key_to_write, false); 1518 } while (ret); 1519 1520 /* 1521 * Sync or no? 1522 * 1523 * If we're running asynchronously, wne may still want to block 1524 * synchronously here if we weren't able to submit all of the IO at 1525 * once, as that signals backpressure to the caller. 1526 */ 1527 if ((op->flags & BCH_WRITE_SYNC) || 1528 (!(op->flags & BCH_WRITE_SUBMITTED) && 1529 !(op->flags & BCH_WRITE_IN_WORKER))) { 1530 bch2_wait_on_allocator(c, &op->cl); 1531 1532 __bch2_write_index(op); 1533 1534 if (!(op->flags & BCH_WRITE_SUBMITTED)) 1535 goto again; 1536 bch2_write_done(&op->cl); 1537 } else { 1538 bch2_write_queue(op, wp); 1539 continue_at(&op->cl, bch2_write_index, NULL); 1540 } 1541 out_nofs_restore: 1542 memalloc_nofs_restore(nofs_flags); 1543 } 1544 1545 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) 1546 { 1547 struct bio *bio = &op->wbio.bio; 1548 struct bvec_iter iter; 1549 struct bkey_i_inline_data *id; 1550 unsigned sectors; 1551 int ret; 1552 1553 memset(&op->failed, 0, sizeof(op->failed)); 1554 1555 op->flags |= BCH_WRITE_WROTE_DATA_INLINE; 1556 op->flags |= BCH_WRITE_SUBMITTED; 1557 1558 bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); 1559 1560 ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, 1561 ARRAY_SIZE(op->inline_keys), 1562 BKEY_U64s + DIV_ROUND_UP(data_len, 8)); 1563 if (ret) { 1564 op->error = ret; 1565 goto err; 1566 } 1567 1568 sectors = bio_sectors(bio); 1569 op->pos.offset += sectors; 1570 1571 id = bkey_inline_data_init(op->insert_keys.top); 1572 id->k.p = op->pos; 1573 id->k.bversion = op->version; 1574 id->k.size = sectors; 1575 1576 iter = bio->bi_iter; 1577 iter.bi_size = data_len; 1578 memcpy_from_bio(id->v.data, bio, iter); 1579 1580 while (data_len & 7) 1581 id->v.data[data_len++] = '\0'; 1582 set_bkey_val_bytes(&id->k, data_len); 1583 bch2_keylist_push(&op->insert_keys); 1584 1585 __bch2_write_index(op); 1586 err: 1587 bch2_write_done(&op->cl); 1588 } 1589 1590 /** 1591 * bch2_write() - handle a write to a cache device or flash only volume 1592 * @cl: &bch_write_op->cl 1593 * 1594 * This is the starting point for any data to end up in a cache device; it could 1595 * be from a normal write, or a writeback write, or a write to a flash only 1596 * volume - it's also used by the moving garbage collector to compact data in 1597 * mostly empty buckets. 1598 * 1599 * It first writes the data to the cache, creating a list of keys to be inserted 1600 * (if the data won't fit in a single open bucket, there will be multiple keys); 1601 * after the data is written it calls bch_journal, and after the keys have been 1602 * added to the next journal write they're inserted into the btree. 1603 * 1604 * If op->discard is true, instead of inserting the data it invalidates the 1605 * region of the cache represented by op->bio and op->inode. 1606 */ 1607 CLOSURE_CALLBACK(bch2_write) 1608 { 1609 closure_type(op, struct bch_write_op, cl); 1610 struct bio *bio = &op->wbio.bio; 1611 struct bch_fs *c = op->c; 1612 unsigned data_len; 1613 1614 EBUG_ON(op->cl.parent); 1615 BUG_ON(!op->nr_replicas); 1616 BUG_ON(!op->write_point.v); 1617 BUG_ON(bkey_eq(op->pos, POS_MAX)); 1618 1619 if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) 1620 op->flags |= BCH_WRITE_ALLOC_NOWAIT; 1621 1622 op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); 1623 op->start_time = local_clock(); 1624 bch2_keylist_init(&op->insert_keys, op->inline_keys); 1625 wbio_init(bio)->put_bio = false; 1626 1627 if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { 1628 struct printbuf buf = PRINTBUF; 1629 bch2_write_op_error(&buf, op); 1630 prt_printf(&buf, "misaligned write"); 1631 printbuf_exit(&buf); 1632 op->error = -EIO; 1633 goto err; 1634 } 1635 1636 if (c->opts.nochanges) { 1637 op->error = -BCH_ERR_erofs_no_writes; 1638 goto err; 1639 } 1640 1641 if (!(op->flags & BCH_WRITE_MOVE) && 1642 !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { 1643 op->error = -BCH_ERR_erofs_no_writes; 1644 goto err; 1645 } 1646 1647 this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); 1648 bch2_increment_clock(c, bio_sectors(bio), WRITE); 1649 1650 data_len = min_t(u64, bio->bi_iter.bi_size, 1651 op->new_i_size - (op->pos.offset << 9)); 1652 1653 if (c->opts.inline_data && 1654 data_len <= min(block_bytes(c) / 2, 1024U)) { 1655 bch2_write_data_inline(op, data_len); 1656 return; 1657 } 1658 1659 __bch2_write(op); 1660 return; 1661 err: 1662 bch2_disk_reservation_put(c, &op->res); 1663 1664 closure_debug_destroy(&op->cl); 1665 if (op->end_io) 1666 op->end_io(op); 1667 } 1668 1669 static const char * const bch2_write_flags[] = { 1670 #define x(f) #f, 1671 BCH_WRITE_FLAGS() 1672 #undef x 1673 NULL 1674 }; 1675 1676 void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) 1677 { 1678 prt_str(out, "pos: "); 1679 bch2_bpos_to_text(out, op->pos); 1680 prt_newline(out); 1681 printbuf_indent_add(out, 2); 1682 1683 prt_str(out, "started: "); 1684 bch2_pr_time_units(out, local_clock() - op->start_time); 1685 prt_newline(out); 1686 1687 prt_str(out, "flags: "); 1688 prt_bitflags(out, bch2_write_flags, op->flags); 1689 prt_newline(out); 1690 1691 prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); 1692 1693 printbuf_indent_sub(out, 2); 1694 } 1695 1696 void bch2_fs_io_write_exit(struct bch_fs *c) 1697 { 1698 mempool_exit(&c->bio_bounce_pages); 1699 bioset_exit(&c->replica_set); 1700 bioset_exit(&c->bio_write); 1701 } 1702 1703 int bch2_fs_io_write_init(struct bch_fs *c) 1704 { 1705 if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || 1706 bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) 1707 return -BCH_ERR_ENOMEM_bio_write_init; 1708 1709 if (mempool_init_page_pool(&c->bio_bounce_pages, 1710 max_t(unsigned, 1711 c->opts.btree_node_size, 1712 c->opts.encoded_extent_max) / 1713 PAGE_SIZE, 0)) 1714 return -BCH_ERR_ENOMEM_bio_bounce_pages_init; 1715 1716 return 0; 1717 } 1718