1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 4 * 5 * Code for managing the extent btree and dynamically updating the writeback 6 * dirty sector count. 7 */ 8 9 #include "bcachefs.h" 10 #include "bkey_methods.h" 11 #include "btree_cache.h" 12 #include "btree_gc.h" 13 #include "btree_io.h" 14 #include "btree_iter.h" 15 #include "buckets.h" 16 #include "checksum.h" 17 #include "compress.h" 18 #include "debug.h" 19 #include "disk_groups.h" 20 #include "error.h" 21 #include "extents.h" 22 #include "inode.h" 23 #include "journal.h" 24 #include "rebalance.h" 25 #include "replicas.h" 26 #include "super.h" 27 #include "super-io.h" 28 #include "trace.h" 29 #include "util.h" 30 31 static unsigned bch2_crc_field_size_max[] = { 32 [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, 33 [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, 34 [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, 35 }; 36 37 static void bch2_extent_crc_pack(union bch_extent_crc *, 38 struct bch_extent_crc_unpacked, 39 enum bch_extent_entry_type); 40 41 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, 42 unsigned dev) 43 { 44 struct bch_dev_io_failures *i; 45 46 for (i = f->devs; i < f->devs + f->nr; i++) 47 if (i->dev == dev) 48 return i; 49 50 return NULL; 51 } 52 53 void bch2_mark_io_failure(struct bch_io_failures *failed, 54 struct extent_ptr_decoded *p) 55 { 56 struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); 57 58 if (!f) { 59 BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); 60 61 f = &failed->devs[failed->nr++]; 62 f->dev = p->ptr.dev; 63 f->idx = p->idx; 64 f->nr_failed = 1; 65 f->nr_retries = 0; 66 } else if (p->idx != f->idx) { 67 f->idx = p->idx; 68 f->nr_failed = 1; 69 f->nr_retries = 0; 70 } else { 71 f->nr_failed++; 72 } 73 } 74 75 static inline u64 dev_latency(struct bch_fs *c, unsigned dev) 76 { 77 struct bch_dev *ca = bch2_dev_rcu(c, dev); 78 return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; 79 } 80 81 /* 82 * returns true if p1 is better than p2: 83 */ 84 static inline bool ptr_better(struct bch_fs *c, 85 const struct extent_ptr_decoded p1, 86 const struct extent_ptr_decoded p2) 87 { 88 if (likely(!p1.idx && !p2.idx)) { 89 u64 l1 = dev_latency(c, p1.ptr.dev); 90 u64 l2 = dev_latency(c, p2.ptr.dev); 91 92 /* 93 * Square the latencies, to bias more in favor of the faster 94 * device - we never want to stop issuing reads to the slower 95 * device altogether, so that we can update our latency numbers: 96 */ 97 l1 *= l1; 98 l2 *= l2; 99 100 /* Pick at random, biased in favor of the faster device: */ 101 102 return bch2_rand_range(l1 + l2) > l1; 103 } 104 105 if (bch2_force_reconstruct_read) 106 return p1.idx > p2.idx; 107 108 return p1.idx < p2.idx; 109 } 110 111 /* 112 * This picks a non-stale pointer, preferably from a device other than @avoid. 113 * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to 114 * other devices, it will still pick a pointer from avoid. 115 */ 116 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, 117 struct bch_io_failures *failed, 118 struct extent_ptr_decoded *pick) 119 { 120 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 121 const union bch_extent_entry *entry; 122 struct extent_ptr_decoded p; 123 struct bch_dev_io_failures *f; 124 int ret = 0; 125 126 if (k.k->type == KEY_TYPE_error) 127 return -BCH_ERR_key_type_error; 128 129 rcu_read_lock(); 130 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 131 /* 132 * Unwritten extent: no need to actually read, treat it as a 133 * hole and return 0s: 134 */ 135 if (p.ptr.unwritten) { 136 ret = 0; 137 break; 138 } 139 140 /* 141 * If there are any dirty pointers it's an error if we can't 142 * read: 143 */ 144 if (!ret && !p.ptr.cached) 145 ret = -BCH_ERR_no_device_to_read_from; 146 147 struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); 148 149 if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) 150 continue; 151 152 f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; 153 if (f) 154 p.idx = f->nr_failed < f->nr_retries 155 ? f->idx 156 : f->idx + 1; 157 158 if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) 159 p.idx++; 160 161 if (!p.idx && p.has_ec && bch2_force_reconstruct_read) 162 p.idx++; 163 164 if (p.idx > (unsigned) p.has_ec) 165 continue; 166 167 if (ret > 0 && !ptr_better(c, p, *pick)) 168 continue; 169 170 *pick = p; 171 ret = 1; 172 } 173 rcu_read_unlock(); 174 175 return ret; 176 } 177 178 /* KEY_TYPE_btree_ptr: */ 179 180 int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, 181 struct bkey_validate_context from) 182 { 183 int ret = 0; 184 185 bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, 186 c, btree_ptr_val_too_big, 187 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); 188 189 ret = bch2_bkey_ptrs_validate(c, k, from); 190 fsck_err: 191 return ret; 192 } 193 194 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, 195 struct bkey_s_c k) 196 { 197 bch2_bkey_ptrs_to_text(out, c, k); 198 } 199 200 int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, 201 struct bkey_validate_context from) 202 { 203 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); 204 int ret = 0; 205 206 bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, 207 c, btree_ptr_v2_val_too_big, 208 "value too big (%zu > %zu)", 209 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); 210 211 bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), 212 c, btree_ptr_v2_min_key_bad, 213 "min_key > key"); 214 215 if ((from.flags & BCH_VALIDATE_write) && 216 c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) 217 bkey_fsck_err_on(!bp.v->sectors_written, 218 c, btree_ptr_v2_written_0, 219 "sectors_written == 0"); 220 221 ret = bch2_bkey_ptrs_validate(c, k, from); 222 fsck_err: 223 return ret; 224 } 225 226 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, 227 struct bkey_s_c k) 228 { 229 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); 230 231 prt_printf(out, "seq %llx written %u min_key %s", 232 le64_to_cpu(bp.v->seq), 233 le16_to_cpu(bp.v->sectors_written), 234 BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); 235 236 bch2_bpos_to_text(out, bp.v->min_key); 237 prt_printf(out, " "); 238 bch2_bkey_ptrs_to_text(out, c, k); 239 } 240 241 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, 242 unsigned big_endian, int write, 243 struct bkey_s k) 244 { 245 struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); 246 247 compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); 248 249 if (version < bcachefs_metadata_version_inode_btree_change && 250 btree_id_is_extents(btree_id) && 251 !bkey_eq(bp.v->min_key, POS_MIN)) 252 bp.v->min_key = write 253 ? bpos_nosnap_predecessor(bp.v->min_key) 254 : bpos_nosnap_successor(bp.v->min_key); 255 } 256 257 /* KEY_TYPE_extent: */ 258 259 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) 260 { 261 struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); 262 struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); 263 union bch_extent_entry *en_l; 264 const union bch_extent_entry *en_r; 265 struct extent_ptr_decoded lp, rp; 266 bool use_right_ptr; 267 268 en_l = l_ptrs.start; 269 en_r = r_ptrs.start; 270 while (en_l < l_ptrs.end && en_r < r_ptrs.end) { 271 if (extent_entry_type(en_l) != extent_entry_type(en_r)) 272 return false; 273 274 en_l = extent_entry_next(en_l); 275 en_r = extent_entry_next(en_r); 276 } 277 278 if (en_l < l_ptrs.end || en_r < r_ptrs.end) 279 return false; 280 281 en_l = l_ptrs.start; 282 en_r = r_ptrs.start; 283 lp.crc = bch2_extent_crc_unpack(l.k, NULL); 284 rp.crc = bch2_extent_crc_unpack(r.k, NULL); 285 286 while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && 287 __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { 288 if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != 289 rp.ptr.offset + rp.crc.offset || 290 lp.ptr.dev != rp.ptr.dev || 291 lp.ptr.gen != rp.ptr.gen || 292 lp.ptr.unwritten != rp.ptr.unwritten || 293 lp.has_ec != rp.has_ec) 294 return false; 295 296 /* Extents may not straddle buckets: */ 297 rcu_read_lock(); 298 struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); 299 bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); 300 rcu_read_unlock(); 301 302 if (!same_bucket) 303 return false; 304 305 if (lp.has_ec != rp.has_ec || 306 (lp.has_ec && 307 (lp.ec.block != rp.ec.block || 308 lp.ec.redundancy != rp.ec.redundancy || 309 lp.ec.idx != rp.ec.idx))) 310 return false; 311 312 if (lp.crc.compression_type != rp.crc.compression_type || 313 lp.crc.nonce != rp.crc.nonce) 314 return false; 315 316 if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= 317 lp.crc.uncompressed_size) { 318 /* can use left extent's crc entry */ 319 } else if (lp.crc.live_size <= rp.crc.offset) { 320 /* can use right extent's crc entry */ 321 } else { 322 /* check if checksums can be merged: */ 323 if (lp.crc.csum_type != rp.crc.csum_type || 324 lp.crc.nonce != rp.crc.nonce || 325 crc_is_compressed(lp.crc) || 326 !bch2_checksum_mergeable(lp.crc.csum_type)) 327 return false; 328 329 if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || 330 rp.crc.offset) 331 return false; 332 333 if (lp.crc.csum_type && 334 lp.crc.uncompressed_size + 335 rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) 336 return false; 337 } 338 339 en_l = extent_entry_next(en_l); 340 en_r = extent_entry_next(en_r); 341 } 342 343 en_l = l_ptrs.start; 344 en_r = r_ptrs.start; 345 while (en_l < l_ptrs.end && en_r < r_ptrs.end) { 346 if (extent_entry_is_crc(en_l)) { 347 struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); 348 struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); 349 350 if (crc_l.uncompressed_size + crc_r.uncompressed_size > 351 bch2_crc_field_size_max[extent_entry_type(en_l)]) 352 return false; 353 } 354 355 en_l = extent_entry_next(en_l); 356 en_r = extent_entry_next(en_r); 357 } 358 359 use_right_ptr = false; 360 en_l = l_ptrs.start; 361 en_r = r_ptrs.start; 362 while (en_l < l_ptrs.end) { 363 if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && 364 use_right_ptr) 365 en_l->ptr = en_r->ptr; 366 367 if (extent_entry_is_crc(en_l)) { 368 struct bch_extent_crc_unpacked crc_l = 369 bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); 370 struct bch_extent_crc_unpacked crc_r = 371 bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); 372 373 use_right_ptr = false; 374 375 if (crc_l.offset + crc_l.live_size + crc_r.live_size <= 376 crc_l.uncompressed_size) { 377 /* can use left extent's crc entry */ 378 } else if (crc_l.live_size <= crc_r.offset) { 379 /* can use right extent's crc entry */ 380 crc_r.offset -= crc_l.live_size; 381 bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, 382 extent_entry_type(en_l)); 383 use_right_ptr = true; 384 } else { 385 crc_l.csum = bch2_checksum_merge(crc_l.csum_type, 386 crc_l.csum, 387 crc_r.csum, 388 crc_r.uncompressed_size << 9); 389 390 crc_l.uncompressed_size += crc_r.uncompressed_size; 391 crc_l.compressed_size += crc_r.compressed_size; 392 bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, 393 extent_entry_type(en_l)); 394 } 395 } 396 397 en_l = extent_entry_next(en_l); 398 en_r = extent_entry_next(en_r); 399 } 400 401 bch2_key_resize(l.k, l.k->size + r.k->size); 402 return true; 403 } 404 405 /* KEY_TYPE_reservation: */ 406 407 int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, 408 struct bkey_validate_context from) 409 { 410 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); 411 int ret = 0; 412 413 bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, 414 c, reservation_key_nr_replicas_invalid, 415 "invalid nr_replicas (%u)", r.v->nr_replicas); 416 fsck_err: 417 return ret; 418 } 419 420 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, 421 struct bkey_s_c k) 422 { 423 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); 424 425 prt_printf(out, "generation %u replicas %u", 426 le32_to_cpu(r.v->generation), 427 r.v->nr_replicas); 428 } 429 430 bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) 431 { 432 struct bkey_s_reservation l = bkey_s_to_reservation(_l); 433 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); 434 435 if (l.v->generation != r.v->generation || 436 l.v->nr_replicas != r.v->nr_replicas) 437 return false; 438 439 bch2_key_resize(l.k, l.k->size + r.k->size); 440 return true; 441 } 442 443 /* Extent checksum entries: */ 444 445 /* returns true if not equal */ 446 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, 447 struct bch_extent_crc_unpacked r) 448 { 449 return (l.csum_type != r.csum_type || 450 l.compression_type != r.compression_type || 451 l.compressed_size != r.compressed_size || 452 l.uncompressed_size != r.uncompressed_size || 453 l.offset != r.offset || 454 l.live_size != r.live_size || 455 l.nonce != r.nonce || 456 bch2_crc_cmp(l.csum, r.csum)); 457 } 458 459 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, 460 struct bch_extent_crc_unpacked n) 461 { 462 return !crc_is_compressed(u) && 463 u.csum_type && 464 u.uncompressed_size > u.live_size && 465 bch2_csum_type_is_encryption(u.csum_type) == 466 bch2_csum_type_is_encryption(n.csum_type); 467 } 468 469 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, 470 struct bch_extent_crc_unpacked n) 471 { 472 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 473 struct bch_extent_crc_unpacked crc; 474 const union bch_extent_entry *i; 475 476 if (!n.csum_type) 477 return false; 478 479 bkey_for_each_crc(k.k, ptrs, crc, i) 480 if (can_narrow_crc(crc, n)) 481 return true; 482 483 return false; 484 } 485 486 /* 487 * We're writing another replica for this extent, so while we've got the data in 488 * memory we'll be computing a new checksum for the currently live data. 489 * 490 * If there are other replicas we aren't moving, and they are checksummed but 491 * not compressed, we can modify them to point to only the data that is 492 * currently live (so that readers won't have to bounce) while we've got the 493 * checksum we need: 494 */ 495 bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) 496 { 497 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); 498 struct bch_extent_crc_unpacked u; 499 struct extent_ptr_decoded p; 500 union bch_extent_entry *i; 501 bool ret = false; 502 503 /* Find a checksum entry that covers only live data: */ 504 if (!n.csum_type) { 505 bkey_for_each_crc(&k->k, ptrs, u, i) 506 if (!crc_is_compressed(u) && 507 u.csum_type && 508 u.live_size == u.uncompressed_size) { 509 n = u; 510 goto found; 511 } 512 return false; 513 } 514 found: 515 BUG_ON(crc_is_compressed(n)); 516 BUG_ON(n.offset); 517 BUG_ON(n.live_size != k->k.size); 518 519 restart_narrow_pointers: 520 ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); 521 522 bkey_for_each_ptr_decode(&k->k, ptrs, p, i) 523 if (can_narrow_crc(p.crc, n)) { 524 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); 525 p.ptr.offset += p.crc.offset; 526 p.crc = n; 527 bch2_extent_ptr_decoded_append(k, &p); 528 ret = true; 529 goto restart_narrow_pointers; 530 } 531 532 return ret; 533 } 534 535 static void bch2_extent_crc_pack(union bch_extent_crc *dst, 536 struct bch_extent_crc_unpacked src, 537 enum bch_extent_entry_type type) 538 { 539 #define set_common_fields(_dst, _src) \ 540 _dst.type = 1 << type; \ 541 _dst.csum_type = _src.csum_type, \ 542 _dst.compression_type = _src.compression_type, \ 543 _dst._compressed_size = _src.compressed_size - 1, \ 544 _dst._uncompressed_size = _src.uncompressed_size - 1, \ 545 _dst.offset = _src.offset 546 547 switch (type) { 548 case BCH_EXTENT_ENTRY_crc32: 549 set_common_fields(dst->crc32, src); 550 dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); 551 break; 552 case BCH_EXTENT_ENTRY_crc64: 553 set_common_fields(dst->crc64, src); 554 dst->crc64.nonce = src.nonce; 555 dst->crc64.csum_lo = (u64 __force) src.csum.lo; 556 dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); 557 break; 558 case BCH_EXTENT_ENTRY_crc128: 559 set_common_fields(dst->crc128, src); 560 dst->crc128.nonce = src.nonce; 561 dst->crc128.csum = src.csum; 562 break; 563 default: 564 BUG(); 565 } 566 #undef set_common_fields 567 } 568 569 void bch2_extent_crc_append(struct bkey_i *k, 570 struct bch_extent_crc_unpacked new) 571 { 572 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); 573 union bch_extent_crc *crc = (void *) ptrs.end; 574 enum bch_extent_entry_type type; 575 576 if (bch_crc_bytes[new.csum_type] <= 4 && 577 new.uncompressed_size <= CRC32_SIZE_MAX && 578 new.nonce <= CRC32_NONCE_MAX) 579 type = BCH_EXTENT_ENTRY_crc32; 580 else if (bch_crc_bytes[new.csum_type] <= 10 && 581 new.uncompressed_size <= CRC64_SIZE_MAX && 582 new.nonce <= CRC64_NONCE_MAX) 583 type = BCH_EXTENT_ENTRY_crc64; 584 else if (bch_crc_bytes[new.csum_type] <= 16 && 585 new.uncompressed_size <= CRC128_SIZE_MAX && 586 new.nonce <= CRC128_NONCE_MAX) 587 type = BCH_EXTENT_ENTRY_crc128; 588 else 589 BUG(); 590 591 bch2_extent_crc_pack(crc, new, type); 592 593 k->k.u64s += extent_entry_u64s(ptrs.end); 594 595 EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); 596 } 597 598 /* Generic code for keys with pointers: */ 599 600 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) 601 { 602 return bch2_bkey_devs(k).nr; 603 } 604 605 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) 606 { 607 return k.k->type == KEY_TYPE_reservation 608 ? bkey_s_c_to_reservation(k).v->nr_replicas 609 : bch2_bkey_dirty_devs(k).nr; 610 } 611 612 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) 613 { 614 unsigned ret = 0; 615 616 if (k.k->type == KEY_TYPE_reservation) { 617 ret = bkey_s_c_to_reservation(k).v->nr_replicas; 618 } else { 619 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 620 const union bch_extent_entry *entry; 621 struct extent_ptr_decoded p; 622 623 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 624 ret += !p.ptr.cached && !crc_is_compressed(p.crc); 625 } 626 627 return ret; 628 } 629 630 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) 631 { 632 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 633 const union bch_extent_entry *entry; 634 struct extent_ptr_decoded p; 635 unsigned ret = 0; 636 637 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 638 if (!p.ptr.cached && crc_is_compressed(p.crc)) 639 ret += p.crc.compressed_size; 640 641 return ret; 642 } 643 644 bool bch2_bkey_is_incompressible(struct bkey_s_c k) 645 { 646 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 647 const union bch_extent_entry *entry; 648 struct bch_extent_crc_unpacked crc; 649 650 bkey_for_each_crc(k.k, ptrs, crc, entry) 651 if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) 652 return true; 653 return false; 654 } 655 656 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) 657 { 658 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 659 const union bch_extent_entry *entry; 660 struct extent_ptr_decoded p = { 0 }; 661 unsigned replicas = 0; 662 663 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 664 if (p.ptr.cached) 665 continue; 666 667 if (p.has_ec) 668 replicas += p.ec.redundancy; 669 670 replicas++; 671 672 } 673 674 return replicas; 675 } 676 677 static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) 678 { 679 if (p->ptr.cached) 680 return 0; 681 682 return p->has_ec 683 ? p->ec.redundancy + 1 684 : ca->mi.durability; 685 } 686 687 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) 688 { 689 struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); 690 691 return ca ? __extent_ptr_durability(ca, p) : 0; 692 } 693 694 unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) 695 { 696 struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); 697 698 if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) 699 return 0; 700 701 return __extent_ptr_durability(ca, p); 702 } 703 704 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) 705 { 706 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 707 const union bch_extent_entry *entry; 708 struct extent_ptr_decoded p; 709 unsigned durability = 0; 710 711 rcu_read_lock(); 712 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 713 durability += bch2_extent_ptr_durability(c, &p); 714 rcu_read_unlock(); 715 716 return durability; 717 } 718 719 static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) 720 { 721 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 722 const union bch_extent_entry *entry; 723 struct extent_ptr_decoded p; 724 unsigned durability = 0; 725 726 rcu_read_lock(); 727 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 728 if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) 729 durability += bch2_extent_ptr_durability(c, &p); 730 rcu_read_unlock(); 731 732 return durability; 733 } 734 735 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) 736 { 737 union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); 738 union bch_extent_entry *next = extent_entry_next(entry); 739 740 memmove_u64s(entry, next, (u64 *) end - (u64 *) next); 741 k->k.u64s -= extent_entry_u64s(entry); 742 } 743 744 void bch2_extent_ptr_decoded_append(struct bkey_i *k, 745 struct extent_ptr_decoded *p) 746 { 747 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); 748 struct bch_extent_crc_unpacked crc = 749 bch2_extent_crc_unpack(&k->k, NULL); 750 union bch_extent_entry *pos; 751 752 if (!bch2_crc_unpacked_cmp(crc, p->crc)) { 753 pos = ptrs.start; 754 goto found; 755 } 756 757 bkey_for_each_crc(&k->k, ptrs, crc, pos) 758 if (!bch2_crc_unpacked_cmp(crc, p->crc)) { 759 pos = extent_entry_next(pos); 760 goto found; 761 } 762 763 bch2_extent_crc_append(k, p->crc); 764 pos = bkey_val_end(bkey_i_to_s(k)); 765 found: 766 p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; 767 __extent_entry_insert(k, pos, to_entry(&p->ptr)); 768 769 if (p->has_ec) { 770 p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; 771 __extent_entry_insert(k, pos, to_entry(&p->ec)); 772 } 773 } 774 775 static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, 776 union bch_extent_entry *entry) 777 { 778 union bch_extent_entry *i = ptrs.start; 779 780 if (i == entry) 781 return NULL; 782 783 while (extent_entry_next(i) != entry) 784 i = extent_entry_next(i); 785 return i; 786 } 787 788 /* 789 * Returns pointer to the next entry after the one being dropped: 790 */ 791 void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) 792 { 793 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); 794 union bch_extent_entry *entry = to_entry(ptr), *next; 795 bool drop_crc = true; 796 797 if (k.k->type == KEY_TYPE_stripe) { 798 ptr->dev = BCH_SB_MEMBER_INVALID; 799 return; 800 } 801 802 EBUG_ON(ptr < &ptrs.start->ptr || 803 ptr >= &ptrs.end->ptr); 804 EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); 805 806 for (next = extent_entry_next(entry); 807 next != ptrs.end; 808 next = extent_entry_next(next)) { 809 if (extent_entry_is_crc(next)) { 810 break; 811 } else if (extent_entry_is_ptr(next)) { 812 drop_crc = false; 813 break; 814 } 815 } 816 817 extent_entry_drop(k, entry); 818 819 while ((entry = extent_entry_prev(ptrs, entry))) { 820 if (extent_entry_is_ptr(entry)) 821 break; 822 823 if ((extent_entry_is_crc(entry) && drop_crc) || 824 extent_entry_is_stripe_ptr(entry)) 825 extent_entry_drop(k, entry); 826 } 827 } 828 829 void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) 830 { 831 if (k.k->type != KEY_TYPE_stripe) { 832 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); 833 const union bch_extent_entry *entry; 834 struct extent_ptr_decoded p; 835 836 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 837 if (p.ptr.dev == ptr->dev && p.has_ec) { 838 ptr->dev = BCH_SB_MEMBER_INVALID; 839 return; 840 } 841 } 842 843 bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; 844 845 bch2_bkey_drop_ptr_noerror(k, ptr); 846 847 /* 848 * If we deleted all the dirty pointers and there's still cached 849 * pointers, we could set the cached pointers to dirty if they're not 850 * stale - but to do that correctly we'd need to grab an open_bucket 851 * reference so that we don't race with bucket reuse: 852 */ 853 if (have_dirty && 854 !bch2_bkey_dirty_devs(k.s_c).nr) { 855 k.k->type = KEY_TYPE_error; 856 set_bkey_val_u64s(k.k, 0); 857 } else if (!bch2_bkey_nr_ptrs(k.s_c)) { 858 k.k->type = KEY_TYPE_deleted; 859 set_bkey_val_u64s(k.k, 0); 860 } 861 } 862 863 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) 864 { 865 bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); 866 } 867 868 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) 869 { 870 bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); 871 } 872 873 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) 874 { 875 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 876 877 bkey_for_each_ptr(ptrs, ptr) 878 if (ptr->dev == dev) 879 return ptr; 880 881 return NULL; 882 } 883 884 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) 885 { 886 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 887 struct bch_dev *ca; 888 bool ret = false; 889 890 rcu_read_lock(); 891 bkey_for_each_ptr(ptrs, ptr) 892 if (bch2_dev_in_target(c, ptr->dev, target) && 893 (ca = bch2_dev_rcu(c, ptr->dev)) && 894 (!ptr->cached || 895 !dev_ptr_stale_rcu(ca, ptr))) { 896 ret = true; 897 break; 898 } 899 rcu_read_unlock(); 900 901 return ret; 902 } 903 904 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, 905 struct bch_extent_ptr m, u64 offset) 906 { 907 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 908 const union bch_extent_entry *entry; 909 struct extent_ptr_decoded p; 910 911 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 912 if (p.ptr.dev == m.dev && 913 p.ptr.gen == m.gen && 914 (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == 915 (s64) m.offset - offset) 916 return true; 917 918 return false; 919 } 920 921 /* 922 * Returns true if two extents refer to the same data: 923 */ 924 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) 925 { 926 if (k1.k->type != k2.k->type) 927 return false; 928 929 if (bkey_extent_is_direct_data(k1.k)) { 930 struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); 931 struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); 932 const union bch_extent_entry *entry1, *entry2; 933 struct extent_ptr_decoded p1, p2; 934 935 if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) 936 return false; 937 938 bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) 939 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) 940 if (p1.ptr.dev == p2.ptr.dev && 941 p1.ptr.gen == p2.ptr.gen && 942 943 /* 944 * This checks that the two pointers point 945 * to the same region on disk - adjusting 946 * for the difference in where the extents 947 * start, since one may have been trimmed: 948 */ 949 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == 950 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && 951 952 /* 953 * This additionally checks that the 954 * extents overlap on disk, since the 955 * previous check may trigger spuriously 956 * when one extent is immediately partially 957 * overwritten with another extent (so that 958 * on disk they are adjacent) and 959 * compression is in use: 960 */ 961 ((p1.ptr.offset >= p2.ptr.offset && 962 p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || 963 (p2.ptr.offset >= p1.ptr.offset && 964 p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) 965 return true; 966 967 return false; 968 } else { 969 /* KEY_TYPE_deleted, etc. */ 970 return true; 971 } 972 } 973 974 struct bch_extent_ptr * 975 bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) 976 { 977 struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); 978 union bch_extent_entry *entry2; 979 struct extent_ptr_decoded p2; 980 981 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) 982 if (p1.ptr.dev == p2.ptr.dev && 983 p1.ptr.gen == p2.ptr.gen && 984 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == 985 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) 986 return &entry2->ptr; 987 988 return NULL; 989 } 990 991 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, 992 struct bch_extent_ptr *ptr) 993 { 994 if (!opts->promote_target || 995 !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) 996 return false; 997 998 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); 999 1000 return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); 1001 } 1002 1003 void bch2_extent_ptr_set_cached(struct bch_fs *c, 1004 struct bch_io_opts *opts, 1005 struct bkey_s k, 1006 struct bch_extent_ptr *ptr) 1007 { 1008 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); 1009 union bch_extent_entry *entry; 1010 struct extent_ptr_decoded p; 1011 1012 rcu_read_lock(); 1013 if (!want_cached_ptr(c, opts, ptr)) { 1014 bch2_bkey_drop_ptr_noerror(k, ptr); 1015 goto out; 1016 } 1017 1018 /* 1019 * Stripes can't contain cached data, for - reasons. 1020 * 1021 * Possibly something we can fix in the future? 1022 */ 1023 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 1024 if (&entry->ptr == ptr) { 1025 if (p.has_ec) 1026 bch2_bkey_drop_ptr_noerror(k, ptr); 1027 else 1028 ptr->cached = true; 1029 goto out; 1030 } 1031 1032 BUG(); 1033 out: 1034 rcu_read_unlock(); 1035 } 1036 1037 /* 1038 * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. 1039 * 1040 * Returns true if @k should be dropped entirely 1041 * 1042 * For existing keys, only called when btree nodes are being rewritten, not when 1043 * they're merely being compacted/resorted in memory. 1044 */ 1045 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) 1046 { 1047 struct bch_dev *ca; 1048 1049 rcu_read_lock(); 1050 bch2_bkey_drop_ptrs(k, ptr, 1051 ptr->cached && 1052 (!(ca = bch2_dev_rcu(c, ptr->dev)) || 1053 dev_ptr_stale_rcu(ca, ptr) > 0)); 1054 rcu_read_unlock(); 1055 1056 return bkey_deleted(k.k); 1057 } 1058 1059 /* 1060 * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. 1061 * 1062 * Like bch2_extent_normalize(), but also only keeps a single cached pointer on 1063 * the promote target. 1064 */ 1065 bool bch2_extent_normalize_by_opts(struct bch_fs *c, 1066 struct bch_io_opts *opts, 1067 struct bkey_s k) 1068 { 1069 struct bkey_ptrs ptrs; 1070 bool have_cached_ptr; 1071 1072 rcu_read_lock(); 1073 restart_drop_ptrs: 1074 ptrs = bch2_bkey_ptrs(k); 1075 have_cached_ptr = false; 1076 1077 bkey_for_each_ptr(ptrs, ptr) 1078 if (ptr->cached) { 1079 if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { 1080 bch2_bkey_drop_ptr(k, ptr); 1081 goto restart_drop_ptrs; 1082 } 1083 have_cached_ptr = true; 1084 } 1085 rcu_read_unlock(); 1086 1087 return bkey_deleted(k.k); 1088 } 1089 1090 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) 1091 { 1092 out->atomic++; 1093 rcu_read_lock(); 1094 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); 1095 if (!ca) { 1096 prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, 1097 (u64) ptr->offset, ptr->gen, 1098 ptr->cached ? " cached" : ""); 1099 } else { 1100 u32 offset; 1101 u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); 1102 1103 prt_printf(out, "ptr: %u:%llu:%u gen %u", 1104 ptr->dev, b, offset, ptr->gen); 1105 if (ca->mi.durability != 1) 1106 prt_printf(out, " d=%u", ca->mi.durability); 1107 if (ptr->cached) 1108 prt_str(out, " cached"); 1109 if (ptr->unwritten) 1110 prt_str(out, " unwritten"); 1111 int stale = dev_ptr_stale_rcu(ca, ptr); 1112 if (stale > 0) 1113 prt_printf(out, " stale"); 1114 else if (stale) 1115 prt_printf(out, " invalid"); 1116 } 1117 rcu_read_unlock(); 1118 --out->atomic; 1119 } 1120 1121 void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) 1122 { 1123 prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", 1124 crc->compressed_size, 1125 crc->uncompressed_size, 1126 crc->offset, crc->nonce); 1127 bch2_prt_csum_type(out, crc->csum_type); 1128 prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo); 1129 prt_str(out, " compress "); 1130 bch2_prt_compression_type(out, crc->compression_type); 1131 } 1132 1133 static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, 1134 const struct bch_extent_rebalance *r) 1135 { 1136 prt_str(out, "rebalance:"); 1137 1138 prt_printf(out, " replicas=%u", r->data_replicas); 1139 if (r->data_replicas_from_inode) 1140 prt_str(out, " (inode)"); 1141 1142 prt_str(out, " checksum="); 1143 bch2_prt_csum_opt(out, r->data_checksum); 1144 if (r->data_checksum_from_inode) 1145 prt_str(out, " (inode)"); 1146 1147 if (r->background_compression || r->background_compression_from_inode) { 1148 prt_str(out, " background_compression="); 1149 bch2_compression_opt_to_text(out, r->background_compression); 1150 1151 if (r->background_compression_from_inode) 1152 prt_str(out, " (inode)"); 1153 } 1154 1155 if (r->background_target || r->background_target_from_inode) { 1156 prt_str(out, " background_target="); 1157 if (c) 1158 bch2_target_to_text(out, c, r->background_target); 1159 else 1160 prt_printf(out, "%u", r->background_target); 1161 1162 if (r->background_target_from_inode) 1163 prt_str(out, " (inode)"); 1164 } 1165 1166 if (r->promote_target || r->promote_target_from_inode) { 1167 prt_str(out, " promote_target="); 1168 if (c) 1169 bch2_target_to_text(out, c, r->promote_target); 1170 else 1171 prt_printf(out, "%u", r->promote_target); 1172 1173 if (r->promote_target_from_inode) 1174 prt_str(out, " (inode)"); 1175 } 1176 1177 if (r->erasure_code || r->erasure_code_from_inode) { 1178 prt_printf(out, " ec=%u", r->erasure_code); 1179 if (r->erasure_code_from_inode) 1180 prt_str(out, " (inode)"); 1181 } 1182 } 1183 1184 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1185 struct bkey_s_c k) 1186 { 1187 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1188 const union bch_extent_entry *entry; 1189 bool first = true; 1190 1191 if (c) 1192 prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); 1193 1194 bkey_extent_entry_for_each(ptrs, entry) { 1195 if (!first) 1196 prt_printf(out, " "); 1197 1198 switch (__extent_entry_type(entry)) { 1199 case BCH_EXTENT_ENTRY_ptr: 1200 bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); 1201 break; 1202 1203 case BCH_EXTENT_ENTRY_crc32: 1204 case BCH_EXTENT_ENTRY_crc64: 1205 case BCH_EXTENT_ENTRY_crc128: { 1206 struct bch_extent_crc_unpacked crc = 1207 bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); 1208 1209 bch2_extent_crc_unpacked_to_text(out, &crc); 1210 break; 1211 } 1212 case BCH_EXTENT_ENTRY_stripe_ptr: { 1213 const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; 1214 1215 prt_printf(out, "ec: idx %llu block %u", 1216 (u64) ec->idx, ec->block); 1217 break; 1218 } 1219 case BCH_EXTENT_ENTRY_rebalance: 1220 bch2_extent_rebalance_to_text(out, c, &entry->rebalance); 1221 break; 1222 1223 default: 1224 prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); 1225 return; 1226 } 1227 1228 first = false; 1229 } 1230 } 1231 1232 static int extent_ptr_validate(struct bch_fs *c, 1233 struct bkey_s_c k, 1234 struct bkey_validate_context from, 1235 const struct bch_extent_ptr *ptr, 1236 unsigned size_ondisk, 1237 bool metadata) 1238 { 1239 int ret = 0; 1240 1241 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1242 bkey_for_each_ptr(ptrs, ptr2) 1243 bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, 1244 c, ptr_to_duplicate_device, 1245 "multiple pointers to same device (%u)", ptr->dev); 1246 1247 /* bad pointers are repaired by check_fix_ptrs(): */ 1248 rcu_read_lock(); 1249 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); 1250 if (!ca) { 1251 rcu_read_unlock(); 1252 return 0; 1253 } 1254 u32 bucket_offset; 1255 u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); 1256 unsigned first_bucket = ca->mi.first_bucket; 1257 u64 nbuckets = ca->mi.nbuckets; 1258 unsigned bucket_size = ca->mi.bucket_size; 1259 rcu_read_unlock(); 1260 1261 bkey_fsck_err_on(bucket >= nbuckets, 1262 c, ptr_after_last_bucket, 1263 "pointer past last bucket (%llu > %llu)", bucket, nbuckets); 1264 bkey_fsck_err_on(bucket < first_bucket, 1265 c, ptr_before_first_bucket, 1266 "pointer before first bucket (%llu < %u)", bucket, first_bucket); 1267 bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, 1268 c, ptr_spans_multiple_buckets, 1269 "pointer spans multiple buckets (%u + %u > %u)", 1270 bucket_offset, size_ondisk, bucket_size); 1271 fsck_err: 1272 return ret; 1273 } 1274 1275 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, 1276 struct bkey_validate_context from) 1277 { 1278 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1279 const union bch_extent_entry *entry; 1280 struct bch_extent_crc_unpacked crc; 1281 unsigned size_ondisk = k.k->size; 1282 unsigned nonce = UINT_MAX; 1283 unsigned nr_ptrs = 0; 1284 bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; 1285 int ret = 0; 1286 1287 if (bkey_is_btree_ptr(k.k)) 1288 size_ondisk = btree_sectors(c); 1289 1290 bkey_extent_entry_for_each(ptrs, entry) { 1291 bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, 1292 c, extent_ptrs_invalid_entry, 1293 "invalid extent entry type (got %u, max %u)", 1294 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); 1295 1296 bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && 1297 !extent_entry_is_ptr(entry), 1298 c, btree_ptr_has_non_ptr, 1299 "has non ptr field"); 1300 1301 switch (extent_entry_type(entry)) { 1302 case BCH_EXTENT_ENTRY_ptr: 1303 ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); 1304 if (ret) 1305 return ret; 1306 1307 bkey_fsck_err_on(entry->ptr.cached && have_ec, 1308 c, ptr_cached_and_erasure_coded, 1309 "cached, erasure coded ptr"); 1310 1311 if (!entry->ptr.unwritten) 1312 have_written = true; 1313 else 1314 have_unwritten = true; 1315 1316 have_ec = false; 1317 crc_since_last_ptr = false; 1318 nr_ptrs++; 1319 break; 1320 case BCH_EXTENT_ENTRY_crc32: 1321 case BCH_EXTENT_ENTRY_crc64: 1322 case BCH_EXTENT_ENTRY_crc128: 1323 crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); 1324 1325 bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), 1326 c, ptr_crc_csum_type_unknown, 1327 "invalid checksum type"); 1328 bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, 1329 c, ptr_crc_compression_type_unknown, 1330 "invalid compression type"); 1331 1332 bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, 1333 c, ptr_crc_uncompressed_size_too_small, 1334 "checksum offset + key size > uncompressed size"); 1335 bkey_fsck_err_on(crc_is_encoded(crc) && 1336 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && 1337 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), 1338 c, ptr_crc_uncompressed_size_too_big, 1339 "too large encoded extent"); 1340 bkey_fsck_err_on(!crc_is_compressed(crc) && 1341 crc.compressed_size != crc.uncompressed_size, 1342 c, ptr_crc_uncompressed_size_mismatch, 1343 "not compressed but compressed != uncompressed size"); 1344 1345 if (bch2_csum_type_is_encryption(crc.csum_type)) { 1346 if (nonce == UINT_MAX) 1347 nonce = crc.offset + crc.nonce; 1348 else if (nonce != crc.offset + crc.nonce) 1349 bkey_fsck_err(c, ptr_crc_nonce_mismatch, 1350 "incorrect nonce"); 1351 } 1352 1353 bkey_fsck_err_on(crc_since_last_ptr, 1354 c, ptr_crc_redundant, 1355 "redundant crc entry"); 1356 crc_since_last_ptr = true; 1357 1358 size_ondisk = crc.compressed_size; 1359 break; 1360 case BCH_EXTENT_ENTRY_stripe_ptr: 1361 bkey_fsck_err_on(have_ec, 1362 c, ptr_stripe_redundant, 1363 "redundant stripe entry"); 1364 have_ec = true; 1365 break; 1366 case BCH_EXTENT_ENTRY_rebalance: { 1367 /* 1368 * this shouldn't be a fsck error, for forward 1369 * compatibility; the rebalance code should just refetch 1370 * the compression opt if it's unknown 1371 */ 1372 #if 0 1373 const struct bch_extent_rebalance *r = &entry->rebalance; 1374 1375 if (!bch2_compression_opt_valid(r->compression)) { 1376 struct bch_compression_opt opt = __bch2_compression_decode(r->compression); 1377 prt_printf(err, "invalid compression opt %u:%u", 1378 opt.type, opt.level); 1379 return -BCH_ERR_invalid_bkey; 1380 } 1381 #endif 1382 break; 1383 } 1384 } 1385 } 1386 1387 bkey_fsck_err_on(!nr_ptrs, 1388 c, extent_ptrs_no_ptrs, 1389 "no ptrs"); 1390 bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, 1391 c, extent_ptrs_too_many_ptrs, 1392 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); 1393 bkey_fsck_err_on(have_written && have_unwritten, 1394 c, extent_ptrs_written_and_unwritten, 1395 "extent with unwritten and written ptrs"); 1396 bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, 1397 c, extent_ptrs_unwritten, 1398 "has unwritten ptrs"); 1399 bkey_fsck_err_on(crc_since_last_ptr, 1400 c, extent_ptrs_redundant_crc, 1401 "redundant crc entry"); 1402 bkey_fsck_err_on(have_ec, 1403 c, extent_ptrs_redundant_stripe, 1404 "redundant stripe entry"); 1405 fsck_err: 1406 return ret; 1407 } 1408 1409 void bch2_ptr_swab(struct bkey_s k) 1410 { 1411 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); 1412 union bch_extent_entry *entry; 1413 u64 *d; 1414 1415 for (d = (u64 *) ptrs.start; 1416 d != (u64 *) ptrs.end; 1417 d++) 1418 *d = swab64(*d); 1419 1420 for (entry = ptrs.start; 1421 entry < ptrs.end; 1422 entry = extent_entry_next(entry)) { 1423 switch (__extent_entry_type(entry)) { 1424 case BCH_EXTENT_ENTRY_ptr: 1425 break; 1426 case BCH_EXTENT_ENTRY_crc32: 1427 entry->crc32.csum = swab32(entry->crc32.csum); 1428 break; 1429 case BCH_EXTENT_ENTRY_crc64: 1430 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); 1431 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); 1432 break; 1433 case BCH_EXTENT_ENTRY_crc128: 1434 entry->crc128.csum.hi = (__force __le64) 1435 swab64((__force u64) entry->crc128.csum.hi); 1436 entry->crc128.csum.lo = (__force __le64) 1437 swab64((__force u64) entry->crc128.csum.lo); 1438 break; 1439 case BCH_EXTENT_ENTRY_stripe_ptr: 1440 break; 1441 case BCH_EXTENT_ENTRY_rebalance: 1442 break; 1443 default: 1444 /* Bad entry type: will be caught by validate() */ 1445 return; 1446 } 1447 } 1448 } 1449 1450 /* Generic extent code: */ 1451 1452 int bch2_cut_front_s(struct bpos where, struct bkey_s k) 1453 { 1454 unsigned new_val_u64s = bkey_val_u64s(k.k); 1455 int val_u64s_delta; 1456 u64 sub; 1457 1458 if (bkey_le(where, bkey_start_pos(k.k))) 1459 return 0; 1460 1461 EBUG_ON(bkey_gt(where, k.k->p)); 1462 1463 sub = where.offset - bkey_start_offset(k.k); 1464 1465 k.k->size -= sub; 1466 1467 if (!k.k->size) { 1468 k.k->type = KEY_TYPE_deleted; 1469 new_val_u64s = 0; 1470 } 1471 1472 switch (k.k->type) { 1473 case KEY_TYPE_extent: 1474 case KEY_TYPE_reflink_v: { 1475 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); 1476 union bch_extent_entry *entry; 1477 bool seen_crc = false; 1478 1479 bkey_extent_entry_for_each(ptrs, entry) { 1480 switch (extent_entry_type(entry)) { 1481 case BCH_EXTENT_ENTRY_ptr: 1482 if (!seen_crc) 1483 entry->ptr.offset += sub; 1484 break; 1485 case BCH_EXTENT_ENTRY_crc32: 1486 entry->crc32.offset += sub; 1487 break; 1488 case BCH_EXTENT_ENTRY_crc64: 1489 entry->crc64.offset += sub; 1490 break; 1491 case BCH_EXTENT_ENTRY_crc128: 1492 entry->crc128.offset += sub; 1493 break; 1494 case BCH_EXTENT_ENTRY_stripe_ptr: 1495 break; 1496 case BCH_EXTENT_ENTRY_rebalance: 1497 break; 1498 } 1499 1500 if (extent_entry_is_crc(entry)) 1501 seen_crc = true; 1502 } 1503 1504 break; 1505 } 1506 case KEY_TYPE_reflink_p: { 1507 struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); 1508 1509 SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); 1510 break; 1511 } 1512 case KEY_TYPE_inline_data: 1513 case KEY_TYPE_indirect_inline_data: { 1514 void *p = bkey_inline_data_p(k); 1515 unsigned bytes = bkey_inline_data_bytes(k.k); 1516 1517 sub = min_t(u64, sub << 9, bytes); 1518 1519 memmove(p, p + sub, bytes - sub); 1520 1521 new_val_u64s -= sub >> 3; 1522 break; 1523 } 1524 } 1525 1526 val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; 1527 BUG_ON(val_u64s_delta < 0); 1528 1529 set_bkey_val_u64s(k.k, new_val_u64s); 1530 memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); 1531 return -val_u64s_delta; 1532 } 1533 1534 int bch2_cut_back_s(struct bpos where, struct bkey_s k) 1535 { 1536 unsigned new_val_u64s = bkey_val_u64s(k.k); 1537 int val_u64s_delta; 1538 u64 len = 0; 1539 1540 if (bkey_ge(where, k.k->p)) 1541 return 0; 1542 1543 EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); 1544 1545 len = where.offset - bkey_start_offset(k.k); 1546 1547 k.k->p.offset = where.offset; 1548 k.k->size = len; 1549 1550 if (!len) { 1551 k.k->type = KEY_TYPE_deleted; 1552 new_val_u64s = 0; 1553 } 1554 1555 switch (k.k->type) { 1556 case KEY_TYPE_inline_data: 1557 case KEY_TYPE_indirect_inline_data: 1558 new_val_u64s = (bkey_inline_data_offset(k.k) + 1559 min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; 1560 break; 1561 } 1562 1563 val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; 1564 BUG_ON(val_u64s_delta < 0); 1565 1566 set_bkey_val_u64s(k.k, new_val_u64s); 1567 memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); 1568 return -val_u64s_delta; 1569 } 1570