1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_key_cache.h" 5 #include "btree_write_buffer.h" 6 #include "bkey_methods.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "compress.h" 10 #include "dirent.h" 11 #include "disk_accounting.h" 12 #include "error.h" 13 #include "extents.h" 14 #include "extent_update.h" 15 #include "fs.h" 16 #include "inode.h" 17 #include "str_hash.h" 18 #include "snapshot.h" 19 #include "subvolume.h" 20 #include "varint.h" 21 22 #include <linux/random.h> 23 24 #include <linux/unaligned.h> 25 26 #define x(name, ...) #name, 27 const char * const bch2_inode_opts[] = { 28 BCH_INODE_OPTS() 29 NULL, 30 }; 31 32 static const char * const bch2_inode_flag_strs[] = { 33 BCH_INODE_FLAGS() 34 NULL 35 }; 36 #undef x 37 38 static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); 39 40 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; 41 42 static int inode_decode_field(const u8 *in, const u8 *end, 43 u64 out[2], unsigned *out_bits) 44 { 45 __be64 be[2] = { 0, 0 }; 46 unsigned bytes, shift; 47 u8 *p; 48 49 if (in >= end) 50 return -1; 51 52 if (!*in) 53 return -1; 54 55 /* 56 * position of highest set bit indicates number of bytes: 57 * shift = number of bits to remove in high byte: 58 */ 59 shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ 60 bytes = byte_table[shift - 1]; 61 62 if (in + bytes > end) 63 return -1; 64 65 p = (u8 *) be + 16 - bytes; 66 memcpy(p, in, bytes); 67 *p ^= (1 << 8) >> shift; 68 69 out[0] = be64_to_cpu(be[0]); 70 out[1] = be64_to_cpu(be[1]); 71 *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); 72 73 return bytes; 74 } 75 76 static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, 77 const struct bch_inode_unpacked *inode) 78 { 79 struct bkey_i_inode_v3 *k = &packed->inode; 80 u8 *out = k->v.fields; 81 u8 *end = (void *) &packed[1]; 82 u8 *last_nonzero_field = out; 83 unsigned nr_fields = 0, last_nonzero_fieldnr = 0; 84 unsigned bytes; 85 int ret; 86 87 bkey_inode_v3_init(&packed->inode.k_i); 88 packed->inode.k.p.offset = inode->bi_inum; 89 packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); 90 packed->inode.v.bi_hash_seed = inode->bi_hash_seed; 91 packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); 92 packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); 93 packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); 94 packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); 95 SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); 96 SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); 97 98 99 #define x(_name, _bits) \ 100 nr_fields++; \ 101 \ 102 if (inode->_name) { \ 103 ret = bch2_varint_encode_fast(out, inode->_name); \ 104 out += ret; \ 105 \ 106 if (_bits > 64) \ 107 *out++ = 0; \ 108 \ 109 last_nonzero_field = out; \ 110 last_nonzero_fieldnr = nr_fields; \ 111 } else { \ 112 *out++ = 0; \ 113 \ 114 if (_bits > 64) \ 115 *out++ = 0; \ 116 } 117 118 BCH_INODE_FIELDS_v3() 119 #undef x 120 BUG_ON(out > end); 121 122 out = last_nonzero_field; 123 nr_fields = last_nonzero_fieldnr; 124 125 bytes = out - (u8 *) &packed->inode.v; 126 set_bkey_val_bytes(&packed->inode.k, bytes); 127 memset_u64s_tail(&packed->inode.v, 0, bytes); 128 129 SET_INODEv3_NR_FIELDS(&k->v, nr_fields); 130 131 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { 132 struct bch_inode_unpacked unpacked; 133 134 ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); 135 BUG_ON(ret); 136 BUG_ON(unpacked.bi_inum != inode->bi_inum); 137 BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); 138 BUG_ON(unpacked.bi_sectors != inode->bi_sectors); 139 BUG_ON(unpacked.bi_size != inode->bi_size); 140 BUG_ON(unpacked.bi_version != inode->bi_version); 141 BUG_ON(unpacked.bi_mode != inode->bi_mode); 142 143 #define x(_name, _bits) if (unpacked._name != inode->_name) \ 144 panic("unpacked %llu should be %llu", \ 145 (u64) unpacked._name, (u64) inode->_name); 146 BCH_INODE_FIELDS_v3() 147 #undef x 148 } 149 } 150 151 void bch2_inode_pack(struct bkey_inode_buf *packed, 152 const struct bch_inode_unpacked *inode) 153 { 154 bch2_inode_pack_inlined(packed, inode); 155 } 156 157 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, 158 struct bch_inode_unpacked *unpacked) 159 { 160 const u8 *in = inode.v->fields; 161 const u8 *end = bkey_val_end(inode); 162 u64 field[2]; 163 unsigned fieldnr = 0, field_bits; 164 int ret; 165 166 #define x(_name, _bits) \ 167 if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ 168 unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ 169 memset((void *) unpacked + offset, 0, \ 170 sizeof(*unpacked) - offset); \ 171 return 0; \ 172 } \ 173 \ 174 ret = inode_decode_field(in, end, field, &field_bits); \ 175 if (ret < 0) \ 176 return ret; \ 177 \ 178 if (field_bits > sizeof(unpacked->_name) * 8) \ 179 return -1; \ 180 \ 181 unpacked->_name = field[1]; \ 182 in += ret; 183 184 BCH_INODE_FIELDS_v2() 185 #undef x 186 187 /* XXX: signal if there were more fields than expected? */ 188 return 0; 189 } 190 191 static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, 192 const u8 *in, const u8 *end, 193 unsigned nr_fields) 194 { 195 unsigned fieldnr = 0; 196 int ret; 197 u64 v[2]; 198 199 #define x(_name, _bits) \ 200 if (fieldnr < nr_fields) { \ 201 ret = bch2_varint_decode_fast(in, end, &v[0]); \ 202 if (ret < 0) \ 203 return ret; \ 204 in += ret; \ 205 \ 206 if (_bits > 64) { \ 207 ret = bch2_varint_decode_fast(in, end, &v[1]); \ 208 if (ret < 0) \ 209 return ret; \ 210 in += ret; \ 211 } else { \ 212 v[1] = 0; \ 213 } \ 214 } else { \ 215 v[0] = v[1] = 0; \ 216 } \ 217 \ 218 unpacked->_name = v[0]; \ 219 if (v[1] || v[0] != unpacked->_name) \ 220 return -1; \ 221 fieldnr++; 222 223 BCH_INODE_FIELDS_v2() 224 #undef x 225 226 /* XXX: signal if there were more fields than expected? */ 227 return 0; 228 } 229 230 static int bch2_inode_unpack_v3(struct bkey_s_c k, 231 struct bch_inode_unpacked *unpacked) 232 { 233 struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); 234 const u8 *in = inode.v->fields; 235 const u8 *end = bkey_val_end(inode); 236 unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); 237 unsigned fieldnr = 0; 238 int ret; 239 u64 v[2]; 240 241 unpacked->bi_inum = inode.k->p.offset; 242 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); 243 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 244 unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); 245 unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); 246 unpacked->bi_size = le64_to_cpu(inode.v->bi_size); 247 unpacked->bi_version = le64_to_cpu(inode.v->bi_version); 248 unpacked->bi_mode = INODEv3_MODE(inode.v); 249 250 #define x(_name, _bits) \ 251 if (fieldnr < nr_fields) { \ 252 ret = bch2_varint_decode_fast(in, end, &v[0]); \ 253 if (ret < 0) \ 254 return ret; \ 255 in += ret; \ 256 \ 257 if (_bits > 64) { \ 258 ret = bch2_varint_decode_fast(in, end, &v[1]); \ 259 if (ret < 0) \ 260 return ret; \ 261 in += ret; \ 262 } else { \ 263 v[1] = 0; \ 264 } \ 265 } else { \ 266 v[0] = v[1] = 0; \ 267 } \ 268 \ 269 unpacked->_name = v[0]; \ 270 if (v[1] || v[0] != unpacked->_name) \ 271 return -1; \ 272 fieldnr++; 273 274 BCH_INODE_FIELDS_v3() 275 #undef x 276 277 /* XXX: signal if there were more fields than expected? */ 278 return 0; 279 } 280 281 static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, 282 struct bch_inode_unpacked *unpacked) 283 { 284 memset(unpacked, 0, sizeof(*unpacked)); 285 286 unpacked->bi_snapshot = k.k->p.snapshot; 287 288 switch (k.k->type) { 289 case KEY_TYPE_inode: { 290 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); 291 292 unpacked->bi_inum = inode.k->p.offset; 293 unpacked->bi_journal_seq= 0; 294 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 295 unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); 296 unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); 297 298 if (INODEv1_NEW_VARINT(inode.v)) { 299 return bch2_inode_unpack_v2(unpacked, inode.v->fields, 300 bkey_val_end(inode), 301 INODEv1_NR_FIELDS(inode.v)); 302 } else { 303 return bch2_inode_unpack_v1(inode, unpacked); 304 } 305 break; 306 } 307 case KEY_TYPE_inode_v2: { 308 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); 309 310 unpacked->bi_inum = inode.k->p.offset; 311 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); 312 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 313 unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); 314 unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); 315 316 return bch2_inode_unpack_v2(unpacked, inode.v->fields, 317 bkey_val_end(inode), 318 INODEv2_NR_FIELDS(inode.v)); 319 } 320 default: 321 BUG(); 322 } 323 } 324 325 int bch2_inode_unpack(struct bkey_s_c k, 326 struct bch_inode_unpacked *unpacked) 327 { 328 unpacked->bi_snapshot = k.k->p.snapshot; 329 330 return likely(k.k->type == KEY_TYPE_inode_v3) 331 ? bch2_inode_unpack_v3(k, unpacked) 332 : bch2_inode_unpack_slowpath(k, unpacked); 333 } 334 335 int __bch2_inode_peek(struct btree_trans *trans, 336 struct btree_iter *iter, 337 struct bch_inode_unpacked *inode, 338 subvol_inum inum, unsigned flags, 339 bool warn) 340 { 341 u32 snapshot; 342 int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn); 343 if (ret) 344 return ret; 345 346 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, 347 SPOS(0, inum.inum, snapshot), 348 flags|BTREE_ITER_cached); 349 ret = bkey_err(k); 350 if (ret) 351 return ret; 352 353 ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; 354 if (ret) 355 goto err; 356 357 ret = bch2_inode_unpack(k, inode); 358 if (ret) 359 goto err; 360 361 return 0; 362 err: 363 if (warn) 364 bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); 365 bch2_trans_iter_exit(trans, iter); 366 return ret; 367 } 368 369 int bch2_inode_write_flags(struct btree_trans *trans, 370 struct btree_iter *iter, 371 struct bch_inode_unpacked *inode, 372 enum btree_iter_update_trigger_flags flags) 373 { 374 struct bkey_inode_buf *inode_p; 375 376 inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); 377 if (IS_ERR(inode_p)) 378 return PTR_ERR(inode_p); 379 380 bch2_inode_pack_inlined(inode_p, inode); 381 inode_p->inode.k.p.snapshot = iter->snapshot; 382 return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); 383 } 384 385 int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 386 { 387 struct bkey_inode_buf *inode_p = 388 bch2_trans_kmalloc(trans, sizeof(*inode_p)); 389 390 if (IS_ERR(inode_p)) 391 return PTR_ERR(inode_p); 392 393 bch2_inode_pack(inode_p, inode); 394 inode_p->inode.k.p.snapshot = inode->bi_snapshot; 395 396 return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 397 &inode_p->inode.k_i, 398 BTREE_UPDATE_internal_snapshot_node); 399 } 400 401 int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 402 { 403 int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 404 __bch2_fsck_write_inode(trans, inode)); 405 bch_err_fn(trans->c, ret); 406 return ret; 407 } 408 409 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) 410 { 411 struct bch_inode_unpacked u; 412 struct bkey_inode_buf *inode_p; 413 int ret; 414 415 if (!bkey_is_inode(&k->k)) 416 return ERR_PTR(-ENOENT); 417 418 inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); 419 if (IS_ERR(inode_p)) 420 return ERR_CAST(inode_p); 421 422 ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); 423 if (ret) 424 return ERR_PTR(ret); 425 426 bch2_inode_pack(inode_p, &u); 427 return &inode_p->inode.k_i; 428 } 429 430 static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, 431 enum bch_validate_flags flags) 432 { 433 struct bch_inode_unpacked unpacked; 434 int ret = 0; 435 436 bkey_fsck_err_on(k.k->p.inode, 437 c, inode_pos_inode_nonzero, 438 "nonzero k.p.inode"); 439 440 bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, 441 c, inode_pos_blockdev_range, 442 "fs inode in blockdev range"); 443 444 bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), 445 c, inode_unpack_error, 446 "invalid variable length fields"); 447 448 bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, 449 c, inode_checksum_type_invalid, 450 "invalid data checksum type (%u >= %u", 451 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); 452 453 bkey_fsck_err_on(unpacked.bi_compression && 454 !bch2_compression_opt_valid(unpacked.bi_compression - 1), 455 c, inode_compression_type_invalid, 456 "invalid compression opt %u", unpacked.bi_compression - 1); 457 458 bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && 459 unpacked.bi_nlink != 0, 460 c, inode_unlinked_but_nlink_nonzero, 461 "flagged as unlinked but bi_nlink != 0"); 462 463 bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), 464 c, inode_subvol_root_but_not_dir, 465 "subvolume root but not a directory"); 466 fsck_err: 467 return ret; 468 } 469 470 int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, 471 enum bch_validate_flags flags) 472 { 473 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); 474 int ret = 0; 475 476 bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, 477 c, inode_str_hash_invalid, 478 "invalid str hash type (%llu >= %u)", 479 INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); 480 481 ret = __bch2_inode_validate(c, k, flags); 482 fsck_err: 483 return ret; 484 } 485 486 int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, 487 enum bch_validate_flags flags) 488 { 489 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); 490 int ret = 0; 491 492 bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, 493 c, inode_str_hash_invalid, 494 "invalid str hash type (%llu >= %u)", 495 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); 496 497 ret = __bch2_inode_validate(c, k, flags); 498 fsck_err: 499 return ret; 500 } 501 502 int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, 503 enum bch_validate_flags flags) 504 { 505 struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); 506 int ret = 0; 507 508 bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || 509 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), 510 c, inode_v3_fields_start_bad, 511 "invalid fields_start (got %llu, min %u max %zu)", 512 INODEv3_FIELDS_START(inode.v), 513 INODEv3_FIELDS_START_INITIAL, 514 bkey_val_u64s(inode.k)); 515 516 bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, 517 c, inode_str_hash_invalid, 518 "invalid str hash type (%llu >= %u)", 519 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); 520 521 ret = __bch2_inode_validate(c, k, flags); 522 fsck_err: 523 return ret; 524 } 525 526 static void __bch2_inode_unpacked_to_text(struct printbuf *out, 527 struct bch_inode_unpacked *inode) 528 { 529 prt_printf(out, "\n"); 530 printbuf_indent_add(out, 2); 531 prt_printf(out, "mode=%o\n", inode->bi_mode); 532 533 prt_str(out, "flags="); 534 prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); 535 prt_printf(out, "(%x)\n", inode->bi_flags); 536 537 prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); 538 prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); 539 prt_printf(out, "hash_type="); 540 bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); 541 prt_newline(out); 542 prt_printf(out, "bi_size=%llu\n", inode->bi_size); 543 prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); 544 prt_printf(out, "bi_version=%llu\n", inode->bi_version); 545 546 #define x(_name, _bits) \ 547 prt_printf(out, #_name "=%llu\n", (u64) inode->_name); 548 BCH_INODE_FIELDS_v3() 549 #undef x 550 551 bch2_printbuf_strip_trailing_newline(out); 552 printbuf_indent_sub(out, 2); 553 } 554 555 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) 556 { 557 prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); 558 __bch2_inode_unpacked_to_text(out, inode); 559 } 560 561 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 562 { 563 struct bch_inode_unpacked inode; 564 565 if (bch2_inode_unpack(k, &inode)) { 566 prt_printf(out, "(unpack error)"); 567 return; 568 } 569 570 __bch2_inode_unpacked_to_text(out, &inode); 571 } 572 573 static inline u64 bkey_inode_flags(struct bkey_s_c k) 574 { 575 switch (k.k->type) { 576 case KEY_TYPE_inode: 577 return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); 578 case KEY_TYPE_inode_v2: 579 return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); 580 case KEY_TYPE_inode_v3: 581 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); 582 default: 583 return 0; 584 } 585 } 586 587 static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) 588 { 589 switch (k.k->type) { 590 case KEY_TYPE_inode: 591 bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); 592 return; 593 case KEY_TYPE_inode_v2: 594 bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); 595 return; 596 case KEY_TYPE_inode_v3: 597 bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); 598 return; 599 default: 600 BUG(); 601 } 602 } 603 604 static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) 605 { 606 unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; 607 608 return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); 609 } 610 611 static struct bkey_s_c 612 bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, 613 enum btree_id btree, struct bpos pos, 614 unsigned flags) 615 { 616 struct bch_fs *c = trans->c; 617 struct bkey_s_c k; 618 int ret = 0; 619 620 for_each_btree_key_upto_norestart(trans, *iter, btree, 621 bpos_successor(pos), 622 SPOS(pos.inode, pos.offset, U32_MAX), 623 flags|BTREE_ITER_all_snapshots, k, ret) 624 if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) 625 return k; 626 627 bch2_trans_iter_exit(trans, iter); 628 return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 629 } 630 631 static struct bkey_s_c 632 bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, 633 struct bpos pos, unsigned flags) 634 { 635 struct bkey_s_c k; 636 again: 637 k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); 638 if (!k.k || 639 bkey_err(k) || 640 bkey_is_inode(k.k)) 641 return k; 642 643 bch2_trans_iter_exit(trans, iter); 644 pos = k.k->p; 645 goto again; 646 } 647 648 int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) 649 { 650 struct bch_fs *c = trans->c; 651 struct btree_iter iter; 652 struct bkey_s_c k; 653 int ret = 0; 654 655 for_each_btree_key_upto_norestart(trans, iter, 656 BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), 657 BTREE_ITER_all_snapshots| 658 BTREE_ITER_with_updates, k, ret) 659 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && 660 bkey_is_inode(k.k)) { 661 ret = 1; 662 break; 663 } 664 bch2_trans_iter_exit(trans, &iter); 665 return ret; 666 } 667 668 static int update_inode_has_children(struct btree_trans *trans, 669 struct bkey_s k, 670 bool have_child) 671 { 672 if (!have_child) { 673 int ret = bch2_inode_has_child_snapshots(trans, k.k->p); 674 if (ret) 675 return ret < 0 ? ret : 0; 676 } 677 678 u64 f = bkey_inode_flags(k.s_c); 679 if (have_child != !!(f & BCH_INODE_has_child_snapshot)) 680 bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); 681 682 return 0; 683 } 684 685 static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, 686 bool have_child) 687 { 688 struct btree_iter iter; 689 struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, 690 &iter, pos, BTREE_ITER_with_updates); 691 int ret = bkey_err(k); 692 if (ret) 693 return ret; 694 if (!k.k) 695 return 0; 696 697 if (!have_child) { 698 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 699 if (ret) { 700 ret = ret < 0 ? ret : 0; 701 goto err; 702 } 703 } 704 705 u64 f = bkey_inode_flags(k); 706 if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { 707 struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, 708 BTREE_UPDATE_internal_snapshot_node); 709 ret = PTR_ERR_OR_ZERO(update); 710 if (ret) 711 goto err; 712 713 bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); 714 } 715 err: 716 bch2_trans_iter_exit(trans, &iter); 717 return ret; 718 } 719 720 int bch2_trigger_inode(struct btree_trans *trans, 721 enum btree_id btree_id, unsigned level, 722 struct bkey_s_c old, 723 struct bkey_s new, 724 enum btree_iter_update_trigger_flags flags) 725 { 726 struct bch_fs *c = trans->c; 727 728 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { 729 BUG_ON(!trans->journal_res.seq); 730 bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); 731 } 732 733 s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); 734 if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { 735 struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; 736 int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); 737 if (ret) 738 return ret; 739 } 740 741 if (flags & BTREE_TRIGGER_transactional) { 742 int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - 743 (int) bkey_is_unlinked_inode(old); 744 if (unlinked_delta) { 745 int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, 746 new.k->p, unlinked_delta > 0); 747 if (ret) 748 return ret; 749 } 750 751 /* 752 * If we're creating or deleting an inode at this snapshot ID, 753 * and there might be an inode in a parent snapshot ID, we might 754 * need to set or clear the has_child_snapshot flag on the 755 * parent. 756 */ 757 int deleted_delta = (int) bkey_is_inode(new.k) - 758 (int) bkey_is_inode(old.k); 759 if (deleted_delta && 760 bch2_snapshot_parent(c, new.k->p.snapshot)) { 761 int ret = update_parent_inode_has_children(trans, new.k->p, 762 deleted_delta > 0); 763 if (ret) 764 return ret; 765 } 766 767 /* 768 * When an inode is first updated in a new snapshot, we may need 769 * to clear has_child_snapshot 770 */ 771 if (deleted_delta > 0) { 772 int ret = update_inode_has_children(trans, new, false); 773 if (ret) 774 return ret; 775 } 776 } 777 778 return 0; 779 } 780 781 int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, 782 enum bch_validate_flags flags) 783 { 784 int ret = 0; 785 786 bkey_fsck_err_on(k.k->p.inode, 787 c, inode_pos_inode_nonzero, 788 "nonzero k.p.inode"); 789 fsck_err: 790 return ret; 791 } 792 793 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, 794 struct bkey_s_c k) 795 { 796 struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); 797 798 prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); 799 } 800 801 void bch2_inode_init_early(struct bch_fs *c, 802 struct bch_inode_unpacked *inode_u) 803 { 804 enum bch_str_hash_type str_hash = 805 bch2_str_hash_opt_to_type(c, c->opts.str_hash); 806 807 memset(inode_u, 0, sizeof(*inode_u)); 808 809 SET_INODE_STR_HASH(inode_u, str_hash); 810 get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); 811 } 812 813 void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, 814 uid_t uid, gid_t gid, umode_t mode, dev_t rdev, 815 struct bch_inode_unpacked *parent) 816 { 817 inode_u->bi_mode = mode; 818 inode_u->bi_uid = uid; 819 inode_u->bi_gid = gid; 820 inode_u->bi_dev = rdev; 821 inode_u->bi_atime = now; 822 inode_u->bi_mtime = now; 823 inode_u->bi_ctime = now; 824 inode_u->bi_otime = now; 825 826 if (parent && parent->bi_mode & S_ISGID) { 827 inode_u->bi_gid = parent->bi_gid; 828 if (S_ISDIR(mode)) 829 inode_u->bi_mode |= S_ISGID; 830 } 831 832 if (parent) { 833 #define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; 834 BCH_INODE_OPTS() 835 #undef x 836 } 837 } 838 839 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, 840 uid_t uid, gid_t gid, umode_t mode, dev_t rdev, 841 struct bch_inode_unpacked *parent) 842 { 843 bch2_inode_init_early(c, inode_u); 844 bch2_inode_init_late(inode_u, bch2_current_time(c), 845 uid, gid, mode, rdev, parent); 846 } 847 848 static inline u32 bkey_generation(struct bkey_s_c k) 849 { 850 switch (k.k->type) { 851 case KEY_TYPE_inode: 852 case KEY_TYPE_inode_v2: 853 BUG(); 854 case KEY_TYPE_inode_generation: 855 return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); 856 default: 857 return 0; 858 } 859 } 860 861 /* 862 * This just finds an empty slot: 863 */ 864 int bch2_inode_create(struct btree_trans *trans, 865 struct btree_iter *iter, 866 struct bch_inode_unpacked *inode_u, 867 u32 snapshot, u64 cpu) 868 { 869 struct bch_fs *c = trans->c; 870 struct bkey_s_c k; 871 u64 min, max, start, pos, *hint; 872 int ret = 0; 873 unsigned bits = (c->opts.inodes_32bit ? 31 : 63); 874 875 if (c->opts.shard_inode_numbers) { 876 bits -= c->inode_shard_bits; 877 878 min = (cpu << bits); 879 max = (cpu << bits) | ~(ULLONG_MAX << bits); 880 881 min = max_t(u64, min, BLOCKDEV_INODE_MAX); 882 hint = c->unused_inode_hints + cpu; 883 } else { 884 min = BLOCKDEV_INODE_MAX; 885 max = ~(ULLONG_MAX << bits); 886 hint = c->unused_inode_hints; 887 } 888 889 start = READ_ONCE(*hint); 890 891 if (start >= max || start < min) 892 start = min; 893 894 pos = start; 895 bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), 896 BTREE_ITER_all_snapshots| 897 BTREE_ITER_intent); 898 again: 899 while ((k = bch2_btree_iter_peek(iter)).k && 900 !(ret = bkey_err(k)) && 901 bkey_lt(k.k->p, POS(0, max))) { 902 if (pos < iter->pos.offset) 903 goto found_slot; 904 905 /* 906 * We don't need to iterate over keys in every snapshot once 907 * we've found just one: 908 */ 909 pos = iter->pos.offset + 1; 910 bch2_btree_iter_set_pos(iter, POS(0, pos)); 911 } 912 913 if (!ret && pos < max) 914 goto found_slot; 915 916 if (!ret && start == min) 917 ret = -BCH_ERR_ENOSPC_inode_create; 918 919 if (ret) { 920 bch2_trans_iter_exit(trans, iter); 921 return ret; 922 } 923 924 /* Retry from start */ 925 pos = start = min; 926 bch2_btree_iter_set_pos(iter, POS(0, pos)); 927 goto again; 928 found_slot: 929 bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); 930 k = bch2_btree_iter_peek_slot(iter); 931 ret = bkey_err(k); 932 if (ret) { 933 bch2_trans_iter_exit(trans, iter); 934 return ret; 935 } 936 937 *hint = k.k->p.offset; 938 inode_u->bi_inum = k.k->p.offset; 939 inode_u->bi_generation = bkey_generation(k); 940 return 0; 941 } 942 943 static int bch2_inode_delete_keys(struct btree_trans *trans, 944 subvol_inum inum, enum btree_id id) 945 { 946 struct btree_iter iter; 947 struct bkey_s_c k; 948 struct bkey_i delete; 949 struct bpos end = POS(inum.inum, U64_MAX); 950 u32 snapshot; 951 int ret = 0; 952 953 /* 954 * We're never going to be deleting partial extents, no need to use an 955 * extent iterator: 956 */ 957 bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), 958 BTREE_ITER_intent); 959 960 while (1) { 961 bch2_trans_begin(trans); 962 963 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 964 if (ret) 965 goto err; 966 967 bch2_btree_iter_set_snapshot(&iter, snapshot); 968 969 k = bch2_btree_iter_peek_upto(&iter, end); 970 ret = bkey_err(k); 971 if (ret) 972 goto err; 973 974 if (!k.k) 975 break; 976 977 bkey_init(&delete.k); 978 delete.k.p = iter.pos; 979 980 if (iter.flags & BTREE_ITER_is_extents) 981 bch2_key_resize(&delete.k, 982 bpos_min(end, k.k->p).offset - 983 iter.pos.offset); 984 985 ret = bch2_trans_update(trans, &iter, &delete, 0) ?: 986 bch2_trans_commit(trans, NULL, NULL, 987 BCH_TRANS_COMMIT_no_enospc); 988 err: 989 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 990 break; 991 } 992 993 bch2_trans_iter_exit(trans, &iter); 994 return ret; 995 } 996 997 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) 998 { 999 struct btree_trans *trans = bch2_trans_get(c); 1000 struct btree_iter iter = { NULL }; 1001 struct bkey_i_inode_generation delete; 1002 struct bch_inode_unpacked inode_u; 1003 struct bkey_s_c k; 1004 u32 snapshot; 1005 int ret; 1006 1007 /* 1008 * If this was a directory, there shouldn't be any real dirents left - 1009 * but there could be whiteouts (from hash collisions) that we should 1010 * delete: 1011 * 1012 * XXX: the dirent could ideally would delete whiteouts when they're no 1013 * longer needed 1014 */ 1015 ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: 1016 bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: 1017 bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); 1018 if (ret) 1019 goto err; 1020 retry: 1021 bch2_trans_begin(trans); 1022 1023 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1024 if (ret) 1025 goto err; 1026 1027 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 1028 SPOS(0, inum.inum, snapshot), 1029 BTREE_ITER_intent|BTREE_ITER_cached); 1030 ret = bkey_err(k); 1031 if (ret) 1032 goto err; 1033 1034 if (!bkey_is_inode(k.k)) { 1035 bch2_fs_inconsistent(c, 1036 "inode %llu:%u not found when deleting", 1037 inum.inum, snapshot); 1038 ret = -EIO; 1039 goto err; 1040 } 1041 1042 bch2_inode_unpack(k, &inode_u); 1043 1044 bkey_inode_generation_init(&delete.k_i); 1045 delete.k.p = iter.pos; 1046 delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); 1047 1048 ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: 1049 bch2_trans_commit(trans, NULL, NULL, 1050 BCH_TRANS_COMMIT_no_enospc); 1051 err: 1052 bch2_trans_iter_exit(trans, &iter); 1053 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1054 goto retry; 1055 1056 if (ret) 1057 goto err2; 1058 1059 ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); 1060 err2: 1061 bch2_trans_put(trans); 1062 return ret; 1063 } 1064 1065 int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, 1066 subvol_inum inum, 1067 struct bch_inode_unpacked *inode) 1068 { 1069 struct btree_iter iter; 1070 int ret; 1071 1072 ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); 1073 if (!ret) 1074 bch2_trans_iter_exit(trans, &iter); 1075 return ret; 1076 } 1077 1078 int bch2_inode_find_by_inum_trans(struct btree_trans *trans, 1079 subvol_inum inum, 1080 struct bch_inode_unpacked *inode) 1081 { 1082 struct btree_iter iter; 1083 int ret; 1084 1085 ret = bch2_inode_peek(trans, &iter, inode, inum, 0); 1086 if (!ret) 1087 bch2_trans_iter_exit(trans, &iter); 1088 return ret; 1089 } 1090 1091 int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, 1092 struct bch_inode_unpacked *inode) 1093 { 1094 return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); 1095 } 1096 1097 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) 1098 { 1099 if (bi->bi_flags & BCH_INODE_unlinked) 1100 bi->bi_flags &= ~BCH_INODE_unlinked; 1101 else { 1102 if (bi->bi_nlink == U32_MAX) 1103 return -EINVAL; 1104 1105 bi->bi_nlink++; 1106 } 1107 1108 return 0; 1109 } 1110 1111 void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) 1112 { 1113 if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { 1114 bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", 1115 bi->bi_inum); 1116 return; 1117 } 1118 1119 if (bi->bi_flags & BCH_INODE_unlinked) { 1120 bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); 1121 return; 1122 } 1123 1124 if (bi->bi_nlink) 1125 bi->bi_nlink--; 1126 else 1127 bi->bi_flags |= BCH_INODE_unlinked; 1128 } 1129 1130 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) 1131 { 1132 struct bch_opts ret = { 0 }; 1133 #define x(_name, _bits) \ 1134 if (inode->bi_##_name) \ 1135 opt_set(ret, _name, inode->bi_##_name - 1); 1136 BCH_INODE_OPTS() 1137 #undef x 1138 return ret; 1139 } 1140 1141 void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, 1142 struct bch_inode_unpacked *inode) 1143 { 1144 #define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); 1145 BCH_INODE_OPTS() 1146 #undef x 1147 1148 if (opts->nocow) 1149 opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; 1150 } 1151 1152 int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) 1153 { 1154 struct bch_inode_unpacked inode; 1155 int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); 1156 1157 if (ret) 1158 return ret; 1159 1160 bch2_inode_opts_get(opts, trans->c, &inode); 1161 return 0; 1162 } 1163 1164 static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 1165 { 1166 struct bch_fs *c = trans->c; 1167 struct btree_iter iter = { NULL }; 1168 struct bkey_i_inode_generation delete; 1169 struct bch_inode_unpacked inode_u; 1170 struct bkey_s_c k; 1171 int ret; 1172 1173 do { 1174 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 1175 SPOS(inum, 0, snapshot), 1176 SPOS(inum, U64_MAX, snapshot), 1177 0, NULL) ?: 1178 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, 1179 SPOS(inum, 0, snapshot), 1180 SPOS(inum, U64_MAX, snapshot), 1181 0, NULL) ?: 1182 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, 1183 SPOS(inum, 0, snapshot), 1184 SPOS(inum, U64_MAX, snapshot), 1185 0, NULL); 1186 } while (ret == -BCH_ERR_transaction_restart_nested); 1187 if (ret) 1188 goto err; 1189 retry: 1190 bch2_trans_begin(trans); 1191 1192 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 1193 SPOS(0, inum, snapshot), BTREE_ITER_intent); 1194 ret = bkey_err(k); 1195 if (ret) 1196 goto err; 1197 1198 if (!bkey_is_inode(k.k)) { 1199 bch2_fs_inconsistent(c, 1200 "inode %llu:%u not found when deleting", 1201 inum, snapshot); 1202 ret = -EIO; 1203 goto err; 1204 } 1205 1206 bch2_inode_unpack(k, &inode_u); 1207 1208 /* Subvolume root? */ 1209 if (inode_u.bi_subvol) 1210 bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); 1211 1212 bkey_inode_generation_init(&delete.k_i); 1213 delete.k.p = iter.pos; 1214 delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); 1215 1216 ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: 1217 bch2_trans_commit(trans, NULL, NULL, 1218 BCH_TRANS_COMMIT_no_enospc); 1219 err: 1220 bch2_trans_iter_exit(trans, &iter); 1221 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1222 goto retry; 1223 1224 return ret ?: -BCH_ERR_transaction_restart_nested; 1225 } 1226 1227 /* 1228 * After deleting an inode, there may be versions in older snapshots that should 1229 * also be deleted - if they're not referenced by sibling snapshots and not open 1230 * in other subvolumes: 1231 */ 1232 static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) 1233 { 1234 struct btree_iter iter; 1235 struct bkey_s_c k; 1236 int ret; 1237 next_parent: 1238 ret = lockrestart_do(trans, 1239 bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); 1240 if (ret || !k.k) 1241 return ret; 1242 1243 bool unlinked = bkey_is_unlinked_inode(k); 1244 pos = k.k->p; 1245 bch2_trans_iter_exit(trans, &iter); 1246 1247 if (!unlinked) 1248 return 0; 1249 1250 ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); 1251 if (ret) 1252 return ret < 0 ? ret : 0; 1253 1254 ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); 1255 if (ret) 1256 return ret; 1257 goto next_parent; 1258 } 1259 1260 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 1261 { 1262 return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: 1263 delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); 1264 } 1265 1266 static int may_delete_deleted_inode(struct btree_trans *trans, 1267 struct btree_iter *iter, 1268 struct bpos pos, 1269 bool *need_another_pass) 1270 { 1271 struct bch_fs *c = trans->c; 1272 struct btree_iter inode_iter; 1273 struct bkey_s_c k; 1274 struct bch_inode_unpacked inode; 1275 struct printbuf buf = PRINTBUF; 1276 int ret; 1277 1278 k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); 1279 ret = bkey_err(k); 1280 if (ret) 1281 return ret; 1282 1283 ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; 1284 if (fsck_err_on(!bkey_is_inode(k.k), 1285 trans, deleted_inode_missing, 1286 "nonexistent inode %llu:%u in deleted_inodes btree", 1287 pos.offset, pos.snapshot)) 1288 goto delete; 1289 1290 ret = bch2_inode_unpack(k, &inode); 1291 if (ret) 1292 goto out; 1293 1294 if (S_ISDIR(inode.bi_mode)) { 1295 ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); 1296 if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), 1297 trans, deleted_inode_is_dir, 1298 "non empty directory %llu:%u in deleted_inodes btree", 1299 pos.offset, pos.snapshot)) 1300 goto delete; 1301 if (ret) 1302 goto out; 1303 } 1304 1305 if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), 1306 trans, deleted_inode_not_unlinked, 1307 "non-deleted inode %llu:%u in deleted_inodes btree", 1308 pos.offset, pos.snapshot)) 1309 goto delete; 1310 1311 if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, 1312 trans, deleted_inode_has_child_snapshots, 1313 "inode with child snapshots %llu:%u in deleted_inodes btree", 1314 pos.offset, pos.snapshot)) 1315 goto delete; 1316 1317 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1318 if (ret < 0) 1319 goto out; 1320 1321 if (ret) { 1322 if (fsck_err(trans, inode_has_child_snapshots_wrong, 1323 "inode has_child_snapshots flag wrong (should be set)\n%s", 1324 (printbuf_reset(&buf), 1325 bch2_inode_unpacked_to_text(&buf, &inode), 1326 buf.buf))) { 1327 inode.bi_flags |= BCH_INODE_has_child_snapshot; 1328 ret = __bch2_fsck_write_inode(trans, &inode); 1329 if (ret) 1330 goto out; 1331 } 1332 goto delete; 1333 1334 } 1335 1336 if (test_bit(BCH_FS_clean_recovery, &c->flags) && 1337 !fsck_err(trans, deleted_inode_but_clean, 1338 "filesystem marked as clean but have deleted inode %llu:%u", 1339 pos.offset, pos.snapshot)) { 1340 ret = 0; 1341 goto out; 1342 } 1343 1344 ret = 1; 1345 out: 1346 fsck_err: 1347 bch2_trans_iter_exit(trans, &inode_iter); 1348 printbuf_exit(&buf); 1349 return ret; 1350 delete: 1351 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); 1352 goto out; 1353 } 1354 1355 int bch2_delete_dead_inodes(struct bch_fs *c) 1356 { 1357 struct btree_trans *trans = bch2_trans_get(c); 1358 bool need_another_pass; 1359 int ret; 1360 again: 1361 /* 1362 * if we ran check_inodes() unlinked inodes will have already been 1363 * cleaned up but the write buffer will be out of sync; therefore we 1364 * alway need a write buffer flush 1365 */ 1366 ret = bch2_btree_write_buffer_flush_sync(trans); 1367 if (ret) 1368 goto err; 1369 1370 need_another_pass = false; 1371 1372 /* 1373 * Weird transaction restart handling here because on successful delete, 1374 * bch2_inode_rm_snapshot() will return a nested transaction restart, 1375 * but we can't retry because the btree write buffer won't have been 1376 * flushed and we'd spin: 1377 */ 1378 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, 1379 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1380 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 1381 ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); 1382 if (ret > 0) { 1383 bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); 1384 1385 ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); 1386 /* 1387 * We don't want to loop here: a transaction restart 1388 * error here means we handled a transaction restart and 1389 * we're actually done, but if we loop we'll retry the 1390 * same key because the write buffer hasn't been flushed 1391 * yet 1392 */ 1393 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1394 ret = 0; 1395 continue; 1396 } 1397 } 1398 1399 ret; 1400 })); 1401 1402 if (!ret && need_another_pass) 1403 goto again; 1404 err: 1405 bch2_trans_put(trans); 1406 return ret; 1407 } 1408