1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_key_cache.h" 5 #include "btree_write_buffer.h" 6 #include "bkey_methods.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "compress.h" 10 #include "dirent.h" 11 #include "disk_accounting.h" 12 #include "error.h" 13 #include "extents.h" 14 #include "extent_update.h" 15 #include "fs.h" 16 #include "inode.h" 17 #include "opts.h" 18 #include "str_hash.h" 19 #include "snapshot.h" 20 #include "subvolume.h" 21 #include "varint.h" 22 23 #include <linux/random.h> 24 25 #include <linux/unaligned.h> 26 27 #define x(name, ...) #name, 28 const char * const bch2_inode_opts[] = { 29 BCH_INODE_OPTS() 30 NULL, 31 }; 32 33 static const char * const bch2_inode_flag_strs[] = { 34 BCH_INODE_FLAGS() 35 NULL 36 }; 37 #undef x 38 39 static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); 40 41 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; 42 43 static int inode_decode_field(const u8 *in, const u8 *end, 44 u64 out[2], unsigned *out_bits) 45 { 46 __be64 be[2] = { 0, 0 }; 47 unsigned bytes, shift; 48 u8 *p; 49 50 if (in >= end) 51 return -BCH_ERR_inode_unpack_error; 52 53 if (!*in) 54 return -BCH_ERR_inode_unpack_error; 55 56 /* 57 * position of highest set bit indicates number of bytes: 58 * shift = number of bits to remove in high byte: 59 */ 60 shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ 61 bytes = byte_table[shift - 1]; 62 63 if (in + bytes > end) 64 return -BCH_ERR_inode_unpack_error; 65 66 p = (u8 *) be + 16 - bytes; 67 memcpy(p, in, bytes); 68 *p ^= (1 << 8) >> shift; 69 70 out[0] = be64_to_cpu(be[0]); 71 out[1] = be64_to_cpu(be[1]); 72 *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); 73 74 return bytes; 75 } 76 77 static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, 78 const struct bch_inode_unpacked *inode) 79 { 80 struct bkey_i_inode_v3 *k = &packed->inode; 81 u8 *out = k->v.fields; 82 u8 *end = (void *) &packed[1]; 83 u8 *last_nonzero_field = out; 84 unsigned nr_fields = 0, last_nonzero_fieldnr = 0; 85 unsigned bytes; 86 int ret; 87 88 bkey_inode_v3_init(&packed->inode.k_i); 89 packed->inode.k.p.offset = inode->bi_inum; 90 packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); 91 packed->inode.v.bi_hash_seed = inode->bi_hash_seed; 92 packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); 93 packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); 94 packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); 95 packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); 96 SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); 97 SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); 98 99 100 #define x(_name, _bits) \ 101 nr_fields++; \ 102 \ 103 if (inode->_name) { \ 104 ret = bch2_varint_encode_fast(out, inode->_name); \ 105 out += ret; \ 106 \ 107 if (_bits > 64) \ 108 *out++ = 0; \ 109 \ 110 last_nonzero_field = out; \ 111 last_nonzero_fieldnr = nr_fields; \ 112 } else { \ 113 *out++ = 0; \ 114 \ 115 if (_bits > 64) \ 116 *out++ = 0; \ 117 } 118 119 BCH_INODE_FIELDS_v3() 120 #undef x 121 BUG_ON(out > end); 122 123 out = last_nonzero_field; 124 nr_fields = last_nonzero_fieldnr; 125 126 bytes = out - (u8 *) &packed->inode.v; 127 set_bkey_val_bytes(&packed->inode.k, bytes); 128 memset_u64s_tail(&packed->inode.v, 0, bytes); 129 130 SET_INODEv3_NR_FIELDS(&k->v, nr_fields); 131 132 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { 133 struct bch_inode_unpacked unpacked; 134 135 ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); 136 BUG_ON(ret); 137 BUG_ON(unpacked.bi_inum != inode->bi_inum); 138 BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); 139 BUG_ON(unpacked.bi_sectors != inode->bi_sectors); 140 BUG_ON(unpacked.bi_size != inode->bi_size); 141 BUG_ON(unpacked.bi_version != inode->bi_version); 142 BUG_ON(unpacked.bi_mode != inode->bi_mode); 143 144 #define x(_name, _bits) if (unpacked._name != inode->_name) \ 145 panic("unpacked %llu should be %llu", \ 146 (u64) unpacked._name, (u64) inode->_name); 147 BCH_INODE_FIELDS_v3() 148 #undef x 149 } 150 } 151 152 void bch2_inode_pack(struct bkey_inode_buf *packed, 153 const struct bch_inode_unpacked *inode) 154 { 155 bch2_inode_pack_inlined(packed, inode); 156 } 157 158 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, 159 struct bch_inode_unpacked *unpacked) 160 { 161 const u8 *in = inode.v->fields; 162 const u8 *end = bkey_val_end(inode); 163 u64 field[2]; 164 unsigned fieldnr = 0, field_bits; 165 int ret; 166 167 #define x(_name, _bits) \ 168 if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ 169 unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ 170 memset((void *) unpacked + offset, 0, \ 171 sizeof(*unpacked) - offset); \ 172 return 0; \ 173 } \ 174 \ 175 ret = inode_decode_field(in, end, field, &field_bits); \ 176 if (ret < 0) \ 177 return ret; \ 178 \ 179 if (field_bits > sizeof(unpacked->_name) * 8) \ 180 return -BCH_ERR_inode_unpack_error; \ 181 \ 182 unpacked->_name = field[1]; \ 183 in += ret; 184 185 BCH_INODE_FIELDS_v2() 186 #undef x 187 188 /* XXX: signal if there were more fields than expected? */ 189 return 0; 190 } 191 192 static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, 193 const u8 *in, const u8 *end, 194 unsigned nr_fields) 195 { 196 unsigned fieldnr = 0; 197 int ret; 198 u64 v[2]; 199 200 #define x(_name, _bits) \ 201 if (fieldnr < nr_fields) { \ 202 ret = bch2_varint_decode_fast(in, end, &v[0]); \ 203 if (ret < 0) \ 204 return ret; \ 205 in += ret; \ 206 \ 207 if (_bits > 64) { \ 208 ret = bch2_varint_decode_fast(in, end, &v[1]); \ 209 if (ret < 0) \ 210 return ret; \ 211 in += ret; \ 212 } else { \ 213 v[1] = 0; \ 214 } \ 215 } else { \ 216 v[0] = v[1] = 0; \ 217 } \ 218 \ 219 unpacked->_name = v[0]; \ 220 if (v[1] || v[0] != unpacked->_name) \ 221 return -BCH_ERR_inode_unpack_error; \ 222 fieldnr++; 223 224 BCH_INODE_FIELDS_v2() 225 #undef x 226 227 /* XXX: signal if there were more fields than expected? */ 228 return 0; 229 } 230 231 static int bch2_inode_unpack_v3(struct bkey_s_c k, 232 struct bch_inode_unpacked *unpacked) 233 { 234 struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); 235 const u8 *in = inode.v->fields; 236 const u8 *end = bkey_val_end(inode); 237 unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); 238 unsigned fieldnr = 0; 239 int ret; 240 u64 v[2]; 241 242 unpacked->bi_inum = inode.k->p.offset; 243 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); 244 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 245 unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); 246 unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); 247 unpacked->bi_size = le64_to_cpu(inode.v->bi_size); 248 unpacked->bi_version = le64_to_cpu(inode.v->bi_version); 249 unpacked->bi_mode = INODEv3_MODE(inode.v); 250 251 #define x(_name, _bits) \ 252 if (fieldnr < nr_fields) { \ 253 ret = bch2_varint_decode_fast(in, end, &v[0]); \ 254 if (ret < 0) \ 255 return ret; \ 256 in += ret; \ 257 \ 258 if (_bits > 64) { \ 259 ret = bch2_varint_decode_fast(in, end, &v[1]); \ 260 if (ret < 0) \ 261 return ret; \ 262 in += ret; \ 263 } else { \ 264 v[1] = 0; \ 265 } \ 266 } else { \ 267 v[0] = v[1] = 0; \ 268 } \ 269 \ 270 unpacked->_name = v[0]; \ 271 if (v[1] || v[0] != unpacked->_name) \ 272 return -BCH_ERR_inode_unpack_error; \ 273 fieldnr++; 274 275 BCH_INODE_FIELDS_v3() 276 #undef x 277 278 /* XXX: signal if there were more fields than expected? */ 279 return 0; 280 } 281 282 static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, 283 struct bch_inode_unpacked *unpacked) 284 { 285 memset(unpacked, 0, sizeof(*unpacked)); 286 287 unpacked->bi_snapshot = k.k->p.snapshot; 288 289 switch (k.k->type) { 290 case KEY_TYPE_inode: { 291 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); 292 293 unpacked->bi_inum = inode.k->p.offset; 294 unpacked->bi_journal_seq= 0; 295 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 296 unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); 297 unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); 298 299 if (INODEv1_NEW_VARINT(inode.v)) { 300 return bch2_inode_unpack_v2(unpacked, inode.v->fields, 301 bkey_val_end(inode), 302 INODEv1_NR_FIELDS(inode.v)); 303 } else { 304 return bch2_inode_unpack_v1(inode, unpacked); 305 } 306 break; 307 } 308 case KEY_TYPE_inode_v2: { 309 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); 310 311 unpacked->bi_inum = inode.k->p.offset; 312 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); 313 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 314 unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); 315 unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); 316 317 return bch2_inode_unpack_v2(unpacked, inode.v->fields, 318 bkey_val_end(inode), 319 INODEv2_NR_FIELDS(inode.v)); 320 } 321 default: 322 BUG(); 323 } 324 } 325 326 int bch2_inode_unpack(struct bkey_s_c k, 327 struct bch_inode_unpacked *unpacked) 328 { 329 unpacked->bi_snapshot = k.k->p.snapshot; 330 331 return likely(k.k->type == KEY_TYPE_inode_v3) 332 ? bch2_inode_unpack_v3(k, unpacked) 333 : bch2_inode_unpack_slowpath(k, unpacked); 334 } 335 336 int __bch2_inode_peek(struct btree_trans *trans, 337 struct btree_iter *iter, 338 struct bch_inode_unpacked *inode, 339 subvol_inum inum, unsigned flags, 340 bool warn) 341 { 342 u32 snapshot; 343 int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn); 344 if (ret) 345 return ret; 346 347 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, 348 SPOS(0, inum.inum, snapshot), 349 flags|BTREE_ITER_cached); 350 ret = bkey_err(k); 351 if (ret) 352 return ret; 353 354 ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; 355 if (ret) 356 goto err; 357 358 ret = bch2_inode_unpack(k, inode); 359 if (ret) 360 goto err; 361 362 return 0; 363 err: 364 if (warn) 365 bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); 366 bch2_trans_iter_exit(trans, iter); 367 return ret; 368 } 369 370 int bch2_inode_write_flags(struct btree_trans *trans, 371 struct btree_iter *iter, 372 struct bch_inode_unpacked *inode, 373 enum btree_iter_update_trigger_flags flags) 374 { 375 struct bkey_inode_buf *inode_p; 376 377 inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); 378 if (IS_ERR(inode_p)) 379 return PTR_ERR(inode_p); 380 381 bch2_inode_pack_inlined(inode_p, inode); 382 inode_p->inode.k.p.snapshot = iter->snapshot; 383 return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); 384 } 385 386 int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 387 { 388 struct bkey_inode_buf *inode_p = 389 bch2_trans_kmalloc(trans, sizeof(*inode_p)); 390 391 if (IS_ERR(inode_p)) 392 return PTR_ERR(inode_p); 393 394 bch2_inode_pack(inode_p, inode); 395 inode_p->inode.k.p.snapshot = inode->bi_snapshot; 396 397 return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 398 &inode_p->inode.k_i, 399 BTREE_UPDATE_internal_snapshot_node); 400 } 401 402 int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) 403 { 404 int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 405 __bch2_fsck_write_inode(trans, inode)); 406 bch_err_fn(trans->c, ret); 407 return ret; 408 } 409 410 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) 411 { 412 struct bch_inode_unpacked u; 413 struct bkey_inode_buf *inode_p; 414 int ret; 415 416 if (!bkey_is_inode(&k->k)) 417 return ERR_PTR(-ENOENT); 418 419 inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); 420 if (IS_ERR(inode_p)) 421 return ERR_CAST(inode_p); 422 423 ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); 424 if (ret) 425 return ERR_PTR(ret); 426 427 bch2_inode_pack(inode_p, &u); 428 return &inode_p->inode.k_i; 429 } 430 431 static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, 432 struct bkey_validate_context from) 433 { 434 struct bch_inode_unpacked unpacked; 435 int ret = 0; 436 437 bkey_fsck_err_on(k.k->p.inode, 438 c, inode_pos_inode_nonzero, 439 "nonzero k.p.inode"); 440 441 bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, 442 c, inode_pos_blockdev_range, 443 "fs inode in blockdev range"); 444 445 bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), 446 c, inode_unpack_error, 447 "invalid variable length fields"); 448 449 bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, 450 c, inode_checksum_type_invalid, 451 "invalid data checksum type (%u >= %u", 452 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); 453 454 bkey_fsck_err_on(unpacked.bi_compression && 455 !bch2_compression_opt_valid(unpacked.bi_compression - 1), 456 c, inode_compression_type_invalid, 457 "invalid compression opt %u", unpacked.bi_compression - 1); 458 459 bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && 460 unpacked.bi_nlink != 0, 461 c, inode_unlinked_but_nlink_nonzero, 462 "flagged as unlinked but bi_nlink != 0"); 463 464 bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), 465 c, inode_subvol_root_but_not_dir, 466 "subvolume root but not a directory"); 467 fsck_err: 468 return ret; 469 } 470 471 int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, 472 struct bkey_validate_context from) 473 { 474 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); 475 int ret = 0; 476 477 bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, 478 c, inode_str_hash_invalid, 479 "invalid str hash type (%llu >= %u)", 480 INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); 481 482 ret = __bch2_inode_validate(c, k, from); 483 fsck_err: 484 return ret; 485 } 486 487 int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, 488 struct bkey_validate_context from) 489 { 490 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); 491 int ret = 0; 492 493 bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, 494 c, inode_str_hash_invalid, 495 "invalid str hash type (%llu >= %u)", 496 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); 497 498 ret = __bch2_inode_validate(c, k, from); 499 fsck_err: 500 return ret; 501 } 502 503 int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, 504 struct bkey_validate_context from) 505 { 506 struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); 507 int ret = 0; 508 509 bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || 510 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), 511 c, inode_v3_fields_start_bad, 512 "invalid fields_start (got %llu, min %u max %zu)", 513 INODEv3_FIELDS_START(inode.v), 514 INODEv3_FIELDS_START_INITIAL, 515 bkey_val_u64s(inode.k)); 516 517 bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, 518 c, inode_str_hash_invalid, 519 "invalid str hash type (%llu >= %u)", 520 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); 521 522 ret = __bch2_inode_validate(c, k, from); 523 fsck_err: 524 return ret; 525 } 526 527 static void __bch2_inode_unpacked_to_text(struct printbuf *out, 528 struct bch_inode_unpacked *inode) 529 { 530 prt_printf(out, "\n"); 531 printbuf_indent_add(out, 2); 532 prt_printf(out, "mode=%o\n", inode->bi_mode); 533 534 prt_str(out, "flags="); 535 prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); 536 prt_printf(out, "(%x)\n", inode->bi_flags); 537 538 prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); 539 prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); 540 prt_printf(out, "hash_type="); 541 bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); 542 prt_newline(out); 543 prt_printf(out, "bi_size=%llu\n", inode->bi_size); 544 prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); 545 prt_printf(out, "bi_version=%llu\n", inode->bi_version); 546 547 #define x(_name, _bits) \ 548 prt_printf(out, #_name "=%llu\n", (u64) inode->_name); 549 BCH_INODE_FIELDS_v3() 550 #undef x 551 552 bch2_printbuf_strip_trailing_newline(out); 553 printbuf_indent_sub(out, 2); 554 } 555 556 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) 557 { 558 prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); 559 __bch2_inode_unpacked_to_text(out, inode); 560 } 561 562 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 563 { 564 struct bch_inode_unpacked inode; 565 566 if (bch2_inode_unpack(k, &inode)) { 567 prt_printf(out, "(unpack error)"); 568 return; 569 } 570 571 __bch2_inode_unpacked_to_text(out, &inode); 572 } 573 574 static inline u64 bkey_inode_flags(struct bkey_s_c k) 575 { 576 switch (k.k->type) { 577 case KEY_TYPE_inode: 578 return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); 579 case KEY_TYPE_inode_v2: 580 return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); 581 case KEY_TYPE_inode_v3: 582 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); 583 default: 584 return 0; 585 } 586 } 587 588 static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) 589 { 590 switch (k.k->type) { 591 case KEY_TYPE_inode: 592 bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); 593 return; 594 case KEY_TYPE_inode_v2: 595 bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); 596 return; 597 case KEY_TYPE_inode_v3: 598 bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); 599 return; 600 default: 601 BUG(); 602 } 603 } 604 605 static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) 606 { 607 unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; 608 609 return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); 610 } 611 612 static struct bkey_s_c 613 bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, 614 enum btree_id btree, struct bpos pos, 615 unsigned flags) 616 { 617 struct bch_fs *c = trans->c; 618 struct bkey_s_c k; 619 int ret = 0; 620 621 for_each_btree_key_max_norestart(trans, *iter, btree, 622 bpos_successor(pos), 623 SPOS(pos.inode, pos.offset, U32_MAX), 624 flags|BTREE_ITER_all_snapshots, k, ret) 625 if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) 626 return k; 627 628 bch2_trans_iter_exit(trans, iter); 629 return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 630 } 631 632 static struct bkey_s_c 633 bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, 634 struct bpos pos, unsigned flags) 635 { 636 struct bkey_s_c k; 637 again: 638 k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); 639 if (!k.k || 640 bkey_err(k) || 641 bkey_is_inode(k.k)) 642 return k; 643 644 bch2_trans_iter_exit(trans, iter); 645 pos = k.k->p; 646 goto again; 647 } 648 649 int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) 650 { 651 struct bch_fs *c = trans->c; 652 struct btree_iter iter; 653 struct bkey_s_c k; 654 int ret = 0; 655 656 for_each_btree_key_max_norestart(trans, iter, 657 BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), 658 BTREE_ITER_all_snapshots| 659 BTREE_ITER_with_updates, k, ret) 660 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && 661 bkey_is_inode(k.k)) { 662 ret = 1; 663 break; 664 } 665 bch2_trans_iter_exit(trans, &iter); 666 return ret; 667 } 668 669 static int update_inode_has_children(struct btree_trans *trans, 670 struct bkey_s k, 671 bool have_child) 672 { 673 if (!have_child) { 674 int ret = bch2_inode_has_child_snapshots(trans, k.k->p); 675 if (ret) 676 return ret < 0 ? ret : 0; 677 } 678 679 u64 f = bkey_inode_flags(k.s_c); 680 if (have_child != !!(f & BCH_INODE_has_child_snapshot)) 681 bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); 682 683 return 0; 684 } 685 686 static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, 687 bool have_child) 688 { 689 struct btree_iter iter; 690 struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, 691 &iter, pos, BTREE_ITER_with_updates); 692 int ret = bkey_err(k); 693 if (ret) 694 return ret; 695 if (!k.k) 696 return 0; 697 698 if (!have_child) { 699 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 700 if (ret) { 701 ret = ret < 0 ? ret : 0; 702 goto err; 703 } 704 } 705 706 u64 f = bkey_inode_flags(k); 707 if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { 708 struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, 709 BTREE_UPDATE_internal_snapshot_node); 710 ret = PTR_ERR_OR_ZERO(update); 711 if (ret) 712 goto err; 713 714 bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); 715 } 716 err: 717 bch2_trans_iter_exit(trans, &iter); 718 return ret; 719 } 720 721 int bch2_trigger_inode(struct btree_trans *trans, 722 enum btree_id btree_id, unsigned level, 723 struct bkey_s_c old, 724 struct bkey_s new, 725 enum btree_iter_update_trigger_flags flags) 726 { 727 struct bch_fs *c = trans->c; 728 729 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { 730 BUG_ON(!trans->journal_res.seq); 731 bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); 732 } 733 734 s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); 735 if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { 736 struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; 737 int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); 738 if (ret) 739 return ret; 740 } 741 742 if (flags & BTREE_TRIGGER_transactional) { 743 int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - 744 (int) bkey_is_unlinked_inode(old); 745 if (unlinked_delta) { 746 int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, 747 new.k->p, unlinked_delta > 0); 748 if (ret) 749 return ret; 750 } 751 752 /* 753 * If we're creating or deleting an inode at this snapshot ID, 754 * and there might be an inode in a parent snapshot ID, we might 755 * need to set or clear the has_child_snapshot flag on the 756 * parent. 757 */ 758 int deleted_delta = (int) bkey_is_inode(new.k) - 759 (int) bkey_is_inode(old.k); 760 if (deleted_delta && 761 bch2_snapshot_parent(c, new.k->p.snapshot)) { 762 int ret = update_parent_inode_has_children(trans, new.k->p, 763 deleted_delta > 0); 764 if (ret) 765 return ret; 766 } 767 768 /* 769 * When an inode is first updated in a new snapshot, we may need 770 * to clear has_child_snapshot 771 */ 772 if (deleted_delta > 0) { 773 int ret = update_inode_has_children(trans, new, false); 774 if (ret) 775 return ret; 776 } 777 } 778 779 return 0; 780 } 781 782 int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, 783 struct bkey_validate_context from) 784 { 785 int ret = 0; 786 787 bkey_fsck_err_on(k.k->p.inode, 788 c, inode_pos_inode_nonzero, 789 "nonzero k.p.inode"); 790 fsck_err: 791 return ret; 792 } 793 794 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, 795 struct bkey_s_c k) 796 { 797 struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); 798 799 prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); 800 } 801 802 int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, 803 struct bkey_validate_context from) 804 { 805 int ret = 0; 806 807 bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, 808 c, inode_alloc_cursor_inode_bad, 809 "k.p.inode bad"); 810 fsck_err: 811 return ret; 812 } 813 814 void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, 815 struct bkey_s_c k) 816 { 817 struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); 818 819 prt_printf(out, "idx %llu generation %llu", 820 le64_to_cpu(i.v->idx), 821 le64_to_cpu(i.v->gen)); 822 } 823 824 void bch2_inode_init_early(struct bch_fs *c, 825 struct bch_inode_unpacked *inode_u) 826 { 827 enum bch_str_hash_type str_hash = 828 bch2_str_hash_opt_to_type(c, c->opts.str_hash); 829 830 memset(inode_u, 0, sizeof(*inode_u)); 831 832 SET_INODE_STR_HASH(inode_u, str_hash); 833 get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); 834 } 835 836 void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, 837 uid_t uid, gid_t gid, umode_t mode, dev_t rdev, 838 struct bch_inode_unpacked *parent) 839 { 840 inode_u->bi_mode = mode; 841 inode_u->bi_uid = uid; 842 inode_u->bi_gid = gid; 843 inode_u->bi_dev = rdev; 844 inode_u->bi_atime = now; 845 inode_u->bi_mtime = now; 846 inode_u->bi_ctime = now; 847 inode_u->bi_otime = now; 848 849 if (parent && parent->bi_mode & S_ISGID) { 850 inode_u->bi_gid = parent->bi_gid; 851 if (S_ISDIR(mode)) 852 inode_u->bi_mode |= S_ISGID; 853 } 854 855 if (parent) { 856 #define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; 857 BCH_INODE_OPTS() 858 #undef x 859 } 860 } 861 862 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, 863 uid_t uid, gid_t gid, umode_t mode, dev_t rdev, 864 struct bch_inode_unpacked *parent) 865 { 866 bch2_inode_init_early(c, inode_u); 867 bch2_inode_init_late(inode_u, bch2_current_time(c), 868 uid, gid, mode, rdev, parent); 869 } 870 871 static inline u32 bkey_generation(struct bkey_s_c k) 872 { 873 switch (k.k->type) { 874 case KEY_TYPE_inode: 875 case KEY_TYPE_inode_v2: 876 BUG(); 877 case KEY_TYPE_inode_generation: 878 return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); 879 default: 880 return 0; 881 } 882 } 883 884 static struct bkey_i_inode_alloc_cursor * 885 bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) 886 { 887 struct bch_fs *c = trans->c; 888 889 u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; 890 891 cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); 892 893 struct btree_iter iter; 894 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, 895 BTREE_ID_logged_ops, 896 POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), 897 BTREE_ITER_cached); 898 int ret = bkey_err(k); 899 if (ret) 900 return ERR_PTR(ret); 901 902 struct bkey_i_inode_alloc_cursor *cursor = 903 k.k->type == KEY_TYPE_inode_alloc_cursor 904 ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) 905 : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); 906 ret = PTR_ERR_OR_ZERO(cursor); 907 if (ret) 908 goto err; 909 910 if (c->opts.inodes_32bit) { 911 *min = BLOCKDEV_INODE_MAX; 912 *max = INT_MAX; 913 } else { 914 cursor->v.bits = c->opts.shard_inode_numbers_bits; 915 916 unsigned bits = 63 - c->opts.shard_inode_numbers_bits; 917 918 *min = max(cpu << bits, (u64) INT_MAX + 1); 919 *max = (cpu << bits) | ~(ULLONG_MAX << bits); 920 } 921 922 if (le64_to_cpu(cursor->v.idx) < *min) 923 cursor->v.idx = cpu_to_le64(*min); 924 925 if (le64_to_cpu(cursor->v.idx) >= *max) { 926 cursor->v.idx = cpu_to_le64(*min); 927 le32_add_cpu(&cursor->v.gen, 1); 928 } 929 err: 930 bch2_trans_iter_exit(trans, &iter); 931 return ret ? ERR_PTR(ret) : cursor; 932 } 933 934 /* 935 * This just finds an empty slot: 936 */ 937 int bch2_inode_create(struct btree_trans *trans, 938 struct btree_iter *iter, 939 struct bch_inode_unpacked *inode_u, 940 u32 snapshot, u64 cpu) 941 { 942 u64 min, max; 943 struct bkey_i_inode_alloc_cursor *cursor = 944 bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); 945 int ret = PTR_ERR_OR_ZERO(cursor); 946 if (ret) 947 return ret; 948 949 u64 start = le64_to_cpu(cursor->v.idx); 950 u64 pos = start; 951 952 bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), 953 BTREE_ITER_all_snapshots| 954 BTREE_ITER_intent); 955 struct bkey_s_c k; 956 again: 957 while ((k = bch2_btree_iter_peek(iter)).k && 958 !(ret = bkey_err(k)) && 959 bkey_lt(k.k->p, POS(0, max))) { 960 if (pos < iter->pos.offset) 961 goto found_slot; 962 963 /* 964 * We don't need to iterate over keys in every snapshot once 965 * we've found just one: 966 */ 967 pos = iter->pos.offset + 1; 968 bch2_btree_iter_set_pos(iter, POS(0, pos)); 969 } 970 971 if (!ret && pos < max) 972 goto found_slot; 973 974 if (!ret && start == min) 975 ret = -BCH_ERR_ENOSPC_inode_create; 976 977 if (ret) { 978 bch2_trans_iter_exit(trans, iter); 979 return ret; 980 } 981 982 /* Retry from start */ 983 pos = start = min; 984 bch2_btree_iter_set_pos(iter, POS(0, pos)); 985 le32_add_cpu(&cursor->v.gen, 1); 986 goto again; 987 found_slot: 988 bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); 989 k = bch2_btree_iter_peek_slot(iter); 990 ret = bkey_err(k); 991 if (ret) { 992 bch2_trans_iter_exit(trans, iter); 993 return ret; 994 } 995 996 inode_u->bi_inum = k.k->p.offset; 997 inode_u->bi_generation = le64_to_cpu(cursor->v.gen); 998 cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); 999 return 0; 1000 } 1001 1002 static int bch2_inode_delete_keys(struct btree_trans *trans, 1003 subvol_inum inum, enum btree_id id) 1004 { 1005 struct btree_iter iter; 1006 struct bkey_s_c k; 1007 struct bkey_i delete; 1008 struct bpos end = POS(inum.inum, U64_MAX); 1009 u32 snapshot; 1010 int ret = 0; 1011 1012 /* 1013 * We're never going to be deleting partial extents, no need to use an 1014 * extent iterator: 1015 */ 1016 bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), 1017 BTREE_ITER_intent); 1018 1019 while (1) { 1020 bch2_trans_begin(trans); 1021 1022 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1023 if (ret) 1024 goto err; 1025 1026 bch2_btree_iter_set_snapshot(&iter, snapshot); 1027 1028 k = bch2_btree_iter_peek_max(&iter, end); 1029 ret = bkey_err(k); 1030 if (ret) 1031 goto err; 1032 1033 if (!k.k) 1034 break; 1035 1036 bkey_init(&delete.k); 1037 delete.k.p = iter.pos; 1038 1039 if (iter.flags & BTREE_ITER_is_extents) 1040 bch2_key_resize(&delete.k, 1041 bpos_min(end, k.k->p).offset - 1042 iter.pos.offset); 1043 1044 ret = bch2_trans_update(trans, &iter, &delete, 0) ?: 1045 bch2_trans_commit(trans, NULL, NULL, 1046 BCH_TRANS_COMMIT_no_enospc); 1047 err: 1048 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1049 break; 1050 } 1051 1052 bch2_trans_iter_exit(trans, &iter); 1053 return ret; 1054 } 1055 1056 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) 1057 { 1058 struct btree_trans *trans = bch2_trans_get(c); 1059 struct btree_iter iter = { NULL }; 1060 struct bkey_s_c k; 1061 u32 snapshot; 1062 int ret; 1063 1064 /* 1065 * If this was a directory, there shouldn't be any real dirents left - 1066 * but there could be whiteouts (from hash collisions) that we should 1067 * delete: 1068 * 1069 * XXX: the dirent could ideally would delete whiteouts when they're no 1070 * longer needed 1071 */ 1072 ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: 1073 bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: 1074 bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); 1075 if (ret) 1076 goto err; 1077 retry: 1078 bch2_trans_begin(trans); 1079 1080 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1081 if (ret) 1082 goto err; 1083 1084 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 1085 SPOS(0, inum.inum, snapshot), 1086 BTREE_ITER_intent|BTREE_ITER_cached); 1087 ret = bkey_err(k); 1088 if (ret) 1089 goto err; 1090 1091 if (!bkey_is_inode(k.k)) { 1092 bch2_fs_inconsistent(c, 1093 "inode %llu:%u not found when deleting", 1094 inum.inum, snapshot); 1095 ret = -EIO; 1096 goto err; 1097 } 1098 1099 ret = bch2_btree_delete_at(trans, &iter, 0) ?: 1100 bch2_trans_commit(trans, NULL, NULL, 1101 BCH_TRANS_COMMIT_no_enospc); 1102 err: 1103 bch2_trans_iter_exit(trans, &iter); 1104 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1105 goto retry; 1106 1107 if (ret) 1108 goto err2; 1109 1110 ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); 1111 err2: 1112 bch2_trans_put(trans); 1113 return ret; 1114 } 1115 1116 int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, 1117 subvol_inum inum, 1118 struct bch_inode_unpacked *inode) 1119 { 1120 struct btree_iter iter; 1121 int ret; 1122 1123 ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); 1124 if (!ret) 1125 bch2_trans_iter_exit(trans, &iter); 1126 return ret; 1127 } 1128 1129 int bch2_inode_find_by_inum_trans(struct btree_trans *trans, 1130 subvol_inum inum, 1131 struct bch_inode_unpacked *inode) 1132 { 1133 struct btree_iter iter; 1134 int ret; 1135 1136 ret = bch2_inode_peek(trans, &iter, inode, inum, 0); 1137 if (!ret) 1138 bch2_trans_iter_exit(trans, &iter); 1139 return ret; 1140 } 1141 1142 int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, 1143 struct bch_inode_unpacked *inode) 1144 { 1145 return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); 1146 } 1147 1148 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) 1149 { 1150 if (bi->bi_flags & BCH_INODE_unlinked) 1151 bi->bi_flags &= ~BCH_INODE_unlinked; 1152 else { 1153 if (bi->bi_nlink == U32_MAX) 1154 return -EINVAL; 1155 1156 bi->bi_nlink++; 1157 } 1158 1159 return 0; 1160 } 1161 1162 void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) 1163 { 1164 if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { 1165 bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", 1166 bi->bi_inum); 1167 return; 1168 } 1169 1170 if (bi->bi_flags & BCH_INODE_unlinked) { 1171 bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); 1172 return; 1173 } 1174 1175 if (bi->bi_nlink) 1176 bi->bi_nlink--; 1177 else 1178 bi->bi_flags |= BCH_INODE_unlinked; 1179 } 1180 1181 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) 1182 { 1183 struct bch_opts ret = { 0 }; 1184 #define x(_name, _bits) \ 1185 if (inode->bi_##_name) \ 1186 opt_set(ret, _name, inode->bi_##_name - 1); 1187 BCH_INODE_OPTS() 1188 #undef x 1189 return ret; 1190 } 1191 1192 void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, 1193 struct bch_inode_unpacked *inode) 1194 { 1195 #define x(_name, _bits) \ 1196 if ((inode)->bi_##_name) { \ 1197 opts->_name = inode->bi_##_name - 1; \ 1198 opts->_name##_from_inode = true; \ 1199 } else { \ 1200 opts->_name = c->opts._name; \ 1201 } 1202 BCH_INODE_OPTS() 1203 #undef x 1204 1205 bch2_io_opts_fixups(opts); 1206 } 1207 1208 int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) 1209 { 1210 struct bch_inode_unpacked inode; 1211 int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); 1212 1213 if (ret) 1214 return ret; 1215 1216 bch2_inode_opts_get(opts, trans->c, &inode); 1217 return 0; 1218 } 1219 1220 static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 1221 { 1222 struct bch_fs *c = trans->c; 1223 struct btree_iter iter = { NULL }; 1224 struct bkey_i_inode_generation delete; 1225 struct bch_inode_unpacked inode_u; 1226 struct bkey_s_c k; 1227 int ret; 1228 1229 do { 1230 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 1231 SPOS(inum, 0, snapshot), 1232 SPOS(inum, U64_MAX, snapshot), 1233 0, NULL) ?: 1234 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, 1235 SPOS(inum, 0, snapshot), 1236 SPOS(inum, U64_MAX, snapshot), 1237 0, NULL) ?: 1238 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, 1239 SPOS(inum, 0, snapshot), 1240 SPOS(inum, U64_MAX, snapshot), 1241 0, NULL); 1242 } while (ret == -BCH_ERR_transaction_restart_nested); 1243 if (ret) 1244 goto err; 1245 retry: 1246 bch2_trans_begin(trans); 1247 1248 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 1249 SPOS(0, inum, snapshot), BTREE_ITER_intent); 1250 ret = bkey_err(k); 1251 if (ret) 1252 goto err; 1253 1254 if (!bkey_is_inode(k.k)) { 1255 bch2_fs_inconsistent(c, 1256 "inode %llu:%u not found when deleting", 1257 inum, snapshot); 1258 ret = -EIO; 1259 goto err; 1260 } 1261 1262 bch2_inode_unpack(k, &inode_u); 1263 1264 /* Subvolume root? */ 1265 if (inode_u.bi_subvol) 1266 bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); 1267 1268 bkey_inode_generation_init(&delete.k_i); 1269 delete.k.p = iter.pos; 1270 delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); 1271 1272 ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: 1273 bch2_trans_commit(trans, NULL, NULL, 1274 BCH_TRANS_COMMIT_no_enospc); 1275 err: 1276 bch2_trans_iter_exit(trans, &iter); 1277 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1278 goto retry; 1279 1280 return ret ?: -BCH_ERR_transaction_restart_nested; 1281 } 1282 1283 /* 1284 * After deleting an inode, there may be versions in older snapshots that should 1285 * also be deleted - if they're not referenced by sibling snapshots and not open 1286 * in other subvolumes: 1287 */ 1288 static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) 1289 { 1290 struct btree_iter iter; 1291 struct bkey_s_c k; 1292 int ret; 1293 next_parent: 1294 ret = lockrestart_do(trans, 1295 bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); 1296 if (ret || !k.k) 1297 return ret; 1298 1299 bool unlinked = bkey_is_unlinked_inode(k); 1300 pos = k.k->p; 1301 bch2_trans_iter_exit(trans, &iter); 1302 1303 if (!unlinked) 1304 return 0; 1305 1306 ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); 1307 if (ret) 1308 return ret < 0 ? ret : 0; 1309 1310 ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); 1311 if (ret) 1312 return ret; 1313 goto next_parent; 1314 } 1315 1316 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 1317 { 1318 return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: 1319 delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); 1320 } 1321 1322 static int may_delete_deleted_inode(struct btree_trans *trans, 1323 struct btree_iter *iter, 1324 struct bpos pos, 1325 bool *need_another_pass) 1326 { 1327 struct bch_fs *c = trans->c; 1328 struct btree_iter inode_iter; 1329 struct bkey_s_c k; 1330 struct bch_inode_unpacked inode; 1331 struct printbuf buf = PRINTBUF; 1332 int ret; 1333 1334 k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); 1335 ret = bkey_err(k); 1336 if (ret) 1337 return ret; 1338 1339 ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; 1340 if (fsck_err_on(!bkey_is_inode(k.k), 1341 trans, deleted_inode_missing, 1342 "nonexistent inode %llu:%u in deleted_inodes btree", 1343 pos.offset, pos.snapshot)) 1344 goto delete; 1345 1346 ret = bch2_inode_unpack(k, &inode); 1347 if (ret) 1348 goto out; 1349 1350 if (S_ISDIR(inode.bi_mode)) { 1351 ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); 1352 if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), 1353 trans, deleted_inode_is_dir, 1354 "non empty directory %llu:%u in deleted_inodes btree", 1355 pos.offset, pos.snapshot)) 1356 goto delete; 1357 if (ret) 1358 goto out; 1359 } 1360 1361 if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), 1362 trans, deleted_inode_not_unlinked, 1363 "non-deleted inode %llu:%u in deleted_inodes btree", 1364 pos.offset, pos.snapshot)) 1365 goto delete; 1366 1367 if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, 1368 trans, deleted_inode_has_child_snapshots, 1369 "inode with child snapshots %llu:%u in deleted_inodes btree", 1370 pos.offset, pos.snapshot)) 1371 goto delete; 1372 1373 ret = bch2_inode_has_child_snapshots(trans, k.k->p); 1374 if (ret < 0) 1375 goto out; 1376 1377 if (ret) { 1378 if (fsck_err(trans, inode_has_child_snapshots_wrong, 1379 "inode has_child_snapshots flag wrong (should be set)\n%s", 1380 (printbuf_reset(&buf), 1381 bch2_inode_unpacked_to_text(&buf, &inode), 1382 buf.buf))) { 1383 inode.bi_flags |= BCH_INODE_has_child_snapshot; 1384 ret = __bch2_fsck_write_inode(trans, &inode); 1385 if (ret) 1386 goto out; 1387 } 1388 goto delete; 1389 1390 } 1391 1392 if (test_bit(BCH_FS_clean_recovery, &c->flags) && 1393 !fsck_err(trans, deleted_inode_but_clean, 1394 "filesystem marked as clean but have deleted inode %llu:%u", 1395 pos.offset, pos.snapshot)) { 1396 ret = 0; 1397 goto out; 1398 } 1399 1400 ret = 1; 1401 out: 1402 fsck_err: 1403 bch2_trans_iter_exit(trans, &inode_iter); 1404 printbuf_exit(&buf); 1405 return ret; 1406 delete: 1407 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); 1408 goto out; 1409 } 1410 1411 int bch2_delete_dead_inodes(struct bch_fs *c) 1412 { 1413 struct btree_trans *trans = bch2_trans_get(c); 1414 bool need_another_pass; 1415 int ret; 1416 again: 1417 /* 1418 * if we ran check_inodes() unlinked inodes will have already been 1419 * cleaned up but the write buffer will be out of sync; therefore we 1420 * alway need a write buffer flush 1421 */ 1422 ret = bch2_btree_write_buffer_flush_sync(trans); 1423 if (ret) 1424 goto err; 1425 1426 need_another_pass = false; 1427 1428 /* 1429 * Weird transaction restart handling here because on successful delete, 1430 * bch2_inode_rm_snapshot() will return a nested transaction restart, 1431 * but we can't retry because the btree write buffer won't have been 1432 * flushed and we'd spin: 1433 */ 1434 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, 1435 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1436 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 1437 ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); 1438 if (ret > 0) { 1439 bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", 1440 k.k->p.offset, k.k->p.snapshot); 1441 1442 ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); 1443 /* 1444 * We don't want to loop here: a transaction restart 1445 * error here means we handled a transaction restart and 1446 * we're actually done, but if we loop we'll retry the 1447 * same key because the write buffer hasn't been flushed 1448 * yet 1449 */ 1450 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1451 ret = 0; 1452 continue; 1453 } 1454 } 1455 1456 ret; 1457 })); 1458 1459 if (!ret && need_another_pass) 1460 goto again; 1461 err: 1462 bch2_trans_put(trans); 1463 return ret; 1464 } 1465