1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_key_cache.h" 5 #include "btree_write_buffer.h" 6 #include "bkey_methods.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "error.h" 10 #include "extents.h" 11 #include "extent_update.h" 12 #include "inode.h" 13 #include "str_hash.h" 14 #include "snapshot.h" 15 #include "subvolume.h" 16 #include "varint.h" 17 18 #include <linux/random.h> 19 20 #include <asm/unaligned.h> 21 22 const char * const bch2_inode_opts[] = { 23 #define x(name, ...) #name, 24 BCH_INODE_OPTS() 25 #undef x 26 NULL, 27 }; 28 29 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; 30 31 static int inode_decode_field(const u8 *in, const u8 *end, 32 u64 out[2], unsigned *out_bits) 33 { 34 __be64 be[2] = { 0, 0 }; 35 unsigned bytes, shift; 36 u8 *p; 37 38 if (in >= end) 39 return -1; 40 41 if (!*in) 42 return -1; 43 44 /* 45 * position of highest set bit indicates number of bytes: 46 * shift = number of bits to remove in high byte: 47 */ 48 shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ 49 bytes = byte_table[shift - 1]; 50 51 if (in + bytes > end) 52 return -1; 53 54 p = (u8 *) be + 16 - bytes; 55 memcpy(p, in, bytes); 56 *p ^= (1 << 8) >> shift; 57 58 out[0] = be64_to_cpu(be[0]); 59 out[1] = be64_to_cpu(be[1]); 60 *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); 61 62 return bytes; 63 } 64 65 static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, 66 const struct bch_inode_unpacked *inode) 67 { 68 struct bkey_i_inode_v3 *k = &packed->inode; 69 u8 *out = k->v.fields; 70 u8 *end = (void *) &packed[1]; 71 u8 *last_nonzero_field = out; 72 unsigned nr_fields = 0, last_nonzero_fieldnr = 0; 73 unsigned bytes; 74 int ret; 75 76 bkey_inode_v3_init(&packed->inode.k_i); 77 packed->inode.k.p.offset = inode->bi_inum; 78 packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); 79 packed->inode.v.bi_hash_seed = inode->bi_hash_seed; 80 packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); 81 packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); 82 packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); 83 packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); 84 SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); 85 SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); 86 87 88 #define x(_name, _bits) \ 89 nr_fields++; \ 90 \ 91 if (inode->_name) { \ 92 ret = bch2_varint_encode_fast(out, inode->_name); \ 93 out += ret; \ 94 \ 95 if (_bits > 64) \ 96 *out++ = 0; \ 97 \ 98 last_nonzero_field = out; \ 99 last_nonzero_fieldnr = nr_fields; \ 100 } else { \ 101 *out++ = 0; \ 102 \ 103 if (_bits > 64) \ 104 *out++ = 0; \ 105 } 106 107 BCH_INODE_FIELDS_v3() 108 #undef x 109 BUG_ON(out > end); 110 111 out = last_nonzero_field; 112 nr_fields = last_nonzero_fieldnr; 113 114 bytes = out - (u8 *) &packed->inode.v; 115 set_bkey_val_bytes(&packed->inode.k, bytes); 116 memset_u64s_tail(&packed->inode.v, 0, bytes); 117 118 SET_INODEv3_NR_FIELDS(&k->v, nr_fields); 119 120 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { 121 struct bch_inode_unpacked unpacked; 122 123 ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); 124 BUG_ON(ret); 125 BUG_ON(unpacked.bi_inum != inode->bi_inum); 126 BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); 127 BUG_ON(unpacked.bi_sectors != inode->bi_sectors); 128 BUG_ON(unpacked.bi_size != inode->bi_size); 129 BUG_ON(unpacked.bi_version != inode->bi_version); 130 BUG_ON(unpacked.bi_mode != inode->bi_mode); 131 132 #define x(_name, _bits) if (unpacked._name != inode->_name) \ 133 panic("unpacked %llu should be %llu", \ 134 (u64) unpacked._name, (u64) inode->_name); 135 BCH_INODE_FIELDS_v3() 136 #undef x 137 } 138 } 139 140 void bch2_inode_pack(struct bkey_inode_buf *packed, 141 const struct bch_inode_unpacked *inode) 142 { 143 bch2_inode_pack_inlined(packed, inode); 144 } 145 146 static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, 147 struct bch_inode_unpacked *unpacked) 148 { 149 const u8 *in = inode.v->fields; 150 const u8 *end = bkey_val_end(inode); 151 u64 field[2]; 152 unsigned fieldnr = 0, field_bits; 153 int ret; 154 155 #define x(_name, _bits) \ 156 if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ 157 unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ 158 memset((void *) unpacked + offset, 0, \ 159 sizeof(*unpacked) - offset); \ 160 return 0; \ 161 } \ 162 \ 163 ret = inode_decode_field(in, end, field, &field_bits); \ 164 if (ret < 0) \ 165 return ret; \ 166 \ 167 if (field_bits > sizeof(unpacked->_name) * 8) \ 168 return -1; \ 169 \ 170 unpacked->_name = field[1]; \ 171 in += ret; 172 173 BCH_INODE_FIELDS_v2() 174 #undef x 175 176 /* XXX: signal if there were more fields than expected? */ 177 return 0; 178 } 179 180 static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, 181 const u8 *in, const u8 *end, 182 unsigned nr_fields) 183 { 184 unsigned fieldnr = 0; 185 int ret; 186 u64 v[2]; 187 188 #define x(_name, _bits) \ 189 if (fieldnr < nr_fields) { \ 190 ret = bch2_varint_decode_fast(in, end, &v[0]); \ 191 if (ret < 0) \ 192 return ret; \ 193 in += ret; \ 194 \ 195 if (_bits > 64) { \ 196 ret = bch2_varint_decode_fast(in, end, &v[1]); \ 197 if (ret < 0) \ 198 return ret; \ 199 in += ret; \ 200 } else { \ 201 v[1] = 0; \ 202 } \ 203 } else { \ 204 v[0] = v[1] = 0; \ 205 } \ 206 \ 207 unpacked->_name = v[0]; \ 208 if (v[1] || v[0] != unpacked->_name) \ 209 return -1; \ 210 fieldnr++; 211 212 BCH_INODE_FIELDS_v2() 213 #undef x 214 215 /* XXX: signal if there were more fields than expected? */ 216 return 0; 217 } 218 219 static int bch2_inode_unpack_v3(struct bkey_s_c k, 220 struct bch_inode_unpacked *unpacked) 221 { 222 struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); 223 const u8 *in = inode.v->fields; 224 const u8 *end = bkey_val_end(inode); 225 unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); 226 unsigned fieldnr = 0; 227 int ret; 228 u64 v[2]; 229 230 unpacked->bi_inum = inode.k->p.offset; 231 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); 232 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 233 unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); 234 unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); 235 unpacked->bi_size = le64_to_cpu(inode.v->bi_size); 236 unpacked->bi_version = le64_to_cpu(inode.v->bi_version); 237 unpacked->bi_mode = INODEv3_MODE(inode.v); 238 239 #define x(_name, _bits) \ 240 if (fieldnr < nr_fields) { \ 241 ret = bch2_varint_decode_fast(in, end, &v[0]); \ 242 if (ret < 0) \ 243 return ret; \ 244 in += ret; \ 245 \ 246 if (_bits > 64) { \ 247 ret = bch2_varint_decode_fast(in, end, &v[1]); \ 248 if (ret < 0) \ 249 return ret; \ 250 in += ret; \ 251 } else { \ 252 v[1] = 0; \ 253 } \ 254 } else { \ 255 v[0] = v[1] = 0; \ 256 } \ 257 \ 258 unpacked->_name = v[0]; \ 259 if (v[1] || v[0] != unpacked->_name) \ 260 return -1; \ 261 fieldnr++; 262 263 BCH_INODE_FIELDS_v3() 264 #undef x 265 266 /* XXX: signal if there were more fields than expected? */ 267 return 0; 268 } 269 270 static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, 271 struct bch_inode_unpacked *unpacked) 272 { 273 memset(unpacked, 0, sizeof(*unpacked)); 274 275 switch (k.k->type) { 276 case KEY_TYPE_inode: { 277 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); 278 279 unpacked->bi_inum = inode.k->p.offset; 280 unpacked->bi_journal_seq= 0; 281 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 282 unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); 283 unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); 284 285 if (INODE_NEW_VARINT(inode.v)) { 286 return bch2_inode_unpack_v2(unpacked, inode.v->fields, 287 bkey_val_end(inode), 288 INODE_NR_FIELDS(inode.v)); 289 } else { 290 return bch2_inode_unpack_v1(inode, unpacked); 291 } 292 break; 293 } 294 case KEY_TYPE_inode_v2: { 295 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); 296 297 unpacked->bi_inum = inode.k->p.offset; 298 unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); 299 unpacked->bi_hash_seed = inode.v->bi_hash_seed; 300 unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); 301 unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); 302 303 return bch2_inode_unpack_v2(unpacked, inode.v->fields, 304 bkey_val_end(inode), 305 INODEv2_NR_FIELDS(inode.v)); 306 } 307 default: 308 BUG(); 309 } 310 } 311 312 int bch2_inode_unpack(struct bkey_s_c k, 313 struct bch_inode_unpacked *unpacked) 314 { 315 if (likely(k.k->type == KEY_TYPE_inode_v3)) 316 return bch2_inode_unpack_v3(k, unpacked); 317 return bch2_inode_unpack_slowpath(k, unpacked); 318 } 319 320 static int bch2_inode_peek_nowarn(struct btree_trans *trans, 321 struct btree_iter *iter, 322 struct bch_inode_unpacked *inode, 323 subvol_inum inum, unsigned flags) 324 { 325 struct bkey_s_c k; 326 u32 snapshot; 327 int ret; 328 329 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 330 if (ret) 331 return ret; 332 333 k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, 334 SPOS(0, inum.inum, snapshot), 335 flags|BTREE_ITER_CACHED); 336 ret = bkey_err(k); 337 if (ret) 338 return ret; 339 340 ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; 341 if (ret) 342 goto err; 343 344 ret = bch2_inode_unpack(k, inode); 345 if (ret) 346 goto err; 347 348 return 0; 349 err: 350 bch2_trans_iter_exit(trans, iter); 351 return ret; 352 } 353 354 int bch2_inode_peek(struct btree_trans *trans, 355 struct btree_iter *iter, 356 struct bch_inode_unpacked *inode, 357 subvol_inum inum, unsigned flags) 358 { 359 int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); 360 bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); 361 return ret; 362 } 363 364 int bch2_inode_write(struct btree_trans *trans, 365 struct btree_iter *iter, 366 struct bch_inode_unpacked *inode) 367 { 368 struct bkey_inode_buf *inode_p; 369 370 inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); 371 if (IS_ERR(inode_p)) 372 return PTR_ERR(inode_p); 373 374 bch2_inode_pack_inlined(inode_p, inode); 375 inode_p->inode.k.p.snapshot = iter->snapshot; 376 return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); 377 } 378 379 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) 380 { 381 struct bch_inode_unpacked u; 382 struct bkey_inode_buf *inode_p; 383 int ret; 384 385 if (!bkey_is_inode(&k->k)) 386 return ERR_PTR(-ENOENT); 387 388 inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); 389 if (IS_ERR(inode_p)) 390 return ERR_CAST(inode_p); 391 392 ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); 393 if (ret) 394 return ERR_PTR(ret); 395 396 bch2_inode_pack(inode_p, &u); 397 return &inode_p->inode.k_i; 398 } 399 400 static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) 401 { 402 struct bch_inode_unpacked unpacked; 403 404 if (k.k->p.inode) { 405 prt_printf(err, "nonzero k.p.inode"); 406 return -BCH_ERR_invalid_bkey; 407 } 408 409 if (k.k->p.offset < BLOCKDEV_INODE_MAX) { 410 prt_printf(err, "fs inode in blockdev range"); 411 return -BCH_ERR_invalid_bkey; 412 } 413 414 if (bch2_inode_unpack(k, &unpacked)) { 415 prt_printf(err, "invalid variable length fields"); 416 return -BCH_ERR_invalid_bkey; 417 } 418 419 if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { 420 prt_printf(err, "invalid data checksum type (%u >= %u", 421 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); 422 return -BCH_ERR_invalid_bkey; 423 } 424 425 if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { 426 prt_printf(err, "invalid data checksum type (%u >= %u)", 427 unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); 428 return -BCH_ERR_invalid_bkey; 429 } 430 431 if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && 432 unpacked.bi_nlink != 0) { 433 prt_printf(err, "flagged as unlinked but bi_nlink != 0"); 434 return -BCH_ERR_invalid_bkey; 435 } 436 437 if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { 438 prt_printf(err, "subvolume root but not a directory"); 439 return -BCH_ERR_invalid_bkey; 440 } 441 442 return 0; 443 } 444 445 int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, 446 enum bkey_invalid_flags flags, 447 struct printbuf *err) 448 { 449 struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); 450 451 if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { 452 prt_printf(err, "invalid str hash type (%llu >= %u)", 453 INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); 454 return -BCH_ERR_invalid_bkey; 455 } 456 457 return __bch2_inode_invalid(k, err); 458 } 459 460 int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, 461 enum bkey_invalid_flags flags, 462 struct printbuf *err) 463 { 464 struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); 465 466 if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { 467 prt_printf(err, "invalid str hash type (%llu >= %u)", 468 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); 469 return -BCH_ERR_invalid_bkey; 470 } 471 472 return __bch2_inode_invalid(k, err); 473 } 474 475 int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, 476 enum bkey_invalid_flags flags, 477 struct printbuf *err) 478 { 479 struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); 480 481 if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || 482 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { 483 prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", 484 INODEv3_FIELDS_START(inode.v), 485 INODEv3_FIELDS_START_INITIAL, 486 bkey_val_u64s(inode.k)); 487 return -BCH_ERR_invalid_bkey; 488 } 489 490 if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { 491 prt_printf(err, "invalid str hash type (%llu >= %u)", 492 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); 493 return -BCH_ERR_invalid_bkey; 494 } 495 496 return __bch2_inode_invalid(k, err); 497 } 498 499 static void __bch2_inode_unpacked_to_text(struct printbuf *out, 500 struct bch_inode_unpacked *inode) 501 { 502 prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", 503 inode->bi_mode, inode->bi_flags, 504 inode->bi_journal_seq, 505 inode->bi_size, 506 inode->bi_sectors, 507 inode->bi_version); 508 509 #define x(_name, _bits) \ 510 prt_printf(out, " "#_name " %llu", (u64) inode->_name); 511 BCH_INODE_FIELDS_v3() 512 #undef x 513 } 514 515 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) 516 { 517 prt_printf(out, "inum: %llu ", inode->bi_inum); 518 __bch2_inode_unpacked_to_text(out, inode); 519 } 520 521 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 522 { 523 struct bch_inode_unpacked inode; 524 525 if (bch2_inode_unpack(k, &inode)) { 526 prt_printf(out, "(unpack error)"); 527 return; 528 } 529 530 __bch2_inode_unpacked_to_text(out, &inode); 531 } 532 533 static inline u64 bkey_inode_flags(struct bkey_s_c k) 534 { 535 switch (k.k->type) { 536 case KEY_TYPE_inode: 537 return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); 538 case KEY_TYPE_inode_v2: 539 return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); 540 case KEY_TYPE_inode_v3: 541 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); 542 default: 543 return 0; 544 } 545 } 546 547 static inline bool bkey_is_deleted_inode(struct bkey_s_c k) 548 { 549 return bkey_inode_flags(k) & BCH_INODE_UNLINKED; 550 } 551 552 int bch2_trans_mark_inode(struct btree_trans *trans, 553 enum btree_id btree_id, unsigned level, 554 struct bkey_s_c old, 555 struct bkey_i *new, 556 unsigned flags) 557 { 558 int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); 559 bool old_deleted = bkey_is_deleted_inode(old); 560 bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); 561 562 if (nr) { 563 int ret = bch2_replicas_deltas_realloc(trans, 0); 564 struct replicas_delta_list *d = trans->fs_usage_deltas; 565 566 if (ret) 567 return ret; 568 569 d->nr_inodes += nr; 570 } 571 572 if (old_deleted != new_deleted) { 573 int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); 574 if (ret) 575 return ret; 576 } 577 578 return 0; 579 } 580 581 int bch2_mark_inode(struct btree_trans *trans, 582 enum btree_id btree_id, unsigned level, 583 struct bkey_s_c old, struct bkey_s_c new, 584 unsigned flags) 585 { 586 struct bch_fs *c = trans->c; 587 struct bch_fs_usage *fs_usage; 588 u64 journal_seq = trans->journal_res.seq; 589 590 if (flags & BTREE_TRIGGER_INSERT) { 591 struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; 592 593 BUG_ON(!journal_seq); 594 BUG_ON(new.k->type != KEY_TYPE_inode_v3); 595 596 v->bi_journal_seq = cpu_to_le64(journal_seq); 597 } 598 599 if (flags & BTREE_TRIGGER_GC) { 600 percpu_down_read(&c->mark_lock); 601 preempt_disable(); 602 603 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); 604 fs_usage->nr_inodes += bkey_is_inode(new.k); 605 fs_usage->nr_inodes -= bkey_is_inode(old.k); 606 607 preempt_enable(); 608 percpu_up_read(&c->mark_lock); 609 } 610 return 0; 611 } 612 613 int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, 614 enum bkey_invalid_flags flags, 615 struct printbuf *err) 616 { 617 if (k.k->p.inode) { 618 prt_printf(err, "nonzero k.p.inode"); 619 return -BCH_ERR_invalid_bkey; 620 } 621 622 return 0; 623 } 624 625 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, 626 struct bkey_s_c k) 627 { 628 struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); 629 630 prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); 631 } 632 633 void bch2_inode_init_early(struct bch_fs *c, 634 struct bch_inode_unpacked *inode_u) 635 { 636 enum bch_str_hash_type str_hash = 637 bch2_str_hash_opt_to_type(c, c->opts.str_hash); 638 639 memset(inode_u, 0, sizeof(*inode_u)); 640 641 /* ick */ 642 inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; 643 get_random_bytes(&inode_u->bi_hash_seed, 644 sizeof(inode_u->bi_hash_seed)); 645 } 646 647 void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, 648 uid_t uid, gid_t gid, umode_t mode, dev_t rdev, 649 struct bch_inode_unpacked *parent) 650 { 651 inode_u->bi_mode = mode; 652 inode_u->bi_uid = uid; 653 inode_u->bi_gid = gid; 654 inode_u->bi_dev = rdev; 655 inode_u->bi_atime = now; 656 inode_u->bi_mtime = now; 657 inode_u->bi_ctime = now; 658 inode_u->bi_otime = now; 659 660 if (parent && parent->bi_mode & S_ISGID) { 661 inode_u->bi_gid = parent->bi_gid; 662 if (S_ISDIR(mode)) 663 inode_u->bi_mode |= S_ISGID; 664 } 665 666 if (parent) { 667 #define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; 668 BCH_INODE_OPTS() 669 #undef x 670 } 671 } 672 673 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, 674 uid_t uid, gid_t gid, umode_t mode, dev_t rdev, 675 struct bch_inode_unpacked *parent) 676 { 677 bch2_inode_init_early(c, inode_u); 678 bch2_inode_init_late(inode_u, bch2_current_time(c), 679 uid, gid, mode, rdev, parent); 680 } 681 682 static inline u32 bkey_generation(struct bkey_s_c k) 683 { 684 switch (k.k->type) { 685 case KEY_TYPE_inode: 686 case KEY_TYPE_inode_v2: 687 BUG(); 688 case KEY_TYPE_inode_generation: 689 return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); 690 default: 691 return 0; 692 } 693 } 694 695 /* 696 * This just finds an empty slot: 697 */ 698 int bch2_inode_create(struct btree_trans *trans, 699 struct btree_iter *iter, 700 struct bch_inode_unpacked *inode_u, 701 u32 snapshot, u64 cpu) 702 { 703 struct bch_fs *c = trans->c; 704 struct bkey_s_c k; 705 u64 min, max, start, pos, *hint; 706 int ret = 0; 707 unsigned bits = (c->opts.inodes_32bit ? 31 : 63); 708 709 if (c->opts.shard_inode_numbers) { 710 bits -= c->inode_shard_bits; 711 712 min = (cpu << bits); 713 max = (cpu << bits) | ~(ULLONG_MAX << bits); 714 715 min = max_t(u64, min, BLOCKDEV_INODE_MAX); 716 hint = c->unused_inode_hints + cpu; 717 } else { 718 min = BLOCKDEV_INODE_MAX; 719 max = ~(ULLONG_MAX << bits); 720 hint = c->unused_inode_hints; 721 } 722 723 start = READ_ONCE(*hint); 724 725 if (start >= max || start < min) 726 start = min; 727 728 pos = start; 729 bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), 730 BTREE_ITER_ALL_SNAPSHOTS| 731 BTREE_ITER_INTENT); 732 again: 733 while ((k = bch2_btree_iter_peek(iter)).k && 734 !(ret = bkey_err(k)) && 735 bkey_lt(k.k->p, POS(0, max))) { 736 if (pos < iter->pos.offset) 737 goto found_slot; 738 739 /* 740 * We don't need to iterate over keys in every snapshot once 741 * we've found just one: 742 */ 743 pos = iter->pos.offset + 1; 744 bch2_btree_iter_set_pos(iter, POS(0, pos)); 745 } 746 747 if (!ret && pos < max) 748 goto found_slot; 749 750 if (!ret && start == min) 751 ret = -BCH_ERR_ENOSPC_inode_create; 752 753 if (ret) { 754 bch2_trans_iter_exit(trans, iter); 755 return ret; 756 } 757 758 /* Retry from start */ 759 pos = start = min; 760 bch2_btree_iter_set_pos(iter, POS(0, pos)); 761 goto again; 762 found_slot: 763 bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); 764 k = bch2_btree_iter_peek_slot(iter); 765 ret = bkey_err(k); 766 if (ret) { 767 bch2_trans_iter_exit(trans, iter); 768 return ret; 769 } 770 771 *hint = k.k->p.offset; 772 inode_u->bi_inum = k.k->p.offset; 773 inode_u->bi_generation = bkey_generation(k); 774 return 0; 775 } 776 777 static int bch2_inode_delete_keys(struct btree_trans *trans, 778 subvol_inum inum, enum btree_id id) 779 { 780 struct btree_iter iter; 781 struct bkey_s_c k; 782 struct bkey_i delete; 783 struct bpos end = POS(inum.inum, U64_MAX); 784 u32 snapshot; 785 int ret = 0; 786 787 /* 788 * We're never going to be deleting partial extents, no need to use an 789 * extent iterator: 790 */ 791 bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), 792 BTREE_ITER_INTENT); 793 794 while (1) { 795 bch2_trans_begin(trans); 796 797 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 798 if (ret) 799 goto err; 800 801 bch2_btree_iter_set_snapshot(&iter, snapshot); 802 803 k = bch2_btree_iter_peek_upto(&iter, end); 804 ret = bkey_err(k); 805 if (ret) 806 goto err; 807 808 if (!k.k) 809 break; 810 811 bkey_init(&delete.k); 812 delete.k.p = iter.pos; 813 814 if (iter.flags & BTREE_ITER_IS_EXTENTS) 815 bch2_key_resize(&delete.k, 816 bpos_min(end, k.k->p).offset - 817 iter.pos.offset); 818 819 ret = bch2_trans_update(trans, &iter, &delete, 0) ?: 820 bch2_trans_commit(trans, NULL, NULL, 821 BTREE_INSERT_NOFAIL); 822 err: 823 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 824 break; 825 } 826 827 bch2_trans_iter_exit(trans, &iter); 828 return ret; 829 } 830 831 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) 832 { 833 struct btree_trans *trans = bch2_trans_get(c); 834 struct btree_iter iter = { NULL }; 835 struct bkey_i_inode_generation delete; 836 struct bch_inode_unpacked inode_u; 837 struct bkey_s_c k; 838 u32 snapshot; 839 int ret; 840 841 /* 842 * If this was a directory, there shouldn't be any real dirents left - 843 * but there could be whiteouts (from hash collisions) that we should 844 * delete: 845 * 846 * XXX: the dirent could ideally would delete whiteouts when they're no 847 * longer needed 848 */ 849 ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: 850 bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: 851 bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); 852 if (ret) 853 goto err; 854 retry: 855 bch2_trans_begin(trans); 856 857 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 858 if (ret) 859 goto err; 860 861 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 862 SPOS(0, inum.inum, snapshot), 863 BTREE_ITER_INTENT|BTREE_ITER_CACHED); 864 ret = bkey_err(k); 865 if (ret) 866 goto err; 867 868 if (!bkey_is_inode(k.k)) { 869 bch2_fs_inconsistent(c, 870 "inode %llu:%u not found when deleting", 871 inum.inum, snapshot); 872 ret = -EIO; 873 goto err; 874 } 875 876 bch2_inode_unpack(k, &inode_u); 877 878 bkey_inode_generation_init(&delete.k_i); 879 delete.k.p = iter.pos; 880 delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); 881 882 ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: 883 bch2_trans_commit(trans, NULL, NULL, 884 BTREE_INSERT_NOFAIL); 885 err: 886 bch2_trans_iter_exit(trans, &iter); 887 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 888 goto retry; 889 890 bch2_trans_put(trans); 891 return ret; 892 } 893 894 int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, 895 subvol_inum inum, 896 struct bch_inode_unpacked *inode) 897 { 898 struct btree_iter iter; 899 int ret; 900 901 ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); 902 if (!ret) 903 bch2_trans_iter_exit(trans, &iter); 904 return ret; 905 } 906 907 int bch2_inode_find_by_inum_trans(struct btree_trans *trans, 908 subvol_inum inum, 909 struct bch_inode_unpacked *inode) 910 { 911 struct btree_iter iter; 912 int ret; 913 914 ret = bch2_inode_peek(trans, &iter, inode, inum, 0); 915 if (!ret) 916 bch2_trans_iter_exit(trans, &iter); 917 return ret; 918 } 919 920 int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, 921 struct bch_inode_unpacked *inode) 922 { 923 return bch2_trans_do(c, NULL, NULL, 0, 924 bch2_inode_find_by_inum_trans(trans, inum, inode)); 925 } 926 927 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) 928 { 929 if (bi->bi_flags & BCH_INODE_UNLINKED) 930 bi->bi_flags &= ~BCH_INODE_UNLINKED; 931 else { 932 if (bi->bi_nlink == U32_MAX) 933 return -EINVAL; 934 935 bi->bi_nlink++; 936 } 937 938 return 0; 939 } 940 941 void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) 942 { 943 if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { 944 bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", 945 bi->bi_inum); 946 return; 947 } 948 949 if (bi->bi_flags & BCH_INODE_UNLINKED) { 950 bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); 951 return; 952 } 953 954 if (bi->bi_nlink) 955 bi->bi_nlink--; 956 else 957 bi->bi_flags |= BCH_INODE_UNLINKED; 958 } 959 960 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) 961 { 962 struct bch_opts ret = { 0 }; 963 #define x(_name, _bits) \ 964 if (inode->bi_##_name) \ 965 opt_set(ret, _name, inode->bi_##_name - 1); 966 BCH_INODE_OPTS() 967 #undef x 968 return ret; 969 } 970 971 void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, 972 struct bch_inode_unpacked *inode) 973 { 974 #define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); 975 BCH_INODE_OPTS() 976 #undef x 977 978 if (opts->nocow) 979 opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; 980 } 981 982 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) 983 { 984 struct bch_fs *c = trans->c; 985 struct btree_iter iter = { NULL }; 986 struct bkey_i_inode_generation delete; 987 struct bch_inode_unpacked inode_u; 988 struct bkey_s_c k; 989 int ret; 990 991 do { 992 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 993 SPOS(inum, 0, snapshot), 994 SPOS(inum, U64_MAX, snapshot), 995 0, NULL) ?: 996 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, 997 SPOS(inum, 0, snapshot), 998 SPOS(inum, U64_MAX, snapshot), 999 0, NULL) ?: 1000 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, 1001 SPOS(inum, 0, snapshot), 1002 SPOS(inum, U64_MAX, snapshot), 1003 0, NULL); 1004 } while (ret == -BCH_ERR_transaction_restart_nested); 1005 if (ret) 1006 goto err; 1007 retry: 1008 bch2_trans_begin(trans); 1009 1010 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 1011 SPOS(0, inum, snapshot), BTREE_ITER_INTENT); 1012 ret = bkey_err(k); 1013 if (ret) 1014 goto err; 1015 1016 if (!bkey_is_inode(k.k)) { 1017 bch2_fs_inconsistent(c, 1018 "inode %llu:%u not found when deleting", 1019 inum, snapshot); 1020 ret = -EIO; 1021 goto err; 1022 } 1023 1024 bch2_inode_unpack(k, &inode_u); 1025 1026 /* Subvolume root? */ 1027 if (inode_u.bi_subvol) 1028 bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); 1029 1030 bkey_inode_generation_init(&delete.k_i); 1031 delete.k.p = iter.pos; 1032 delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); 1033 1034 ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: 1035 bch2_trans_commit(trans, NULL, NULL, 1036 BTREE_INSERT_NOFAIL); 1037 err: 1038 bch2_trans_iter_exit(trans, &iter); 1039 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1040 goto retry; 1041 1042 return ret ?: -BCH_ERR_transaction_restart_nested; 1043 } 1044 1045 static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) 1046 { 1047 struct bch_fs *c = trans->c; 1048 struct btree_iter iter; 1049 struct bkey_s_c k; 1050 struct bch_inode_unpacked inode; 1051 int ret; 1052 1053 if (bch2_snapshot_is_internal_node(c, pos.snapshot)) 1054 return 0; 1055 1056 if (!fsck_err_on(c->sb.clean, c, 1057 "filesystem marked as clean but have deleted inode %llu:%u", 1058 pos.offset, pos.snapshot)) 1059 return 0; 1060 1061 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); 1062 ret = bkey_err(k); 1063 if (ret) 1064 return ret; 1065 1066 ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; 1067 if (fsck_err_on(!bkey_is_inode(k.k), c, 1068 "nonexistent inode %llu:%u in deleted_inodes btree", 1069 pos.offset, pos.snapshot)) 1070 goto delete; 1071 1072 ret = bch2_inode_unpack(k, &inode); 1073 if (ret) 1074 goto err; 1075 1076 if (fsck_err_on(S_ISDIR(inode.bi_mode), c, 1077 "directory %llu:%u in deleted_inodes btree", 1078 pos.offset, pos.snapshot)) 1079 goto delete; 1080 1081 if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, 1082 "non-deleted inode %llu:%u in deleted_inodes btree", 1083 pos.offset, pos.snapshot)) 1084 goto delete; 1085 1086 return 1; 1087 err: 1088 fsck_err: 1089 return ret; 1090 delete: 1091 return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); 1092 } 1093 1094 int bch2_delete_dead_inodes(struct bch_fs *c) 1095 { 1096 struct btree_trans *trans = bch2_trans_get(c); 1097 struct btree_iter iter; 1098 struct bkey_s_c k; 1099 int ret; 1100 1101 ret = bch2_btree_write_buffer_flush_sync(trans); 1102 if (ret) 1103 goto err; 1104 1105 /* 1106 * Weird transaction restart handling here because on successful delete, 1107 * bch2_inode_rm_snapshot() will return a nested transaction restart, 1108 * but we can't retry because the btree write buffer won't have been 1109 * flushed and we'd spin: 1110 */ 1111 for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, 1112 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 1113 ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p)); 1114 if (ret < 0) 1115 break; 1116 1117 if (ret) { 1118 if (!test_bit(BCH_FS_RW, &c->flags)) { 1119 bch2_trans_unlock(trans); 1120 bch2_fs_lazy_rw(c); 1121 } 1122 1123 ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); 1124 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1125 break; 1126 } 1127 } 1128 bch2_trans_iter_exit(trans, &iter); 1129 err: 1130 bch2_trans_put(trans); 1131 1132 return ret; 1133 } 1134