1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "acl.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "chardev.h" 10 #include "dirent.h" 11 #include "errcode.h" 12 #include "extents.h" 13 #include "fs.h" 14 #include "fs-common.h" 15 #include "fs-io.h" 16 #include "fs-ioctl.h" 17 #include "fs-io-buffered.h" 18 #include "fs-io-direct.h" 19 #include "fs-io-pagecache.h" 20 #include "fsck.h" 21 #include "inode.h" 22 #include "io_read.h" 23 #include "journal.h" 24 #include "keylist.h" 25 #include "quota.h" 26 #include "rebalance.h" 27 #include "snapshot.h" 28 #include "super.h" 29 #include "xattr.h" 30 #include "trace.h" 31 32 #include <linux/aio.h> 33 #include <linux/backing-dev.h> 34 #include <linux/exportfs.h> 35 #include <linux/fiemap.h> 36 #include <linux/fs_context.h> 37 #include <linux/module.h> 38 #include <linux/pagemap.h> 39 #include <linux/posix_acl.h> 40 #include <linux/random.h> 41 #include <linux/seq_file.h> 42 #include <linux/siphash.h> 43 #include <linux/statfs.h> 44 #include <linux/string.h> 45 #include <linux/xattr.h> 46 47 static struct kmem_cache *bch2_inode_cache; 48 49 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, 50 struct bch_inode_info *, 51 struct bch_inode_unpacked *, 52 struct bch_subvolume *); 53 54 void bch2_inode_update_after_write(struct btree_trans *trans, 55 struct bch_inode_info *inode, 56 struct bch_inode_unpacked *bi, 57 unsigned fields) 58 { 59 struct bch_fs *c = trans->c; 60 61 BUG_ON(bi->bi_inum != inode->v.i_ino); 62 63 bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); 64 65 set_nlink(&inode->v, bch2_inode_nlink_get(bi)); 66 i_uid_write(&inode->v, bi->bi_uid); 67 i_gid_write(&inode->v, bi->bi_gid); 68 inode->v.i_mode = bi->bi_mode; 69 70 if (fields & ATTR_SIZE) 71 i_size_write(&inode->v, bi->bi_size); 72 73 if (fields & ATTR_ATIME) 74 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); 75 if (fields & ATTR_MTIME) 76 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); 77 if (fields & ATTR_CTIME) 78 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); 79 80 inode->ei_inode = *bi; 81 82 bch2_inode_flags_to_vfs(inode); 83 } 84 85 int __must_check bch2_write_inode(struct bch_fs *c, 86 struct bch_inode_info *inode, 87 inode_set_fn set, 88 void *p, unsigned fields) 89 { 90 struct btree_trans *trans = bch2_trans_get(c); 91 struct btree_iter iter = { NULL }; 92 struct bch_inode_unpacked inode_u; 93 int ret; 94 retry: 95 bch2_trans_begin(trans); 96 97 ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); 98 if (ret) 99 goto err; 100 101 struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); 102 103 ret = (set ? set(trans, inode, &inode_u, p) : 0); 104 if (ret) 105 goto err; 106 107 struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); 108 109 if (memcmp(&old_r, &new_r, sizeof(new_r))) { 110 ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); 111 if (ret) 112 goto err; 113 } 114 115 ret = bch2_inode_write(trans, &iter, &inode_u) ?: 116 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 117 118 /* 119 * the btree node lock protects inode->ei_inode, not ei_update_lock; 120 * this is important for inode updates via bchfs_write_index_update 121 */ 122 if (!ret) 123 bch2_inode_update_after_write(trans, inode, &inode_u, fields); 124 err: 125 bch2_trans_iter_exit(trans, &iter); 126 127 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 128 goto retry; 129 130 bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, 131 "%s: inode %llu:%llu not found when updating", 132 bch2_err_str(ret), 133 inode_inum(inode).subvol, 134 inode_inum(inode).inum); 135 136 bch2_trans_put(trans); 137 return ret < 0 ? ret : 0; 138 } 139 140 int bch2_fs_quota_transfer(struct bch_fs *c, 141 struct bch_inode_info *inode, 142 struct bch_qid new_qid, 143 unsigned qtypes, 144 enum quota_acct_mode mode) 145 { 146 unsigned i; 147 int ret; 148 149 qtypes &= enabled_qtypes(c); 150 151 for (i = 0; i < QTYP_NR; i++) 152 if (new_qid.q[i] == inode->ei_qid.q[i]) 153 qtypes &= ~(1U << i); 154 155 if (!qtypes) 156 return 0; 157 158 mutex_lock(&inode->ei_quota_lock); 159 160 ret = bch2_quota_transfer(c, qtypes, new_qid, 161 inode->ei_qid, 162 inode->v.i_blocks + 163 inode->ei_quota_reserved, 164 mode); 165 if (!ret) 166 for (i = 0; i < QTYP_NR; i++) 167 if (qtypes & (1 << i)) 168 inode->ei_qid.q[i] = new_qid.q[i]; 169 170 mutex_unlock(&inode->ei_quota_lock); 171 172 return ret; 173 } 174 175 static bool subvol_inum_eq(subvol_inum a, subvol_inum b) 176 { 177 return a.subvol == b.subvol && a.inum == b.inum; 178 } 179 180 static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) 181 { 182 const subvol_inum *inum = data; 183 siphash_key_t k = { .key[0] = seed }; 184 185 return siphash_2u64(inum->subvol, inum->inum, &k); 186 } 187 188 static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) 189 { 190 const struct bch_inode_info *inode = data; 191 192 return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); 193 } 194 195 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, 196 const void *obj) 197 { 198 const struct bch_inode_info *inode = obj; 199 const subvol_inum *v = arg->key; 200 201 return !subvol_inum_eq(inode->ei_inum, *v); 202 } 203 204 static const struct rhashtable_params bch2_vfs_inodes_params = { 205 .head_offset = offsetof(struct bch_inode_info, hash), 206 .key_offset = offsetof(struct bch_inode_info, ei_inum), 207 .key_len = sizeof(subvol_inum), 208 .hashfn = bch2_vfs_inode_hash_fn, 209 .obj_hashfn = bch2_vfs_inode_obj_hash_fn, 210 .obj_cmpfn = bch2_vfs_inode_cmp_fn, 211 .automatic_shrinking = true, 212 }; 213 214 static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { 215 .head_offset = offsetof(struct bch_inode_info, by_inum_hash), 216 .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), 217 .key_len = sizeof(u64), 218 .automatic_shrinking = true, 219 }; 220 221 int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) 222 { 223 struct bch_fs *c = trans->c; 224 struct rhltable *ht = &c->vfs_inodes_by_inum_table; 225 u64 inum = p.offset; 226 DARRAY(u32) subvols; 227 int ret = 0; 228 229 if (!test_bit(BCH_FS_started, &c->flags)) 230 return false; 231 232 darray_init(&subvols); 233 restart_from_top: 234 235 /* 236 * Tweaked version of __rhashtable_lookup(); we need to get a list of 237 * subvolumes in which the given inode number is open. 238 * 239 * For this to work, we don't include the subvolume ID in the key that 240 * we hash - all inodes with the same inode number regardless of 241 * subvolume will hash to the same slot. 242 * 243 * This will be less than ideal if the same file is ever open 244 * simultaneously in many different snapshots: 245 */ 246 rcu_read_lock(); 247 struct rhash_lock_head __rcu *const *bkt; 248 struct rhash_head *he; 249 unsigned int hash; 250 struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); 251 restart: 252 hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); 253 bkt = rht_bucket(tbl, hash); 254 do { 255 struct bch_inode_info *inode; 256 257 rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { 258 if (inode->ei_inum.inum == inum) { 259 ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, 260 GFP_NOWAIT|__GFP_NOWARN); 261 if (ret) { 262 rcu_read_unlock(); 263 ret = darray_make_room(&subvols, 1); 264 if (ret) 265 goto err; 266 subvols.nr = 0; 267 goto restart_from_top; 268 } 269 } 270 } 271 /* An object might have been moved to a different hash chain, 272 * while we walk along it - better check and retry. 273 */ 274 } while (he != RHT_NULLS_MARKER(bkt)); 275 276 /* Ensure we see any new tables. */ 277 smp_rmb(); 278 279 tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); 280 if (unlikely(tbl)) 281 goto restart; 282 rcu_read_unlock(); 283 284 darray_for_each(subvols, i) { 285 u32 snap; 286 ret = bch2_subvolume_get_snapshot(trans, *i, &snap); 287 if (ret) 288 goto err; 289 290 ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); 291 if (ret) 292 break; 293 } 294 err: 295 darray_exit(&subvols); 296 return ret; 297 } 298 299 static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 300 { 301 return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); 302 } 303 304 static void __wait_on_freeing_inode(struct bch_fs *c, 305 struct bch_inode_info *inode, 306 subvol_inum inum) 307 { 308 wait_queue_head_t *wq; 309 struct wait_bit_queue_entry wait; 310 311 wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); 312 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 313 spin_unlock(&inode->v.i_lock); 314 315 if (__bch2_inode_hash_find(c, inum) == inode) 316 schedule_timeout(HZ * 10); 317 finish_wait(wq, &wait.wq_entry); 318 } 319 320 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, 321 subvol_inum inum) 322 { 323 struct bch_inode_info *inode; 324 repeat: 325 inode = __bch2_inode_hash_find(c, inum); 326 if (inode) { 327 spin_lock(&inode->v.i_lock); 328 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { 329 spin_unlock(&inode->v.i_lock); 330 return NULL; 331 } 332 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { 333 if (!trans) { 334 __wait_on_freeing_inode(c, inode, inum); 335 } else { 336 bch2_trans_unlock(trans); 337 __wait_on_freeing_inode(c, inode, inum); 338 int ret = bch2_trans_relock(trans); 339 if (ret) 340 return ERR_PTR(ret); 341 } 342 goto repeat; 343 } 344 __iget(&inode->v); 345 spin_unlock(&inode->v.i_lock); 346 } 347 348 return inode; 349 } 350 351 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) 352 { 353 spin_lock(&inode->v.i_lock); 354 bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); 355 spin_unlock(&inode->v.i_lock); 356 357 if (remove) { 358 int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, 359 &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); 360 BUG_ON(ret); 361 362 ret = rhashtable_remove_fast(&c->vfs_inodes_table, 363 &inode->hash, bch2_vfs_inodes_params); 364 BUG_ON(ret); 365 inode->v.i_hash.pprev = NULL; 366 /* 367 * This pairs with the bch2_inode_hash_find() -> 368 * __wait_on_freeing_inode() path 369 */ 370 inode_wake_up_bit(&inode->v, __I_NEW); 371 } 372 } 373 374 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, 375 struct btree_trans *trans, 376 struct bch_inode_info *inode) 377 { 378 struct bch_inode_info *old = inode; 379 380 set_bit(EI_INODE_HASHED, &inode->ei_flags); 381 retry: 382 if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, 383 &inode->ei_inum, 384 &inode->hash, 385 bch2_vfs_inodes_params))) { 386 old = bch2_inode_hash_find(c, trans, inode->ei_inum); 387 if (!old) 388 goto retry; 389 390 clear_bit(EI_INODE_HASHED, &inode->ei_flags); 391 392 /* 393 * bcachefs doesn't use I_NEW; we have no use for it since we 394 * only insert fully created inodes in the inode hash table. But 395 * discard_new_inode() expects it to be set... 396 */ 397 inode->v.i_state |= I_NEW; 398 /* 399 * We don't want bch2_evict_inode() to delete the inode on disk, 400 * we just raced and had another inode in cache. Normally new 401 * inodes don't have nlink == 0 - except tmpfiles do... 402 */ 403 set_nlink(&inode->v, 1); 404 discard_new_inode(&inode->v); 405 return old; 406 } else { 407 int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, 408 &inode->by_inum_hash, 409 bch2_vfs_inodes_by_inum_params); 410 BUG_ON(ret); 411 412 inode_fake_hash(&inode->v); 413 414 inode_sb_list_add(&inode->v); 415 416 mutex_lock(&c->vfs_inodes_lock); 417 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 418 mutex_unlock(&c->vfs_inodes_lock); 419 return inode; 420 } 421 } 422 423 #define memalloc_flags_do(_flags, _do) \ 424 ({ \ 425 unsigned _saved_flags = memalloc_flags_save(_flags); \ 426 typeof(_do) _ret = _do; \ 427 memalloc_noreclaim_restore(_saved_flags); \ 428 _ret; \ 429 }) 430 431 static struct inode *bch2_alloc_inode(struct super_block *sb) 432 { 433 BUG(); 434 } 435 436 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) 437 { 438 struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, 439 bch2_inode_cache, gfp); 440 if (!inode) 441 return NULL; 442 443 inode_init_once(&inode->v); 444 mutex_init(&inode->ei_update_lock); 445 two_state_lock_init(&inode->ei_pagecache_lock); 446 INIT_LIST_HEAD(&inode->ei_vfs_inode_list); 447 inode->ei_flags = 0; 448 mutex_init(&inode->ei_quota_lock); 449 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 450 451 if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { 452 kmem_cache_free(bch2_inode_cache, inode); 453 return NULL; 454 } 455 456 return inode; 457 } 458 459 /* 460 * Allocate a new inode, dropping/retaking btree locks if necessary: 461 */ 462 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) 463 { 464 struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); 465 466 if (unlikely(!inode)) { 467 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); 468 if (ret && inode) { 469 __destroy_inode(&inode->v); 470 kmem_cache_free(bch2_inode_cache, inode); 471 } 472 if (ret) 473 return ERR_PTR(ret); 474 } 475 476 return inode; 477 } 478 479 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, 480 subvol_inum inum, 481 struct bch_inode_unpacked *bi, 482 struct bch_subvolume *subvol) 483 { 484 struct bch_inode_info *inode = bch2_new_inode(trans); 485 if (IS_ERR(inode)) 486 return inode; 487 488 bch2_vfs_inode_init(trans, inum, inode, bi, subvol); 489 490 return bch2_inode_hash_insert(trans->c, trans, inode); 491 492 } 493 494 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 495 { 496 struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); 497 if (inode) 498 return &inode->v; 499 500 struct btree_trans *trans = bch2_trans_get(c); 501 502 struct bch_inode_unpacked inode_u; 503 struct bch_subvolume subvol; 504 int ret = lockrestart_do(trans, 505 bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 506 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: 507 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 508 bch2_trans_put(trans); 509 510 return ret ? ERR_PTR(ret) : &inode->v; 511 } 512 513 struct bch_inode_info * 514 __bch2_create(struct mnt_idmap *idmap, 515 struct bch_inode_info *dir, struct dentry *dentry, 516 umode_t mode, dev_t rdev, subvol_inum snapshot_src, 517 unsigned flags) 518 { 519 struct bch_fs *c = dir->v.i_sb->s_fs_info; 520 struct btree_trans *trans; 521 struct bch_inode_unpacked dir_u; 522 struct bch_inode_info *inode; 523 struct bch_inode_unpacked inode_u; 524 struct posix_acl *default_acl = NULL, *acl = NULL; 525 subvol_inum inum; 526 struct bch_subvolume subvol; 527 u64 journal_seq = 0; 528 kuid_t kuid; 529 kgid_t kgid; 530 int ret; 531 532 /* 533 * preallocate acls + vfs inode before btree transaction, so that 534 * nothing can fail after the transaction succeeds: 535 */ 536 #ifdef CONFIG_BCACHEFS_POSIX_ACL 537 ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); 538 if (ret) 539 return ERR_PTR(ret); 540 #endif 541 inode = __bch2_new_inode(c, GFP_NOFS); 542 if (unlikely(!inode)) { 543 inode = ERR_PTR(-ENOMEM); 544 goto err; 545 } 546 547 bch2_inode_init_early(c, &inode_u); 548 549 if (!(flags & BCH_CREATE_TMPFILE)) 550 mutex_lock(&dir->ei_update_lock); 551 552 trans = bch2_trans_get(c); 553 retry: 554 bch2_trans_begin(trans); 555 556 kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); 557 kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); 558 ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: 559 bch2_create_trans(trans, 560 inode_inum(dir), &dir_u, &inode_u, 561 !(flags & BCH_CREATE_TMPFILE) 562 ? &dentry->d_name : NULL, 563 from_kuid(i_user_ns(&dir->v), kuid), 564 from_kgid(i_user_ns(&dir->v), kgid), 565 mode, rdev, 566 default_acl, acl, snapshot_src, flags) ?: 567 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, 568 KEY_TYPE_QUOTA_PREALLOC); 569 if (unlikely(ret)) 570 goto err_before_quota; 571 572 inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; 573 inum.inum = inode_u.bi_inum; 574 575 ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 576 bch2_trans_commit(trans, NULL, &journal_seq, 0); 577 if (unlikely(ret)) { 578 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, 579 KEY_TYPE_QUOTA_WARN); 580 err_before_quota: 581 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 582 goto retry; 583 goto err_trans; 584 } 585 586 if (!(flags & BCH_CREATE_TMPFILE)) { 587 bch2_inode_update_after_write(trans, dir, &dir_u, 588 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 589 mutex_unlock(&dir->ei_update_lock); 590 } 591 592 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 593 594 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 595 set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); 596 597 /* 598 * we must insert the new inode into the inode cache before calling 599 * bch2_trans_exit() and dropping locks, else we could race with another 600 * thread pulling the inode in and modifying it: 601 * 602 * also, calling bch2_inode_hash_insert() without passing in the 603 * transaction object is sketchy - if we could ever end up in 604 * __wait_on_freeing_inode(), we'd risk deadlock. 605 * 606 * But that shouldn't be possible, since we still have the inode locked 607 * that we just created, and we _really_ can't take a transaction 608 * restart here. 609 */ 610 inode = bch2_inode_hash_insert(c, NULL, inode); 611 bch2_trans_put(trans); 612 err: 613 posix_acl_release(default_acl); 614 posix_acl_release(acl); 615 return inode; 616 err_trans: 617 if (!(flags & BCH_CREATE_TMPFILE)) 618 mutex_unlock(&dir->ei_update_lock); 619 620 bch2_trans_put(trans); 621 make_bad_inode(&inode->v); 622 iput(&inode->v); 623 inode = ERR_PTR(ret); 624 goto err; 625 } 626 627 /* methods */ 628 629 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, 630 subvol_inum dir, struct bch_hash_info *dir_hash_info, 631 const struct qstr *name) 632 { 633 struct bch_fs *c = trans->c; 634 struct btree_iter dirent_iter = {}; 635 subvol_inum inum = {}; 636 struct printbuf buf = PRINTBUF; 637 638 struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, 639 dir_hash_info, dir, name, 0); 640 int ret = bkey_err(k); 641 if (ret) 642 return ERR_PTR(ret); 643 644 ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); 645 if (ret > 0) 646 ret = -ENOENT; 647 if (ret) 648 goto err; 649 650 struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); 651 if (inode) 652 goto out; 653 654 struct bch_subvolume subvol; 655 struct bch_inode_unpacked inode_u; 656 ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 657 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: 658 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 659 660 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 661 c, "dirent to missing inode:\n %s", 662 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 663 if (ret) 664 goto err; 665 666 /* regular files may have hardlinks: */ 667 if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && 668 !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), 669 c, 670 "dirent points to inode that does not point back:\n %s", 671 (bch2_bkey_val_to_text(&buf, c, k), 672 prt_printf(&buf, "\n "), 673 bch2_inode_unpacked_to_text(&buf, &inode_u), 674 buf.buf))) { 675 ret = -ENOENT; 676 goto err; 677 } 678 out: 679 bch2_trans_iter_exit(trans, &dirent_iter); 680 printbuf_exit(&buf); 681 return inode; 682 err: 683 inode = ERR_PTR(ret); 684 goto out; 685 } 686 687 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, 688 unsigned int flags) 689 { 690 struct bch_fs *c = vdir->i_sb->s_fs_info; 691 struct bch_inode_info *dir = to_bch_ei(vdir); 692 struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); 693 694 struct bch_inode_info *inode; 695 bch2_trans_do(c, 696 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), 697 &hash, &dentry->d_name))); 698 if (IS_ERR(inode)) 699 inode = NULL; 700 701 return d_splice_alias(&inode->v, dentry); 702 } 703 704 static int bch2_mknod(struct mnt_idmap *idmap, 705 struct inode *vdir, struct dentry *dentry, 706 umode_t mode, dev_t rdev) 707 { 708 struct bch_inode_info *inode = 709 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 710 (subvol_inum) { 0 }, 0); 711 712 if (IS_ERR(inode)) 713 return bch2_err_class(PTR_ERR(inode)); 714 715 d_instantiate(dentry, &inode->v); 716 return 0; 717 } 718 719 static int bch2_create(struct mnt_idmap *idmap, 720 struct inode *vdir, struct dentry *dentry, 721 umode_t mode, bool excl) 722 { 723 return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); 724 } 725 726 static int __bch2_link(struct bch_fs *c, 727 struct bch_inode_info *inode, 728 struct bch_inode_info *dir, 729 struct dentry *dentry) 730 { 731 struct bch_inode_unpacked dir_u, inode_u; 732 int ret; 733 734 mutex_lock(&inode->ei_update_lock); 735 struct btree_trans *trans = bch2_trans_get(c); 736 737 ret = commit_do(trans, NULL, NULL, 0, 738 bch2_link_trans(trans, 739 inode_inum(dir), &dir_u, 740 inode_inum(inode), &inode_u, 741 &dentry->d_name)); 742 743 if (likely(!ret)) { 744 bch2_inode_update_after_write(trans, dir, &dir_u, 745 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 746 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); 747 } 748 749 bch2_trans_put(trans); 750 mutex_unlock(&inode->ei_update_lock); 751 return ret; 752 } 753 754 static int bch2_link(struct dentry *old_dentry, struct inode *vdir, 755 struct dentry *dentry) 756 { 757 struct bch_fs *c = vdir->i_sb->s_fs_info; 758 struct bch_inode_info *dir = to_bch_ei(vdir); 759 struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); 760 int ret; 761 762 lockdep_assert_held(&inode->v.i_rwsem); 763 764 ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 765 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 766 __bch2_link(c, inode, dir, dentry); 767 if (unlikely(ret)) 768 return bch2_err_class(ret); 769 770 ihold(&inode->v); 771 d_instantiate(dentry, &inode->v); 772 return 0; 773 } 774 775 int __bch2_unlink(struct inode *vdir, struct dentry *dentry, 776 bool deleting_snapshot) 777 { 778 struct bch_fs *c = vdir->i_sb->s_fs_info; 779 struct bch_inode_info *dir = to_bch_ei(vdir); 780 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 781 struct bch_inode_unpacked dir_u, inode_u; 782 int ret; 783 784 bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); 785 786 struct btree_trans *trans = bch2_trans_get(c); 787 788 ret = commit_do(trans, NULL, NULL, 789 BCH_TRANS_COMMIT_no_enospc, 790 bch2_unlink_trans(trans, 791 inode_inum(dir), &dir_u, 792 &inode_u, &dentry->d_name, 793 deleting_snapshot)); 794 if (unlikely(ret)) 795 goto err; 796 797 bch2_inode_update_after_write(trans, dir, &dir_u, 798 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 799 bch2_inode_update_after_write(trans, inode, &inode_u, 800 ATTR_MTIME); 801 802 if (inode_u.bi_subvol) { 803 /* 804 * Subvolume deletion is asynchronous, but we still want to tell 805 * the VFS that it's been deleted here: 806 */ 807 set_nlink(&inode->v, 0); 808 } 809 err: 810 bch2_trans_put(trans); 811 bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); 812 813 return ret; 814 } 815 816 static int bch2_unlink(struct inode *vdir, struct dentry *dentry) 817 { 818 struct bch_inode_info *dir= to_bch_ei(vdir); 819 struct bch_fs *c = dir->v.i_sb->s_fs_info; 820 821 int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 822 __bch2_unlink(vdir, dentry, false); 823 return bch2_err_class(ret); 824 } 825 826 static int bch2_symlink(struct mnt_idmap *idmap, 827 struct inode *vdir, struct dentry *dentry, 828 const char *symname) 829 { 830 struct bch_fs *c = vdir->i_sb->s_fs_info; 831 struct bch_inode_info *dir = to_bch_ei(vdir), *inode; 832 int ret; 833 834 inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, 835 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 836 if (IS_ERR(inode)) 837 return bch2_err_class(PTR_ERR(inode)); 838 839 inode_lock(&inode->v); 840 ret = page_symlink(&inode->v, symname, strlen(symname) + 1); 841 inode_unlock(&inode->v); 842 843 if (unlikely(ret)) 844 goto err; 845 846 ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); 847 if (unlikely(ret)) 848 goto err; 849 850 ret = __bch2_link(c, inode, dir, dentry); 851 if (unlikely(ret)) 852 goto err; 853 854 d_instantiate(dentry, &inode->v); 855 return 0; 856 err: 857 iput(&inode->v); 858 return bch2_err_class(ret); 859 } 860 861 static int bch2_mkdir(struct mnt_idmap *idmap, 862 struct inode *vdir, struct dentry *dentry, umode_t mode) 863 { 864 return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); 865 } 866 867 static int bch2_rename2(struct mnt_idmap *idmap, 868 struct inode *src_vdir, struct dentry *src_dentry, 869 struct inode *dst_vdir, struct dentry *dst_dentry, 870 unsigned flags) 871 { 872 struct bch_fs *c = src_vdir->i_sb->s_fs_info; 873 struct bch_inode_info *src_dir = to_bch_ei(src_vdir); 874 struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); 875 struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); 876 struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); 877 struct bch_inode_unpacked dst_dir_u, src_dir_u; 878 struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; 879 struct btree_trans *trans; 880 enum bch_rename_mode mode = flags & RENAME_EXCHANGE 881 ? BCH_RENAME_EXCHANGE 882 : dst_dentry->d_inode 883 ? BCH_RENAME_OVERWRITE : BCH_RENAME; 884 bool whiteout = !!(flags & RENAME_WHITEOUT); 885 int ret; 886 887 if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) 888 return -EINVAL; 889 890 if (mode == BCH_RENAME_OVERWRITE) { 891 ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 892 0, LLONG_MAX); 893 if (ret) 894 return ret; 895 } 896 897 bch2_lock_inodes(INODE_UPDATE_LOCK, 898 src_dir, 899 dst_dir, 900 src_inode, 901 dst_inode); 902 903 trans = bch2_trans_get(c); 904 905 ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: 906 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); 907 if (ret) 908 goto err_tx_restart; 909 910 if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { 911 ret = bch2_fs_quota_transfer(c, src_inode, 912 dst_dir->ei_qid, 913 1 << QTYP_PRJ, 914 KEY_TYPE_QUOTA_PREALLOC); 915 if (ret) 916 goto err; 917 } 918 919 if (mode == BCH_RENAME_EXCHANGE && 920 inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { 921 ret = bch2_fs_quota_transfer(c, dst_inode, 922 src_dir->ei_qid, 923 1 << QTYP_PRJ, 924 KEY_TYPE_QUOTA_PREALLOC); 925 if (ret) 926 goto err; 927 } 928 retry: 929 bch2_trans_begin(trans); 930 931 ret = bch2_rename_trans(trans, 932 inode_inum(src_dir), &src_dir_u, 933 inode_inum(dst_dir), &dst_dir_u, 934 &src_inode_u, 935 &dst_inode_u, 936 &src_dentry->d_name, 937 &dst_dentry->d_name, 938 mode); 939 if (unlikely(ret)) 940 goto err_tx_restart; 941 942 if (whiteout) { 943 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); 944 ret = PTR_ERR_OR_ZERO(whiteout_inode_u); 945 if (unlikely(ret)) 946 goto err_tx_restart; 947 bch2_inode_init_early(c, whiteout_inode_u); 948 949 ret = bch2_create_trans(trans, 950 inode_inum(src_dir), &src_dir_u, 951 whiteout_inode_u, 952 &src_dentry->d_name, 953 from_kuid(i_user_ns(&src_dir->v), current_fsuid()), 954 from_kgid(i_user_ns(&src_dir->v), current_fsgid()), 955 S_IFCHR|WHITEOUT_MODE, 0, 956 NULL, NULL, (subvol_inum) { 0 }, 0) ?: 957 bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, 958 KEY_TYPE_QUOTA_PREALLOC); 959 if (unlikely(ret)) 960 goto err_tx_restart; 961 } 962 963 ret = bch2_trans_commit(trans, NULL, NULL, 0); 964 if (unlikely(ret)) { 965 err_tx_restart: 966 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 967 goto retry; 968 goto err; 969 } 970 971 BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); 972 BUG_ON(dst_inode && 973 dst_inode->v.i_ino != dst_inode_u.bi_inum); 974 975 bch2_inode_update_after_write(trans, src_dir, &src_dir_u, 976 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 977 978 if (src_dir != dst_dir) 979 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, 980 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 981 982 bch2_inode_update_after_write(trans, src_inode, &src_inode_u, 983 ATTR_CTIME); 984 985 if (dst_inode) 986 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, 987 ATTR_CTIME); 988 err: 989 bch2_trans_put(trans); 990 991 bch2_fs_quota_transfer(c, src_inode, 992 bch_qid(&src_inode->ei_inode), 993 1 << QTYP_PRJ, 994 KEY_TYPE_QUOTA_NOCHECK); 995 if (dst_inode) 996 bch2_fs_quota_transfer(c, dst_inode, 997 bch_qid(&dst_inode->ei_inode), 998 1 << QTYP_PRJ, 999 KEY_TYPE_QUOTA_NOCHECK); 1000 1001 bch2_unlock_inodes(INODE_UPDATE_LOCK, 1002 src_dir, 1003 dst_dir, 1004 src_inode, 1005 dst_inode); 1006 1007 return bch2_err_class(ret); 1008 } 1009 1010 static void bch2_setattr_copy(struct mnt_idmap *idmap, 1011 struct bch_inode_info *inode, 1012 struct bch_inode_unpacked *bi, 1013 struct iattr *attr) 1014 { 1015 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1016 unsigned int ia_valid = attr->ia_valid; 1017 kuid_t kuid; 1018 kgid_t kgid; 1019 1020 if (ia_valid & ATTR_UID) { 1021 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 1022 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); 1023 } 1024 if (ia_valid & ATTR_GID) { 1025 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 1026 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); 1027 } 1028 1029 if (ia_valid & ATTR_SIZE) 1030 bi->bi_size = attr->ia_size; 1031 1032 if (ia_valid & ATTR_ATIME) 1033 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); 1034 if (ia_valid & ATTR_MTIME) 1035 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); 1036 if (ia_valid & ATTR_CTIME) 1037 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); 1038 1039 if (ia_valid & ATTR_MODE) { 1040 umode_t mode = attr->ia_mode; 1041 kgid_t gid = ia_valid & ATTR_GID 1042 ? kgid 1043 : inode->v.i_gid; 1044 1045 if (!in_group_or_capable(idmap, &inode->v, 1046 make_vfsgid(idmap, i_user_ns(&inode->v), gid))) 1047 mode &= ~S_ISGID; 1048 bi->bi_mode = mode; 1049 } 1050 } 1051 1052 int bch2_setattr_nonsize(struct mnt_idmap *idmap, 1053 struct bch_inode_info *inode, 1054 struct iattr *attr) 1055 { 1056 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1057 struct bch_qid qid; 1058 struct btree_trans *trans; 1059 struct btree_iter inode_iter = { NULL }; 1060 struct bch_inode_unpacked inode_u; 1061 struct posix_acl *acl = NULL; 1062 kuid_t kuid; 1063 kgid_t kgid; 1064 int ret; 1065 1066 mutex_lock(&inode->ei_update_lock); 1067 1068 qid = inode->ei_qid; 1069 1070 if (attr->ia_valid & ATTR_UID) { 1071 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 1072 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); 1073 } 1074 1075 if (attr->ia_valid & ATTR_GID) { 1076 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 1077 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); 1078 } 1079 1080 ret = bch2_fs_quota_transfer(c, inode, qid, ~0, 1081 KEY_TYPE_QUOTA_PREALLOC); 1082 if (ret) 1083 goto err; 1084 1085 trans = bch2_trans_get(c); 1086 retry: 1087 bch2_trans_begin(trans); 1088 kfree(acl); 1089 acl = NULL; 1090 1091 ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), 1092 BTREE_ITER_intent); 1093 if (ret) 1094 goto btree_err; 1095 1096 bch2_setattr_copy(idmap, inode, &inode_u, attr); 1097 1098 if (attr->ia_valid & ATTR_MODE) { 1099 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, 1100 inode_u.bi_mode, &acl); 1101 if (ret) 1102 goto btree_err; 1103 } 1104 1105 ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: 1106 bch2_trans_commit(trans, NULL, NULL, 1107 BCH_TRANS_COMMIT_no_enospc); 1108 btree_err: 1109 bch2_trans_iter_exit(trans, &inode_iter); 1110 1111 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1112 goto retry; 1113 if (unlikely(ret)) 1114 goto err_trans; 1115 1116 bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); 1117 1118 if (acl) 1119 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 1120 err_trans: 1121 bch2_trans_put(trans); 1122 err: 1123 mutex_unlock(&inode->ei_update_lock); 1124 1125 return bch2_err_class(ret); 1126 } 1127 1128 static int bch2_getattr(struct mnt_idmap *idmap, 1129 const struct path *path, struct kstat *stat, 1130 u32 request_mask, unsigned query_flags) 1131 { 1132 struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); 1133 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1134 vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); 1135 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); 1136 1137 stat->dev = inode->v.i_sb->s_dev; 1138 stat->ino = inode->v.i_ino; 1139 stat->mode = inode->v.i_mode; 1140 stat->nlink = inode->v.i_nlink; 1141 stat->uid = vfsuid_into_kuid(vfsuid); 1142 stat->gid = vfsgid_into_kgid(vfsgid); 1143 stat->rdev = inode->v.i_rdev; 1144 stat->size = i_size_read(&inode->v); 1145 stat->atime = inode_get_atime(&inode->v); 1146 stat->mtime = inode_get_mtime(&inode->v); 1147 stat->ctime = inode_get_ctime(&inode->v); 1148 stat->blksize = block_bytes(c); 1149 stat->blocks = inode->v.i_blocks; 1150 1151 stat->subvol = inode->ei_inum.subvol; 1152 stat->result_mask |= STATX_SUBVOL; 1153 1154 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { 1155 stat->result_mask |= STATX_DIOALIGN; 1156 /* 1157 * this is incorrect; we should be tracking this in superblock, 1158 * and checking the alignment of open devices 1159 */ 1160 stat->dio_mem_align = SECTOR_SIZE; 1161 stat->dio_offset_align = block_bytes(c); 1162 } 1163 1164 if (request_mask & STATX_BTIME) { 1165 stat->result_mask |= STATX_BTIME; 1166 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); 1167 } 1168 1169 if (inode->ei_inode.bi_flags & BCH_INODE_immutable) 1170 stat->attributes |= STATX_ATTR_IMMUTABLE; 1171 stat->attributes_mask |= STATX_ATTR_IMMUTABLE; 1172 1173 if (inode->ei_inode.bi_flags & BCH_INODE_append) 1174 stat->attributes |= STATX_ATTR_APPEND; 1175 stat->attributes_mask |= STATX_ATTR_APPEND; 1176 1177 if (inode->ei_inode.bi_flags & BCH_INODE_nodump) 1178 stat->attributes |= STATX_ATTR_NODUMP; 1179 stat->attributes_mask |= STATX_ATTR_NODUMP; 1180 1181 return 0; 1182 } 1183 1184 static int bch2_setattr(struct mnt_idmap *idmap, 1185 struct dentry *dentry, struct iattr *iattr) 1186 { 1187 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 1188 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1189 int ret; 1190 1191 lockdep_assert_held(&inode->v.i_rwsem); 1192 1193 ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 1194 setattr_prepare(idmap, dentry, iattr); 1195 if (ret) 1196 return ret; 1197 1198 return iattr->ia_valid & ATTR_SIZE 1199 ? bchfs_truncate(idmap, inode, iattr) 1200 : bch2_setattr_nonsize(idmap, inode, iattr); 1201 } 1202 1203 static int bch2_tmpfile(struct mnt_idmap *idmap, 1204 struct inode *vdir, struct file *file, umode_t mode) 1205 { 1206 struct bch_inode_info *inode = 1207 __bch2_create(idmap, to_bch_ei(vdir), 1208 file->f_path.dentry, mode, 0, 1209 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 1210 1211 if (IS_ERR(inode)) 1212 return bch2_err_class(PTR_ERR(inode)); 1213 1214 d_mark_tmpfile(file, &inode->v); 1215 d_instantiate(file->f_path.dentry, &inode->v); 1216 return finish_open_simple(file, 0); 1217 } 1218 1219 static int bch2_fill_extent(struct bch_fs *c, 1220 struct fiemap_extent_info *info, 1221 struct bkey_s_c k, unsigned flags) 1222 { 1223 if (bkey_extent_is_direct_data(k.k)) { 1224 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1225 const union bch_extent_entry *entry; 1226 struct extent_ptr_decoded p; 1227 int ret; 1228 1229 if (k.k->type == KEY_TYPE_reflink_v) 1230 flags |= FIEMAP_EXTENT_SHARED; 1231 1232 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1233 int flags2 = 0; 1234 u64 offset = p.ptr.offset; 1235 1236 if (p.ptr.unwritten) 1237 flags2 |= FIEMAP_EXTENT_UNWRITTEN; 1238 1239 if (p.crc.compression_type) 1240 flags2 |= FIEMAP_EXTENT_ENCODED; 1241 else 1242 offset += p.crc.offset; 1243 1244 if ((offset & (block_sectors(c) - 1)) || 1245 (k.k->size & (block_sectors(c) - 1))) 1246 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; 1247 1248 ret = fiemap_fill_next_extent(info, 1249 bkey_start_offset(k.k) << 9, 1250 offset << 9, 1251 k.k->size << 9, flags|flags2); 1252 if (ret) 1253 return ret; 1254 } 1255 1256 return 0; 1257 } else if (bkey_extent_is_inline_data(k.k)) { 1258 return fiemap_fill_next_extent(info, 1259 bkey_start_offset(k.k) << 9, 1260 0, k.k->size << 9, 1261 flags| 1262 FIEMAP_EXTENT_DATA_INLINE); 1263 } else if (k.k->type == KEY_TYPE_reservation) { 1264 return fiemap_fill_next_extent(info, 1265 bkey_start_offset(k.k) << 9, 1266 0, k.k->size << 9, 1267 flags| 1268 FIEMAP_EXTENT_DELALLOC| 1269 FIEMAP_EXTENT_UNWRITTEN); 1270 } else { 1271 BUG(); 1272 } 1273 } 1274 1275 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, 1276 u64 start, u64 len) 1277 { 1278 struct bch_fs *c = vinode->i_sb->s_fs_info; 1279 struct bch_inode_info *ei = to_bch_ei(vinode); 1280 struct btree_trans *trans; 1281 struct btree_iter iter; 1282 struct bkey_s_c k; 1283 struct bkey_buf cur, prev; 1284 bool have_extent = false; 1285 int ret = 0; 1286 1287 ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); 1288 if (ret) 1289 return ret; 1290 1291 struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); 1292 if (start + len < start) 1293 return -EINVAL; 1294 1295 start >>= 9; 1296 1297 bch2_bkey_buf_init(&cur); 1298 bch2_bkey_buf_init(&prev); 1299 trans = bch2_trans_get(c); 1300 1301 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1302 POS(ei->v.i_ino, start), 0); 1303 1304 while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1305 enum btree_id data_btree = BTREE_ID_extents; 1306 1307 bch2_trans_begin(trans); 1308 1309 u32 snapshot; 1310 ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); 1311 if (ret) 1312 continue; 1313 1314 bch2_btree_iter_set_snapshot(&iter, snapshot); 1315 1316 k = bch2_btree_iter_peek_max(&iter, end); 1317 ret = bkey_err(k); 1318 if (ret) 1319 continue; 1320 1321 if (!k.k) 1322 break; 1323 1324 if (!bkey_extent_is_data(k.k) && 1325 k.k->type != KEY_TYPE_reservation) { 1326 bch2_btree_iter_advance(&iter); 1327 continue; 1328 } 1329 1330 s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 1331 unsigned sectors = k.k->size - offset_into_extent; 1332 1333 bch2_bkey_buf_reassemble(&cur, c, k); 1334 1335 ret = bch2_read_indirect_extent(trans, &data_btree, 1336 &offset_into_extent, &cur); 1337 if (ret) 1338 continue; 1339 1340 k = bkey_i_to_s_c(cur.k); 1341 bch2_bkey_buf_realloc(&prev, c, k.k->u64s); 1342 1343 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1344 1345 bch2_cut_front(POS(k.k->p.inode, 1346 bkey_start_offset(k.k) + 1347 offset_into_extent), 1348 cur.k); 1349 bch2_key_resize(&cur.k->k, sectors); 1350 cur.k->k.p = iter.pos; 1351 cur.k->k.p.offset += cur.k->k.size; 1352 1353 if (have_extent) { 1354 bch2_trans_unlock(trans); 1355 ret = bch2_fill_extent(c, info, 1356 bkey_i_to_s_c(prev.k), 0); 1357 if (ret) 1358 break; 1359 } 1360 1361 bkey_copy(prev.k, cur.k); 1362 have_extent = true; 1363 1364 bch2_btree_iter_set_pos(&iter, 1365 POS(iter.pos.inode, iter.pos.offset + sectors)); 1366 } 1367 bch2_trans_iter_exit(trans, &iter); 1368 1369 if (!ret && have_extent) { 1370 bch2_trans_unlock(trans); 1371 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 1372 FIEMAP_EXTENT_LAST); 1373 } 1374 1375 bch2_trans_put(trans); 1376 bch2_bkey_buf_exit(&cur, c); 1377 bch2_bkey_buf_exit(&prev, c); 1378 return ret < 0 ? ret : 0; 1379 } 1380 1381 static const struct vm_operations_struct bch_vm_ops = { 1382 .fault = bch2_page_fault, 1383 .map_pages = filemap_map_pages, 1384 .page_mkwrite = bch2_page_mkwrite, 1385 }; 1386 1387 static int bch2_mmap(struct file *file, struct vm_area_struct *vma) 1388 { 1389 file_accessed(file); 1390 1391 vma->vm_ops = &bch_vm_ops; 1392 return 0; 1393 } 1394 1395 /* Directories: */ 1396 1397 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) 1398 { 1399 return generic_file_llseek_size(file, offset, whence, 1400 S64_MAX, S64_MAX); 1401 } 1402 1403 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) 1404 { 1405 struct bch_inode_info *inode = file_bch_inode(file); 1406 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1407 1408 if (!dir_emit_dots(file, ctx)) 1409 return 0; 1410 1411 int ret = bch2_readdir(c, inode_inum(inode), ctx); 1412 1413 bch_err_fn(c, ret); 1414 return bch2_err_class(ret); 1415 } 1416 1417 static int bch2_open(struct inode *vinode, struct file *file) 1418 { 1419 if (file->f_flags & (O_WRONLY|O_RDWR)) { 1420 struct bch_inode_info *inode = to_bch_ei(vinode); 1421 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1422 1423 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); 1424 if (ret) 1425 return ret; 1426 } 1427 1428 file->f_mode |= FMODE_CAN_ODIRECT; 1429 1430 return generic_file_open(vinode, file); 1431 } 1432 1433 static const struct file_operations bch_file_operations = { 1434 .open = bch2_open, 1435 .llseek = bch2_llseek, 1436 .read_iter = bch2_read_iter, 1437 .write_iter = bch2_write_iter, 1438 .mmap = bch2_mmap, 1439 .get_unmapped_area = thp_get_unmapped_area, 1440 .fsync = bch2_fsync, 1441 .splice_read = filemap_splice_read, 1442 .splice_write = iter_file_splice_write, 1443 .fallocate = bch2_fallocate_dispatch, 1444 .unlocked_ioctl = bch2_fs_file_ioctl, 1445 #ifdef CONFIG_COMPAT 1446 .compat_ioctl = bch2_compat_fs_ioctl, 1447 #endif 1448 .remap_file_range = bch2_remap_file_range, 1449 }; 1450 1451 static const struct inode_operations bch_file_inode_operations = { 1452 .getattr = bch2_getattr, 1453 .setattr = bch2_setattr, 1454 .fiemap = bch2_fiemap, 1455 .listxattr = bch2_xattr_list, 1456 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1457 .get_inode_acl = bch2_get_acl, 1458 .set_acl = bch2_set_acl, 1459 #endif 1460 }; 1461 1462 static const struct inode_operations bch_dir_inode_operations = { 1463 .lookup = bch2_lookup, 1464 .create = bch2_create, 1465 .link = bch2_link, 1466 .unlink = bch2_unlink, 1467 .symlink = bch2_symlink, 1468 .mkdir = bch2_mkdir, 1469 .rmdir = bch2_unlink, 1470 .mknod = bch2_mknod, 1471 .rename = bch2_rename2, 1472 .getattr = bch2_getattr, 1473 .setattr = bch2_setattr, 1474 .tmpfile = bch2_tmpfile, 1475 .listxattr = bch2_xattr_list, 1476 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1477 .get_inode_acl = bch2_get_acl, 1478 .set_acl = bch2_set_acl, 1479 #endif 1480 }; 1481 1482 static const struct file_operations bch_dir_file_operations = { 1483 .llseek = bch2_dir_llseek, 1484 .read = generic_read_dir, 1485 .iterate_shared = bch2_vfs_readdir, 1486 .fsync = bch2_fsync, 1487 .unlocked_ioctl = bch2_fs_file_ioctl, 1488 #ifdef CONFIG_COMPAT 1489 .compat_ioctl = bch2_compat_fs_ioctl, 1490 #endif 1491 }; 1492 1493 static const struct inode_operations bch_symlink_inode_operations = { 1494 .get_link = page_get_link, 1495 .getattr = bch2_getattr, 1496 .setattr = bch2_setattr, 1497 .listxattr = bch2_xattr_list, 1498 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1499 .get_inode_acl = bch2_get_acl, 1500 .set_acl = bch2_set_acl, 1501 #endif 1502 }; 1503 1504 static const struct inode_operations bch_special_inode_operations = { 1505 .getattr = bch2_getattr, 1506 .setattr = bch2_setattr, 1507 .listxattr = bch2_xattr_list, 1508 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1509 .get_inode_acl = bch2_get_acl, 1510 .set_acl = bch2_set_acl, 1511 #endif 1512 }; 1513 1514 static const struct address_space_operations bch_address_space_operations = { 1515 .read_folio = bch2_read_folio, 1516 .writepages = bch2_writepages, 1517 .readahead = bch2_readahead, 1518 .dirty_folio = filemap_dirty_folio, 1519 .write_begin = bch2_write_begin, 1520 .write_end = bch2_write_end, 1521 .invalidate_folio = bch2_invalidate_folio, 1522 .release_folio = bch2_release_folio, 1523 #ifdef CONFIG_MIGRATION 1524 .migrate_folio = filemap_migrate_folio, 1525 #endif 1526 .error_remove_folio = generic_error_remove_folio, 1527 }; 1528 1529 struct bcachefs_fid { 1530 u64 inum; 1531 u32 subvol; 1532 u32 gen; 1533 } __packed; 1534 1535 struct bcachefs_fid_with_parent { 1536 struct bcachefs_fid fid; 1537 struct bcachefs_fid dir; 1538 } __packed; 1539 1540 static int bcachefs_fid_valid(int fh_len, int fh_type) 1541 { 1542 switch (fh_type) { 1543 case FILEID_BCACHEFS_WITHOUT_PARENT: 1544 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); 1545 case FILEID_BCACHEFS_WITH_PARENT: 1546 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); 1547 default: 1548 return false; 1549 } 1550 } 1551 1552 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) 1553 { 1554 return (struct bcachefs_fid) { 1555 .inum = inode->ei_inum.inum, 1556 .subvol = inode->ei_inum.subvol, 1557 .gen = inode->ei_inode.bi_generation, 1558 }; 1559 } 1560 1561 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, 1562 struct inode *vdir) 1563 { 1564 struct bch_inode_info *inode = to_bch_ei(vinode); 1565 struct bch_inode_info *dir = to_bch_ei(vdir); 1566 int min_len; 1567 1568 if (!S_ISDIR(inode->v.i_mode) && dir) { 1569 struct bcachefs_fid_with_parent *fid = (void *) fh; 1570 1571 min_len = sizeof(*fid) / sizeof(u32); 1572 if (*len < min_len) { 1573 *len = min_len; 1574 return FILEID_INVALID; 1575 } 1576 1577 fid->fid = bch2_inode_to_fid(inode); 1578 fid->dir = bch2_inode_to_fid(dir); 1579 1580 *len = min_len; 1581 return FILEID_BCACHEFS_WITH_PARENT; 1582 } else { 1583 struct bcachefs_fid *fid = (void *) fh; 1584 1585 min_len = sizeof(*fid) / sizeof(u32); 1586 if (*len < min_len) { 1587 *len = min_len; 1588 return FILEID_INVALID; 1589 } 1590 *fid = bch2_inode_to_fid(inode); 1591 1592 *len = min_len; 1593 return FILEID_BCACHEFS_WITHOUT_PARENT; 1594 } 1595 } 1596 1597 static struct inode *bch2_nfs_get_inode(struct super_block *sb, 1598 struct bcachefs_fid fid) 1599 { 1600 struct bch_fs *c = sb->s_fs_info; 1601 struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { 1602 .subvol = fid.subvol, 1603 .inum = fid.inum, 1604 }); 1605 if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { 1606 iput(vinode); 1607 vinode = ERR_PTR(-ESTALE); 1608 } 1609 return vinode; 1610 } 1611 1612 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, 1613 int fh_len, int fh_type) 1614 { 1615 struct bcachefs_fid *fid = (void *) _fid; 1616 1617 if (!bcachefs_fid_valid(fh_len, fh_type)) 1618 return NULL; 1619 1620 return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); 1621 } 1622 1623 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, 1624 int fh_len, int fh_type) 1625 { 1626 struct bcachefs_fid_with_parent *fid = (void *) _fid; 1627 1628 if (!bcachefs_fid_valid(fh_len, fh_type) || 1629 fh_type != FILEID_BCACHEFS_WITH_PARENT) 1630 return NULL; 1631 1632 return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); 1633 } 1634 1635 static struct dentry *bch2_get_parent(struct dentry *child) 1636 { 1637 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1638 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1639 subvol_inum parent_inum = { 1640 .subvol = inode->ei_inode.bi_parent_subvol ?: 1641 inode->ei_inum.subvol, 1642 .inum = inode->ei_inode.bi_dir, 1643 }; 1644 1645 return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); 1646 } 1647 1648 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) 1649 { 1650 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1651 struct bch_inode_info *dir = to_bch_ei(parent->d_inode); 1652 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1653 struct btree_trans *trans; 1654 struct btree_iter iter1; 1655 struct btree_iter iter2; 1656 struct bkey_s_c k; 1657 struct bkey_s_c_dirent d; 1658 struct bch_inode_unpacked inode_u; 1659 subvol_inum target; 1660 u32 snapshot; 1661 struct qstr dirent_name; 1662 unsigned name_len = 0; 1663 int ret; 1664 1665 if (!S_ISDIR(dir->v.i_mode)) 1666 return -EINVAL; 1667 1668 trans = bch2_trans_get(c); 1669 1670 bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, 1671 POS(dir->ei_inode.bi_inum, 0), 0); 1672 bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, 1673 POS(dir->ei_inode.bi_inum, 0), 0); 1674 retry: 1675 bch2_trans_begin(trans); 1676 1677 ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); 1678 if (ret) 1679 goto err; 1680 1681 bch2_btree_iter_set_snapshot(&iter1, snapshot); 1682 bch2_btree_iter_set_snapshot(&iter2, snapshot); 1683 1684 ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); 1685 if (ret) 1686 goto err; 1687 1688 if (inode_u.bi_dir == dir->ei_inode.bi_inum) { 1689 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); 1690 1691 k = bch2_btree_iter_peek_slot(&iter1); 1692 ret = bkey_err(k); 1693 if (ret) 1694 goto err; 1695 1696 if (k.k->type != KEY_TYPE_dirent) { 1697 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1698 goto err; 1699 } 1700 1701 d = bkey_s_c_to_dirent(k); 1702 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1703 if (ret > 0) 1704 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1705 if (ret) 1706 goto err; 1707 1708 if (subvol_inum_eq(target, inode->ei_inum)) 1709 goto found; 1710 } else { 1711 /* 1712 * File with multiple hardlinks and our backref is to the wrong 1713 * directory - linear search: 1714 */ 1715 for_each_btree_key_continue_norestart(iter2, 0, k, ret) { 1716 if (k.k->p.inode > dir->ei_inode.bi_inum) 1717 break; 1718 1719 if (k.k->type != KEY_TYPE_dirent) 1720 continue; 1721 1722 d = bkey_s_c_to_dirent(k); 1723 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1724 if (ret < 0) 1725 break; 1726 if (ret) 1727 continue; 1728 1729 if (subvol_inum_eq(target, inode->ei_inum)) 1730 goto found; 1731 } 1732 } 1733 1734 ret = -ENOENT; 1735 goto err; 1736 found: 1737 dirent_name = bch2_dirent_get_name(d); 1738 1739 name_len = min_t(unsigned, dirent_name.len, NAME_MAX); 1740 memcpy(name, dirent_name.name, name_len); 1741 name[name_len] = '\0'; 1742 err: 1743 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1744 goto retry; 1745 1746 bch2_trans_iter_exit(trans, &iter1); 1747 bch2_trans_iter_exit(trans, &iter2); 1748 bch2_trans_put(trans); 1749 1750 return ret; 1751 } 1752 1753 static const struct export_operations bch_export_ops = { 1754 .encode_fh = bch2_encode_fh, 1755 .fh_to_dentry = bch2_fh_to_dentry, 1756 .fh_to_parent = bch2_fh_to_parent, 1757 .get_parent = bch2_get_parent, 1758 .get_name = bch2_get_name, 1759 }; 1760 1761 static void bch2_vfs_inode_init(struct btree_trans *trans, 1762 subvol_inum inum, 1763 struct bch_inode_info *inode, 1764 struct bch_inode_unpacked *bi, 1765 struct bch_subvolume *subvol) 1766 { 1767 inode->v.i_ino = inum.inum; 1768 inode->ei_inum = inum; 1769 inode->ei_inode.bi_inum = inum.inum; 1770 bch2_inode_update_after_write(trans, inode, bi, ~0); 1771 1772 inode->v.i_blocks = bi->bi_sectors; 1773 inode->v.i_rdev = bi->bi_dev; 1774 inode->v.i_generation = bi->bi_generation; 1775 inode->v.i_size = bi->bi_size; 1776 1777 inode->ei_flags = 0; 1778 inode->ei_quota_reserved = 0; 1779 inode->ei_qid = bch_qid(bi); 1780 1781 if (BCH_SUBVOLUME_SNAP(subvol)) 1782 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); 1783 1784 inode->v.i_mapping->a_ops = &bch_address_space_operations; 1785 1786 switch (inode->v.i_mode & S_IFMT) { 1787 case S_IFREG: 1788 inode->v.i_op = &bch_file_inode_operations; 1789 inode->v.i_fop = &bch_file_operations; 1790 break; 1791 case S_IFDIR: 1792 inode->v.i_op = &bch_dir_inode_operations; 1793 inode->v.i_fop = &bch_dir_file_operations; 1794 break; 1795 case S_IFLNK: 1796 inode_nohighmem(&inode->v); 1797 inode->v.i_op = &bch_symlink_inode_operations; 1798 break; 1799 default: 1800 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); 1801 inode->v.i_op = &bch_special_inode_operations; 1802 break; 1803 } 1804 1805 mapping_set_large_folios(inode->v.i_mapping); 1806 } 1807 1808 static void bch2_free_inode(struct inode *vinode) 1809 { 1810 kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); 1811 } 1812 1813 static int inode_update_times_fn(struct btree_trans *trans, 1814 struct bch_inode_info *inode, 1815 struct bch_inode_unpacked *bi, 1816 void *p) 1817 { 1818 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1819 1820 bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); 1821 bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); 1822 bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); 1823 1824 return 0; 1825 } 1826 1827 static int bch2_vfs_write_inode(struct inode *vinode, 1828 struct writeback_control *wbc) 1829 { 1830 struct bch_fs *c = vinode->i_sb->s_fs_info; 1831 struct bch_inode_info *inode = to_bch_ei(vinode); 1832 int ret; 1833 1834 mutex_lock(&inode->ei_update_lock); 1835 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 1836 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); 1837 mutex_unlock(&inode->ei_update_lock); 1838 1839 return bch2_err_class(ret); 1840 } 1841 1842 static void bch2_evict_inode(struct inode *vinode) 1843 { 1844 struct bch_fs *c = vinode->i_sb->s_fs_info; 1845 struct bch_inode_info *inode = to_bch_ei(vinode); 1846 bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); 1847 1848 /* 1849 * evict() has waited for outstanding writeback, we'll do no more IO 1850 * through this inode: it's safe to remove from VFS inode hashtable here 1851 * 1852 * Do that now so that other threads aren't blocked from pulling it back 1853 * in, there's no reason for them to be: 1854 */ 1855 if (!delete) 1856 bch2_inode_hash_remove(c, inode); 1857 1858 truncate_inode_pages_final(&inode->v.i_data); 1859 1860 clear_inode(&inode->v); 1861 1862 BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); 1863 1864 if (delete) { 1865 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), 1866 KEY_TYPE_QUOTA_WARN); 1867 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, 1868 KEY_TYPE_QUOTA_WARN); 1869 bch2_inode_rm(c, inode_inum(inode)); 1870 1871 /* 1872 * If we are deleting, we need it present in the vfs hash table 1873 * so that fsck can check if unlinked inodes are still open: 1874 */ 1875 bch2_inode_hash_remove(c, inode); 1876 } 1877 1878 mutex_lock(&c->vfs_inodes_lock); 1879 list_del_init(&inode->ei_vfs_inode_list); 1880 mutex_unlock(&c->vfs_inodes_lock); 1881 } 1882 1883 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) 1884 { 1885 struct bch_inode_info *inode; 1886 DARRAY(struct bch_inode_info *) grabbed; 1887 bool clean_pass = false, this_pass_clean; 1888 1889 /* 1890 * Initially, we scan for inodes without I_DONTCACHE, then mark them to 1891 * be pruned with d_mark_dontcache(). 1892 * 1893 * Once we've had a clean pass where we didn't find any inodes without 1894 * I_DONTCACHE, we wait for them to be freed: 1895 */ 1896 1897 darray_init(&grabbed); 1898 darray_make_room(&grabbed, 1024); 1899 again: 1900 cond_resched(); 1901 this_pass_clean = true; 1902 1903 mutex_lock(&c->vfs_inodes_lock); 1904 list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { 1905 if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) 1906 continue; 1907 1908 if (!(inode->v.i_state & I_DONTCACHE) && 1909 !(inode->v.i_state & I_FREEING) && 1910 igrab(&inode->v)) { 1911 this_pass_clean = false; 1912 1913 if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { 1914 iput(&inode->v); 1915 break; 1916 } 1917 } else if (clean_pass && this_pass_clean) { 1918 struct wait_bit_queue_entry wqe; 1919 struct wait_queue_head *wq_head; 1920 1921 wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); 1922 prepare_to_wait_event(wq_head, &wqe.wq_entry, 1923 TASK_UNINTERRUPTIBLE); 1924 mutex_unlock(&c->vfs_inodes_lock); 1925 1926 schedule(); 1927 finish_wait(wq_head, &wqe.wq_entry); 1928 goto again; 1929 } 1930 } 1931 mutex_unlock(&c->vfs_inodes_lock); 1932 1933 darray_for_each(grabbed, i) { 1934 inode = *i; 1935 d_mark_dontcache(&inode->v); 1936 d_prune_aliases(&inode->v); 1937 iput(&inode->v); 1938 } 1939 grabbed.nr = 0; 1940 1941 if (!clean_pass || !this_pass_clean) { 1942 clean_pass = this_pass_clean; 1943 goto again; 1944 } 1945 1946 darray_exit(&grabbed); 1947 } 1948 1949 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) 1950 { 1951 struct super_block *sb = dentry->d_sb; 1952 struct bch_fs *c = sb->s_fs_info; 1953 struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); 1954 unsigned shift = sb->s_blocksize_bits - 9; 1955 /* 1956 * this assumes inodes take up 64 bytes, which is a decent average 1957 * number: 1958 */ 1959 u64 avail_inodes = ((usage.capacity - usage.used) << 3); 1960 1961 buf->f_type = BCACHEFS_STATFS_MAGIC; 1962 buf->f_bsize = sb->s_blocksize; 1963 buf->f_blocks = usage.capacity >> shift; 1964 buf->f_bfree = usage.free >> shift; 1965 buf->f_bavail = avail_factor(usage.free) >> shift; 1966 1967 buf->f_files = usage.nr_inodes + avail_inodes; 1968 buf->f_ffree = avail_inodes; 1969 1970 buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); 1971 buf->f_namelen = BCH_NAME_MAX; 1972 1973 return 0; 1974 } 1975 1976 static int bch2_sync_fs(struct super_block *sb, int wait) 1977 { 1978 struct bch_fs *c = sb->s_fs_info; 1979 int ret; 1980 1981 trace_bch2_sync_fs(sb, wait); 1982 1983 if (c->opts.journal_flush_disabled) 1984 return 0; 1985 1986 if (!wait) { 1987 bch2_journal_flush_async(&c->journal, NULL); 1988 return 0; 1989 } 1990 1991 ret = bch2_journal_flush(&c->journal); 1992 return bch2_err_class(ret); 1993 } 1994 1995 static struct bch_fs *bch2_path_to_fs(const char *path) 1996 { 1997 struct bch_fs *c; 1998 dev_t dev; 1999 int ret; 2000 2001 ret = lookup_bdev(path, &dev); 2002 if (ret) 2003 return ERR_PTR(ret); 2004 2005 c = bch2_dev_to_fs(dev); 2006 if (c) 2007 closure_put(&c->cl); 2008 return c ?: ERR_PTR(-ENOENT); 2009 } 2010 2011 static int bch2_remount(struct super_block *sb, int *flags, 2012 struct bch_opts opts) 2013 { 2014 struct bch_fs *c = sb->s_fs_info; 2015 int ret = 0; 2016 2017 opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); 2018 2019 if (opts.read_only != c->opts.read_only) { 2020 down_write(&c->state_lock); 2021 2022 if (opts.read_only) { 2023 bch2_fs_read_only(c); 2024 2025 sb->s_flags |= SB_RDONLY; 2026 } else { 2027 ret = bch2_fs_read_write(c); 2028 if (ret) { 2029 bch_err(c, "error going rw: %i", ret); 2030 up_write(&c->state_lock); 2031 ret = -EINVAL; 2032 goto err; 2033 } 2034 2035 sb->s_flags &= ~SB_RDONLY; 2036 } 2037 2038 c->opts.read_only = opts.read_only; 2039 2040 up_write(&c->state_lock); 2041 } 2042 2043 if (opt_defined(opts, errors)) 2044 c->opts.errors = opts.errors; 2045 err: 2046 return bch2_err_class(ret); 2047 } 2048 2049 static int bch2_show_devname(struct seq_file *seq, struct dentry *root) 2050 { 2051 struct bch_fs *c = root->d_sb->s_fs_info; 2052 bool first = true; 2053 2054 for_each_online_member(c, ca) { 2055 if (!first) 2056 seq_putc(seq, ':'); 2057 first = false; 2058 seq_puts(seq, ca->disk_sb.sb_name); 2059 } 2060 2061 return 0; 2062 } 2063 2064 static int bch2_show_options(struct seq_file *seq, struct dentry *root) 2065 { 2066 struct bch_fs *c = root->d_sb->s_fs_info; 2067 struct printbuf buf = PRINTBUF; 2068 2069 bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, 2070 OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); 2071 printbuf_nul_terminate(&buf); 2072 seq_printf(seq, ",%s", buf.buf); 2073 2074 int ret = buf.allocation_failure ? -ENOMEM : 0; 2075 printbuf_exit(&buf); 2076 return ret; 2077 } 2078 2079 static void bch2_put_super(struct super_block *sb) 2080 { 2081 struct bch_fs *c = sb->s_fs_info; 2082 2083 __bch2_fs_stop(c); 2084 } 2085 2086 /* 2087 * bcachefs doesn't currently integrate intwrite freeze protection but the 2088 * internal write references serve the same purpose. Therefore reuse the 2089 * read-only transition code to perform the quiesce. The caveat is that we don't 2090 * currently have the ability to block tasks that want a write reference while 2091 * the superblock is frozen. This is fine for now, but we should either add 2092 * blocking support or find a way to integrate sb_start_intwrite() and friends. 2093 */ 2094 static int bch2_freeze(struct super_block *sb) 2095 { 2096 struct bch_fs *c = sb->s_fs_info; 2097 2098 down_write(&c->state_lock); 2099 bch2_fs_read_only(c); 2100 up_write(&c->state_lock); 2101 return 0; 2102 } 2103 2104 static int bch2_unfreeze(struct super_block *sb) 2105 { 2106 struct bch_fs *c = sb->s_fs_info; 2107 int ret; 2108 2109 if (test_bit(BCH_FS_emergency_ro, &c->flags)) 2110 return 0; 2111 2112 down_write(&c->state_lock); 2113 ret = bch2_fs_read_write(c); 2114 up_write(&c->state_lock); 2115 return ret; 2116 } 2117 2118 static const struct super_operations bch_super_operations = { 2119 .alloc_inode = bch2_alloc_inode, 2120 .free_inode = bch2_free_inode, 2121 .write_inode = bch2_vfs_write_inode, 2122 .evict_inode = bch2_evict_inode, 2123 .sync_fs = bch2_sync_fs, 2124 .statfs = bch2_statfs, 2125 .show_devname = bch2_show_devname, 2126 .show_options = bch2_show_options, 2127 .put_super = bch2_put_super, 2128 .freeze_fs = bch2_freeze, 2129 .unfreeze_fs = bch2_unfreeze, 2130 }; 2131 2132 static int bch2_set_super(struct super_block *s, void *data) 2133 { 2134 s->s_fs_info = data; 2135 return 0; 2136 } 2137 2138 static int bch2_noset_super(struct super_block *s, void *data) 2139 { 2140 return -EBUSY; 2141 } 2142 2143 typedef DARRAY(struct bch_fs *) darray_fs; 2144 2145 static int bch2_test_super(struct super_block *s, void *data) 2146 { 2147 struct bch_fs *c = s->s_fs_info; 2148 darray_fs *d = data; 2149 2150 if (!c) 2151 return false; 2152 2153 darray_for_each(*d, i) 2154 if (c != *i) 2155 return false; 2156 return true; 2157 } 2158 2159 static int bch2_fs_get_tree(struct fs_context *fc) 2160 { 2161 struct bch_fs *c; 2162 struct super_block *sb; 2163 struct inode *vinode; 2164 struct bch2_opts_parse *opts_parse = fc->fs_private; 2165 struct bch_opts opts = opts_parse->opts; 2166 darray_str devs; 2167 darray_fs devs_to_fs = {}; 2168 int ret; 2169 2170 opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2171 opt_set(opts, nostart, true); 2172 2173 if (!fc->source || strlen(fc->source) == 0) 2174 return -EINVAL; 2175 2176 ret = bch2_split_devs(fc->source, &devs); 2177 if (ret) 2178 return ret; 2179 2180 darray_for_each(devs, i) { 2181 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); 2182 if (ret) 2183 goto err; 2184 } 2185 2186 sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); 2187 if (!IS_ERR(sb)) 2188 goto got_sb; 2189 2190 c = bch2_fs_open(devs.data, devs.nr, opts); 2191 ret = PTR_ERR_OR_ZERO(c); 2192 if (ret) 2193 goto err; 2194 2195 /* Some options can't be parsed until after the fs is started: */ 2196 opts = bch2_opts_empty(); 2197 ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); 2198 if (ret) 2199 goto err_stop_fs; 2200 2201 bch2_opts_apply(&c->opts, opts); 2202 2203 ret = bch2_fs_start(c); 2204 if (ret) 2205 goto err_stop_fs; 2206 2207 sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); 2208 ret = PTR_ERR_OR_ZERO(sb); 2209 if (ret) 2210 goto err_stop_fs; 2211 got_sb: 2212 c = sb->s_fs_info; 2213 2214 if (sb->s_root) { 2215 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { 2216 ret = -EBUSY; 2217 goto err_put_super; 2218 } 2219 goto out; 2220 } 2221 2222 sb->s_blocksize = block_bytes(c); 2223 sb->s_blocksize_bits = ilog2(block_bytes(c)); 2224 sb->s_maxbytes = MAX_LFS_FILESIZE; 2225 sb->s_op = &bch_super_operations; 2226 sb->s_export_op = &bch_export_ops; 2227 #ifdef CONFIG_BCACHEFS_QUOTA 2228 sb->s_qcop = &bch2_quotactl_operations; 2229 sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; 2230 #endif 2231 sb->s_xattr = bch2_xattr_handlers; 2232 sb->s_magic = BCACHEFS_STATFS_MAGIC; 2233 sb->s_time_gran = c->sb.nsec_per_time_unit; 2234 sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; 2235 sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); 2236 super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); 2237 super_set_sysfs_name_uuid(sb); 2238 sb->s_shrink->seeks = 0; 2239 c->vfs_sb = sb; 2240 strscpy(sb->s_id, c->name, sizeof(sb->s_id)); 2241 2242 ret = super_setup_bdi(sb); 2243 if (ret) 2244 goto err_put_super; 2245 2246 sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 2247 2248 for_each_online_member(c, ca) { 2249 struct block_device *bdev = ca->disk_sb.bdev; 2250 2251 /* XXX: create an anonymous device for multi device filesystems */ 2252 sb->s_bdev = bdev; 2253 sb->s_dev = bdev->bd_dev; 2254 percpu_ref_put(&ca->io_ref); 2255 break; 2256 } 2257 2258 c->dev = sb->s_dev; 2259 2260 #ifdef CONFIG_BCACHEFS_POSIX_ACL 2261 if (c->opts.acl) 2262 sb->s_flags |= SB_POSIXACL; 2263 #endif 2264 2265 sb->s_shrink->seeks = 0; 2266 2267 vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); 2268 ret = PTR_ERR_OR_ZERO(vinode); 2269 bch_err_msg(c, ret, "mounting: error getting root inode"); 2270 if (ret) 2271 goto err_put_super; 2272 2273 sb->s_root = d_make_root(vinode); 2274 if (!sb->s_root) { 2275 bch_err(c, "error mounting: error allocating root dentry"); 2276 ret = -ENOMEM; 2277 goto err_put_super; 2278 } 2279 2280 sb->s_flags |= SB_ACTIVE; 2281 out: 2282 fc->root = dget(sb->s_root); 2283 err: 2284 darray_exit(&devs_to_fs); 2285 bch2_darray_str_exit(&devs); 2286 if (ret) 2287 pr_err("error: %s", bch2_err_str(ret)); 2288 /* 2289 * On an inconsistency error in recovery we might see an -EROFS derived 2290 * errorcode (from the journal), but we don't want to return that to 2291 * userspace as that causes util-linux to retry the mount RO - which is 2292 * confusing: 2293 */ 2294 if (bch2_err_matches(ret, EROFS) && ret != -EROFS) 2295 ret = -EIO; 2296 return bch2_err_class(ret); 2297 2298 err_stop_fs: 2299 bch2_fs_stop(c); 2300 goto err; 2301 2302 err_put_super: 2303 __bch2_fs_stop(c); 2304 deactivate_locked_super(sb); 2305 goto err; 2306 } 2307 2308 static void bch2_kill_sb(struct super_block *sb) 2309 { 2310 struct bch_fs *c = sb->s_fs_info; 2311 2312 generic_shutdown_super(sb); 2313 bch2_fs_free(c); 2314 } 2315 2316 static void bch2_fs_context_free(struct fs_context *fc) 2317 { 2318 struct bch2_opts_parse *opts = fc->fs_private; 2319 2320 if (opts) { 2321 printbuf_exit(&opts->parse_later); 2322 kfree(opts); 2323 } 2324 } 2325 2326 static int bch2_fs_parse_param(struct fs_context *fc, 2327 struct fs_parameter *param) 2328 { 2329 /* 2330 * the "source" param, i.e., the name of the device(s) to mount, 2331 * is handled by the VFS layer. 2332 */ 2333 if (!strcmp(param->key, "source")) 2334 return -ENOPARAM; 2335 2336 struct bch2_opts_parse *opts = fc->fs_private; 2337 struct bch_fs *c = NULL; 2338 2339 /* for reconfigure, we already have a struct bch_fs */ 2340 if (fc->root) 2341 c = fc->root->d_sb->s_fs_info; 2342 2343 int ret = bch2_parse_one_mount_opt(c, &opts->opts, 2344 &opts->parse_later, param->key, 2345 param->string); 2346 2347 return bch2_err_class(ret); 2348 } 2349 2350 static int bch2_fs_reconfigure(struct fs_context *fc) 2351 { 2352 struct super_block *sb = fc->root->d_sb; 2353 struct bch2_opts_parse *opts = fc->fs_private; 2354 2355 return bch2_remount(sb, &fc->sb_flags, opts->opts); 2356 } 2357 2358 static const struct fs_context_operations bch2_context_ops = { 2359 .free = bch2_fs_context_free, 2360 .parse_param = bch2_fs_parse_param, 2361 .get_tree = bch2_fs_get_tree, 2362 .reconfigure = bch2_fs_reconfigure, 2363 }; 2364 2365 static int bch2_init_fs_context(struct fs_context *fc) 2366 { 2367 struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); 2368 2369 if (!opts) 2370 return -ENOMEM; 2371 2372 opts->parse_later = PRINTBUF; 2373 2374 fc->ops = &bch2_context_ops; 2375 fc->fs_private = opts; 2376 2377 return 0; 2378 } 2379 2380 void bch2_fs_vfs_exit(struct bch_fs *c) 2381 { 2382 if (c->vfs_inodes_by_inum_table.ht.tbl) 2383 rhltable_destroy(&c->vfs_inodes_by_inum_table); 2384 if (c->vfs_inodes_table.tbl) 2385 rhashtable_destroy(&c->vfs_inodes_table); 2386 } 2387 2388 int bch2_fs_vfs_init(struct bch_fs *c) 2389 { 2390 return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: 2391 rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); 2392 } 2393 2394 static struct file_system_type bcache_fs_type = { 2395 .owner = THIS_MODULE, 2396 .name = "bcachefs", 2397 .init_fs_context = bch2_init_fs_context, 2398 .kill_sb = bch2_kill_sb, 2399 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, 2400 }; 2401 2402 MODULE_ALIAS_FS("bcachefs"); 2403 2404 void bch2_vfs_exit(void) 2405 { 2406 unregister_filesystem(&bcache_fs_type); 2407 kmem_cache_destroy(bch2_inode_cache); 2408 } 2409 2410 int __init bch2_vfs_init(void) 2411 { 2412 int ret = -ENOMEM; 2413 2414 bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | 2415 SLAB_ACCOUNT); 2416 if (!bch2_inode_cache) 2417 goto err; 2418 2419 ret = register_filesystem(&bcache_fs_type); 2420 if (ret) 2421 goto err; 2422 2423 return 0; 2424 err: 2425 bch2_vfs_exit(); 2426 return ret; 2427 } 2428 2429 #endif /* NO_BCACHEFS_FS */ 2430