1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "acl.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "chardev.h" 10 #include "dirent.h" 11 #include "errcode.h" 12 #include "extents.h" 13 #include "fs.h" 14 #include "fs-io.h" 15 #include "fs-ioctl.h" 16 #include "fs-io-buffered.h" 17 #include "fs-io-direct.h" 18 #include "fs-io-pagecache.h" 19 #include "fsck.h" 20 #include "inode.h" 21 #include "io_read.h" 22 #include "journal.h" 23 #include "keylist.h" 24 #include "namei.h" 25 #include "quota.h" 26 #include "rebalance.h" 27 #include "snapshot.h" 28 #include "super.h" 29 #include "xattr.h" 30 #include "trace.h" 31 32 #include <linux/aio.h> 33 #include <linux/backing-dev.h> 34 #include <linux/exportfs.h> 35 #include <linux/fiemap.h> 36 #include <linux/fs_context.h> 37 #include <linux/module.h> 38 #include <linux/pagemap.h> 39 #include <linux/posix_acl.h> 40 #include <linux/random.h> 41 #include <linux/seq_file.h> 42 #include <linux/siphash.h> 43 #include <linux/statfs.h> 44 #include <linux/string.h> 45 #include <linux/xattr.h> 46 47 static struct kmem_cache *bch2_inode_cache; 48 49 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, 50 struct bch_inode_info *, 51 struct bch_inode_unpacked *, 52 struct bch_subvolume *); 53 54 void bch2_inode_update_after_write(struct btree_trans *trans, 55 struct bch_inode_info *inode, 56 struct bch_inode_unpacked *bi, 57 unsigned fields) 58 { 59 struct bch_fs *c = trans->c; 60 61 BUG_ON(bi->bi_inum != inode->v.i_ino); 62 63 bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); 64 65 set_nlink(&inode->v, bch2_inode_nlink_get(bi)); 66 i_uid_write(&inode->v, bi->bi_uid); 67 i_gid_write(&inode->v, bi->bi_gid); 68 inode->v.i_mode = bi->bi_mode; 69 70 if (fields & ATTR_SIZE) 71 i_size_write(&inode->v, bi->bi_size); 72 73 if (fields & ATTR_ATIME) 74 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); 75 if (fields & ATTR_MTIME) 76 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); 77 if (fields & ATTR_CTIME) 78 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); 79 80 inode->ei_inode = *bi; 81 82 bch2_inode_flags_to_vfs(inode); 83 } 84 85 int __must_check bch2_write_inode(struct bch_fs *c, 86 struct bch_inode_info *inode, 87 inode_set_fn set, 88 void *p, unsigned fields) 89 { 90 struct btree_trans *trans = bch2_trans_get(c); 91 struct btree_iter iter = {}; 92 struct bch_inode_unpacked inode_u; 93 int ret; 94 retry: 95 bch2_trans_begin(trans); 96 97 ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); 98 if (ret) 99 goto err; 100 101 struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); 102 103 ret = (set ? set(trans, inode, &inode_u, p) : 0); 104 if (ret) 105 goto err; 106 107 struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); 108 109 if (memcmp(&old_r, &new_r, sizeof(new_r))) { 110 ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); 111 if (ret) 112 goto err; 113 } 114 115 ret = bch2_inode_write(trans, &iter, &inode_u) ?: 116 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 117 118 /* 119 * the btree node lock protects inode->ei_inode, not ei_update_lock; 120 * this is important for inode updates via bchfs_write_index_update 121 */ 122 if (!ret) 123 bch2_inode_update_after_write(trans, inode, &inode_u, fields); 124 err: 125 bch2_trans_iter_exit(trans, &iter); 126 127 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 128 goto retry; 129 130 bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, 131 "%s: inode %llu:%llu not found when updating", 132 bch2_err_str(ret), 133 inode_inum(inode).subvol, 134 inode_inum(inode).inum); 135 136 bch2_trans_put(trans); 137 return ret < 0 ? ret : 0; 138 } 139 140 int bch2_fs_quota_transfer(struct bch_fs *c, 141 struct bch_inode_info *inode, 142 struct bch_qid new_qid, 143 unsigned qtypes, 144 enum quota_acct_mode mode) 145 { 146 unsigned i; 147 int ret; 148 149 qtypes &= enabled_qtypes(c); 150 151 for (i = 0; i < QTYP_NR; i++) 152 if (new_qid.q[i] == inode->ei_qid.q[i]) 153 qtypes &= ~(1U << i); 154 155 if (!qtypes) 156 return 0; 157 158 mutex_lock(&inode->ei_quota_lock); 159 160 ret = bch2_quota_transfer(c, qtypes, new_qid, 161 inode->ei_qid, 162 inode->v.i_blocks + 163 inode->ei_quota_reserved, 164 mode); 165 if (!ret) 166 for (i = 0; i < QTYP_NR; i++) 167 if (qtypes & (1 << i)) 168 inode->ei_qid.q[i] = new_qid.q[i]; 169 170 mutex_unlock(&inode->ei_quota_lock); 171 172 return ret; 173 } 174 175 static bool subvol_inum_eq(subvol_inum a, subvol_inum b) 176 { 177 return a.subvol == b.subvol && a.inum == b.inum; 178 } 179 180 static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) 181 { 182 const subvol_inum *inum = data; 183 siphash_key_t k = { .key[0] = seed }; 184 185 return siphash_2u64(inum->subvol, inum->inum, &k); 186 } 187 188 static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) 189 { 190 const struct bch_inode_info *inode = data; 191 192 return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); 193 } 194 195 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, 196 const void *obj) 197 { 198 const struct bch_inode_info *inode = obj; 199 const subvol_inum *v = arg->key; 200 201 return !subvol_inum_eq(inode->ei_inum, *v); 202 } 203 204 static const struct rhashtable_params bch2_vfs_inodes_params = { 205 .head_offset = offsetof(struct bch_inode_info, hash), 206 .key_offset = offsetof(struct bch_inode_info, ei_inum), 207 .key_len = sizeof(subvol_inum), 208 .hashfn = bch2_vfs_inode_hash_fn, 209 .obj_hashfn = bch2_vfs_inode_obj_hash_fn, 210 .obj_cmpfn = bch2_vfs_inode_cmp_fn, 211 .automatic_shrinking = true, 212 }; 213 214 static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { 215 .head_offset = offsetof(struct bch_inode_info, by_inum_hash), 216 .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), 217 .key_len = sizeof(u64), 218 .automatic_shrinking = true, 219 }; 220 221 int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) 222 { 223 struct bch_fs *c = trans->c; 224 struct rhltable *ht = &c->vfs_inodes_by_inum_table; 225 u64 inum = p.offset; 226 DARRAY(u32) subvols; 227 int ret = 0; 228 229 if (!test_bit(BCH_FS_started, &c->flags)) 230 return false; 231 232 darray_init(&subvols); 233 restart_from_top: 234 235 /* 236 * Tweaked version of __rhashtable_lookup(); we need to get a list of 237 * subvolumes in which the given inode number is open. 238 * 239 * For this to work, we don't include the subvolume ID in the key that 240 * we hash - all inodes with the same inode number regardless of 241 * subvolume will hash to the same slot. 242 * 243 * This will be less than ideal if the same file is ever open 244 * simultaneously in many different snapshots: 245 */ 246 rcu_read_lock(); 247 struct rhash_lock_head __rcu *const *bkt; 248 struct rhash_head *he; 249 unsigned int hash; 250 struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); 251 restart: 252 hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); 253 bkt = rht_bucket(tbl, hash); 254 do { 255 struct bch_inode_info *inode; 256 257 rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { 258 if (inode->ei_inum.inum == inum) { 259 ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, 260 GFP_NOWAIT|__GFP_NOWARN); 261 if (ret) { 262 rcu_read_unlock(); 263 ret = darray_make_room(&subvols, 1); 264 if (ret) 265 goto err; 266 subvols.nr = 0; 267 goto restart_from_top; 268 } 269 } 270 } 271 /* An object might have been moved to a different hash chain, 272 * while we walk along it - better check and retry. 273 */ 274 } while (he != RHT_NULLS_MARKER(bkt)); 275 276 /* Ensure we see any new tables. */ 277 smp_rmb(); 278 279 tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); 280 if (unlikely(tbl)) 281 goto restart; 282 rcu_read_unlock(); 283 284 darray_for_each(subvols, i) { 285 u32 snap; 286 ret = bch2_subvolume_get_snapshot(trans, *i, &snap); 287 if (ret) 288 goto err; 289 290 ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); 291 if (ret) 292 break; 293 } 294 err: 295 darray_exit(&subvols); 296 return ret; 297 } 298 299 static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 300 { 301 return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); 302 } 303 304 static void __wait_on_freeing_inode(struct bch_fs *c, 305 struct bch_inode_info *inode, 306 subvol_inum inum) 307 { 308 wait_queue_head_t *wq; 309 struct wait_bit_queue_entry wait; 310 311 wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); 312 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 313 spin_unlock(&inode->v.i_lock); 314 315 if (__bch2_inode_hash_find(c, inum) == inode) 316 schedule_timeout(HZ * 10); 317 finish_wait(wq, &wait.wq_entry); 318 } 319 320 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, 321 subvol_inum inum) 322 { 323 struct bch_inode_info *inode; 324 repeat: 325 inode = __bch2_inode_hash_find(c, inum); 326 if (inode) { 327 spin_lock(&inode->v.i_lock); 328 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { 329 spin_unlock(&inode->v.i_lock); 330 return NULL; 331 } 332 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { 333 if (!trans) { 334 __wait_on_freeing_inode(c, inode, inum); 335 } else { 336 bch2_trans_unlock(trans); 337 __wait_on_freeing_inode(c, inode, inum); 338 int ret = bch2_trans_relock(trans); 339 if (ret) 340 return ERR_PTR(ret); 341 } 342 goto repeat; 343 } 344 __iget(&inode->v); 345 spin_unlock(&inode->v.i_lock); 346 } 347 348 return inode; 349 } 350 351 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) 352 { 353 spin_lock(&inode->v.i_lock); 354 bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); 355 spin_unlock(&inode->v.i_lock); 356 357 if (remove) { 358 int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, 359 &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); 360 BUG_ON(ret); 361 362 ret = rhashtable_remove_fast(&c->vfs_inodes_table, 363 &inode->hash, bch2_vfs_inodes_params); 364 BUG_ON(ret); 365 inode->v.i_hash.pprev = NULL; 366 /* 367 * This pairs with the bch2_inode_hash_find() -> 368 * __wait_on_freeing_inode() path 369 */ 370 inode_wake_up_bit(&inode->v, __I_NEW); 371 } 372 } 373 374 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, 375 struct btree_trans *trans, 376 struct bch_inode_info *inode) 377 { 378 struct bch_inode_info *old = inode; 379 380 set_bit(EI_INODE_HASHED, &inode->ei_flags); 381 retry: 382 if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, 383 &inode->ei_inum, 384 &inode->hash, 385 bch2_vfs_inodes_params))) { 386 old = bch2_inode_hash_find(c, trans, inode->ei_inum); 387 if (!old) 388 goto retry; 389 390 clear_bit(EI_INODE_HASHED, &inode->ei_flags); 391 392 /* 393 * bcachefs doesn't use I_NEW; we have no use for it since we 394 * only insert fully created inodes in the inode hash table. But 395 * discard_new_inode() expects it to be set... 396 */ 397 inode->v.i_state |= I_NEW; 398 /* 399 * We don't want bch2_evict_inode() to delete the inode on disk, 400 * we just raced and had another inode in cache. Normally new 401 * inodes don't have nlink == 0 - except tmpfiles do... 402 */ 403 set_nlink(&inode->v, 1); 404 discard_new_inode(&inode->v); 405 return old; 406 } else { 407 int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, 408 &inode->by_inum_hash, 409 bch2_vfs_inodes_by_inum_params); 410 BUG_ON(ret); 411 412 inode_fake_hash(&inode->v); 413 414 inode_sb_list_add(&inode->v); 415 416 mutex_lock(&c->vfs_inodes_lock); 417 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 418 mutex_unlock(&c->vfs_inodes_lock); 419 return inode; 420 } 421 } 422 423 #define memalloc_flags_do(_flags, _do) \ 424 ({ \ 425 unsigned _saved_flags = memalloc_flags_save(_flags); \ 426 typeof(_do) _ret = _do; \ 427 memalloc_noreclaim_restore(_saved_flags); \ 428 _ret; \ 429 }) 430 431 static struct inode *bch2_alloc_inode(struct super_block *sb) 432 { 433 BUG(); 434 } 435 436 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) 437 { 438 struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, 439 bch2_inode_cache, gfp); 440 if (!inode) 441 return NULL; 442 443 inode_init_once(&inode->v); 444 mutex_init(&inode->ei_update_lock); 445 two_state_lock_init(&inode->ei_pagecache_lock); 446 INIT_LIST_HEAD(&inode->ei_vfs_inode_list); 447 inode->ei_flags = 0; 448 mutex_init(&inode->ei_quota_lock); 449 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 450 451 if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { 452 kmem_cache_free(bch2_inode_cache, inode); 453 return NULL; 454 } 455 456 return inode; 457 } 458 459 /* 460 * Allocate a new inode, dropping/retaking btree locks if necessary: 461 */ 462 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) 463 { 464 struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); 465 466 if (unlikely(!inode)) { 467 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); 468 if (ret && inode) { 469 __destroy_inode(&inode->v); 470 kmem_cache_free(bch2_inode_cache, inode); 471 } 472 if (ret) 473 return ERR_PTR(ret); 474 } 475 476 return inode; 477 } 478 479 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, 480 subvol_inum inum, 481 struct bch_inode_unpacked *bi, 482 struct bch_subvolume *subvol) 483 { 484 struct bch_inode_info *inode = bch2_new_inode(trans); 485 if (IS_ERR(inode)) 486 return inode; 487 488 bch2_vfs_inode_init(trans, inum, inode, bi, subvol); 489 490 return bch2_inode_hash_insert(trans->c, trans, inode); 491 492 } 493 494 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 495 { 496 struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); 497 if (inode) 498 return &inode->v; 499 500 struct btree_trans *trans = bch2_trans_get(c); 501 502 struct bch_inode_unpacked inode_u; 503 struct bch_subvolume subvol; 504 int ret = lockrestart_do(trans, 505 bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 506 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: 507 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 508 bch2_trans_put(trans); 509 510 return ret ? ERR_PTR(ret) : &inode->v; 511 } 512 513 struct bch_inode_info * 514 __bch2_create(struct mnt_idmap *idmap, 515 struct bch_inode_info *dir, struct dentry *dentry, 516 umode_t mode, dev_t rdev, subvol_inum snapshot_src, 517 unsigned flags) 518 { 519 struct bch_fs *c = dir->v.i_sb->s_fs_info; 520 struct btree_trans *trans; 521 struct bch_inode_unpacked dir_u; 522 struct bch_inode_info *inode; 523 struct bch_inode_unpacked inode_u; 524 struct posix_acl *default_acl = NULL, *acl = NULL; 525 subvol_inum inum; 526 struct bch_subvolume subvol; 527 u64 journal_seq = 0; 528 kuid_t kuid; 529 kgid_t kgid; 530 int ret; 531 532 /* 533 * preallocate acls + vfs inode before btree transaction, so that 534 * nothing can fail after the transaction succeeds: 535 */ 536 #ifdef CONFIG_BCACHEFS_POSIX_ACL 537 ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); 538 if (ret) 539 return ERR_PTR(ret); 540 #endif 541 inode = __bch2_new_inode(c, GFP_NOFS); 542 if (unlikely(!inode)) { 543 inode = ERR_PTR(-ENOMEM); 544 goto err; 545 } 546 547 bch2_inode_init_early(c, &inode_u); 548 549 if (!(flags & BCH_CREATE_TMPFILE)) 550 mutex_lock(&dir->ei_update_lock); 551 552 trans = bch2_trans_get(c); 553 retry: 554 bch2_trans_begin(trans); 555 556 kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); 557 kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); 558 ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: 559 bch2_create_trans(trans, 560 inode_inum(dir), &dir_u, &inode_u, 561 !(flags & BCH_CREATE_TMPFILE) 562 ? &dentry->d_name : NULL, 563 from_kuid(i_user_ns(&dir->v), kuid), 564 from_kgid(i_user_ns(&dir->v), kgid), 565 mode, rdev, 566 default_acl, acl, snapshot_src, flags) ?: 567 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, 568 KEY_TYPE_QUOTA_PREALLOC); 569 if (unlikely(ret)) 570 goto err_before_quota; 571 572 inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; 573 inum.inum = inode_u.bi_inum; 574 575 ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 576 bch2_trans_commit(trans, NULL, &journal_seq, 0); 577 if (unlikely(ret)) { 578 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, 579 KEY_TYPE_QUOTA_WARN); 580 err_before_quota: 581 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 582 goto retry; 583 goto err_trans; 584 } 585 586 if (!(flags & BCH_CREATE_TMPFILE)) { 587 bch2_inode_update_after_write(trans, dir, &dir_u, 588 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 589 mutex_unlock(&dir->ei_update_lock); 590 } 591 592 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 593 594 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 595 set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); 596 597 /* 598 * we must insert the new inode into the inode cache before calling 599 * bch2_trans_exit() and dropping locks, else we could race with another 600 * thread pulling the inode in and modifying it: 601 * 602 * also, calling bch2_inode_hash_insert() without passing in the 603 * transaction object is sketchy - if we could ever end up in 604 * __wait_on_freeing_inode(), we'd risk deadlock. 605 * 606 * But that shouldn't be possible, since we still have the inode locked 607 * that we just created, and we _really_ can't take a transaction 608 * restart here. 609 */ 610 inode = bch2_inode_hash_insert(c, NULL, inode); 611 bch2_trans_put(trans); 612 err: 613 posix_acl_release(default_acl); 614 posix_acl_release(acl); 615 return inode; 616 err_trans: 617 if (!(flags & BCH_CREATE_TMPFILE)) 618 mutex_unlock(&dir->ei_update_lock); 619 620 bch2_trans_put(trans); 621 make_bad_inode(&inode->v); 622 iput(&inode->v); 623 inode = ERR_PTR(ret); 624 goto err; 625 } 626 627 /* methods */ 628 629 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, 630 subvol_inum dir, struct bch_hash_info *dir_hash_info, 631 const struct qstr *name) 632 { 633 struct bch_fs *c = trans->c; 634 struct btree_iter dirent_iter = {}; 635 subvol_inum inum = {}; 636 struct printbuf buf = PRINTBUF; 637 638 struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, 639 dir_hash_info, dir, name, 0); 640 int ret = bkey_err(k); 641 if (ret) 642 return ERR_PTR(ret); 643 644 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 645 646 ret = bch2_dirent_read_target(trans, dir, d, &inum); 647 if (ret > 0) 648 ret = -ENOENT; 649 if (ret) 650 goto err; 651 652 struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); 653 if (inode) 654 goto out; 655 656 /* 657 * Note: if check/repair needs it, we commit before 658 * bch2_inode_hash_init_insert(), as after that point we can't take a 659 * restart - not in the top level loop with a commit_do(), like we 660 * usually do: 661 */ 662 663 struct bch_subvolume subvol; 664 struct bch_inode_unpacked inode_u; 665 ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 666 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: 667 bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: 668 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: 669 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 670 671 /* 672 * don't remove it: check_inodes might find another inode that points 673 * back to this dirent 674 */ 675 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 676 c, "dirent to missing inode:\n%s", 677 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); 678 if (ret) 679 goto err; 680 out: 681 bch2_trans_iter_exit(trans, &dirent_iter); 682 printbuf_exit(&buf); 683 return inode; 684 err: 685 inode = ERR_PTR(ret); 686 goto out; 687 } 688 689 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, 690 unsigned int flags) 691 { 692 struct bch_fs *c = vdir->i_sb->s_fs_info; 693 struct bch_inode_info *dir = to_bch_ei(vdir); 694 struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); 695 696 struct bch_inode_info *inode; 697 bch2_trans_do(c, 698 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), 699 &hash, &dentry->d_name))); 700 if (IS_ERR(inode)) 701 inode = NULL; 702 703 #ifdef CONFIG_UNICODE 704 if (!inode && IS_CASEFOLDED(vdir)) { 705 /* 706 * Do not cache a negative dentry in casefolded directories 707 * as it would need to be invalidated in the following situation: 708 * - Lookup file "blAH" in a casefolded directory 709 * - Creation of file "BLAH" in a casefolded directory 710 * - Lookup file "blAH" in a casefolded directory 711 * which would fail if we had a negative dentry. 712 * 713 * We should come back to this when VFS has a method to handle 714 * this edgecase. 715 */ 716 return NULL; 717 } 718 #endif 719 720 return d_splice_alias(&inode->v, dentry); 721 } 722 723 static int bch2_mknod(struct mnt_idmap *idmap, 724 struct inode *vdir, struct dentry *dentry, 725 umode_t mode, dev_t rdev) 726 { 727 struct bch_inode_info *inode = 728 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 729 (subvol_inum) { 0 }, 0); 730 731 if (IS_ERR(inode)) 732 return bch2_err_class(PTR_ERR(inode)); 733 734 d_instantiate(dentry, &inode->v); 735 return 0; 736 } 737 738 static int bch2_create(struct mnt_idmap *idmap, 739 struct inode *vdir, struct dentry *dentry, 740 umode_t mode, bool excl) 741 { 742 return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); 743 } 744 745 static int __bch2_link(struct bch_fs *c, 746 struct bch_inode_info *inode, 747 struct bch_inode_info *dir, 748 struct dentry *dentry) 749 { 750 struct bch_inode_unpacked dir_u, inode_u; 751 int ret; 752 753 mutex_lock(&inode->ei_update_lock); 754 struct btree_trans *trans = bch2_trans_get(c); 755 756 ret = commit_do(trans, NULL, NULL, 0, 757 bch2_link_trans(trans, 758 inode_inum(dir), &dir_u, 759 inode_inum(inode), &inode_u, 760 &dentry->d_name)); 761 762 if (likely(!ret)) { 763 bch2_inode_update_after_write(trans, dir, &dir_u, 764 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 765 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); 766 } 767 768 bch2_trans_put(trans); 769 mutex_unlock(&inode->ei_update_lock); 770 return ret; 771 } 772 773 static int bch2_link(struct dentry *old_dentry, struct inode *vdir, 774 struct dentry *dentry) 775 { 776 struct bch_fs *c = vdir->i_sb->s_fs_info; 777 struct bch_inode_info *dir = to_bch_ei(vdir); 778 struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); 779 int ret; 780 781 lockdep_assert_held(&inode->v.i_rwsem); 782 783 ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 784 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 785 __bch2_link(c, inode, dir, dentry); 786 if (unlikely(ret)) 787 return bch2_err_class(ret); 788 789 ihold(&inode->v); 790 d_instantiate(dentry, &inode->v); 791 return 0; 792 } 793 794 int __bch2_unlink(struct inode *vdir, struct dentry *dentry, 795 bool deleting_snapshot) 796 { 797 struct bch_fs *c = vdir->i_sb->s_fs_info; 798 struct bch_inode_info *dir = to_bch_ei(vdir); 799 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 800 struct bch_inode_unpacked dir_u, inode_u; 801 int ret; 802 803 bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); 804 805 struct btree_trans *trans = bch2_trans_get(c); 806 807 ret = commit_do(trans, NULL, NULL, 808 BCH_TRANS_COMMIT_no_enospc, 809 bch2_unlink_trans(trans, 810 inode_inum(dir), &dir_u, 811 &inode_u, &dentry->d_name, 812 deleting_snapshot)); 813 if (unlikely(ret)) 814 goto err; 815 816 bch2_inode_update_after_write(trans, dir, &dir_u, 817 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 818 bch2_inode_update_after_write(trans, inode, &inode_u, 819 ATTR_MTIME); 820 821 if (inode_u.bi_subvol) { 822 /* 823 * Subvolume deletion is asynchronous, but we still want to tell 824 * the VFS that it's been deleted here: 825 */ 826 set_nlink(&inode->v, 0); 827 } 828 err: 829 bch2_trans_put(trans); 830 bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); 831 832 return ret; 833 } 834 835 static int bch2_unlink(struct inode *vdir, struct dentry *dentry) 836 { 837 struct bch_inode_info *dir= to_bch_ei(vdir); 838 struct bch_fs *c = dir->v.i_sb->s_fs_info; 839 840 int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 841 __bch2_unlink(vdir, dentry, false); 842 return bch2_err_class(ret); 843 } 844 845 static int bch2_symlink(struct mnt_idmap *idmap, 846 struct inode *vdir, struct dentry *dentry, 847 const char *symname) 848 { 849 struct bch_fs *c = vdir->i_sb->s_fs_info; 850 struct bch_inode_info *dir = to_bch_ei(vdir), *inode; 851 int ret; 852 853 inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, 854 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 855 if (IS_ERR(inode)) 856 return bch2_err_class(PTR_ERR(inode)); 857 858 inode_lock(&inode->v); 859 ret = page_symlink(&inode->v, symname, strlen(symname) + 1); 860 inode_unlock(&inode->v); 861 862 if (unlikely(ret)) 863 goto err; 864 865 ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); 866 if (unlikely(ret)) 867 goto err; 868 869 ret = __bch2_link(c, inode, dir, dentry); 870 if (unlikely(ret)) 871 goto err; 872 873 d_instantiate(dentry, &inode->v); 874 return 0; 875 err: 876 iput(&inode->v); 877 return bch2_err_class(ret); 878 } 879 880 static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, 881 struct inode *vdir, struct dentry *dentry, umode_t mode) 882 { 883 return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); 884 } 885 886 static int bch2_rename2(struct mnt_idmap *idmap, 887 struct inode *src_vdir, struct dentry *src_dentry, 888 struct inode *dst_vdir, struct dentry *dst_dentry, 889 unsigned flags) 890 { 891 struct bch_fs *c = src_vdir->i_sb->s_fs_info; 892 struct bch_inode_info *src_dir = to_bch_ei(src_vdir); 893 struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); 894 struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); 895 struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); 896 struct bch_inode_unpacked dst_dir_u, src_dir_u; 897 struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; 898 struct btree_trans *trans; 899 enum bch_rename_mode mode = flags & RENAME_EXCHANGE 900 ? BCH_RENAME_EXCHANGE 901 : dst_dentry->d_inode 902 ? BCH_RENAME_OVERWRITE : BCH_RENAME; 903 bool whiteout = !!(flags & RENAME_WHITEOUT); 904 int ret; 905 906 if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) 907 return -EINVAL; 908 909 if (mode == BCH_RENAME_OVERWRITE) { 910 ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 911 0, LLONG_MAX); 912 if (ret) 913 return ret; 914 } 915 916 bch2_lock_inodes(INODE_UPDATE_LOCK, 917 src_dir, 918 dst_dir, 919 src_inode, 920 dst_inode); 921 922 trans = bch2_trans_get(c); 923 924 ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: 925 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); 926 if (ret) 927 goto err_tx_restart; 928 929 if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { 930 ret = bch2_fs_quota_transfer(c, src_inode, 931 dst_dir->ei_qid, 932 1 << QTYP_PRJ, 933 KEY_TYPE_QUOTA_PREALLOC); 934 if (ret) 935 goto err; 936 } 937 938 if (mode == BCH_RENAME_EXCHANGE && 939 inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { 940 ret = bch2_fs_quota_transfer(c, dst_inode, 941 src_dir->ei_qid, 942 1 << QTYP_PRJ, 943 KEY_TYPE_QUOTA_PREALLOC); 944 if (ret) 945 goto err; 946 } 947 retry: 948 bch2_trans_begin(trans); 949 950 ret = bch2_rename_trans(trans, 951 inode_inum(src_dir), &src_dir_u, 952 inode_inum(dst_dir), &dst_dir_u, 953 &src_inode_u, 954 &dst_inode_u, 955 &src_dentry->d_name, 956 &dst_dentry->d_name, 957 mode); 958 if (unlikely(ret)) 959 goto err_tx_restart; 960 961 if (whiteout) { 962 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); 963 ret = PTR_ERR_OR_ZERO(whiteout_inode_u); 964 if (unlikely(ret)) 965 goto err_tx_restart; 966 bch2_inode_init_early(c, whiteout_inode_u); 967 968 ret = bch2_create_trans(trans, 969 inode_inum(src_dir), &src_dir_u, 970 whiteout_inode_u, 971 &src_dentry->d_name, 972 from_kuid(i_user_ns(&src_dir->v), current_fsuid()), 973 from_kgid(i_user_ns(&src_dir->v), current_fsgid()), 974 S_IFCHR|WHITEOUT_MODE, 0, 975 NULL, NULL, (subvol_inum) { 0 }, 0) ?: 976 bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, 977 KEY_TYPE_QUOTA_PREALLOC); 978 if (unlikely(ret)) 979 goto err_tx_restart; 980 } 981 982 ret = bch2_trans_commit(trans, NULL, NULL, 0); 983 if (unlikely(ret)) { 984 err_tx_restart: 985 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 986 goto retry; 987 goto err; 988 } 989 990 BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); 991 BUG_ON(dst_inode && 992 dst_inode->v.i_ino != dst_inode_u.bi_inum); 993 994 bch2_inode_update_after_write(trans, src_dir, &src_dir_u, 995 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 996 997 if (src_dir != dst_dir) 998 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, 999 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 1000 1001 bch2_inode_update_after_write(trans, src_inode, &src_inode_u, 1002 ATTR_CTIME); 1003 1004 if (dst_inode) 1005 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, 1006 ATTR_CTIME); 1007 err: 1008 bch2_trans_put(trans); 1009 1010 bch2_fs_quota_transfer(c, src_inode, 1011 bch_qid(&src_inode->ei_inode), 1012 1 << QTYP_PRJ, 1013 KEY_TYPE_QUOTA_NOCHECK); 1014 if (dst_inode) 1015 bch2_fs_quota_transfer(c, dst_inode, 1016 bch_qid(&dst_inode->ei_inode), 1017 1 << QTYP_PRJ, 1018 KEY_TYPE_QUOTA_NOCHECK); 1019 1020 bch2_unlock_inodes(INODE_UPDATE_LOCK, 1021 src_dir, 1022 dst_dir, 1023 src_inode, 1024 dst_inode); 1025 1026 return bch2_err_class(ret); 1027 } 1028 1029 static void bch2_setattr_copy(struct mnt_idmap *idmap, 1030 struct bch_inode_info *inode, 1031 struct bch_inode_unpacked *bi, 1032 struct iattr *attr) 1033 { 1034 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1035 unsigned int ia_valid = attr->ia_valid; 1036 kuid_t kuid; 1037 kgid_t kgid; 1038 1039 if (ia_valid & ATTR_UID) { 1040 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 1041 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); 1042 } 1043 if (ia_valid & ATTR_GID) { 1044 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 1045 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); 1046 } 1047 1048 if (ia_valid & ATTR_SIZE) 1049 bi->bi_size = attr->ia_size; 1050 1051 if (ia_valid & ATTR_ATIME) 1052 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); 1053 if (ia_valid & ATTR_MTIME) 1054 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); 1055 if (ia_valid & ATTR_CTIME) 1056 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); 1057 1058 if (ia_valid & ATTR_MODE) { 1059 umode_t mode = attr->ia_mode; 1060 kgid_t gid = ia_valid & ATTR_GID 1061 ? kgid 1062 : inode->v.i_gid; 1063 1064 if (!in_group_or_capable(idmap, &inode->v, 1065 make_vfsgid(idmap, i_user_ns(&inode->v), gid))) 1066 mode &= ~S_ISGID; 1067 bi->bi_mode = mode; 1068 } 1069 } 1070 1071 int bch2_setattr_nonsize(struct mnt_idmap *idmap, 1072 struct bch_inode_info *inode, 1073 struct iattr *attr) 1074 { 1075 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1076 struct bch_qid qid; 1077 struct btree_trans *trans; 1078 struct btree_iter inode_iter = {}; 1079 struct bch_inode_unpacked inode_u; 1080 struct posix_acl *acl = NULL; 1081 kuid_t kuid; 1082 kgid_t kgid; 1083 int ret; 1084 1085 mutex_lock(&inode->ei_update_lock); 1086 1087 qid = inode->ei_qid; 1088 1089 if (attr->ia_valid & ATTR_UID) { 1090 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 1091 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); 1092 } 1093 1094 if (attr->ia_valid & ATTR_GID) { 1095 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 1096 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); 1097 } 1098 1099 ret = bch2_fs_quota_transfer(c, inode, qid, ~0, 1100 KEY_TYPE_QUOTA_PREALLOC); 1101 if (ret) 1102 goto err; 1103 1104 trans = bch2_trans_get(c); 1105 retry: 1106 bch2_trans_begin(trans); 1107 kfree(acl); 1108 acl = NULL; 1109 1110 ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), 1111 BTREE_ITER_intent); 1112 if (ret) 1113 goto btree_err; 1114 1115 bch2_setattr_copy(idmap, inode, &inode_u, attr); 1116 1117 if (attr->ia_valid & ATTR_MODE) { 1118 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, 1119 inode_u.bi_mode, &acl); 1120 if (ret) 1121 goto btree_err; 1122 } 1123 1124 ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: 1125 bch2_trans_commit(trans, NULL, NULL, 1126 BCH_TRANS_COMMIT_no_enospc); 1127 btree_err: 1128 bch2_trans_iter_exit(trans, &inode_iter); 1129 1130 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1131 goto retry; 1132 if (unlikely(ret)) 1133 goto err_trans; 1134 1135 bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); 1136 1137 if (acl) 1138 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 1139 err_trans: 1140 bch2_trans_put(trans); 1141 err: 1142 mutex_unlock(&inode->ei_update_lock); 1143 1144 return bch2_err_class(ret); 1145 } 1146 1147 static int bch2_getattr(struct mnt_idmap *idmap, 1148 const struct path *path, struct kstat *stat, 1149 u32 request_mask, unsigned query_flags) 1150 { 1151 struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); 1152 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1153 vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); 1154 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); 1155 1156 stat->dev = inode->v.i_sb->s_dev; 1157 stat->ino = inode->v.i_ino; 1158 stat->mode = inode->v.i_mode; 1159 stat->nlink = inode->v.i_nlink; 1160 stat->uid = vfsuid_into_kuid(vfsuid); 1161 stat->gid = vfsgid_into_kgid(vfsgid); 1162 stat->rdev = inode->v.i_rdev; 1163 stat->size = i_size_read(&inode->v); 1164 stat->atime = inode_get_atime(&inode->v); 1165 stat->mtime = inode_get_mtime(&inode->v); 1166 stat->ctime = inode_get_ctime(&inode->v); 1167 stat->blksize = block_bytes(c); 1168 stat->blocks = inode->v.i_blocks; 1169 1170 stat->subvol = inode->ei_inum.subvol; 1171 stat->result_mask |= STATX_SUBVOL; 1172 1173 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { 1174 stat->result_mask |= STATX_DIOALIGN; 1175 /* 1176 * this is incorrect; we should be tracking this in superblock, 1177 * and checking the alignment of open devices 1178 */ 1179 stat->dio_mem_align = SECTOR_SIZE; 1180 stat->dio_offset_align = block_bytes(c); 1181 } 1182 1183 if (request_mask & STATX_BTIME) { 1184 stat->result_mask |= STATX_BTIME; 1185 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); 1186 } 1187 1188 if (inode->ei_inode.bi_flags & BCH_INODE_immutable) 1189 stat->attributes |= STATX_ATTR_IMMUTABLE; 1190 stat->attributes_mask |= STATX_ATTR_IMMUTABLE; 1191 1192 if (inode->ei_inode.bi_flags & BCH_INODE_append) 1193 stat->attributes |= STATX_ATTR_APPEND; 1194 stat->attributes_mask |= STATX_ATTR_APPEND; 1195 1196 if (inode->ei_inode.bi_flags & BCH_INODE_nodump) 1197 stat->attributes |= STATX_ATTR_NODUMP; 1198 stat->attributes_mask |= STATX_ATTR_NODUMP; 1199 1200 return 0; 1201 } 1202 1203 static int bch2_setattr(struct mnt_idmap *idmap, 1204 struct dentry *dentry, struct iattr *iattr) 1205 { 1206 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 1207 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1208 int ret; 1209 1210 lockdep_assert_held(&inode->v.i_rwsem); 1211 1212 ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 1213 setattr_prepare(idmap, dentry, iattr); 1214 if (ret) 1215 return ret; 1216 1217 return iattr->ia_valid & ATTR_SIZE 1218 ? bchfs_truncate(idmap, inode, iattr) 1219 : bch2_setattr_nonsize(idmap, inode, iattr); 1220 } 1221 1222 static int bch2_tmpfile(struct mnt_idmap *idmap, 1223 struct inode *vdir, struct file *file, umode_t mode) 1224 { 1225 struct bch_inode_info *inode = 1226 __bch2_create(idmap, to_bch_ei(vdir), 1227 file->f_path.dentry, mode, 0, 1228 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 1229 1230 if (IS_ERR(inode)) 1231 return bch2_err_class(PTR_ERR(inode)); 1232 1233 d_mark_tmpfile(file, &inode->v); 1234 d_instantiate(file->f_path.dentry, &inode->v); 1235 return finish_open_simple(file, 0); 1236 } 1237 1238 static int bch2_fill_extent(struct bch_fs *c, 1239 struct fiemap_extent_info *info, 1240 struct bkey_s_c k, unsigned flags) 1241 { 1242 if (bkey_extent_is_direct_data(k.k)) { 1243 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1244 const union bch_extent_entry *entry; 1245 struct extent_ptr_decoded p; 1246 int ret; 1247 1248 if (k.k->type == KEY_TYPE_reflink_v) 1249 flags |= FIEMAP_EXTENT_SHARED; 1250 1251 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1252 int flags2 = 0; 1253 u64 offset = p.ptr.offset; 1254 1255 if (p.ptr.unwritten) 1256 flags2 |= FIEMAP_EXTENT_UNWRITTEN; 1257 1258 if (p.crc.compression_type) 1259 flags2 |= FIEMAP_EXTENT_ENCODED; 1260 else 1261 offset += p.crc.offset; 1262 1263 if ((offset & (block_sectors(c) - 1)) || 1264 (k.k->size & (block_sectors(c) - 1))) 1265 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; 1266 1267 ret = fiemap_fill_next_extent(info, 1268 bkey_start_offset(k.k) << 9, 1269 offset << 9, 1270 k.k->size << 9, flags|flags2); 1271 if (ret) 1272 return ret; 1273 } 1274 1275 return 0; 1276 } else if (bkey_extent_is_inline_data(k.k)) { 1277 return fiemap_fill_next_extent(info, 1278 bkey_start_offset(k.k) << 9, 1279 0, k.k->size << 9, 1280 flags| 1281 FIEMAP_EXTENT_DATA_INLINE); 1282 } else if (k.k->type == KEY_TYPE_reservation) { 1283 return fiemap_fill_next_extent(info, 1284 bkey_start_offset(k.k) << 9, 1285 0, k.k->size << 9, 1286 flags| 1287 FIEMAP_EXTENT_DELALLOC| 1288 FIEMAP_EXTENT_UNWRITTEN); 1289 } else { 1290 BUG(); 1291 } 1292 } 1293 1294 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, 1295 u64 start, u64 len) 1296 { 1297 struct bch_fs *c = vinode->i_sb->s_fs_info; 1298 struct bch_inode_info *ei = to_bch_ei(vinode); 1299 struct btree_trans *trans; 1300 struct btree_iter iter; 1301 struct bkey_s_c k; 1302 struct bkey_buf cur, prev; 1303 bool have_extent = false; 1304 int ret = 0; 1305 1306 ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); 1307 if (ret) 1308 return ret; 1309 1310 struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); 1311 if (start + len < start) 1312 return -EINVAL; 1313 1314 start >>= 9; 1315 1316 bch2_bkey_buf_init(&cur); 1317 bch2_bkey_buf_init(&prev); 1318 trans = bch2_trans_get(c); 1319 1320 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1321 POS(ei->v.i_ino, start), 0); 1322 1323 while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1324 enum btree_id data_btree = BTREE_ID_extents; 1325 1326 bch2_trans_begin(trans); 1327 1328 u32 snapshot; 1329 ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); 1330 if (ret) 1331 continue; 1332 1333 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1334 1335 k = bch2_btree_iter_peek_max(trans, &iter, end); 1336 ret = bkey_err(k); 1337 if (ret) 1338 continue; 1339 1340 if (!k.k) 1341 break; 1342 1343 if (!bkey_extent_is_data(k.k) && 1344 k.k->type != KEY_TYPE_reservation) { 1345 bch2_btree_iter_advance(trans, &iter); 1346 continue; 1347 } 1348 1349 s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 1350 unsigned sectors = k.k->size - offset_into_extent; 1351 1352 bch2_bkey_buf_reassemble(&cur, c, k); 1353 1354 ret = bch2_read_indirect_extent(trans, &data_btree, 1355 &offset_into_extent, &cur); 1356 if (ret) 1357 continue; 1358 1359 k = bkey_i_to_s_c(cur.k); 1360 bch2_bkey_buf_realloc(&prev, c, k.k->u64s); 1361 1362 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1363 1364 bch2_cut_front(POS(k.k->p.inode, 1365 bkey_start_offset(k.k) + 1366 offset_into_extent), 1367 cur.k); 1368 bch2_key_resize(&cur.k->k, sectors); 1369 cur.k->k.p = iter.pos; 1370 cur.k->k.p.offset += cur.k->k.size; 1371 1372 if (have_extent) { 1373 bch2_trans_unlock(trans); 1374 ret = bch2_fill_extent(c, info, 1375 bkey_i_to_s_c(prev.k), 0); 1376 if (ret) 1377 break; 1378 } 1379 1380 bkey_copy(prev.k, cur.k); 1381 have_extent = true; 1382 1383 bch2_btree_iter_set_pos(trans, &iter, 1384 POS(iter.pos.inode, iter.pos.offset + sectors)); 1385 } 1386 bch2_trans_iter_exit(trans, &iter); 1387 1388 if (!ret && have_extent) { 1389 bch2_trans_unlock(trans); 1390 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 1391 FIEMAP_EXTENT_LAST); 1392 } 1393 1394 bch2_trans_put(trans); 1395 bch2_bkey_buf_exit(&cur, c); 1396 bch2_bkey_buf_exit(&prev, c); 1397 return ret < 0 ? ret : 0; 1398 } 1399 1400 static const struct vm_operations_struct bch_vm_ops = { 1401 .fault = bch2_page_fault, 1402 .map_pages = filemap_map_pages, 1403 .page_mkwrite = bch2_page_mkwrite, 1404 }; 1405 1406 static int bch2_mmap(struct file *file, struct vm_area_struct *vma) 1407 { 1408 file_accessed(file); 1409 1410 vma->vm_ops = &bch_vm_ops; 1411 return 0; 1412 } 1413 1414 /* Directories: */ 1415 1416 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) 1417 { 1418 return generic_file_llseek_size(file, offset, whence, 1419 S64_MAX, S64_MAX); 1420 } 1421 1422 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) 1423 { 1424 struct bch_inode_info *inode = file_bch_inode(file); 1425 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1426 1427 if (!dir_emit_dots(file, ctx)) 1428 return 0; 1429 1430 int ret = bch2_readdir(c, inode_inum(inode), ctx); 1431 1432 bch_err_fn(c, ret); 1433 return bch2_err_class(ret); 1434 } 1435 1436 static int bch2_open(struct inode *vinode, struct file *file) 1437 { 1438 if (file->f_flags & (O_WRONLY|O_RDWR)) { 1439 struct bch_inode_info *inode = to_bch_ei(vinode); 1440 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1441 1442 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); 1443 if (ret) 1444 return ret; 1445 } 1446 1447 file->f_mode |= FMODE_CAN_ODIRECT; 1448 1449 return generic_file_open(vinode, file); 1450 } 1451 1452 static const struct file_operations bch_file_operations = { 1453 .open = bch2_open, 1454 .llseek = bch2_llseek, 1455 .read_iter = bch2_read_iter, 1456 .write_iter = bch2_write_iter, 1457 .mmap = bch2_mmap, 1458 .get_unmapped_area = thp_get_unmapped_area, 1459 .fsync = bch2_fsync, 1460 .splice_read = filemap_splice_read, 1461 .splice_write = iter_file_splice_write, 1462 .fallocate = bch2_fallocate_dispatch, 1463 .unlocked_ioctl = bch2_fs_file_ioctl, 1464 #ifdef CONFIG_COMPAT 1465 .compat_ioctl = bch2_compat_fs_ioctl, 1466 #endif 1467 .remap_file_range = bch2_remap_file_range, 1468 }; 1469 1470 static const struct inode_operations bch_file_inode_operations = { 1471 .getattr = bch2_getattr, 1472 .setattr = bch2_setattr, 1473 .fiemap = bch2_fiemap, 1474 .listxattr = bch2_xattr_list, 1475 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1476 .get_inode_acl = bch2_get_acl, 1477 .set_acl = bch2_set_acl, 1478 #endif 1479 }; 1480 1481 static const struct inode_operations bch_dir_inode_operations = { 1482 .lookup = bch2_lookup, 1483 .create = bch2_create, 1484 .link = bch2_link, 1485 .unlink = bch2_unlink, 1486 .symlink = bch2_symlink, 1487 .mkdir = bch2_mkdir, 1488 .rmdir = bch2_unlink, 1489 .mknod = bch2_mknod, 1490 .rename = bch2_rename2, 1491 .getattr = bch2_getattr, 1492 .setattr = bch2_setattr, 1493 .tmpfile = bch2_tmpfile, 1494 .listxattr = bch2_xattr_list, 1495 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1496 .get_inode_acl = bch2_get_acl, 1497 .set_acl = bch2_set_acl, 1498 #endif 1499 }; 1500 1501 static const struct file_operations bch_dir_file_operations = { 1502 .llseek = bch2_dir_llseek, 1503 .read = generic_read_dir, 1504 .iterate_shared = bch2_vfs_readdir, 1505 .fsync = bch2_fsync, 1506 .unlocked_ioctl = bch2_fs_file_ioctl, 1507 #ifdef CONFIG_COMPAT 1508 .compat_ioctl = bch2_compat_fs_ioctl, 1509 #endif 1510 }; 1511 1512 static const struct inode_operations bch_symlink_inode_operations = { 1513 .get_link = page_get_link, 1514 .getattr = bch2_getattr, 1515 .setattr = bch2_setattr, 1516 .listxattr = bch2_xattr_list, 1517 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1518 .get_inode_acl = bch2_get_acl, 1519 .set_acl = bch2_set_acl, 1520 #endif 1521 }; 1522 1523 static const struct inode_operations bch_special_inode_operations = { 1524 .getattr = bch2_getattr, 1525 .setattr = bch2_setattr, 1526 .listxattr = bch2_xattr_list, 1527 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1528 .get_inode_acl = bch2_get_acl, 1529 .set_acl = bch2_set_acl, 1530 #endif 1531 }; 1532 1533 static const struct address_space_operations bch_address_space_operations = { 1534 .read_folio = bch2_read_folio, 1535 .writepages = bch2_writepages, 1536 .readahead = bch2_readahead, 1537 .dirty_folio = filemap_dirty_folio, 1538 .write_begin = bch2_write_begin, 1539 .write_end = bch2_write_end, 1540 .invalidate_folio = bch2_invalidate_folio, 1541 .release_folio = bch2_release_folio, 1542 #ifdef CONFIG_MIGRATION 1543 .migrate_folio = filemap_migrate_folio, 1544 #endif 1545 .error_remove_folio = generic_error_remove_folio, 1546 }; 1547 1548 struct bcachefs_fid { 1549 u64 inum; 1550 u32 subvol; 1551 u32 gen; 1552 } __packed; 1553 1554 struct bcachefs_fid_with_parent { 1555 struct bcachefs_fid fid; 1556 struct bcachefs_fid dir; 1557 } __packed; 1558 1559 static int bcachefs_fid_valid(int fh_len, int fh_type) 1560 { 1561 switch (fh_type) { 1562 case FILEID_BCACHEFS_WITHOUT_PARENT: 1563 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); 1564 case FILEID_BCACHEFS_WITH_PARENT: 1565 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); 1566 default: 1567 return false; 1568 } 1569 } 1570 1571 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) 1572 { 1573 return (struct bcachefs_fid) { 1574 .inum = inode->ei_inum.inum, 1575 .subvol = inode->ei_inum.subvol, 1576 .gen = inode->ei_inode.bi_generation, 1577 }; 1578 } 1579 1580 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, 1581 struct inode *vdir) 1582 { 1583 struct bch_inode_info *inode = to_bch_ei(vinode); 1584 struct bch_inode_info *dir = to_bch_ei(vdir); 1585 int min_len; 1586 1587 if (!S_ISDIR(inode->v.i_mode) && dir) { 1588 struct bcachefs_fid_with_parent *fid = (void *) fh; 1589 1590 min_len = sizeof(*fid) / sizeof(u32); 1591 if (*len < min_len) { 1592 *len = min_len; 1593 return FILEID_INVALID; 1594 } 1595 1596 fid->fid = bch2_inode_to_fid(inode); 1597 fid->dir = bch2_inode_to_fid(dir); 1598 1599 *len = min_len; 1600 return FILEID_BCACHEFS_WITH_PARENT; 1601 } else { 1602 struct bcachefs_fid *fid = (void *) fh; 1603 1604 min_len = sizeof(*fid) / sizeof(u32); 1605 if (*len < min_len) { 1606 *len = min_len; 1607 return FILEID_INVALID; 1608 } 1609 *fid = bch2_inode_to_fid(inode); 1610 1611 *len = min_len; 1612 return FILEID_BCACHEFS_WITHOUT_PARENT; 1613 } 1614 } 1615 1616 static struct inode *bch2_nfs_get_inode(struct super_block *sb, 1617 struct bcachefs_fid fid) 1618 { 1619 struct bch_fs *c = sb->s_fs_info; 1620 struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { 1621 .subvol = fid.subvol, 1622 .inum = fid.inum, 1623 }); 1624 if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { 1625 iput(vinode); 1626 vinode = ERR_PTR(-ESTALE); 1627 } 1628 return vinode; 1629 } 1630 1631 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, 1632 int fh_len, int fh_type) 1633 { 1634 struct bcachefs_fid *fid = (void *) _fid; 1635 1636 if (!bcachefs_fid_valid(fh_len, fh_type)) 1637 return NULL; 1638 1639 return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); 1640 } 1641 1642 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, 1643 int fh_len, int fh_type) 1644 { 1645 struct bcachefs_fid_with_parent *fid = (void *) _fid; 1646 1647 if (!bcachefs_fid_valid(fh_len, fh_type) || 1648 fh_type != FILEID_BCACHEFS_WITH_PARENT) 1649 return NULL; 1650 1651 return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); 1652 } 1653 1654 static struct dentry *bch2_get_parent(struct dentry *child) 1655 { 1656 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1657 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1658 subvol_inum parent_inum = { 1659 .subvol = inode->ei_inode.bi_parent_subvol ?: 1660 inode->ei_inum.subvol, 1661 .inum = inode->ei_inode.bi_dir, 1662 }; 1663 1664 return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); 1665 } 1666 1667 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) 1668 { 1669 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1670 struct bch_inode_info *dir = to_bch_ei(parent->d_inode); 1671 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1672 struct btree_trans *trans; 1673 struct btree_iter iter1; 1674 struct btree_iter iter2; 1675 struct bkey_s_c k; 1676 struct bkey_s_c_dirent d; 1677 struct bch_inode_unpacked inode_u; 1678 subvol_inum target; 1679 u32 snapshot; 1680 struct qstr dirent_name; 1681 unsigned name_len = 0; 1682 int ret; 1683 1684 if (!S_ISDIR(dir->v.i_mode)) 1685 return -EINVAL; 1686 1687 trans = bch2_trans_get(c); 1688 1689 bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, 1690 POS(dir->ei_inode.bi_inum, 0), 0); 1691 bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, 1692 POS(dir->ei_inode.bi_inum, 0), 0); 1693 retry: 1694 bch2_trans_begin(trans); 1695 1696 ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); 1697 if (ret) 1698 goto err; 1699 1700 bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); 1701 bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); 1702 1703 ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); 1704 if (ret) 1705 goto err; 1706 1707 if (inode_u.bi_dir == dir->ei_inode.bi_inum) { 1708 bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); 1709 1710 k = bch2_btree_iter_peek_slot(trans, &iter1); 1711 ret = bkey_err(k); 1712 if (ret) 1713 goto err; 1714 1715 if (k.k->type != KEY_TYPE_dirent) { 1716 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1717 goto err; 1718 } 1719 1720 d = bkey_s_c_to_dirent(k); 1721 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1722 if (ret > 0) 1723 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1724 if (ret) 1725 goto err; 1726 1727 if (subvol_inum_eq(target, inode->ei_inum)) 1728 goto found; 1729 } else { 1730 /* 1731 * File with multiple hardlinks and our backref is to the wrong 1732 * directory - linear search: 1733 */ 1734 for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { 1735 if (k.k->p.inode > dir->ei_inode.bi_inum) 1736 break; 1737 1738 if (k.k->type != KEY_TYPE_dirent) 1739 continue; 1740 1741 d = bkey_s_c_to_dirent(k); 1742 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1743 if (ret < 0) 1744 break; 1745 if (ret) 1746 continue; 1747 1748 if (subvol_inum_eq(target, inode->ei_inum)) 1749 goto found; 1750 } 1751 } 1752 1753 ret = -ENOENT; 1754 goto err; 1755 found: 1756 dirent_name = bch2_dirent_get_name(d); 1757 1758 name_len = min_t(unsigned, dirent_name.len, NAME_MAX); 1759 memcpy(name, dirent_name.name, name_len); 1760 name[name_len] = '\0'; 1761 err: 1762 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1763 goto retry; 1764 1765 bch2_trans_iter_exit(trans, &iter1); 1766 bch2_trans_iter_exit(trans, &iter2); 1767 bch2_trans_put(trans); 1768 1769 return ret; 1770 } 1771 1772 static const struct export_operations bch_export_ops = { 1773 .encode_fh = bch2_encode_fh, 1774 .fh_to_dentry = bch2_fh_to_dentry, 1775 .fh_to_parent = bch2_fh_to_parent, 1776 .get_parent = bch2_get_parent, 1777 .get_name = bch2_get_name, 1778 }; 1779 1780 static void bch2_vfs_inode_init(struct btree_trans *trans, 1781 subvol_inum inum, 1782 struct bch_inode_info *inode, 1783 struct bch_inode_unpacked *bi, 1784 struct bch_subvolume *subvol) 1785 { 1786 inode->v.i_ino = inum.inum; 1787 inode->ei_inum = inum; 1788 inode->ei_inode.bi_inum = inum.inum; 1789 bch2_inode_update_after_write(trans, inode, bi, ~0); 1790 1791 inode->v.i_blocks = bi->bi_sectors; 1792 inode->v.i_rdev = bi->bi_dev; 1793 inode->v.i_generation = bi->bi_generation; 1794 inode->v.i_size = bi->bi_size; 1795 1796 inode->ei_flags = 0; 1797 inode->ei_quota_reserved = 0; 1798 inode->ei_qid = bch_qid(bi); 1799 1800 if (BCH_SUBVOLUME_SNAP(subvol)) 1801 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); 1802 1803 inode->v.i_mapping->a_ops = &bch_address_space_operations; 1804 1805 switch (inode->v.i_mode & S_IFMT) { 1806 case S_IFREG: 1807 inode->v.i_op = &bch_file_inode_operations; 1808 inode->v.i_fop = &bch_file_operations; 1809 break; 1810 case S_IFDIR: 1811 inode->v.i_op = &bch_dir_inode_operations; 1812 inode->v.i_fop = &bch_dir_file_operations; 1813 break; 1814 case S_IFLNK: 1815 inode_nohighmem(&inode->v); 1816 inode->v.i_op = &bch_symlink_inode_operations; 1817 break; 1818 default: 1819 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); 1820 inode->v.i_op = &bch_special_inode_operations; 1821 break; 1822 } 1823 1824 mapping_set_folio_min_order(inode->v.i_mapping, 1825 get_order(trans->c->opts.block_size)); 1826 } 1827 1828 static void bch2_free_inode(struct inode *vinode) 1829 { 1830 kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); 1831 } 1832 1833 static int inode_update_times_fn(struct btree_trans *trans, 1834 struct bch_inode_info *inode, 1835 struct bch_inode_unpacked *bi, 1836 void *p) 1837 { 1838 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1839 1840 bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); 1841 bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); 1842 bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); 1843 1844 return 0; 1845 } 1846 1847 static int bch2_vfs_write_inode(struct inode *vinode, 1848 struct writeback_control *wbc) 1849 { 1850 struct bch_fs *c = vinode->i_sb->s_fs_info; 1851 struct bch_inode_info *inode = to_bch_ei(vinode); 1852 int ret; 1853 1854 mutex_lock(&inode->ei_update_lock); 1855 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 1856 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); 1857 mutex_unlock(&inode->ei_update_lock); 1858 1859 return bch2_err_class(ret); 1860 } 1861 1862 static void bch2_evict_inode(struct inode *vinode) 1863 { 1864 struct bch_fs *c = vinode->i_sb->s_fs_info; 1865 struct bch_inode_info *inode = to_bch_ei(vinode); 1866 bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); 1867 1868 /* 1869 * evict() has waited for outstanding writeback, we'll do no more IO 1870 * through this inode: it's safe to remove from VFS inode hashtable here 1871 * 1872 * Do that now so that other threads aren't blocked from pulling it back 1873 * in, there's no reason for them to be: 1874 */ 1875 if (!delete) 1876 bch2_inode_hash_remove(c, inode); 1877 1878 truncate_inode_pages_final(&inode->v.i_data); 1879 1880 clear_inode(&inode->v); 1881 1882 BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); 1883 1884 if (delete) { 1885 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), 1886 KEY_TYPE_QUOTA_WARN); 1887 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, 1888 KEY_TYPE_QUOTA_WARN); 1889 bch2_inode_rm(c, inode_inum(inode)); 1890 1891 /* 1892 * If we are deleting, we need it present in the vfs hash table 1893 * so that fsck can check if unlinked inodes are still open: 1894 */ 1895 bch2_inode_hash_remove(c, inode); 1896 } 1897 1898 mutex_lock(&c->vfs_inodes_lock); 1899 list_del_init(&inode->ei_vfs_inode_list); 1900 mutex_unlock(&c->vfs_inodes_lock); 1901 } 1902 1903 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) 1904 { 1905 struct bch_inode_info *inode; 1906 DARRAY(struct bch_inode_info *) grabbed; 1907 bool clean_pass = false, this_pass_clean; 1908 1909 /* 1910 * Initially, we scan for inodes without I_DONTCACHE, then mark them to 1911 * be pruned with d_mark_dontcache(). 1912 * 1913 * Once we've had a clean pass where we didn't find any inodes without 1914 * I_DONTCACHE, we wait for them to be freed: 1915 */ 1916 1917 darray_init(&grabbed); 1918 darray_make_room(&grabbed, 1024); 1919 again: 1920 cond_resched(); 1921 this_pass_clean = true; 1922 1923 mutex_lock(&c->vfs_inodes_lock); 1924 list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { 1925 if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) 1926 continue; 1927 1928 if (!(inode->v.i_state & I_DONTCACHE) && 1929 !(inode->v.i_state & I_FREEING) && 1930 igrab(&inode->v)) { 1931 this_pass_clean = false; 1932 1933 if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { 1934 iput(&inode->v); 1935 break; 1936 } 1937 } else if (clean_pass && this_pass_clean) { 1938 struct wait_bit_queue_entry wqe; 1939 struct wait_queue_head *wq_head; 1940 1941 wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); 1942 prepare_to_wait_event(wq_head, &wqe.wq_entry, 1943 TASK_UNINTERRUPTIBLE); 1944 mutex_unlock(&c->vfs_inodes_lock); 1945 1946 schedule(); 1947 finish_wait(wq_head, &wqe.wq_entry); 1948 goto again; 1949 } 1950 } 1951 mutex_unlock(&c->vfs_inodes_lock); 1952 1953 darray_for_each(grabbed, i) { 1954 inode = *i; 1955 d_mark_dontcache(&inode->v); 1956 d_prune_aliases(&inode->v); 1957 iput(&inode->v); 1958 } 1959 grabbed.nr = 0; 1960 1961 if (!clean_pass || !this_pass_clean) { 1962 clean_pass = this_pass_clean; 1963 goto again; 1964 } 1965 1966 darray_exit(&grabbed); 1967 } 1968 1969 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) 1970 { 1971 struct super_block *sb = dentry->d_sb; 1972 struct bch_fs *c = sb->s_fs_info; 1973 struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); 1974 unsigned shift = sb->s_blocksize_bits - 9; 1975 /* 1976 * this assumes inodes take up 64 bytes, which is a decent average 1977 * number: 1978 */ 1979 u64 avail_inodes = ((usage.capacity - usage.used) << 3); 1980 1981 buf->f_type = BCACHEFS_STATFS_MAGIC; 1982 buf->f_bsize = sb->s_blocksize; 1983 buf->f_blocks = usage.capacity >> shift; 1984 buf->f_bfree = usage.free >> shift; 1985 buf->f_bavail = avail_factor(usage.free) >> shift; 1986 1987 buf->f_files = usage.nr_inodes + avail_inodes; 1988 buf->f_ffree = avail_inodes; 1989 1990 buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); 1991 buf->f_namelen = BCH_NAME_MAX; 1992 1993 return 0; 1994 } 1995 1996 static int bch2_sync_fs(struct super_block *sb, int wait) 1997 { 1998 struct bch_fs *c = sb->s_fs_info; 1999 int ret; 2000 2001 trace_bch2_sync_fs(sb, wait); 2002 2003 if (c->opts.journal_flush_disabled) 2004 return 0; 2005 2006 if (!wait) { 2007 bch2_journal_flush_async(&c->journal, NULL); 2008 return 0; 2009 } 2010 2011 ret = bch2_journal_flush(&c->journal); 2012 return bch2_err_class(ret); 2013 } 2014 2015 static struct bch_fs *bch2_path_to_fs(const char *path) 2016 { 2017 struct bch_fs *c; 2018 dev_t dev; 2019 int ret; 2020 2021 ret = lookup_bdev(path, &dev); 2022 if (ret) 2023 return ERR_PTR(ret); 2024 2025 c = bch2_dev_to_fs(dev); 2026 if (c) 2027 closure_put(&c->cl); 2028 return c ?: ERR_PTR(-ENOENT); 2029 } 2030 2031 static int bch2_show_devname(struct seq_file *seq, struct dentry *root) 2032 { 2033 struct bch_fs *c = root->d_sb->s_fs_info; 2034 bool first = true; 2035 2036 for_each_online_member(c, ca) { 2037 if (!first) 2038 seq_putc(seq, ':'); 2039 first = false; 2040 seq_puts(seq, ca->disk_sb.sb_name); 2041 } 2042 2043 return 0; 2044 } 2045 2046 static int bch2_show_options(struct seq_file *seq, struct dentry *root) 2047 { 2048 struct bch_fs *c = root->d_sb->s_fs_info; 2049 struct printbuf buf = PRINTBUF; 2050 2051 bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, 2052 OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); 2053 printbuf_nul_terminate(&buf); 2054 seq_printf(seq, ",%s", buf.buf); 2055 2056 int ret = buf.allocation_failure ? -ENOMEM : 0; 2057 printbuf_exit(&buf); 2058 return ret; 2059 } 2060 2061 static void bch2_put_super(struct super_block *sb) 2062 { 2063 struct bch_fs *c = sb->s_fs_info; 2064 2065 __bch2_fs_stop(c); 2066 } 2067 2068 /* 2069 * bcachefs doesn't currently integrate intwrite freeze protection but the 2070 * internal write references serve the same purpose. Therefore reuse the 2071 * read-only transition code to perform the quiesce. The caveat is that we don't 2072 * currently have the ability to block tasks that want a write reference while 2073 * the superblock is frozen. This is fine for now, but we should either add 2074 * blocking support or find a way to integrate sb_start_intwrite() and friends. 2075 */ 2076 static int bch2_freeze(struct super_block *sb) 2077 { 2078 struct bch_fs *c = sb->s_fs_info; 2079 2080 down_write(&c->state_lock); 2081 bch2_fs_read_only(c); 2082 up_write(&c->state_lock); 2083 return 0; 2084 } 2085 2086 static int bch2_unfreeze(struct super_block *sb) 2087 { 2088 struct bch_fs *c = sb->s_fs_info; 2089 int ret; 2090 2091 if (test_bit(BCH_FS_emergency_ro, &c->flags)) 2092 return 0; 2093 2094 down_write(&c->state_lock); 2095 ret = bch2_fs_read_write(c); 2096 up_write(&c->state_lock); 2097 return ret; 2098 } 2099 2100 static const struct super_operations bch_super_operations = { 2101 .alloc_inode = bch2_alloc_inode, 2102 .free_inode = bch2_free_inode, 2103 .write_inode = bch2_vfs_write_inode, 2104 .evict_inode = bch2_evict_inode, 2105 .sync_fs = bch2_sync_fs, 2106 .statfs = bch2_statfs, 2107 .show_devname = bch2_show_devname, 2108 .show_options = bch2_show_options, 2109 .put_super = bch2_put_super, 2110 .freeze_fs = bch2_freeze, 2111 .unfreeze_fs = bch2_unfreeze, 2112 }; 2113 2114 static int bch2_set_super(struct super_block *s, void *data) 2115 { 2116 s->s_fs_info = data; 2117 return 0; 2118 } 2119 2120 static int bch2_noset_super(struct super_block *s, void *data) 2121 { 2122 return -EBUSY; 2123 } 2124 2125 typedef DARRAY(struct bch_fs *) darray_fs; 2126 2127 static int bch2_test_super(struct super_block *s, void *data) 2128 { 2129 struct bch_fs *c = s->s_fs_info; 2130 darray_fs *d = data; 2131 2132 if (!c) 2133 return false; 2134 2135 darray_for_each(*d, i) 2136 if (c != *i) 2137 return false; 2138 return true; 2139 } 2140 2141 static int bch2_fs_get_tree(struct fs_context *fc) 2142 { 2143 struct bch_fs *c; 2144 struct super_block *sb; 2145 struct inode *vinode; 2146 struct bch2_opts_parse *opts_parse = fc->fs_private; 2147 struct bch_opts opts = opts_parse->opts; 2148 darray_str devs; 2149 darray_fs devs_to_fs = {}; 2150 int ret; 2151 2152 opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2153 opt_set(opts, nostart, true); 2154 2155 if (!fc->source || strlen(fc->source) == 0) 2156 return -EINVAL; 2157 2158 ret = bch2_split_devs(fc->source, &devs); 2159 if (ret) 2160 return ret; 2161 2162 darray_for_each(devs, i) { 2163 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); 2164 if (ret) 2165 goto err; 2166 } 2167 2168 sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); 2169 if (!IS_ERR(sb)) 2170 goto got_sb; 2171 2172 c = bch2_fs_open(devs.data, devs.nr, opts); 2173 ret = PTR_ERR_OR_ZERO(c); 2174 if (ret) 2175 goto err; 2176 2177 if (opt_defined(opts, discard)) 2178 set_bit(BCH_FS_discard_mount_opt_set, &c->flags); 2179 2180 /* Some options can't be parsed until after the fs is started: */ 2181 opts = bch2_opts_empty(); 2182 ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); 2183 if (ret) 2184 goto err_stop_fs; 2185 2186 bch2_opts_apply(&c->opts, opts); 2187 2188 /* 2189 * need to initialise sb and set c->vfs_sb _before_ starting fs, 2190 * for blk_holder_ops 2191 */ 2192 2193 sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); 2194 ret = PTR_ERR_OR_ZERO(sb); 2195 if (ret) 2196 goto err_stop_fs; 2197 got_sb: 2198 c = sb->s_fs_info; 2199 2200 if (sb->s_root) { 2201 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { 2202 ret = -EBUSY; 2203 goto err_put_super; 2204 } 2205 goto out; 2206 } 2207 2208 sb->s_blocksize = block_bytes(c); 2209 sb->s_blocksize_bits = ilog2(block_bytes(c)); 2210 sb->s_maxbytes = MAX_LFS_FILESIZE; 2211 sb->s_op = &bch_super_operations; 2212 sb->s_export_op = &bch_export_ops; 2213 #ifdef CONFIG_BCACHEFS_QUOTA 2214 sb->s_qcop = &bch2_quotactl_operations; 2215 sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; 2216 #endif 2217 sb->s_xattr = bch2_xattr_handlers; 2218 sb->s_magic = BCACHEFS_STATFS_MAGIC; 2219 sb->s_time_gran = c->sb.nsec_per_time_unit; 2220 sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; 2221 sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); 2222 super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); 2223 super_set_sysfs_name_uuid(sb); 2224 sb->s_shrink->seeks = 0; 2225 c->vfs_sb = sb; 2226 strscpy(sb->s_id, c->name, sizeof(sb->s_id)); 2227 2228 ret = super_setup_bdi(sb); 2229 if (ret) 2230 goto err_put_super; 2231 2232 sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 2233 2234 for_each_online_member(c, ca) { 2235 struct block_device *bdev = ca->disk_sb.bdev; 2236 2237 /* XXX: create an anonymous device for multi device filesystems */ 2238 sb->s_bdev = bdev; 2239 sb->s_dev = bdev->bd_dev; 2240 percpu_ref_put(&ca->io_ref[READ]); 2241 break; 2242 } 2243 2244 c->dev = sb->s_dev; 2245 2246 #ifdef CONFIG_BCACHEFS_POSIX_ACL 2247 if (c->opts.acl) 2248 sb->s_flags |= SB_POSIXACL; 2249 #endif 2250 2251 sb->s_shrink->seeks = 0; 2252 2253 ret = bch2_fs_start(c); 2254 if (ret) 2255 goto err_put_super; 2256 2257 vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); 2258 ret = PTR_ERR_OR_ZERO(vinode); 2259 bch_err_msg(c, ret, "mounting: error getting root inode"); 2260 if (ret) 2261 goto err_put_super; 2262 2263 sb->s_root = d_make_root(vinode); 2264 if (!sb->s_root) { 2265 bch_err(c, "error mounting: error allocating root dentry"); 2266 ret = -ENOMEM; 2267 goto err_put_super; 2268 } 2269 2270 sb->s_flags |= SB_ACTIVE; 2271 out: 2272 fc->root = dget(sb->s_root); 2273 err: 2274 darray_exit(&devs_to_fs); 2275 bch2_darray_str_exit(&devs); 2276 if (ret) 2277 pr_err("error: %s", bch2_err_str(ret)); 2278 /* 2279 * On an inconsistency error in recovery we might see an -EROFS derived 2280 * errorcode (from the journal), but we don't want to return that to 2281 * userspace as that causes util-linux to retry the mount RO - which is 2282 * confusing: 2283 */ 2284 if (bch2_err_matches(ret, EROFS) && ret != -EROFS) 2285 ret = -EIO; 2286 return bch2_err_class(ret); 2287 2288 err_stop_fs: 2289 bch2_fs_stop(c); 2290 goto err; 2291 2292 err_put_super: 2293 if (!sb->s_root) 2294 __bch2_fs_stop(c); 2295 deactivate_locked_super(sb); 2296 goto err; 2297 } 2298 2299 static void bch2_kill_sb(struct super_block *sb) 2300 { 2301 struct bch_fs *c = sb->s_fs_info; 2302 2303 generic_shutdown_super(sb); 2304 bch2_fs_free(c); 2305 } 2306 2307 static void bch2_fs_context_free(struct fs_context *fc) 2308 { 2309 struct bch2_opts_parse *opts = fc->fs_private; 2310 2311 if (opts) { 2312 printbuf_exit(&opts->parse_later); 2313 kfree(opts); 2314 } 2315 } 2316 2317 static int bch2_fs_parse_param(struct fs_context *fc, 2318 struct fs_parameter *param) 2319 { 2320 /* 2321 * the "source" param, i.e., the name of the device(s) to mount, 2322 * is handled by the VFS layer. 2323 */ 2324 if (!strcmp(param->key, "source")) 2325 return -ENOPARAM; 2326 2327 struct bch2_opts_parse *opts = fc->fs_private; 2328 struct bch_fs *c = NULL; 2329 2330 /* for reconfigure, we already have a struct bch_fs */ 2331 if (fc->root) 2332 c = fc->root->d_sb->s_fs_info; 2333 2334 int ret = bch2_parse_one_mount_opt(c, &opts->opts, 2335 &opts->parse_later, param->key, 2336 param->string); 2337 if (ret) 2338 pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); 2339 2340 return bch2_err_class(ret); 2341 } 2342 2343 static int bch2_fs_reconfigure(struct fs_context *fc) 2344 { 2345 struct super_block *sb = fc->root->d_sb; 2346 struct bch2_opts_parse *opts = fc->fs_private; 2347 struct bch_fs *c = sb->s_fs_info; 2348 int ret = 0; 2349 2350 opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2351 2352 if (opts->opts.read_only != c->opts.read_only) { 2353 down_write(&c->state_lock); 2354 2355 if (opts->opts.read_only) { 2356 bch2_fs_read_only(c); 2357 2358 sb->s_flags |= SB_RDONLY; 2359 } else { 2360 ret = bch2_fs_read_write(c); 2361 if (ret) { 2362 bch_err(c, "error going rw: %i", ret); 2363 up_write(&c->state_lock); 2364 ret = -EINVAL; 2365 goto err; 2366 } 2367 2368 sb->s_flags &= ~SB_RDONLY; 2369 } 2370 2371 c->opts.read_only = opts->opts.read_only; 2372 2373 up_write(&c->state_lock); 2374 } 2375 2376 if (opt_defined(opts->opts, errors)) 2377 c->opts.errors = opts->opts.errors; 2378 err: 2379 return bch2_err_class(ret); 2380 } 2381 2382 static const struct fs_context_operations bch2_context_ops = { 2383 .free = bch2_fs_context_free, 2384 .parse_param = bch2_fs_parse_param, 2385 .get_tree = bch2_fs_get_tree, 2386 .reconfigure = bch2_fs_reconfigure, 2387 }; 2388 2389 static int bch2_init_fs_context(struct fs_context *fc) 2390 { 2391 struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); 2392 2393 if (!opts) 2394 return -ENOMEM; 2395 2396 opts->parse_later = PRINTBUF; 2397 2398 fc->ops = &bch2_context_ops; 2399 fc->fs_private = opts; 2400 2401 return 0; 2402 } 2403 2404 void bch2_fs_vfs_exit(struct bch_fs *c) 2405 { 2406 if (c->vfs_inodes_by_inum_table.ht.tbl) 2407 rhltable_destroy(&c->vfs_inodes_by_inum_table); 2408 if (c->vfs_inodes_table.tbl) 2409 rhashtable_destroy(&c->vfs_inodes_table); 2410 } 2411 2412 int bch2_fs_vfs_init(struct bch_fs *c) 2413 { 2414 return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: 2415 rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); 2416 } 2417 2418 static struct file_system_type bcache_fs_type = { 2419 .owner = THIS_MODULE, 2420 .name = "bcachefs", 2421 .init_fs_context = bch2_init_fs_context, 2422 .kill_sb = bch2_kill_sb, 2423 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, 2424 }; 2425 2426 MODULE_ALIAS_FS("bcachefs"); 2427 2428 void bch2_vfs_exit(void) 2429 { 2430 unregister_filesystem(&bcache_fs_type); 2431 kmem_cache_destroy(bch2_inode_cache); 2432 } 2433 2434 int __init bch2_vfs_init(void) 2435 { 2436 int ret = -ENOMEM; 2437 2438 bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | 2439 SLAB_ACCOUNT); 2440 if (!bch2_inode_cache) 2441 goto err; 2442 2443 ret = register_filesystem(&bcache_fs_type); 2444 if (ret) 2445 goto err; 2446 2447 return 0; 2448 err: 2449 bch2_vfs_exit(); 2450 return ret; 2451 } 2452 2453 #endif /* NO_BCACHEFS_FS */ 2454