1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "acl.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "chardev.h" 10 #include "dirent.h" 11 #include "errcode.h" 12 #include "extents.h" 13 #include "fs.h" 14 #include "fs-io.h" 15 #include "fs-ioctl.h" 16 #include "fs-io-buffered.h" 17 #include "fs-io-direct.h" 18 #include "fs-io-pagecache.h" 19 #include "fsck.h" 20 #include "inode.h" 21 #include "io_read.h" 22 #include "journal.h" 23 #include "keylist.h" 24 #include "namei.h" 25 #include "quota.h" 26 #include "rebalance.h" 27 #include "snapshot.h" 28 #include "super.h" 29 #include "xattr.h" 30 #include "trace.h" 31 32 #include <linux/aio.h> 33 #include <linux/backing-dev.h> 34 #include <linux/exportfs.h> 35 #include <linux/fiemap.h> 36 #include <linux/fileattr.h> 37 #include <linux/fs_context.h> 38 #include <linux/module.h> 39 #include <linux/pagemap.h> 40 #include <linux/posix_acl.h> 41 #include <linux/random.h> 42 #include <linux/seq_file.h> 43 #include <linux/siphash.h> 44 #include <linux/statfs.h> 45 #include <linux/string.h> 46 #include <linux/xattr.h> 47 48 static struct kmem_cache *bch2_inode_cache; 49 50 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, 51 struct bch_inode_info *, 52 struct bch_inode_unpacked *, 53 struct bch_subvolume *); 54 55 /* Set VFS inode flags from bcachefs inode: */ 56 static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) 57 { 58 static const __maybe_unused unsigned bch_flags_to_vfs[] = { 59 [__BCH_INODE_sync] = S_SYNC, 60 [__BCH_INODE_immutable] = S_IMMUTABLE, 61 [__BCH_INODE_append] = S_APPEND, 62 [__BCH_INODE_noatime] = S_NOATIME, 63 }; 64 65 set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); 66 67 if (bch2_inode_casefold(c, &inode->ei_inode)) 68 inode->v.i_flags |= S_CASEFOLD; 69 else 70 inode->v.i_flags &= ~S_CASEFOLD; 71 } 72 73 void bch2_inode_update_after_write(struct btree_trans *trans, 74 struct bch_inode_info *inode, 75 struct bch_inode_unpacked *bi, 76 unsigned fields) 77 { 78 struct bch_fs *c = trans->c; 79 80 BUG_ON(bi->bi_inum != inode->v.i_ino); 81 82 bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); 83 84 set_nlink(&inode->v, bch2_inode_nlink_get(bi)); 85 i_uid_write(&inode->v, bi->bi_uid); 86 i_gid_write(&inode->v, bi->bi_gid); 87 inode->v.i_mode = bi->bi_mode; 88 89 if (fields & ATTR_SIZE) 90 i_size_write(&inode->v, bi->bi_size); 91 92 if (fields & ATTR_ATIME) 93 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); 94 if (fields & ATTR_MTIME) 95 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); 96 if (fields & ATTR_CTIME) 97 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); 98 99 inode->ei_inode = *bi; 100 101 bch2_inode_flags_to_vfs(c, inode); 102 } 103 104 int __must_check bch2_write_inode(struct bch_fs *c, 105 struct bch_inode_info *inode, 106 inode_set_fn set, 107 void *p, unsigned fields) 108 { 109 struct btree_trans *trans = bch2_trans_get(c); 110 struct btree_iter iter = {}; 111 struct bch_inode_unpacked inode_u; 112 int ret; 113 retry: 114 bch2_trans_begin(trans); 115 116 ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); 117 if (ret) 118 goto err; 119 120 struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); 121 122 ret = (set ? set(trans, inode, &inode_u, p) : 0); 123 if (ret) 124 goto err; 125 126 struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); 127 128 if (memcmp(&old_r, &new_r, sizeof(new_r))) { 129 ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); 130 if (ret) 131 goto err; 132 } 133 134 ret = bch2_inode_write(trans, &iter, &inode_u) ?: 135 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 136 137 /* 138 * the btree node lock protects inode->ei_inode, not ei_update_lock; 139 * this is important for inode updates via bchfs_write_index_update 140 */ 141 if (!ret) 142 bch2_inode_update_after_write(trans, inode, &inode_u, fields); 143 err: 144 bch2_trans_iter_exit(trans, &iter); 145 146 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 147 goto retry; 148 149 bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, 150 "%s: inode %llu:%llu not found when updating", 151 bch2_err_str(ret), 152 inode_inum(inode).subvol, 153 inode_inum(inode).inum); 154 155 bch2_trans_put(trans); 156 return ret < 0 ? ret : 0; 157 } 158 159 int bch2_fs_quota_transfer(struct bch_fs *c, 160 struct bch_inode_info *inode, 161 struct bch_qid new_qid, 162 unsigned qtypes, 163 enum quota_acct_mode mode) 164 { 165 unsigned i; 166 int ret; 167 168 qtypes &= enabled_qtypes(c); 169 170 for (i = 0; i < QTYP_NR; i++) 171 if (new_qid.q[i] == inode->ei_qid.q[i]) 172 qtypes &= ~(1U << i); 173 174 if (!qtypes) 175 return 0; 176 177 mutex_lock(&inode->ei_quota_lock); 178 179 ret = bch2_quota_transfer(c, qtypes, new_qid, 180 inode->ei_qid, 181 inode->v.i_blocks + 182 inode->ei_quota_reserved, 183 mode); 184 if (!ret) 185 for (i = 0; i < QTYP_NR; i++) 186 if (qtypes & (1 << i)) 187 inode->ei_qid.q[i] = new_qid.q[i]; 188 189 mutex_unlock(&inode->ei_quota_lock); 190 191 return ret; 192 } 193 194 static bool subvol_inum_eq(subvol_inum a, subvol_inum b) 195 { 196 return a.subvol == b.subvol && a.inum == b.inum; 197 } 198 199 static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) 200 { 201 const subvol_inum *inum = data; 202 siphash_key_t k = { .key[0] = seed }; 203 204 return siphash_2u64(inum->subvol, inum->inum, &k); 205 } 206 207 static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) 208 { 209 const struct bch_inode_info *inode = data; 210 211 return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); 212 } 213 214 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, 215 const void *obj) 216 { 217 const struct bch_inode_info *inode = obj; 218 const subvol_inum *v = arg->key; 219 220 return !subvol_inum_eq(inode->ei_inum, *v); 221 } 222 223 static const struct rhashtable_params bch2_vfs_inodes_params = { 224 .head_offset = offsetof(struct bch_inode_info, hash), 225 .key_offset = offsetof(struct bch_inode_info, ei_inum), 226 .key_len = sizeof(subvol_inum), 227 .hashfn = bch2_vfs_inode_hash_fn, 228 .obj_hashfn = bch2_vfs_inode_obj_hash_fn, 229 .obj_cmpfn = bch2_vfs_inode_cmp_fn, 230 .automatic_shrinking = true, 231 }; 232 233 static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { 234 .head_offset = offsetof(struct bch_inode_info, by_inum_hash), 235 .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), 236 .key_len = sizeof(u64), 237 .automatic_shrinking = true, 238 }; 239 240 int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) 241 { 242 struct bch_fs *c = trans->c; 243 struct rhltable *ht = &c->vfs_inodes_by_inum_table; 244 u64 inum = p.offset; 245 DARRAY(u32) subvols; 246 int ret = 0; 247 248 if (!test_bit(BCH_FS_started, &c->flags)) 249 return false; 250 251 darray_init(&subvols); 252 restart_from_top: 253 254 /* 255 * Tweaked version of __rhashtable_lookup(); we need to get a list of 256 * subvolumes in which the given inode number is open. 257 * 258 * For this to work, we don't include the subvolume ID in the key that 259 * we hash - all inodes with the same inode number regardless of 260 * subvolume will hash to the same slot. 261 * 262 * This will be less than ideal if the same file is ever open 263 * simultaneously in many different snapshots: 264 */ 265 rcu_read_lock(); 266 struct rhash_lock_head __rcu *const *bkt; 267 struct rhash_head *he; 268 unsigned int hash; 269 struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); 270 restart: 271 hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); 272 bkt = rht_bucket(tbl, hash); 273 do { 274 struct bch_inode_info *inode; 275 276 rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { 277 if (inode->ei_inum.inum == inum) { 278 ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, 279 GFP_NOWAIT|__GFP_NOWARN); 280 if (ret) { 281 rcu_read_unlock(); 282 ret = darray_make_room(&subvols, 1); 283 if (ret) 284 goto err; 285 subvols.nr = 0; 286 goto restart_from_top; 287 } 288 } 289 } 290 /* An object might have been moved to a different hash chain, 291 * while we walk along it - better check and retry. 292 */ 293 } while (he != RHT_NULLS_MARKER(bkt)); 294 295 /* Ensure we see any new tables. */ 296 smp_rmb(); 297 298 tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); 299 if (unlikely(tbl)) 300 goto restart; 301 rcu_read_unlock(); 302 303 darray_for_each(subvols, i) { 304 u32 snap; 305 ret = bch2_subvolume_get_snapshot(trans, *i, &snap); 306 if (ret) 307 goto err; 308 309 ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); 310 if (ret) 311 break; 312 } 313 err: 314 darray_exit(&subvols); 315 return ret; 316 } 317 318 static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 319 { 320 return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); 321 } 322 323 static void __wait_on_freeing_inode(struct bch_fs *c, 324 struct bch_inode_info *inode, 325 subvol_inum inum) 326 { 327 wait_queue_head_t *wq; 328 struct wait_bit_queue_entry wait; 329 330 wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); 331 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 332 spin_unlock(&inode->v.i_lock); 333 334 if (__bch2_inode_hash_find(c, inum) == inode) 335 schedule_timeout(HZ * 10); 336 finish_wait(wq, &wait.wq_entry); 337 } 338 339 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, 340 subvol_inum inum) 341 { 342 struct bch_inode_info *inode; 343 repeat: 344 inode = __bch2_inode_hash_find(c, inum); 345 if (inode) { 346 spin_lock(&inode->v.i_lock); 347 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { 348 spin_unlock(&inode->v.i_lock); 349 return NULL; 350 } 351 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { 352 if (!trans) { 353 __wait_on_freeing_inode(c, inode, inum); 354 } else { 355 bch2_trans_unlock(trans); 356 __wait_on_freeing_inode(c, inode, inum); 357 int ret = bch2_trans_relock(trans); 358 if (ret) 359 return ERR_PTR(ret); 360 } 361 goto repeat; 362 } 363 __iget(&inode->v); 364 spin_unlock(&inode->v.i_lock); 365 } 366 367 return inode; 368 } 369 370 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) 371 { 372 spin_lock(&inode->v.i_lock); 373 bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); 374 spin_unlock(&inode->v.i_lock); 375 376 if (remove) { 377 int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, 378 &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); 379 BUG_ON(ret); 380 381 ret = rhashtable_remove_fast(&c->vfs_inodes_table, 382 &inode->hash, bch2_vfs_inodes_params); 383 BUG_ON(ret); 384 inode->v.i_hash.pprev = NULL; 385 /* 386 * This pairs with the bch2_inode_hash_find() -> 387 * __wait_on_freeing_inode() path 388 */ 389 inode_wake_up_bit(&inode->v, __I_NEW); 390 } 391 } 392 393 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, 394 struct btree_trans *trans, 395 struct bch_inode_info *inode) 396 { 397 struct bch_inode_info *old = inode; 398 399 set_bit(EI_INODE_HASHED, &inode->ei_flags); 400 retry: 401 if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, 402 &inode->ei_inum, 403 &inode->hash, 404 bch2_vfs_inodes_params))) { 405 old = bch2_inode_hash_find(c, trans, inode->ei_inum); 406 if (!old) 407 goto retry; 408 409 clear_bit(EI_INODE_HASHED, &inode->ei_flags); 410 411 /* 412 * bcachefs doesn't use I_NEW; we have no use for it since we 413 * only insert fully created inodes in the inode hash table. But 414 * discard_new_inode() expects it to be set... 415 */ 416 inode->v.i_state |= I_NEW; 417 /* 418 * We don't want bch2_evict_inode() to delete the inode on disk, 419 * we just raced and had another inode in cache. Normally new 420 * inodes don't have nlink == 0 - except tmpfiles do... 421 */ 422 set_nlink(&inode->v, 1); 423 discard_new_inode(&inode->v); 424 return old; 425 } else { 426 int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, 427 &inode->by_inum_hash, 428 bch2_vfs_inodes_by_inum_params); 429 BUG_ON(ret); 430 431 inode_fake_hash(&inode->v); 432 433 inode_sb_list_add(&inode->v); 434 435 mutex_lock(&c->vfs_inodes_lock); 436 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 437 mutex_unlock(&c->vfs_inodes_lock); 438 return inode; 439 } 440 } 441 442 #define memalloc_flags_do(_flags, _do) \ 443 ({ \ 444 unsigned _saved_flags = memalloc_flags_save(_flags); \ 445 typeof(_do) _ret = _do; \ 446 memalloc_noreclaim_restore(_saved_flags); \ 447 _ret; \ 448 }) 449 450 static struct inode *bch2_alloc_inode(struct super_block *sb) 451 { 452 BUG(); 453 } 454 455 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) 456 { 457 struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, 458 bch2_inode_cache, gfp); 459 if (!inode) 460 return NULL; 461 462 inode_init_once(&inode->v); 463 mutex_init(&inode->ei_update_lock); 464 two_state_lock_init(&inode->ei_pagecache_lock); 465 INIT_LIST_HEAD(&inode->ei_vfs_inode_list); 466 inode->ei_flags = 0; 467 mutex_init(&inode->ei_quota_lock); 468 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 469 470 if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { 471 kmem_cache_free(bch2_inode_cache, inode); 472 return NULL; 473 } 474 475 return inode; 476 } 477 478 /* 479 * Allocate a new inode, dropping/retaking btree locks if necessary: 480 */ 481 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) 482 { 483 struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); 484 485 if (unlikely(!inode)) { 486 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); 487 if (ret && inode) { 488 __destroy_inode(&inode->v); 489 kmem_cache_free(bch2_inode_cache, inode); 490 } 491 if (ret) 492 return ERR_PTR(ret); 493 } 494 495 return inode; 496 } 497 498 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, 499 subvol_inum inum, 500 struct bch_inode_unpacked *bi, 501 struct bch_subvolume *subvol) 502 { 503 struct bch_inode_info *inode = bch2_new_inode(trans); 504 if (IS_ERR(inode)) 505 return inode; 506 507 bch2_vfs_inode_init(trans, inum, inode, bi, subvol); 508 509 return bch2_inode_hash_insert(trans->c, trans, inode); 510 511 } 512 513 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 514 { 515 struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); 516 if (inode) 517 return &inode->v; 518 519 struct btree_trans *trans = bch2_trans_get(c); 520 521 struct bch_inode_unpacked inode_u; 522 struct bch_subvolume subvol; 523 int ret = lockrestart_do(trans, 524 bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 525 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: 526 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 527 bch2_trans_put(trans); 528 529 return ret ? ERR_PTR(ret) : &inode->v; 530 } 531 532 struct bch_inode_info * 533 __bch2_create(struct mnt_idmap *idmap, 534 struct bch_inode_info *dir, struct dentry *dentry, 535 umode_t mode, dev_t rdev, subvol_inum snapshot_src, 536 unsigned flags) 537 { 538 struct bch_fs *c = dir->v.i_sb->s_fs_info; 539 struct btree_trans *trans; 540 struct bch_inode_unpacked dir_u; 541 struct bch_inode_info *inode; 542 struct bch_inode_unpacked inode_u; 543 struct posix_acl *default_acl = NULL, *acl = NULL; 544 subvol_inum inum; 545 struct bch_subvolume subvol; 546 u64 journal_seq = 0; 547 kuid_t kuid; 548 kgid_t kgid; 549 int ret; 550 551 /* 552 * preallocate acls + vfs inode before btree transaction, so that 553 * nothing can fail after the transaction succeeds: 554 */ 555 #ifdef CONFIG_BCACHEFS_POSIX_ACL 556 ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); 557 if (ret) 558 return ERR_PTR(ret); 559 #endif 560 inode = __bch2_new_inode(c, GFP_NOFS); 561 if (unlikely(!inode)) { 562 inode = ERR_PTR(-ENOMEM); 563 goto err; 564 } 565 566 bch2_inode_init_early(c, &inode_u); 567 568 if (!(flags & BCH_CREATE_TMPFILE)) 569 mutex_lock(&dir->ei_update_lock); 570 571 trans = bch2_trans_get(c); 572 retry: 573 bch2_trans_begin(trans); 574 575 kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); 576 kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); 577 ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: 578 bch2_create_trans(trans, 579 inode_inum(dir), &dir_u, &inode_u, 580 !(flags & BCH_CREATE_TMPFILE) 581 ? &dentry->d_name : NULL, 582 from_kuid(i_user_ns(&dir->v), kuid), 583 from_kgid(i_user_ns(&dir->v), kgid), 584 mode, rdev, 585 default_acl, acl, snapshot_src, flags) ?: 586 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, 587 KEY_TYPE_QUOTA_PREALLOC); 588 if (unlikely(ret)) 589 goto err_before_quota; 590 591 inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; 592 inum.inum = inode_u.bi_inum; 593 594 ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 595 bch2_trans_commit(trans, NULL, &journal_seq, 0); 596 if (unlikely(ret)) { 597 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, 598 KEY_TYPE_QUOTA_WARN); 599 err_before_quota: 600 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 601 goto retry; 602 goto err_trans; 603 } 604 605 if (!(flags & BCH_CREATE_TMPFILE)) { 606 bch2_inode_update_after_write(trans, dir, &dir_u, 607 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 608 mutex_unlock(&dir->ei_update_lock); 609 } 610 611 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 612 613 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 614 set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); 615 616 /* 617 * we must insert the new inode into the inode cache before calling 618 * bch2_trans_exit() and dropping locks, else we could race with another 619 * thread pulling the inode in and modifying it: 620 * 621 * also, calling bch2_inode_hash_insert() without passing in the 622 * transaction object is sketchy - if we could ever end up in 623 * __wait_on_freeing_inode(), we'd risk deadlock. 624 * 625 * But that shouldn't be possible, since we still have the inode locked 626 * that we just created, and we _really_ can't take a transaction 627 * restart here. 628 */ 629 inode = bch2_inode_hash_insert(c, NULL, inode); 630 bch2_trans_put(trans); 631 err: 632 posix_acl_release(default_acl); 633 posix_acl_release(acl); 634 return inode; 635 err_trans: 636 if (!(flags & BCH_CREATE_TMPFILE)) 637 mutex_unlock(&dir->ei_update_lock); 638 639 bch2_trans_put(trans); 640 make_bad_inode(&inode->v); 641 iput(&inode->v); 642 inode = ERR_PTR(ret); 643 goto err; 644 } 645 646 /* methods */ 647 648 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, 649 subvol_inum dir, struct bch_hash_info *dir_hash_info, 650 const struct qstr *name) 651 { 652 struct bch_fs *c = trans->c; 653 subvol_inum inum = {}; 654 struct printbuf buf = PRINTBUF; 655 656 struct qstr lookup_name; 657 int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); 658 if (ret) 659 return ERR_PTR(ret); 660 661 struct btree_iter dirent_iter = {}; 662 struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, 663 dir_hash_info, dir, &lookup_name, 0); 664 ret = bkey_err(k); 665 if (ret) 666 return ERR_PTR(ret); 667 668 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 669 670 ret = bch2_dirent_read_target(trans, dir, d, &inum); 671 if (ret > 0) 672 ret = -ENOENT; 673 if (ret) 674 goto err; 675 676 struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); 677 if (inode) 678 goto out; 679 680 /* 681 * Note: if check/repair needs it, we commit before 682 * bch2_inode_hash_init_insert(), as after that point we can't take a 683 * restart - not in the top level loop with a commit_do(), like we 684 * usually do: 685 */ 686 687 struct bch_subvolume subvol; 688 struct bch_inode_unpacked inode_u; 689 ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 690 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: 691 bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: 692 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: 693 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 694 695 /* 696 * don't remove it: check_inodes might find another inode that points 697 * back to this dirent 698 */ 699 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 700 c, "dirent to missing inode:\n%s", 701 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); 702 if (ret) 703 goto err; 704 out: 705 bch2_trans_iter_exit(trans, &dirent_iter); 706 printbuf_exit(&buf); 707 return inode; 708 err: 709 inode = ERR_PTR(ret); 710 goto out; 711 } 712 713 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, 714 unsigned int flags) 715 { 716 struct bch_fs *c = vdir->i_sb->s_fs_info; 717 struct bch_inode_info *dir = to_bch_ei(vdir); 718 struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); 719 720 struct bch_inode_info *inode; 721 bch2_trans_do(c, 722 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), 723 &hash, &dentry->d_name))); 724 if (IS_ERR(inode)) 725 inode = NULL; 726 727 #ifdef CONFIG_UNICODE 728 if (!inode && IS_CASEFOLDED(vdir)) { 729 /* 730 * Do not cache a negative dentry in casefolded directories 731 * as it would need to be invalidated in the following situation: 732 * - Lookup file "blAH" in a casefolded directory 733 * - Creation of file "BLAH" in a casefolded directory 734 * - Lookup file "blAH" in a casefolded directory 735 * which would fail if we had a negative dentry. 736 * 737 * We should come back to this when VFS has a method to handle 738 * this edgecase. 739 */ 740 return NULL; 741 } 742 #endif 743 744 return d_splice_alias(&inode->v, dentry); 745 } 746 747 static int bch2_mknod(struct mnt_idmap *idmap, 748 struct inode *vdir, struct dentry *dentry, 749 umode_t mode, dev_t rdev) 750 { 751 struct bch_inode_info *inode = 752 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 753 (subvol_inum) { 0 }, 0); 754 755 if (IS_ERR(inode)) 756 return bch2_err_class(PTR_ERR(inode)); 757 758 d_instantiate(dentry, &inode->v); 759 return 0; 760 } 761 762 static int bch2_create(struct mnt_idmap *idmap, 763 struct inode *vdir, struct dentry *dentry, 764 umode_t mode, bool excl) 765 { 766 return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); 767 } 768 769 static int __bch2_link(struct bch_fs *c, 770 struct bch_inode_info *inode, 771 struct bch_inode_info *dir, 772 struct dentry *dentry) 773 { 774 struct bch_inode_unpacked dir_u, inode_u; 775 int ret; 776 777 mutex_lock(&inode->ei_update_lock); 778 struct btree_trans *trans = bch2_trans_get(c); 779 780 ret = commit_do(trans, NULL, NULL, 0, 781 bch2_link_trans(trans, 782 inode_inum(dir), &dir_u, 783 inode_inum(inode), &inode_u, 784 &dentry->d_name)); 785 786 if (likely(!ret)) { 787 bch2_inode_update_after_write(trans, dir, &dir_u, 788 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 789 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); 790 } 791 792 bch2_trans_put(trans); 793 mutex_unlock(&inode->ei_update_lock); 794 return ret; 795 } 796 797 static int bch2_link(struct dentry *old_dentry, struct inode *vdir, 798 struct dentry *dentry) 799 { 800 struct bch_fs *c = vdir->i_sb->s_fs_info; 801 struct bch_inode_info *dir = to_bch_ei(vdir); 802 struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); 803 int ret; 804 805 lockdep_assert_held(&inode->v.i_rwsem); 806 807 ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 808 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 809 __bch2_link(c, inode, dir, dentry); 810 if (unlikely(ret)) 811 return bch2_err_class(ret); 812 813 ihold(&inode->v); 814 d_instantiate(dentry, &inode->v); 815 return 0; 816 } 817 818 int __bch2_unlink(struct inode *vdir, struct dentry *dentry, 819 bool deleting_snapshot) 820 { 821 struct bch_fs *c = vdir->i_sb->s_fs_info; 822 struct bch_inode_info *dir = to_bch_ei(vdir); 823 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 824 struct bch_inode_unpacked dir_u, inode_u; 825 int ret; 826 827 bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); 828 829 struct btree_trans *trans = bch2_trans_get(c); 830 831 ret = commit_do(trans, NULL, NULL, 832 BCH_TRANS_COMMIT_no_enospc, 833 bch2_unlink_trans(trans, 834 inode_inum(dir), &dir_u, 835 &inode_u, &dentry->d_name, 836 deleting_snapshot)); 837 if (unlikely(ret)) 838 goto err; 839 840 bch2_inode_update_after_write(trans, dir, &dir_u, 841 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 842 bch2_inode_update_after_write(trans, inode, &inode_u, 843 ATTR_MTIME); 844 845 if (inode_u.bi_subvol) { 846 /* 847 * Subvolume deletion is asynchronous, but we still want to tell 848 * the VFS that it's been deleted here: 849 */ 850 set_nlink(&inode->v, 0); 851 } 852 853 if (IS_CASEFOLDED(vdir)) 854 d_invalidate(dentry); 855 err: 856 bch2_trans_put(trans); 857 bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); 858 859 return ret; 860 } 861 862 static int bch2_unlink(struct inode *vdir, struct dentry *dentry) 863 { 864 struct bch_inode_info *dir= to_bch_ei(vdir); 865 struct bch_fs *c = dir->v.i_sb->s_fs_info; 866 867 int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 868 __bch2_unlink(vdir, dentry, false); 869 return bch2_err_class(ret); 870 } 871 872 static int bch2_symlink(struct mnt_idmap *idmap, 873 struct inode *vdir, struct dentry *dentry, 874 const char *symname) 875 { 876 struct bch_fs *c = vdir->i_sb->s_fs_info; 877 struct bch_inode_info *dir = to_bch_ei(vdir), *inode; 878 int ret; 879 880 inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, 881 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 882 if (IS_ERR(inode)) 883 return bch2_err_class(PTR_ERR(inode)); 884 885 inode_lock(&inode->v); 886 ret = page_symlink(&inode->v, symname, strlen(symname) + 1); 887 inode_unlock(&inode->v); 888 889 if (unlikely(ret)) 890 goto err; 891 892 ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); 893 if (unlikely(ret)) 894 goto err; 895 896 ret = __bch2_link(c, inode, dir, dentry); 897 if (unlikely(ret)) 898 goto err; 899 900 d_instantiate(dentry, &inode->v); 901 return 0; 902 err: 903 iput(&inode->v); 904 return bch2_err_class(ret); 905 } 906 907 static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, 908 struct inode *vdir, struct dentry *dentry, umode_t mode) 909 { 910 return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); 911 } 912 913 static int bch2_rename2(struct mnt_idmap *idmap, 914 struct inode *src_vdir, struct dentry *src_dentry, 915 struct inode *dst_vdir, struct dentry *dst_dentry, 916 unsigned flags) 917 { 918 struct bch_fs *c = src_vdir->i_sb->s_fs_info; 919 struct bch_inode_info *src_dir = to_bch_ei(src_vdir); 920 struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); 921 struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); 922 struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); 923 struct bch_inode_unpacked dst_dir_u, src_dir_u; 924 struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; 925 struct btree_trans *trans; 926 enum bch_rename_mode mode = flags & RENAME_EXCHANGE 927 ? BCH_RENAME_EXCHANGE 928 : dst_dentry->d_inode 929 ? BCH_RENAME_OVERWRITE : BCH_RENAME; 930 bool whiteout = !!(flags & RENAME_WHITEOUT); 931 int ret; 932 933 if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) 934 return -EINVAL; 935 936 if (mode == BCH_RENAME_OVERWRITE) { 937 ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 938 0, LLONG_MAX); 939 if (ret) 940 return ret; 941 } 942 943 bch2_lock_inodes(INODE_UPDATE_LOCK, 944 src_dir, 945 dst_dir, 946 src_inode, 947 dst_inode); 948 949 trans = bch2_trans_get(c); 950 951 ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: 952 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); 953 if (ret) 954 goto err_tx_restart; 955 956 if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { 957 ret = bch2_fs_quota_transfer(c, src_inode, 958 dst_dir->ei_qid, 959 1 << QTYP_PRJ, 960 KEY_TYPE_QUOTA_PREALLOC); 961 if (ret) 962 goto err; 963 } 964 965 if (mode == BCH_RENAME_EXCHANGE && 966 inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { 967 ret = bch2_fs_quota_transfer(c, dst_inode, 968 src_dir->ei_qid, 969 1 << QTYP_PRJ, 970 KEY_TYPE_QUOTA_PREALLOC); 971 if (ret) 972 goto err; 973 } 974 retry: 975 bch2_trans_begin(trans); 976 977 ret = bch2_rename_trans(trans, 978 inode_inum(src_dir), &src_dir_u, 979 inode_inum(dst_dir), &dst_dir_u, 980 &src_inode_u, 981 &dst_inode_u, 982 &src_dentry->d_name, 983 &dst_dentry->d_name, 984 mode); 985 if (unlikely(ret)) 986 goto err_tx_restart; 987 988 if (whiteout) { 989 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); 990 ret = PTR_ERR_OR_ZERO(whiteout_inode_u); 991 if (unlikely(ret)) 992 goto err_tx_restart; 993 bch2_inode_init_early(c, whiteout_inode_u); 994 995 ret = bch2_create_trans(trans, 996 inode_inum(src_dir), &src_dir_u, 997 whiteout_inode_u, 998 &src_dentry->d_name, 999 from_kuid(i_user_ns(&src_dir->v), current_fsuid()), 1000 from_kgid(i_user_ns(&src_dir->v), current_fsgid()), 1001 S_IFCHR|WHITEOUT_MODE, 0, 1002 NULL, NULL, (subvol_inum) { 0 }, 0) ?: 1003 bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, 1004 KEY_TYPE_QUOTA_PREALLOC); 1005 if (unlikely(ret)) 1006 goto err_tx_restart; 1007 } 1008 1009 ret = bch2_trans_commit(trans, NULL, NULL, 0); 1010 if (unlikely(ret)) { 1011 err_tx_restart: 1012 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1013 goto retry; 1014 goto err; 1015 } 1016 1017 BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); 1018 BUG_ON(dst_inode && 1019 dst_inode->v.i_ino != dst_inode_u.bi_inum); 1020 1021 bch2_inode_update_after_write(trans, src_dir, &src_dir_u, 1022 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 1023 1024 if (src_dir != dst_dir) 1025 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, 1026 ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); 1027 1028 bch2_inode_update_after_write(trans, src_inode, &src_inode_u, 1029 ATTR_CTIME); 1030 1031 if (dst_inode) 1032 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, 1033 ATTR_CTIME); 1034 err: 1035 bch2_trans_put(trans); 1036 1037 bch2_fs_quota_transfer(c, src_inode, 1038 bch_qid(&src_inode->ei_inode), 1039 1 << QTYP_PRJ, 1040 KEY_TYPE_QUOTA_NOCHECK); 1041 if (dst_inode) 1042 bch2_fs_quota_transfer(c, dst_inode, 1043 bch_qid(&dst_inode->ei_inode), 1044 1 << QTYP_PRJ, 1045 KEY_TYPE_QUOTA_NOCHECK); 1046 1047 bch2_unlock_inodes(INODE_UPDATE_LOCK, 1048 src_dir, 1049 dst_dir, 1050 src_inode, 1051 dst_inode); 1052 1053 return bch2_err_class(ret); 1054 } 1055 1056 static void bch2_setattr_copy(struct mnt_idmap *idmap, 1057 struct bch_inode_info *inode, 1058 struct bch_inode_unpacked *bi, 1059 struct iattr *attr) 1060 { 1061 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1062 unsigned int ia_valid = attr->ia_valid; 1063 kuid_t kuid; 1064 kgid_t kgid; 1065 1066 if (ia_valid & ATTR_UID) { 1067 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 1068 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); 1069 } 1070 if (ia_valid & ATTR_GID) { 1071 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 1072 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); 1073 } 1074 1075 if (ia_valid & ATTR_SIZE) 1076 bi->bi_size = attr->ia_size; 1077 1078 if (ia_valid & ATTR_ATIME) 1079 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); 1080 if (ia_valid & ATTR_MTIME) 1081 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); 1082 if (ia_valid & ATTR_CTIME) 1083 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); 1084 1085 if (ia_valid & ATTR_MODE) { 1086 umode_t mode = attr->ia_mode; 1087 kgid_t gid = ia_valid & ATTR_GID 1088 ? kgid 1089 : inode->v.i_gid; 1090 1091 if (!in_group_or_capable(idmap, &inode->v, 1092 make_vfsgid(idmap, i_user_ns(&inode->v), gid))) 1093 mode &= ~S_ISGID; 1094 bi->bi_mode = mode; 1095 } 1096 } 1097 1098 int bch2_setattr_nonsize(struct mnt_idmap *idmap, 1099 struct bch_inode_info *inode, 1100 struct iattr *attr) 1101 { 1102 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1103 struct bch_qid qid; 1104 struct btree_trans *trans; 1105 struct btree_iter inode_iter = {}; 1106 struct bch_inode_unpacked inode_u; 1107 struct posix_acl *acl = NULL; 1108 kuid_t kuid; 1109 kgid_t kgid; 1110 int ret; 1111 1112 mutex_lock(&inode->ei_update_lock); 1113 1114 qid = inode->ei_qid; 1115 1116 if (attr->ia_valid & ATTR_UID) { 1117 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 1118 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); 1119 } 1120 1121 if (attr->ia_valid & ATTR_GID) { 1122 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 1123 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); 1124 } 1125 1126 ret = bch2_fs_quota_transfer(c, inode, qid, ~0, 1127 KEY_TYPE_QUOTA_PREALLOC); 1128 if (ret) 1129 goto err; 1130 1131 trans = bch2_trans_get(c); 1132 retry: 1133 bch2_trans_begin(trans); 1134 kfree(acl); 1135 acl = NULL; 1136 1137 ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), 1138 BTREE_ITER_intent); 1139 if (ret) 1140 goto btree_err; 1141 1142 bch2_setattr_copy(idmap, inode, &inode_u, attr); 1143 1144 if (attr->ia_valid & ATTR_MODE) { 1145 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, 1146 inode_u.bi_mode, &acl); 1147 if (ret) 1148 goto btree_err; 1149 } 1150 1151 ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: 1152 bch2_trans_commit(trans, NULL, NULL, 1153 BCH_TRANS_COMMIT_no_enospc); 1154 btree_err: 1155 bch2_trans_iter_exit(trans, &inode_iter); 1156 1157 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1158 goto retry; 1159 if (unlikely(ret)) 1160 goto err_trans; 1161 1162 bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); 1163 1164 if (acl) 1165 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 1166 err_trans: 1167 bch2_trans_put(trans); 1168 err: 1169 mutex_unlock(&inode->ei_update_lock); 1170 1171 return bch2_err_class(ret); 1172 } 1173 1174 static int bch2_getattr(struct mnt_idmap *idmap, 1175 const struct path *path, struct kstat *stat, 1176 u32 request_mask, unsigned query_flags) 1177 { 1178 struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); 1179 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1180 vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); 1181 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); 1182 1183 stat->dev = inode->v.i_sb->s_dev; 1184 stat->ino = inode->v.i_ino; 1185 stat->mode = inode->v.i_mode; 1186 stat->nlink = inode->v.i_nlink; 1187 stat->uid = vfsuid_into_kuid(vfsuid); 1188 stat->gid = vfsgid_into_kgid(vfsgid); 1189 stat->rdev = inode->v.i_rdev; 1190 stat->size = i_size_read(&inode->v); 1191 stat->atime = inode_get_atime(&inode->v); 1192 stat->mtime = inode_get_mtime(&inode->v); 1193 stat->ctime = inode_get_ctime(&inode->v); 1194 stat->blksize = block_bytes(c); 1195 stat->blocks = inode->v.i_blocks; 1196 1197 stat->subvol = inode->ei_inum.subvol; 1198 stat->result_mask |= STATX_SUBVOL; 1199 1200 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { 1201 stat->result_mask |= STATX_DIOALIGN; 1202 /* 1203 * this is incorrect; we should be tracking this in superblock, 1204 * and checking the alignment of open devices 1205 */ 1206 stat->dio_mem_align = SECTOR_SIZE; 1207 stat->dio_offset_align = block_bytes(c); 1208 } 1209 1210 if (request_mask & STATX_BTIME) { 1211 stat->result_mask |= STATX_BTIME; 1212 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); 1213 } 1214 1215 if (inode->ei_inode.bi_flags & BCH_INODE_immutable) 1216 stat->attributes |= STATX_ATTR_IMMUTABLE; 1217 stat->attributes_mask |= STATX_ATTR_IMMUTABLE; 1218 1219 if (inode->ei_inode.bi_flags & BCH_INODE_append) 1220 stat->attributes |= STATX_ATTR_APPEND; 1221 stat->attributes_mask |= STATX_ATTR_APPEND; 1222 1223 if (inode->ei_inode.bi_flags & BCH_INODE_nodump) 1224 stat->attributes |= STATX_ATTR_NODUMP; 1225 stat->attributes_mask |= STATX_ATTR_NODUMP; 1226 1227 return 0; 1228 } 1229 1230 static int bch2_setattr(struct mnt_idmap *idmap, 1231 struct dentry *dentry, struct iattr *iattr) 1232 { 1233 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 1234 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1235 int ret; 1236 1237 lockdep_assert_held(&inode->v.i_rwsem); 1238 1239 ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 1240 setattr_prepare(idmap, dentry, iattr); 1241 if (ret) 1242 return ret; 1243 1244 return iattr->ia_valid & ATTR_SIZE 1245 ? bchfs_truncate(idmap, inode, iattr) 1246 : bch2_setattr_nonsize(idmap, inode, iattr); 1247 } 1248 1249 static int bch2_tmpfile(struct mnt_idmap *idmap, 1250 struct inode *vdir, struct file *file, umode_t mode) 1251 { 1252 struct bch_inode_info *inode = 1253 __bch2_create(idmap, to_bch_ei(vdir), 1254 file->f_path.dentry, mode, 0, 1255 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 1256 1257 if (IS_ERR(inode)) 1258 return bch2_err_class(PTR_ERR(inode)); 1259 1260 d_mark_tmpfile(file, &inode->v); 1261 d_instantiate(file->f_path.dentry, &inode->v); 1262 return finish_open_simple(file, 0); 1263 } 1264 1265 struct bch_fiemap_extent { 1266 struct bkey_buf kbuf; 1267 unsigned flags; 1268 }; 1269 1270 static int bch2_fill_extent(struct bch_fs *c, 1271 struct fiemap_extent_info *info, 1272 struct bch_fiemap_extent *fe) 1273 { 1274 struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); 1275 unsigned flags = fe->flags; 1276 1277 BUG_ON(!k.k->size); 1278 1279 if (bkey_extent_is_direct_data(k.k)) { 1280 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1281 const union bch_extent_entry *entry; 1282 struct extent_ptr_decoded p; 1283 int ret; 1284 1285 if (k.k->type == KEY_TYPE_reflink_v) 1286 flags |= FIEMAP_EXTENT_SHARED; 1287 1288 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1289 int flags2 = 0; 1290 u64 offset = p.ptr.offset; 1291 1292 if (p.ptr.unwritten) 1293 flags2 |= FIEMAP_EXTENT_UNWRITTEN; 1294 1295 if (p.crc.compression_type) 1296 flags2 |= FIEMAP_EXTENT_ENCODED; 1297 else 1298 offset += p.crc.offset; 1299 1300 if ((offset & (block_sectors(c) - 1)) || 1301 (k.k->size & (block_sectors(c) - 1))) 1302 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; 1303 1304 ret = fiemap_fill_next_extent(info, 1305 bkey_start_offset(k.k) << 9, 1306 offset << 9, 1307 k.k->size << 9, flags|flags2); 1308 if (ret) 1309 return ret; 1310 } 1311 1312 return 0; 1313 } else if (bkey_extent_is_inline_data(k.k)) { 1314 return fiemap_fill_next_extent(info, 1315 bkey_start_offset(k.k) << 9, 1316 0, k.k->size << 9, 1317 flags| 1318 FIEMAP_EXTENT_DATA_INLINE); 1319 } else if (k.k->type == KEY_TYPE_reservation) { 1320 return fiemap_fill_next_extent(info, 1321 bkey_start_offset(k.k) << 9, 1322 0, k.k->size << 9, 1323 flags| 1324 FIEMAP_EXTENT_DELALLOC| 1325 FIEMAP_EXTENT_UNWRITTEN); 1326 } else { 1327 BUG(); 1328 } 1329 } 1330 1331 /* 1332 * Scan a range of an inode for data in pagecache. 1333 * 1334 * Intended to be retryable, so don't modify the output params until success is 1335 * imminent. 1336 */ 1337 static int 1338 bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, 1339 bool nonblock) 1340 { 1341 loff_t dstart, dend; 1342 1343 dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); 1344 if (dstart < 0) 1345 return dstart; 1346 1347 if (dstart == *end) { 1348 *start = dstart; 1349 return 0; 1350 } 1351 1352 dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); 1353 if (dend < 0) 1354 return dend; 1355 1356 /* race */ 1357 BUG_ON(dstart == dend); 1358 1359 *start = dstart; 1360 *end = dend; 1361 return 0; 1362 } 1363 1364 /* 1365 * Scan a range of pagecache that corresponds to a file mapping hole in the 1366 * extent btree. If data is found, fake up an extent key so it looks like a 1367 * delalloc extent to the rest of the fiemap processing code. 1368 */ 1369 static int 1370 bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, 1371 u64 start, u64 end, struct bch_fiemap_extent *cur) 1372 { 1373 struct bch_fs *c = trans->c; 1374 struct bkey_i_extent *delextent; 1375 struct bch_extent_ptr ptr = {}; 1376 loff_t dstart = start << 9, dend = end << 9; 1377 int ret; 1378 1379 /* 1380 * We hold btree locks here so we cannot block on folio locks without 1381 * dropping trans locks first. Run a nonblocking scan for the common 1382 * case of no folios over holes and fall back on failure. 1383 * 1384 * Note that dropping locks like this is technically racy against 1385 * writeback inserting to the extent tree, but a non-sync fiemap scan is 1386 * fundamentally racy with writeback anyways. Therefore, just report the 1387 * range as delalloc regardless of whether we have to cycle trans locks. 1388 */ 1389 ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); 1390 if (ret == -EAGAIN) 1391 ret = drop_locks_do(trans, 1392 bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); 1393 if (ret < 0) 1394 return ret; 1395 1396 /* 1397 * Create a fake extent key in the buffer. We have to add a dummy extent 1398 * pointer for the fill code to add an extent entry. It's explicitly 1399 * zeroed to reflect delayed allocation (i.e. phys offset 0). 1400 */ 1401 bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); 1402 delextent = bkey_extent_init(cur->kbuf.k); 1403 delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); 1404 delextent->k.size = (dend - dstart) >> 9; 1405 bch2_bkey_append_ptr(&delextent->k_i, ptr); 1406 1407 cur->flags = FIEMAP_EXTENT_DELALLOC; 1408 1409 return 0; 1410 } 1411 1412 static int bch2_next_fiemap_extent(struct btree_trans *trans, 1413 struct bch_inode_info *inode, 1414 u64 start, u64 end, 1415 struct bch_fiemap_extent *cur) 1416 { 1417 u32 snapshot; 1418 int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); 1419 if (ret) 1420 return ret; 1421 1422 struct btree_iter iter; 1423 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1424 SPOS(inode->ei_inum.inum, start, snapshot), 0); 1425 1426 struct bkey_s_c k = 1427 bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); 1428 ret = bkey_err(k); 1429 if (ret) 1430 goto err; 1431 1432 ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur); 1433 if (ret) 1434 goto err; 1435 1436 struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); 1437 1438 /* 1439 * Does the pagecache or the btree take precedence? 1440 * 1441 * It _should_ be the pagecache, so that we correctly report delalloc 1442 * extents when dirty in the pagecache (we're COW, after all). 1443 * 1444 * But we'd have to add per-sector writeback tracking to 1445 * bch_folio_state, otherwise we report delalloc extents for clean 1446 * cached data in the pagecache. 1447 * 1448 * We should do this, but even then fiemap won't report stable mappings: 1449 * on bcachefs data moves around in the background (copygc, rebalance) 1450 * and we don't provide a way for userspace to lock that out. 1451 */ 1452 if (k.k && 1453 bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), 1454 pagecache_start)) { 1455 bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); 1456 bch2_cut_front(iter.pos, cur->kbuf.k); 1457 bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); 1458 cur->flags = 0; 1459 } else if (k.k) { 1460 bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); 1461 } 1462 1463 if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { 1464 unsigned sectors = cur->kbuf.k->k.size; 1465 s64 offset_into_extent = 0; 1466 enum btree_id data_btree = BTREE_ID_extents; 1467 ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, 1468 &cur->kbuf); 1469 if (ret) 1470 goto err; 1471 1472 struct bkey_i *k = cur->kbuf.k; 1473 sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); 1474 1475 bch2_cut_front(POS(k->k.p.inode, 1476 bkey_start_offset(&k->k) + offset_into_extent), 1477 k); 1478 bch2_key_resize(&k->k, sectors); 1479 k->k.p = iter.pos; 1480 k->k.p.offset += k->k.size; 1481 } 1482 err: 1483 bch2_trans_iter_exit(trans, &iter); 1484 return ret; 1485 } 1486 1487 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, 1488 u64 start, u64 len) 1489 { 1490 struct bch_fs *c = vinode->i_sb->s_fs_info; 1491 struct bch_inode_info *ei = to_bch_ei(vinode); 1492 struct btree_trans *trans; 1493 struct bch_fiemap_extent cur, prev; 1494 int ret = 0; 1495 1496 ret = fiemap_prep(&ei->v, info, start, &len, 0); 1497 if (ret) 1498 return ret; 1499 1500 if (start + len < start) 1501 return -EINVAL; 1502 1503 start >>= 9; 1504 u64 end = (start + len) >> 9; 1505 1506 bch2_bkey_buf_init(&cur.kbuf); 1507 bch2_bkey_buf_init(&prev.kbuf); 1508 bkey_init(&prev.kbuf.k->k); 1509 1510 trans = bch2_trans_get(c); 1511 1512 while (start < end) { 1513 ret = lockrestart_do(trans, 1514 bch2_next_fiemap_extent(trans, ei, start, end, &cur)); 1515 if (ret) 1516 goto err; 1517 1518 BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); 1519 BUG_ON(cur.kbuf.k->k.p.offset > end); 1520 1521 if (bkey_start_offset(&cur.kbuf.k->k) == end) 1522 break; 1523 1524 start = cur.kbuf.k->k.p.offset; 1525 1526 if (!bkey_deleted(&prev.kbuf.k->k)) { 1527 bch2_trans_unlock(trans); 1528 ret = bch2_fill_extent(c, info, &prev); 1529 if (ret) 1530 goto err; 1531 } 1532 1533 bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); 1534 prev.flags = cur.flags; 1535 } 1536 1537 if (!bkey_deleted(&prev.kbuf.k->k)) { 1538 bch2_trans_unlock(trans); 1539 prev.flags |= FIEMAP_EXTENT_LAST; 1540 ret = bch2_fill_extent(c, info, &prev); 1541 } 1542 err: 1543 bch2_trans_put(trans); 1544 bch2_bkey_buf_exit(&cur.kbuf, c); 1545 bch2_bkey_buf_exit(&prev.kbuf, c); 1546 1547 return bch2_err_class(ret < 0 ? ret : 0); 1548 } 1549 1550 static const struct vm_operations_struct bch_vm_ops = { 1551 .fault = bch2_page_fault, 1552 .map_pages = filemap_map_pages, 1553 .page_mkwrite = bch2_page_mkwrite, 1554 }; 1555 1556 static int bch2_mmap(struct file *file, struct vm_area_struct *vma) 1557 { 1558 file_accessed(file); 1559 1560 vma->vm_ops = &bch_vm_ops; 1561 return 0; 1562 } 1563 1564 /* Directories: */ 1565 1566 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) 1567 { 1568 return generic_file_llseek_size(file, offset, whence, 1569 S64_MAX, S64_MAX); 1570 } 1571 1572 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) 1573 { 1574 struct bch_inode_info *inode = file_bch_inode(file); 1575 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1576 1577 if (!dir_emit_dots(file, ctx)) 1578 return 0; 1579 1580 int ret = bch2_readdir(c, inode_inum(inode), ctx); 1581 1582 bch_err_fn(c, ret); 1583 return bch2_err_class(ret); 1584 } 1585 1586 static int bch2_open(struct inode *vinode, struct file *file) 1587 { 1588 if (file->f_flags & (O_WRONLY|O_RDWR)) { 1589 struct bch_inode_info *inode = to_bch_ei(vinode); 1590 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1591 1592 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); 1593 if (ret) 1594 return ret; 1595 } 1596 1597 file->f_mode |= FMODE_CAN_ODIRECT; 1598 1599 return generic_file_open(vinode, file); 1600 } 1601 1602 /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ 1603 static const __maybe_unused unsigned bch_flags_to_uflags[] = { 1604 [__BCH_INODE_sync] = FS_SYNC_FL, 1605 [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, 1606 [__BCH_INODE_append] = FS_APPEND_FL, 1607 [__BCH_INODE_nodump] = FS_NODUMP_FL, 1608 [__BCH_INODE_noatime] = FS_NOATIME_FL, 1609 }; 1610 1611 /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ 1612 static const __maybe_unused unsigned bch_flags_to_xflags[] = { 1613 [__BCH_INODE_sync] = FS_XFLAG_SYNC, 1614 [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, 1615 [__BCH_INODE_append] = FS_XFLAG_APPEND, 1616 [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, 1617 [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, 1618 }; 1619 1620 static int bch2_fileattr_get(struct dentry *dentry, 1621 struct fileattr *fa) 1622 { 1623 struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); 1624 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1625 1626 fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); 1627 1628 if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) 1629 fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; 1630 1631 if (bch2_inode_casefold(c, &inode->ei_inode)) 1632 fa->flags |= FS_CASEFOLD_FL; 1633 1634 fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; 1635 return 0; 1636 } 1637 1638 struct flags_set { 1639 unsigned mask; 1640 unsigned flags; 1641 unsigned projid; 1642 bool set_project; 1643 bool set_casefold; 1644 bool casefold; 1645 }; 1646 1647 static int fssetxattr_inode_update_fn(struct btree_trans *trans, 1648 struct bch_inode_info *inode, 1649 struct bch_inode_unpacked *bi, 1650 void *p) 1651 { 1652 struct bch_fs *c = trans->c; 1653 struct flags_set *s = p; 1654 1655 /* 1656 * We're relying on btree locking here for exclusion with other ioctl 1657 * calls - use the flags in the btree (@bi), not inode->i_flags: 1658 */ 1659 if (!S_ISREG(bi->bi_mode) && 1660 !S_ISDIR(bi->bi_mode) && 1661 (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) 1662 return -EINVAL; 1663 1664 if (s->casefold != bch2_inode_casefold(c, bi)) { 1665 #ifdef CONFIG_UNICODE 1666 int ret = 0; 1667 /* Not supported on individual files. */ 1668 if (!S_ISDIR(bi->bi_mode)) 1669 return -EOPNOTSUPP; 1670 1671 /* 1672 * Make sure the dir is empty, as otherwise we'd need to 1673 * rehash everything and update the dirent keys. 1674 */ 1675 ret = bch2_empty_dir_trans(trans, inode_inum(inode)); 1676 if (ret < 0) 1677 return ret; 1678 1679 ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); 1680 if (ret) 1681 return ret; 1682 1683 bch2_check_set_feature(c, BCH_FEATURE_casefolding); 1684 1685 bi->bi_casefold = s->casefold + 1; 1686 bi->bi_fields_set |= BIT(Inode_opt_casefold); 1687 1688 #else 1689 printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); 1690 return -EOPNOTSUPP; 1691 #endif 1692 } 1693 1694 if (s->set_project) { 1695 bi->bi_project = s->projid; 1696 bi->bi_fields_set |= BIT(Inode_opt_project); 1697 } 1698 1699 bi->bi_flags &= ~s->mask; 1700 bi->bi_flags |= s->flags; 1701 1702 bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); 1703 return 0; 1704 } 1705 1706 static int bch2_fileattr_set(struct mnt_idmap *idmap, 1707 struct dentry *dentry, 1708 struct fileattr *fa) 1709 { 1710 struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); 1711 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1712 struct flags_set s = {}; 1713 int ret; 1714 1715 if (fa->fsx_valid) { 1716 fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; 1717 1718 s.mask = map_defined(bch_flags_to_xflags); 1719 s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); 1720 if (fa->fsx_xflags) 1721 return -EOPNOTSUPP; 1722 1723 if (fa->fsx_projid >= U32_MAX) 1724 return -EINVAL; 1725 1726 /* 1727 * inode fields accessible via the xattr interface are stored with a +1 1728 * bias, so that 0 means unset: 1729 */ 1730 if ((inode->ei_inode.bi_project || 1731 fa->fsx_projid) && 1732 inode->ei_inode.bi_project != fa->fsx_projid + 1) { 1733 s.projid = fa->fsx_projid + 1; 1734 s.set_project = true; 1735 } 1736 } 1737 1738 if (fa->flags_valid) { 1739 s.mask = map_defined(bch_flags_to_uflags); 1740 1741 s.set_casefold = true; 1742 s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; 1743 fa->flags &= ~FS_CASEFOLD_FL; 1744 1745 s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); 1746 if (fa->flags) 1747 return -EOPNOTSUPP; 1748 } 1749 1750 mutex_lock(&inode->ei_update_lock); 1751 ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 1752 (s.set_project 1753 ? bch2_set_projid(c, inode, fa->fsx_projid) 1754 : 0) ?: 1755 bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 1756 ATTR_CTIME); 1757 mutex_unlock(&inode->ei_update_lock); 1758 return ret; 1759 } 1760 1761 static const struct file_operations bch_file_operations = { 1762 .open = bch2_open, 1763 .llseek = bch2_llseek, 1764 .read_iter = bch2_read_iter, 1765 .write_iter = bch2_write_iter, 1766 .mmap = bch2_mmap, 1767 .get_unmapped_area = thp_get_unmapped_area, 1768 .fsync = bch2_fsync, 1769 .splice_read = filemap_splice_read, 1770 .splice_write = iter_file_splice_write, 1771 .fallocate = bch2_fallocate_dispatch, 1772 .unlocked_ioctl = bch2_fs_file_ioctl, 1773 #ifdef CONFIG_COMPAT 1774 .compat_ioctl = bch2_compat_fs_ioctl, 1775 #endif 1776 .remap_file_range = bch2_remap_file_range, 1777 }; 1778 1779 static const struct inode_operations bch_file_inode_operations = { 1780 .getattr = bch2_getattr, 1781 .setattr = bch2_setattr, 1782 .fiemap = bch2_fiemap, 1783 .listxattr = bch2_xattr_list, 1784 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1785 .get_inode_acl = bch2_get_acl, 1786 .set_acl = bch2_set_acl, 1787 #endif 1788 .fileattr_get = bch2_fileattr_get, 1789 .fileattr_set = bch2_fileattr_set, 1790 }; 1791 1792 static const struct inode_operations bch_dir_inode_operations = { 1793 .lookup = bch2_lookup, 1794 .create = bch2_create, 1795 .link = bch2_link, 1796 .unlink = bch2_unlink, 1797 .symlink = bch2_symlink, 1798 .mkdir = bch2_mkdir, 1799 .rmdir = bch2_unlink, 1800 .mknod = bch2_mknod, 1801 .rename = bch2_rename2, 1802 .getattr = bch2_getattr, 1803 .setattr = bch2_setattr, 1804 .tmpfile = bch2_tmpfile, 1805 .listxattr = bch2_xattr_list, 1806 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1807 .get_inode_acl = bch2_get_acl, 1808 .set_acl = bch2_set_acl, 1809 #endif 1810 .fileattr_get = bch2_fileattr_get, 1811 .fileattr_set = bch2_fileattr_set, 1812 }; 1813 1814 static const struct file_operations bch_dir_file_operations = { 1815 .llseek = bch2_dir_llseek, 1816 .read = generic_read_dir, 1817 .iterate_shared = bch2_vfs_readdir, 1818 .fsync = bch2_fsync, 1819 .unlocked_ioctl = bch2_fs_file_ioctl, 1820 #ifdef CONFIG_COMPAT 1821 .compat_ioctl = bch2_compat_fs_ioctl, 1822 #endif 1823 }; 1824 1825 static const struct inode_operations bch_symlink_inode_operations = { 1826 .get_link = page_get_link, 1827 .getattr = bch2_getattr, 1828 .setattr = bch2_setattr, 1829 .listxattr = bch2_xattr_list, 1830 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1831 .get_inode_acl = bch2_get_acl, 1832 .set_acl = bch2_set_acl, 1833 #endif 1834 .fileattr_get = bch2_fileattr_get, 1835 .fileattr_set = bch2_fileattr_set, 1836 }; 1837 1838 static const struct inode_operations bch_special_inode_operations = { 1839 .getattr = bch2_getattr, 1840 .setattr = bch2_setattr, 1841 .listxattr = bch2_xattr_list, 1842 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1843 .get_inode_acl = bch2_get_acl, 1844 .set_acl = bch2_set_acl, 1845 #endif 1846 .fileattr_get = bch2_fileattr_get, 1847 .fileattr_set = bch2_fileattr_set, 1848 }; 1849 1850 static const struct address_space_operations bch_address_space_operations = { 1851 .read_folio = bch2_read_folio, 1852 .writepages = bch2_writepages, 1853 .readahead = bch2_readahead, 1854 .dirty_folio = filemap_dirty_folio, 1855 .write_begin = bch2_write_begin, 1856 .write_end = bch2_write_end, 1857 .invalidate_folio = bch2_invalidate_folio, 1858 .release_folio = bch2_release_folio, 1859 #ifdef CONFIG_MIGRATION 1860 .migrate_folio = filemap_migrate_folio, 1861 #endif 1862 .error_remove_folio = generic_error_remove_folio, 1863 }; 1864 1865 struct bcachefs_fid { 1866 u64 inum; 1867 u32 subvol; 1868 u32 gen; 1869 } __packed; 1870 1871 struct bcachefs_fid_with_parent { 1872 struct bcachefs_fid fid; 1873 struct bcachefs_fid dir; 1874 } __packed; 1875 1876 static int bcachefs_fid_valid(int fh_len, int fh_type) 1877 { 1878 switch (fh_type) { 1879 case FILEID_BCACHEFS_WITHOUT_PARENT: 1880 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); 1881 case FILEID_BCACHEFS_WITH_PARENT: 1882 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); 1883 default: 1884 return false; 1885 } 1886 } 1887 1888 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) 1889 { 1890 return (struct bcachefs_fid) { 1891 .inum = inode->ei_inum.inum, 1892 .subvol = inode->ei_inum.subvol, 1893 .gen = inode->ei_inode.bi_generation, 1894 }; 1895 } 1896 1897 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, 1898 struct inode *vdir) 1899 { 1900 struct bch_inode_info *inode = to_bch_ei(vinode); 1901 struct bch_inode_info *dir = to_bch_ei(vdir); 1902 int min_len; 1903 1904 if (!S_ISDIR(inode->v.i_mode) && dir) { 1905 struct bcachefs_fid_with_parent *fid = (void *) fh; 1906 1907 min_len = sizeof(*fid) / sizeof(u32); 1908 if (*len < min_len) { 1909 *len = min_len; 1910 return FILEID_INVALID; 1911 } 1912 1913 fid->fid = bch2_inode_to_fid(inode); 1914 fid->dir = bch2_inode_to_fid(dir); 1915 1916 *len = min_len; 1917 return FILEID_BCACHEFS_WITH_PARENT; 1918 } else { 1919 struct bcachefs_fid *fid = (void *) fh; 1920 1921 min_len = sizeof(*fid) / sizeof(u32); 1922 if (*len < min_len) { 1923 *len = min_len; 1924 return FILEID_INVALID; 1925 } 1926 *fid = bch2_inode_to_fid(inode); 1927 1928 *len = min_len; 1929 return FILEID_BCACHEFS_WITHOUT_PARENT; 1930 } 1931 } 1932 1933 static struct inode *bch2_nfs_get_inode(struct super_block *sb, 1934 struct bcachefs_fid fid) 1935 { 1936 struct bch_fs *c = sb->s_fs_info; 1937 struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { 1938 .subvol = fid.subvol, 1939 .inum = fid.inum, 1940 }); 1941 if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { 1942 iput(vinode); 1943 vinode = ERR_PTR(-ESTALE); 1944 } 1945 return vinode; 1946 } 1947 1948 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, 1949 int fh_len, int fh_type) 1950 { 1951 struct bcachefs_fid *fid = (void *) _fid; 1952 1953 if (!bcachefs_fid_valid(fh_len, fh_type)) 1954 return NULL; 1955 1956 return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); 1957 } 1958 1959 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, 1960 int fh_len, int fh_type) 1961 { 1962 struct bcachefs_fid_with_parent *fid = (void *) _fid; 1963 1964 if (!bcachefs_fid_valid(fh_len, fh_type) || 1965 fh_type != FILEID_BCACHEFS_WITH_PARENT) 1966 return NULL; 1967 1968 return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); 1969 } 1970 1971 static struct dentry *bch2_get_parent(struct dentry *child) 1972 { 1973 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1974 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1975 subvol_inum parent_inum = { 1976 .subvol = inode->ei_inode.bi_parent_subvol ?: 1977 inode->ei_inum.subvol, 1978 .inum = inode->ei_inode.bi_dir, 1979 }; 1980 1981 return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); 1982 } 1983 1984 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) 1985 { 1986 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1987 struct bch_inode_info *dir = to_bch_ei(parent->d_inode); 1988 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1989 struct btree_trans *trans; 1990 struct btree_iter iter1; 1991 struct btree_iter iter2; 1992 struct bkey_s_c k; 1993 struct bkey_s_c_dirent d; 1994 struct bch_inode_unpacked inode_u; 1995 subvol_inum target; 1996 u32 snapshot; 1997 struct qstr dirent_name; 1998 unsigned name_len = 0; 1999 int ret; 2000 2001 if (!S_ISDIR(dir->v.i_mode)) 2002 return -EINVAL; 2003 2004 trans = bch2_trans_get(c); 2005 2006 bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, 2007 POS(dir->ei_inode.bi_inum, 0), 0); 2008 bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, 2009 POS(dir->ei_inode.bi_inum, 0), 0); 2010 retry: 2011 bch2_trans_begin(trans); 2012 2013 ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); 2014 if (ret) 2015 goto err; 2016 2017 bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); 2018 bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); 2019 2020 ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); 2021 if (ret) 2022 goto err; 2023 2024 if (inode_u.bi_dir == dir->ei_inode.bi_inum) { 2025 bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); 2026 2027 k = bch2_btree_iter_peek_slot(trans, &iter1); 2028 ret = bkey_err(k); 2029 if (ret) 2030 goto err; 2031 2032 if (k.k->type != KEY_TYPE_dirent) { 2033 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 2034 goto err; 2035 } 2036 2037 d = bkey_s_c_to_dirent(k); 2038 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 2039 if (ret > 0) 2040 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 2041 if (ret) 2042 goto err; 2043 2044 if (subvol_inum_eq(target, inode->ei_inum)) 2045 goto found; 2046 } else { 2047 /* 2048 * File with multiple hardlinks and our backref is to the wrong 2049 * directory - linear search: 2050 */ 2051 for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { 2052 if (k.k->p.inode > dir->ei_inode.bi_inum) 2053 break; 2054 2055 if (k.k->type != KEY_TYPE_dirent) 2056 continue; 2057 2058 d = bkey_s_c_to_dirent(k); 2059 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 2060 if (ret < 0) 2061 break; 2062 if (ret) 2063 continue; 2064 2065 if (subvol_inum_eq(target, inode->ei_inum)) 2066 goto found; 2067 } 2068 } 2069 2070 ret = -ENOENT; 2071 goto err; 2072 found: 2073 dirent_name = bch2_dirent_get_name(d); 2074 2075 name_len = min_t(unsigned, dirent_name.len, NAME_MAX); 2076 memcpy(name, dirent_name.name, name_len); 2077 name[name_len] = '\0'; 2078 err: 2079 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2080 goto retry; 2081 2082 bch2_trans_iter_exit(trans, &iter1); 2083 bch2_trans_iter_exit(trans, &iter2); 2084 bch2_trans_put(trans); 2085 2086 return ret; 2087 } 2088 2089 static const struct export_operations bch_export_ops = { 2090 .encode_fh = bch2_encode_fh, 2091 .fh_to_dentry = bch2_fh_to_dentry, 2092 .fh_to_parent = bch2_fh_to_parent, 2093 .get_parent = bch2_get_parent, 2094 .get_name = bch2_get_name, 2095 }; 2096 2097 static void bch2_vfs_inode_init(struct btree_trans *trans, 2098 subvol_inum inum, 2099 struct bch_inode_info *inode, 2100 struct bch_inode_unpacked *bi, 2101 struct bch_subvolume *subvol) 2102 { 2103 inode->v.i_ino = inum.inum; 2104 inode->ei_inum = inum; 2105 inode->ei_inode.bi_inum = inum.inum; 2106 bch2_inode_update_after_write(trans, inode, bi, ~0); 2107 2108 inode->v.i_blocks = bi->bi_sectors; 2109 inode->v.i_rdev = bi->bi_dev; 2110 inode->v.i_generation = bi->bi_generation; 2111 inode->v.i_size = bi->bi_size; 2112 2113 inode->ei_flags = 0; 2114 inode->ei_quota_reserved = 0; 2115 inode->ei_qid = bch_qid(bi); 2116 2117 if (BCH_SUBVOLUME_SNAP(subvol)) 2118 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); 2119 2120 inode->v.i_mapping->a_ops = &bch_address_space_operations; 2121 2122 switch (inode->v.i_mode & S_IFMT) { 2123 case S_IFREG: 2124 inode->v.i_op = &bch_file_inode_operations; 2125 inode->v.i_fop = &bch_file_operations; 2126 break; 2127 case S_IFDIR: 2128 inode->v.i_op = &bch_dir_inode_operations; 2129 inode->v.i_fop = &bch_dir_file_operations; 2130 break; 2131 case S_IFLNK: 2132 inode_nohighmem(&inode->v); 2133 inode->v.i_op = &bch_symlink_inode_operations; 2134 break; 2135 default: 2136 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); 2137 inode->v.i_op = &bch_special_inode_operations; 2138 break; 2139 } 2140 2141 mapping_set_folio_min_order(inode->v.i_mapping, 2142 get_order(trans->c->opts.block_size)); 2143 } 2144 2145 static void bch2_free_inode(struct inode *vinode) 2146 { 2147 kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); 2148 } 2149 2150 static int inode_update_times_fn(struct btree_trans *trans, 2151 struct bch_inode_info *inode, 2152 struct bch_inode_unpacked *bi, 2153 void *p) 2154 { 2155 struct bch_fs *c = inode->v.i_sb->s_fs_info; 2156 2157 bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); 2158 bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); 2159 bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); 2160 2161 return 0; 2162 } 2163 2164 static int bch2_vfs_write_inode(struct inode *vinode, 2165 struct writeback_control *wbc) 2166 { 2167 struct bch_fs *c = vinode->i_sb->s_fs_info; 2168 struct bch_inode_info *inode = to_bch_ei(vinode); 2169 int ret; 2170 2171 mutex_lock(&inode->ei_update_lock); 2172 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 2173 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); 2174 mutex_unlock(&inode->ei_update_lock); 2175 2176 return bch2_err_class(ret); 2177 } 2178 2179 static void bch2_evict_inode(struct inode *vinode) 2180 { 2181 struct bch_fs *c = vinode->i_sb->s_fs_info; 2182 struct bch_inode_info *inode = to_bch_ei(vinode); 2183 bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); 2184 2185 /* 2186 * evict() has waited for outstanding writeback, we'll do no more IO 2187 * through this inode: it's safe to remove from VFS inode hashtable here 2188 * 2189 * Do that now so that other threads aren't blocked from pulling it back 2190 * in, there's no reason for them to be: 2191 */ 2192 if (!delete) 2193 bch2_inode_hash_remove(c, inode); 2194 2195 truncate_inode_pages_final(&inode->v.i_data); 2196 2197 clear_inode(&inode->v); 2198 2199 BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); 2200 2201 if (delete) { 2202 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), 2203 KEY_TYPE_QUOTA_WARN); 2204 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, 2205 KEY_TYPE_QUOTA_WARN); 2206 bch2_inode_rm(c, inode_inum(inode)); 2207 2208 /* 2209 * If we are deleting, we need it present in the vfs hash table 2210 * so that fsck can check if unlinked inodes are still open: 2211 */ 2212 bch2_inode_hash_remove(c, inode); 2213 } 2214 2215 mutex_lock(&c->vfs_inodes_lock); 2216 list_del_init(&inode->ei_vfs_inode_list); 2217 mutex_unlock(&c->vfs_inodes_lock); 2218 } 2219 2220 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) 2221 { 2222 struct bch_inode_info *inode; 2223 DARRAY(struct bch_inode_info *) grabbed; 2224 bool clean_pass = false, this_pass_clean; 2225 2226 /* 2227 * Initially, we scan for inodes without I_DONTCACHE, then mark them to 2228 * be pruned with d_mark_dontcache(). 2229 * 2230 * Once we've had a clean pass where we didn't find any inodes without 2231 * I_DONTCACHE, we wait for them to be freed: 2232 */ 2233 2234 darray_init(&grabbed); 2235 darray_make_room(&grabbed, 1024); 2236 again: 2237 cond_resched(); 2238 this_pass_clean = true; 2239 2240 mutex_lock(&c->vfs_inodes_lock); 2241 list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { 2242 if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) 2243 continue; 2244 2245 if (!(inode->v.i_state & I_DONTCACHE) && 2246 !(inode->v.i_state & I_FREEING) && 2247 igrab(&inode->v)) { 2248 this_pass_clean = false; 2249 2250 if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { 2251 iput(&inode->v); 2252 break; 2253 } 2254 } else if (clean_pass && this_pass_clean) { 2255 struct wait_bit_queue_entry wqe; 2256 struct wait_queue_head *wq_head; 2257 2258 wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); 2259 prepare_to_wait_event(wq_head, &wqe.wq_entry, 2260 TASK_UNINTERRUPTIBLE); 2261 mutex_unlock(&c->vfs_inodes_lock); 2262 2263 schedule(); 2264 finish_wait(wq_head, &wqe.wq_entry); 2265 goto again; 2266 } 2267 } 2268 mutex_unlock(&c->vfs_inodes_lock); 2269 2270 darray_for_each(grabbed, i) { 2271 inode = *i; 2272 d_mark_dontcache(&inode->v); 2273 d_prune_aliases(&inode->v); 2274 iput(&inode->v); 2275 } 2276 grabbed.nr = 0; 2277 2278 if (!clean_pass || !this_pass_clean) { 2279 clean_pass = this_pass_clean; 2280 goto again; 2281 } 2282 2283 darray_exit(&grabbed); 2284 } 2285 2286 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) 2287 { 2288 struct super_block *sb = dentry->d_sb; 2289 struct bch_fs *c = sb->s_fs_info; 2290 struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); 2291 unsigned shift = sb->s_blocksize_bits - 9; 2292 /* 2293 * this assumes inodes take up 64 bytes, which is a decent average 2294 * number: 2295 */ 2296 u64 avail_inodes = ((usage.capacity - usage.used) << 3); 2297 2298 buf->f_type = BCACHEFS_STATFS_MAGIC; 2299 buf->f_bsize = sb->s_blocksize; 2300 buf->f_blocks = usage.capacity >> shift; 2301 buf->f_bfree = usage.free >> shift; 2302 buf->f_bavail = avail_factor(usage.free) >> shift; 2303 2304 buf->f_files = usage.nr_inodes + avail_inodes; 2305 buf->f_ffree = avail_inodes; 2306 2307 buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); 2308 buf->f_namelen = BCH_NAME_MAX; 2309 2310 return 0; 2311 } 2312 2313 static int bch2_sync_fs(struct super_block *sb, int wait) 2314 { 2315 struct bch_fs *c = sb->s_fs_info; 2316 int ret; 2317 2318 trace_bch2_sync_fs(sb, wait); 2319 2320 if (c->opts.journal_flush_disabled) 2321 return 0; 2322 2323 if (!wait) { 2324 bch2_journal_flush_async(&c->journal, NULL); 2325 return 0; 2326 } 2327 2328 ret = bch2_journal_flush(&c->journal); 2329 return bch2_err_class(ret); 2330 } 2331 2332 static struct bch_fs *bch2_path_to_fs(const char *path) 2333 { 2334 struct bch_fs *c; 2335 dev_t dev; 2336 int ret; 2337 2338 ret = lookup_bdev(path, &dev); 2339 if (ret) 2340 return ERR_PTR(ret); 2341 2342 c = bch2_dev_to_fs(dev); 2343 if (c) 2344 closure_put(&c->cl); 2345 return c ?: ERR_PTR(-ENOENT); 2346 } 2347 2348 static int bch2_show_devname(struct seq_file *seq, struct dentry *root) 2349 { 2350 struct bch_fs *c = root->d_sb->s_fs_info; 2351 bool first = true; 2352 2353 for_each_online_member(c, ca) { 2354 if (!first) 2355 seq_putc(seq, ':'); 2356 first = false; 2357 seq_puts(seq, ca->disk_sb.sb_name); 2358 } 2359 2360 return 0; 2361 } 2362 2363 static int bch2_show_options(struct seq_file *seq, struct dentry *root) 2364 { 2365 struct bch_fs *c = root->d_sb->s_fs_info; 2366 struct printbuf buf = PRINTBUF; 2367 2368 bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, 2369 OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); 2370 printbuf_nul_terminate(&buf); 2371 seq_printf(seq, ",%s", buf.buf); 2372 2373 int ret = buf.allocation_failure ? -ENOMEM : 0; 2374 printbuf_exit(&buf); 2375 return ret; 2376 } 2377 2378 static void bch2_put_super(struct super_block *sb) 2379 { 2380 struct bch_fs *c = sb->s_fs_info; 2381 2382 __bch2_fs_stop(c); 2383 } 2384 2385 /* 2386 * bcachefs doesn't currently integrate intwrite freeze protection but the 2387 * internal write references serve the same purpose. Therefore reuse the 2388 * read-only transition code to perform the quiesce. The caveat is that we don't 2389 * currently have the ability to block tasks that want a write reference while 2390 * the superblock is frozen. This is fine for now, but we should either add 2391 * blocking support or find a way to integrate sb_start_intwrite() and friends. 2392 */ 2393 static int bch2_freeze(struct super_block *sb) 2394 { 2395 struct bch_fs *c = sb->s_fs_info; 2396 2397 down_write(&c->state_lock); 2398 bch2_fs_read_only(c); 2399 up_write(&c->state_lock); 2400 return 0; 2401 } 2402 2403 static int bch2_unfreeze(struct super_block *sb) 2404 { 2405 struct bch_fs *c = sb->s_fs_info; 2406 int ret; 2407 2408 if (test_bit(BCH_FS_emergency_ro, &c->flags)) 2409 return 0; 2410 2411 down_write(&c->state_lock); 2412 ret = bch2_fs_read_write(c); 2413 up_write(&c->state_lock); 2414 return ret; 2415 } 2416 2417 static const struct super_operations bch_super_operations = { 2418 .alloc_inode = bch2_alloc_inode, 2419 .free_inode = bch2_free_inode, 2420 .write_inode = bch2_vfs_write_inode, 2421 .evict_inode = bch2_evict_inode, 2422 .sync_fs = bch2_sync_fs, 2423 .statfs = bch2_statfs, 2424 .show_devname = bch2_show_devname, 2425 .show_options = bch2_show_options, 2426 .put_super = bch2_put_super, 2427 .freeze_fs = bch2_freeze, 2428 .unfreeze_fs = bch2_unfreeze, 2429 }; 2430 2431 static int bch2_set_super(struct super_block *s, void *data) 2432 { 2433 s->s_fs_info = data; 2434 return 0; 2435 } 2436 2437 static int bch2_noset_super(struct super_block *s, void *data) 2438 { 2439 return -EBUSY; 2440 } 2441 2442 typedef DARRAY(struct bch_fs *) darray_fs; 2443 2444 static int bch2_test_super(struct super_block *s, void *data) 2445 { 2446 struct bch_fs *c = s->s_fs_info; 2447 darray_fs *d = data; 2448 2449 if (!c) 2450 return false; 2451 2452 darray_for_each(*d, i) 2453 if (c != *i) 2454 return false; 2455 return true; 2456 } 2457 2458 static int bch2_fs_get_tree(struct fs_context *fc) 2459 { 2460 struct bch_fs *c; 2461 struct super_block *sb; 2462 struct inode *vinode; 2463 struct bch2_opts_parse *opts_parse = fc->fs_private; 2464 struct bch_opts opts = opts_parse->opts; 2465 darray_str devs; 2466 darray_fs devs_to_fs = {}; 2467 int ret; 2468 2469 opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2470 opt_set(opts, nostart, true); 2471 2472 if (!fc->source || strlen(fc->source) == 0) 2473 return -EINVAL; 2474 2475 ret = bch2_split_devs(fc->source, &devs); 2476 if (ret) 2477 return ret; 2478 2479 darray_for_each(devs, i) { 2480 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); 2481 if (ret) 2482 goto err; 2483 } 2484 2485 sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); 2486 if (!IS_ERR(sb)) 2487 goto got_sb; 2488 2489 c = bch2_fs_open(devs.data, devs.nr, opts); 2490 ret = PTR_ERR_OR_ZERO(c); 2491 if (ret) 2492 goto err; 2493 2494 if (opt_defined(opts, discard)) 2495 set_bit(BCH_FS_discard_mount_opt_set, &c->flags); 2496 2497 /* Some options can't be parsed until after the fs is started: */ 2498 opts = bch2_opts_empty(); 2499 ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); 2500 if (ret) 2501 goto err_stop_fs; 2502 2503 bch2_opts_apply(&c->opts, opts); 2504 2505 /* 2506 * need to initialise sb and set c->vfs_sb _before_ starting fs, 2507 * for blk_holder_ops 2508 */ 2509 2510 sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); 2511 ret = PTR_ERR_OR_ZERO(sb); 2512 if (ret) 2513 goto err_stop_fs; 2514 got_sb: 2515 c = sb->s_fs_info; 2516 2517 if (sb->s_root) { 2518 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { 2519 ret = -EBUSY; 2520 goto err_put_super; 2521 } 2522 goto out; 2523 } 2524 2525 sb->s_blocksize = block_bytes(c); 2526 sb->s_blocksize_bits = ilog2(block_bytes(c)); 2527 sb->s_maxbytes = MAX_LFS_FILESIZE; 2528 sb->s_op = &bch_super_operations; 2529 sb->s_export_op = &bch_export_ops; 2530 #ifdef CONFIG_BCACHEFS_QUOTA 2531 sb->s_qcop = &bch2_quotactl_operations; 2532 sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; 2533 #endif 2534 sb->s_xattr = bch2_xattr_handlers; 2535 sb->s_magic = BCACHEFS_STATFS_MAGIC; 2536 sb->s_time_gran = c->sb.nsec_per_time_unit; 2537 sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; 2538 sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); 2539 super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); 2540 super_set_sysfs_name_uuid(sb); 2541 sb->s_shrink->seeks = 0; 2542 c->vfs_sb = sb; 2543 strscpy(sb->s_id, c->name, sizeof(sb->s_id)); 2544 2545 ret = super_setup_bdi(sb); 2546 if (ret) 2547 goto err_put_super; 2548 2549 sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 2550 2551 for_each_online_member(c, ca) { 2552 struct block_device *bdev = ca->disk_sb.bdev; 2553 2554 /* XXX: create an anonymous device for multi device filesystems */ 2555 sb->s_bdev = bdev; 2556 sb->s_dev = bdev->bd_dev; 2557 percpu_ref_put(&ca->io_ref[READ]); 2558 break; 2559 } 2560 2561 c->dev = sb->s_dev; 2562 2563 #ifdef CONFIG_BCACHEFS_POSIX_ACL 2564 if (c->opts.acl) 2565 sb->s_flags |= SB_POSIXACL; 2566 #endif 2567 2568 sb->s_shrink->seeks = 0; 2569 2570 ret = bch2_fs_start(c); 2571 if (ret) 2572 goto err_put_super; 2573 2574 #ifdef CONFIG_UNICODE 2575 sb->s_encoding = c->cf_encoding; 2576 #endif 2577 generic_set_sb_d_ops(sb); 2578 2579 vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); 2580 ret = PTR_ERR_OR_ZERO(vinode); 2581 bch_err_msg(c, ret, "mounting: error getting root inode"); 2582 if (ret) 2583 goto err_put_super; 2584 2585 sb->s_root = d_make_root(vinode); 2586 if (!sb->s_root) { 2587 bch_err(c, "error mounting: error allocating root dentry"); 2588 ret = -ENOMEM; 2589 goto err_put_super; 2590 } 2591 2592 sb->s_flags |= SB_ACTIVE; 2593 out: 2594 fc->root = dget(sb->s_root); 2595 err: 2596 darray_exit(&devs_to_fs); 2597 bch2_darray_str_exit(&devs); 2598 if (ret) 2599 pr_err("error: %s", bch2_err_str(ret)); 2600 /* 2601 * On an inconsistency error in recovery we might see an -EROFS derived 2602 * errorcode (from the journal), but we don't want to return that to 2603 * userspace as that causes util-linux to retry the mount RO - which is 2604 * confusing: 2605 */ 2606 if (bch2_err_matches(ret, EROFS) && ret != -EROFS) 2607 ret = -EIO; 2608 return bch2_err_class(ret); 2609 2610 err_stop_fs: 2611 bch2_fs_stop(c); 2612 goto err; 2613 2614 err_put_super: 2615 if (!sb->s_root) 2616 __bch2_fs_stop(c); 2617 deactivate_locked_super(sb); 2618 goto err; 2619 } 2620 2621 static void bch2_kill_sb(struct super_block *sb) 2622 { 2623 struct bch_fs *c = sb->s_fs_info; 2624 2625 generic_shutdown_super(sb); 2626 bch2_fs_free(c); 2627 } 2628 2629 static void bch2_fs_context_free(struct fs_context *fc) 2630 { 2631 struct bch2_opts_parse *opts = fc->fs_private; 2632 2633 if (opts) { 2634 printbuf_exit(&opts->parse_later); 2635 kfree(opts); 2636 } 2637 } 2638 2639 static int bch2_fs_parse_param(struct fs_context *fc, 2640 struct fs_parameter *param) 2641 { 2642 /* 2643 * the "source" param, i.e., the name of the device(s) to mount, 2644 * is handled by the VFS layer. 2645 */ 2646 if (!strcmp(param->key, "source")) 2647 return -ENOPARAM; 2648 2649 struct bch2_opts_parse *opts = fc->fs_private; 2650 struct bch_fs *c = NULL; 2651 2652 /* for reconfigure, we already have a struct bch_fs */ 2653 if (fc->root) 2654 c = fc->root->d_sb->s_fs_info; 2655 2656 int ret = bch2_parse_one_mount_opt(c, &opts->opts, 2657 &opts->parse_later, param->key, 2658 param->string); 2659 if (ret) 2660 pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); 2661 2662 return bch2_err_class(ret); 2663 } 2664 2665 static int bch2_fs_reconfigure(struct fs_context *fc) 2666 { 2667 struct super_block *sb = fc->root->d_sb; 2668 struct bch2_opts_parse *opts = fc->fs_private; 2669 struct bch_fs *c = sb->s_fs_info; 2670 int ret = 0; 2671 2672 opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2673 2674 if (opts->opts.read_only != c->opts.read_only) { 2675 down_write(&c->state_lock); 2676 2677 if (opts->opts.read_only) { 2678 bch2_fs_read_only(c); 2679 2680 sb->s_flags |= SB_RDONLY; 2681 } else { 2682 ret = bch2_fs_read_write(c); 2683 if (ret) { 2684 bch_err(c, "error going rw: %i", ret); 2685 up_write(&c->state_lock); 2686 ret = -EINVAL; 2687 goto err; 2688 } 2689 2690 sb->s_flags &= ~SB_RDONLY; 2691 } 2692 2693 c->opts.read_only = opts->opts.read_only; 2694 2695 up_write(&c->state_lock); 2696 } 2697 2698 if (opt_defined(opts->opts, errors)) 2699 c->opts.errors = opts->opts.errors; 2700 err: 2701 return bch2_err_class(ret); 2702 } 2703 2704 static const struct fs_context_operations bch2_context_ops = { 2705 .free = bch2_fs_context_free, 2706 .parse_param = bch2_fs_parse_param, 2707 .get_tree = bch2_fs_get_tree, 2708 .reconfigure = bch2_fs_reconfigure, 2709 }; 2710 2711 static int bch2_init_fs_context(struct fs_context *fc) 2712 { 2713 struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); 2714 2715 if (!opts) 2716 return -ENOMEM; 2717 2718 opts->parse_later = PRINTBUF; 2719 2720 fc->ops = &bch2_context_ops; 2721 fc->fs_private = opts; 2722 2723 return 0; 2724 } 2725 2726 void bch2_fs_vfs_exit(struct bch_fs *c) 2727 { 2728 if (c->vfs_inodes_by_inum_table.ht.tbl) 2729 rhltable_destroy(&c->vfs_inodes_by_inum_table); 2730 if (c->vfs_inodes_table.tbl) 2731 rhashtable_destroy(&c->vfs_inodes_table); 2732 } 2733 2734 int bch2_fs_vfs_init(struct bch_fs *c) 2735 { 2736 return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: 2737 rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); 2738 } 2739 2740 static struct file_system_type bcache_fs_type = { 2741 .owner = THIS_MODULE, 2742 .name = "bcachefs", 2743 .init_fs_context = bch2_init_fs_context, 2744 .kill_sb = bch2_kill_sb, 2745 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, 2746 }; 2747 2748 MODULE_ALIAS_FS("bcachefs"); 2749 2750 void bch2_vfs_exit(void) 2751 { 2752 unregister_filesystem(&bcache_fs_type); 2753 kmem_cache_destroy(bch2_inode_cache); 2754 } 2755 2756 int __init bch2_vfs_init(void) 2757 { 2758 int ret = -ENOMEM; 2759 2760 bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | 2761 SLAB_ACCOUNT); 2762 if (!bch2_inode_cache) 2763 goto err; 2764 2765 ret = register_filesystem(&bcache_fs_type); 2766 if (ret) 2767 goto err; 2768 2769 return 0; 2770 err: 2771 bch2_vfs_exit(); 2772 return ret; 2773 } 2774 2775 #endif /* NO_BCACHEFS_FS */ 2776