1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "acl.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "chardev.h" 10 #include "dirent.h" 11 #include "errcode.h" 12 #include "extents.h" 13 #include "fs.h" 14 #include "fs-common.h" 15 #include "fs-io.h" 16 #include "fs-ioctl.h" 17 #include "fs-io-buffered.h" 18 #include "fs-io-direct.h" 19 #include "fs-io-pagecache.h" 20 #include "fsck.h" 21 #include "inode.h" 22 #include "io_read.h" 23 #include "journal.h" 24 #include "keylist.h" 25 #include "quota.h" 26 #include "snapshot.h" 27 #include "super.h" 28 #include "xattr.h" 29 #include "trace.h" 30 31 #include <linux/aio.h> 32 #include <linux/backing-dev.h> 33 #include <linux/exportfs.h> 34 #include <linux/fiemap.h> 35 #include <linux/fs_context.h> 36 #include <linux/module.h> 37 #include <linux/pagemap.h> 38 #include <linux/posix_acl.h> 39 #include <linux/random.h> 40 #include <linux/seq_file.h> 41 #include <linux/statfs.h> 42 #include <linux/string.h> 43 #include <linux/xattr.h> 44 45 static struct kmem_cache *bch2_inode_cache; 46 47 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, 48 struct bch_inode_info *, 49 struct bch_inode_unpacked *, 50 struct bch_subvolume *); 51 52 void bch2_inode_update_after_write(struct btree_trans *trans, 53 struct bch_inode_info *inode, 54 struct bch_inode_unpacked *bi, 55 unsigned fields) 56 { 57 struct bch_fs *c = trans->c; 58 59 BUG_ON(bi->bi_inum != inode->v.i_ino); 60 61 bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); 62 63 set_nlink(&inode->v, bch2_inode_nlink_get(bi)); 64 i_uid_write(&inode->v, bi->bi_uid); 65 i_gid_write(&inode->v, bi->bi_gid); 66 inode->v.i_mode = bi->bi_mode; 67 68 if (fields & ATTR_ATIME) 69 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); 70 if (fields & ATTR_MTIME) 71 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); 72 if (fields & ATTR_CTIME) 73 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); 74 75 inode->ei_inode = *bi; 76 77 bch2_inode_flags_to_vfs(inode); 78 } 79 80 int __must_check bch2_write_inode(struct bch_fs *c, 81 struct bch_inode_info *inode, 82 inode_set_fn set, 83 void *p, unsigned fields) 84 { 85 struct btree_trans *trans = bch2_trans_get(c); 86 struct btree_iter iter = { NULL }; 87 struct bch_inode_unpacked inode_u; 88 int ret; 89 retry: 90 bch2_trans_begin(trans); 91 92 ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), 93 BTREE_ITER_intent) ?: 94 (set ? set(trans, inode, &inode_u, p) : 0) ?: 95 bch2_inode_write(trans, &iter, &inode_u) ?: 96 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 97 98 /* 99 * the btree node lock protects inode->ei_inode, not ei_update_lock; 100 * this is important for inode updates via bchfs_write_index_update 101 */ 102 if (!ret) 103 bch2_inode_update_after_write(trans, inode, &inode_u, fields); 104 105 bch2_trans_iter_exit(trans, &iter); 106 107 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 108 goto retry; 109 110 bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, 111 "%s: inode %llu:%llu not found when updating", 112 bch2_err_str(ret), 113 inode_inum(inode).subvol, 114 inode_inum(inode).inum); 115 116 bch2_trans_put(trans); 117 return ret < 0 ? ret : 0; 118 } 119 120 int bch2_fs_quota_transfer(struct bch_fs *c, 121 struct bch_inode_info *inode, 122 struct bch_qid new_qid, 123 unsigned qtypes, 124 enum quota_acct_mode mode) 125 { 126 unsigned i; 127 int ret; 128 129 qtypes &= enabled_qtypes(c); 130 131 for (i = 0; i < QTYP_NR; i++) 132 if (new_qid.q[i] == inode->ei_qid.q[i]) 133 qtypes &= ~(1U << i); 134 135 if (!qtypes) 136 return 0; 137 138 mutex_lock(&inode->ei_quota_lock); 139 140 ret = bch2_quota_transfer(c, qtypes, new_qid, 141 inode->ei_qid, 142 inode->v.i_blocks + 143 inode->ei_quota_reserved, 144 mode); 145 if (!ret) 146 for (i = 0; i < QTYP_NR; i++) 147 if (qtypes & (1 << i)) 148 inode->ei_qid.q[i] = new_qid.q[i]; 149 150 mutex_unlock(&inode->ei_quota_lock); 151 152 return ret; 153 } 154 155 static bool subvol_inum_eq(subvol_inum a, subvol_inum b) 156 { 157 return a.subvol == b.subvol && a.inum == b.inum; 158 } 159 160 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, 161 const void *obj) 162 { 163 const struct bch_inode_info *inode = obj; 164 const subvol_inum *v = arg->key; 165 166 return !subvol_inum_eq(inode->ei_inum, *v); 167 } 168 169 static const struct rhashtable_params bch2_vfs_inodes_params = { 170 .head_offset = offsetof(struct bch_inode_info, hash), 171 .key_offset = offsetof(struct bch_inode_info, ei_inum), 172 .key_len = sizeof(subvol_inum), 173 .obj_cmpfn = bch2_vfs_inode_cmp_fn, 174 .automatic_shrinking = true, 175 }; 176 177 static void __wait_on_freeing_inode(struct inode *inode) 178 { 179 wait_queue_head_t *wq; 180 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 181 wq = bit_waitqueue(&inode->i_state, __I_NEW); 182 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 183 spin_unlock(&inode->i_lock); 184 schedule(); 185 finish_wait(wq, &wait.wq_entry); 186 } 187 188 struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 189 { 190 return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); 191 } 192 193 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, 194 subvol_inum inum) 195 { 196 struct bch_inode_info *inode; 197 repeat: 198 inode = __bch2_inode_hash_find(c, inum); 199 if (inode) { 200 spin_lock(&inode->v.i_lock); 201 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { 202 spin_unlock(&inode->v.i_lock); 203 return NULL; 204 } 205 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { 206 if (!trans) { 207 __wait_on_freeing_inode(&inode->v); 208 } else { 209 bch2_trans_unlock(trans); 210 __wait_on_freeing_inode(&inode->v); 211 int ret = bch2_trans_relock(trans); 212 if (ret) 213 return ERR_PTR(ret); 214 } 215 goto repeat; 216 } 217 __iget(&inode->v); 218 spin_unlock(&inode->v.i_lock); 219 } 220 221 return inode; 222 } 223 224 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) 225 { 226 spin_lock(&inode->v.i_lock); 227 bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); 228 spin_unlock(&inode->v.i_lock); 229 230 if (remove) { 231 int ret = rhashtable_remove_fast(&c->vfs_inodes_table, 232 &inode->hash, bch2_vfs_inodes_params); 233 BUG_ON(ret); 234 inode->v.i_hash.pprev = NULL; 235 } 236 } 237 238 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, 239 struct btree_trans *trans, 240 struct bch_inode_info *inode) 241 { 242 struct bch_inode_info *old = inode; 243 244 set_bit(EI_INODE_HASHED, &inode->ei_flags); 245 retry: 246 if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, 247 &inode->hash, 248 bch2_vfs_inodes_params))) { 249 old = bch2_inode_hash_find(c, trans, inode->ei_inum); 250 if (!old) 251 goto retry; 252 253 clear_bit(EI_INODE_HASHED, &inode->ei_flags); 254 255 /* 256 * bcachefs doesn't use I_NEW; we have no use for it since we 257 * only insert fully created inodes in the inode hash table. But 258 * discard_new_inode() expects it to be set... 259 */ 260 inode->v.i_state |= I_NEW; 261 /* 262 * We don't want bch2_evict_inode() to delete the inode on disk, 263 * we just raced and had another inode in cache. Normally new 264 * inodes don't have nlink == 0 - except tmpfiles do... 265 */ 266 set_nlink(&inode->v, 1); 267 discard_new_inode(&inode->v); 268 return old; 269 } else { 270 inode_fake_hash(&inode->v); 271 272 inode_sb_list_add(&inode->v); 273 274 mutex_lock(&c->vfs_inodes_lock); 275 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 276 mutex_unlock(&c->vfs_inodes_lock); 277 return inode; 278 } 279 } 280 281 #define memalloc_flags_do(_flags, _do) \ 282 ({ \ 283 unsigned _saved_flags = memalloc_flags_save(_flags); \ 284 typeof(_do) _ret = _do; \ 285 memalloc_noreclaim_restore(_saved_flags); \ 286 _ret; \ 287 }) 288 289 static struct inode *bch2_alloc_inode(struct super_block *sb) 290 { 291 BUG(); 292 } 293 294 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) 295 { 296 struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, 297 bch2_inode_cache, GFP_NOFS); 298 if (!inode) 299 return NULL; 300 301 inode_init_once(&inode->v); 302 mutex_init(&inode->ei_update_lock); 303 two_state_lock_init(&inode->ei_pagecache_lock); 304 INIT_LIST_HEAD(&inode->ei_vfs_inode_list); 305 inode->ei_flags = 0; 306 mutex_init(&inode->ei_quota_lock); 307 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 308 309 if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { 310 kmem_cache_free(bch2_inode_cache, inode); 311 return NULL; 312 } 313 314 return inode; 315 } 316 317 /* 318 * Allocate a new inode, dropping/retaking btree locks if necessary: 319 */ 320 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) 321 { 322 struct bch_inode_info *inode = 323 memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, 324 __bch2_new_inode(trans->c)); 325 326 if (unlikely(!inode)) { 327 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); 328 if (ret && inode) { 329 __destroy_inode(&inode->v); 330 kmem_cache_free(bch2_inode_cache, inode); 331 } 332 if (ret) 333 return ERR_PTR(ret); 334 } 335 336 return inode; 337 } 338 339 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, 340 subvol_inum inum, 341 struct bch_inode_unpacked *bi, 342 struct bch_subvolume *subvol) 343 { 344 struct bch_inode_info *inode = bch2_new_inode(trans); 345 if (IS_ERR(inode)) 346 return inode; 347 348 bch2_vfs_inode_init(trans, inum, inode, bi, subvol); 349 350 return bch2_inode_hash_insert(trans->c, trans, inode); 351 352 } 353 354 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 355 { 356 struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); 357 if (inode) 358 return &inode->v; 359 360 struct btree_trans *trans = bch2_trans_get(c); 361 362 struct bch_inode_unpacked inode_u; 363 struct bch_subvolume subvol; 364 int ret = lockrestart_do(trans, 365 bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 366 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: 367 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 368 bch2_trans_put(trans); 369 370 return ret ? ERR_PTR(ret) : &inode->v; 371 } 372 373 struct bch_inode_info * 374 __bch2_create(struct mnt_idmap *idmap, 375 struct bch_inode_info *dir, struct dentry *dentry, 376 umode_t mode, dev_t rdev, subvol_inum snapshot_src, 377 unsigned flags) 378 { 379 struct bch_fs *c = dir->v.i_sb->s_fs_info; 380 struct btree_trans *trans; 381 struct bch_inode_unpacked dir_u; 382 struct bch_inode_info *inode; 383 struct bch_inode_unpacked inode_u; 384 struct posix_acl *default_acl = NULL, *acl = NULL; 385 subvol_inum inum; 386 struct bch_subvolume subvol; 387 u64 journal_seq = 0; 388 kuid_t kuid; 389 kgid_t kgid; 390 int ret; 391 392 /* 393 * preallocate acls + vfs inode before btree transaction, so that 394 * nothing can fail after the transaction succeeds: 395 */ 396 #ifdef CONFIG_BCACHEFS_POSIX_ACL 397 ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); 398 if (ret) 399 return ERR_PTR(ret); 400 #endif 401 inode = __bch2_new_inode(c); 402 if (unlikely(!inode)) { 403 inode = ERR_PTR(-ENOMEM); 404 goto err; 405 } 406 407 bch2_inode_init_early(c, &inode_u); 408 409 if (!(flags & BCH_CREATE_TMPFILE)) 410 mutex_lock(&dir->ei_update_lock); 411 412 trans = bch2_trans_get(c); 413 retry: 414 bch2_trans_begin(trans); 415 416 kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); 417 kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); 418 ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: 419 bch2_create_trans(trans, 420 inode_inum(dir), &dir_u, &inode_u, 421 !(flags & BCH_CREATE_TMPFILE) 422 ? &dentry->d_name : NULL, 423 from_kuid(i_user_ns(&dir->v), kuid), 424 from_kgid(i_user_ns(&dir->v), kgid), 425 mode, rdev, 426 default_acl, acl, snapshot_src, flags) ?: 427 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, 428 KEY_TYPE_QUOTA_PREALLOC); 429 if (unlikely(ret)) 430 goto err_before_quota; 431 432 inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; 433 inum.inum = inode_u.bi_inum; 434 435 ret = bch2_subvolume_get(trans, inum.subvol, true, 436 BTREE_ITER_with_updates, &subvol) ?: 437 bch2_trans_commit(trans, NULL, &journal_seq, 0); 438 if (unlikely(ret)) { 439 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, 440 KEY_TYPE_QUOTA_WARN); 441 err_before_quota: 442 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 443 goto retry; 444 goto err_trans; 445 } 446 447 if (!(flags & BCH_CREATE_TMPFILE)) { 448 bch2_inode_update_after_write(trans, dir, &dir_u, 449 ATTR_MTIME|ATTR_CTIME); 450 mutex_unlock(&dir->ei_update_lock); 451 } 452 453 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 454 455 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 456 set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); 457 458 /* 459 * we must insert the new inode into the inode cache before calling 460 * bch2_trans_exit() and dropping locks, else we could race with another 461 * thread pulling the inode in and modifying it: 462 * 463 * also, calling bch2_inode_hash_insert() without passing in the 464 * transaction object is sketchy - if we could ever end up in 465 * __wait_on_freeing_inode(), we'd risk deadlock. 466 * 467 * But that shouldn't be possible, since we still have the inode locked 468 * that we just created, and we _really_ can't take a transaction 469 * restart here. 470 */ 471 inode = bch2_inode_hash_insert(c, NULL, inode); 472 bch2_trans_put(trans); 473 err: 474 posix_acl_release(default_acl); 475 posix_acl_release(acl); 476 return inode; 477 err_trans: 478 if (!(flags & BCH_CREATE_TMPFILE)) 479 mutex_unlock(&dir->ei_update_lock); 480 481 bch2_trans_put(trans); 482 make_bad_inode(&inode->v); 483 iput(&inode->v); 484 inode = ERR_PTR(ret); 485 goto err; 486 } 487 488 /* methods */ 489 490 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, 491 subvol_inum dir, struct bch_hash_info *dir_hash_info, 492 const struct qstr *name) 493 { 494 struct bch_fs *c = trans->c; 495 struct btree_iter dirent_iter = {}; 496 subvol_inum inum = {}; 497 struct printbuf buf = PRINTBUF; 498 499 struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, 500 dir_hash_info, dir, name, 0); 501 int ret = bkey_err(k); 502 if (ret) 503 return ERR_PTR(ret); 504 505 ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); 506 if (ret > 0) 507 ret = -ENOENT; 508 if (ret) 509 goto err; 510 511 struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); 512 if (inode) 513 goto out; 514 515 struct bch_subvolume subvol; 516 struct bch_inode_unpacked inode_u; 517 ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 518 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: 519 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 520 521 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 522 c, "dirent to missing inode:\n %s", 523 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 524 if (ret) 525 goto err; 526 527 /* regular files may have hardlinks: */ 528 if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && 529 !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), 530 c, 531 "dirent points to inode that does not point back:\n %s", 532 (bch2_bkey_val_to_text(&buf, c, k), 533 prt_printf(&buf, "\n "), 534 bch2_inode_unpacked_to_text(&buf, &inode_u), 535 buf.buf))) { 536 ret = -ENOENT; 537 goto err; 538 } 539 out: 540 bch2_trans_iter_exit(trans, &dirent_iter); 541 printbuf_exit(&buf); 542 return inode; 543 err: 544 inode = ERR_PTR(ret); 545 goto out; 546 } 547 548 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, 549 unsigned int flags) 550 { 551 struct bch_fs *c = vdir->i_sb->s_fs_info; 552 struct bch_inode_info *dir = to_bch_ei(vdir); 553 struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); 554 555 struct bch_inode_info *inode; 556 bch2_trans_do(c, NULL, NULL, 0, 557 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), 558 &hash, &dentry->d_name))); 559 if (IS_ERR(inode)) 560 inode = NULL; 561 562 return d_splice_alias(&inode->v, dentry); 563 } 564 565 static int bch2_mknod(struct mnt_idmap *idmap, 566 struct inode *vdir, struct dentry *dentry, 567 umode_t mode, dev_t rdev) 568 { 569 struct bch_inode_info *inode = 570 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 571 (subvol_inum) { 0 }, 0); 572 573 if (IS_ERR(inode)) 574 return bch2_err_class(PTR_ERR(inode)); 575 576 d_instantiate(dentry, &inode->v); 577 return 0; 578 } 579 580 static int bch2_create(struct mnt_idmap *idmap, 581 struct inode *vdir, struct dentry *dentry, 582 umode_t mode, bool excl) 583 { 584 return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); 585 } 586 587 static int __bch2_link(struct bch_fs *c, 588 struct bch_inode_info *inode, 589 struct bch_inode_info *dir, 590 struct dentry *dentry) 591 { 592 struct bch_inode_unpacked dir_u, inode_u; 593 int ret; 594 595 mutex_lock(&inode->ei_update_lock); 596 struct btree_trans *trans = bch2_trans_get(c); 597 598 ret = commit_do(trans, NULL, NULL, 0, 599 bch2_link_trans(trans, 600 inode_inum(dir), &dir_u, 601 inode_inum(inode), &inode_u, 602 &dentry->d_name)); 603 604 if (likely(!ret)) { 605 bch2_inode_update_after_write(trans, dir, &dir_u, 606 ATTR_MTIME|ATTR_CTIME); 607 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); 608 } 609 610 bch2_trans_put(trans); 611 mutex_unlock(&inode->ei_update_lock); 612 return ret; 613 } 614 615 static int bch2_link(struct dentry *old_dentry, struct inode *vdir, 616 struct dentry *dentry) 617 { 618 struct bch_fs *c = vdir->i_sb->s_fs_info; 619 struct bch_inode_info *dir = to_bch_ei(vdir); 620 struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); 621 int ret; 622 623 lockdep_assert_held(&inode->v.i_rwsem); 624 625 ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 626 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 627 __bch2_link(c, inode, dir, dentry); 628 if (unlikely(ret)) 629 return bch2_err_class(ret); 630 631 ihold(&inode->v); 632 d_instantiate(dentry, &inode->v); 633 return 0; 634 } 635 636 int __bch2_unlink(struct inode *vdir, struct dentry *dentry, 637 bool deleting_snapshot) 638 { 639 struct bch_fs *c = vdir->i_sb->s_fs_info; 640 struct bch_inode_info *dir = to_bch_ei(vdir); 641 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 642 struct bch_inode_unpacked dir_u, inode_u; 643 int ret; 644 645 bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); 646 647 struct btree_trans *trans = bch2_trans_get(c); 648 649 ret = commit_do(trans, NULL, NULL, 650 BCH_TRANS_COMMIT_no_enospc, 651 bch2_unlink_trans(trans, 652 inode_inum(dir), &dir_u, 653 &inode_u, &dentry->d_name, 654 deleting_snapshot)); 655 if (unlikely(ret)) 656 goto err; 657 658 bch2_inode_update_after_write(trans, dir, &dir_u, 659 ATTR_MTIME|ATTR_CTIME); 660 bch2_inode_update_after_write(trans, inode, &inode_u, 661 ATTR_MTIME); 662 663 if (inode_u.bi_subvol) { 664 /* 665 * Subvolume deletion is asynchronous, but we still want to tell 666 * the VFS that it's been deleted here: 667 */ 668 set_nlink(&inode->v, 0); 669 } 670 err: 671 bch2_trans_put(trans); 672 bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); 673 674 return ret; 675 } 676 677 static int bch2_unlink(struct inode *vdir, struct dentry *dentry) 678 { 679 struct bch_inode_info *dir= to_bch_ei(vdir); 680 struct bch_fs *c = dir->v.i_sb->s_fs_info; 681 682 int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 683 __bch2_unlink(vdir, dentry, false); 684 return bch2_err_class(ret); 685 } 686 687 static int bch2_symlink(struct mnt_idmap *idmap, 688 struct inode *vdir, struct dentry *dentry, 689 const char *symname) 690 { 691 struct bch_fs *c = vdir->i_sb->s_fs_info; 692 struct bch_inode_info *dir = to_bch_ei(vdir), *inode; 693 int ret; 694 695 inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, 696 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 697 if (IS_ERR(inode)) 698 return bch2_err_class(PTR_ERR(inode)); 699 700 inode_lock(&inode->v); 701 ret = page_symlink(&inode->v, symname, strlen(symname) + 1); 702 inode_unlock(&inode->v); 703 704 if (unlikely(ret)) 705 goto err; 706 707 ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); 708 if (unlikely(ret)) 709 goto err; 710 711 ret = __bch2_link(c, inode, dir, dentry); 712 if (unlikely(ret)) 713 goto err; 714 715 d_instantiate(dentry, &inode->v); 716 return 0; 717 err: 718 iput(&inode->v); 719 return bch2_err_class(ret); 720 } 721 722 static int bch2_mkdir(struct mnt_idmap *idmap, 723 struct inode *vdir, struct dentry *dentry, umode_t mode) 724 { 725 return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); 726 } 727 728 static int bch2_rename2(struct mnt_idmap *idmap, 729 struct inode *src_vdir, struct dentry *src_dentry, 730 struct inode *dst_vdir, struct dentry *dst_dentry, 731 unsigned flags) 732 { 733 struct bch_fs *c = src_vdir->i_sb->s_fs_info; 734 struct bch_inode_info *src_dir = to_bch_ei(src_vdir); 735 struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); 736 struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); 737 struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); 738 struct bch_inode_unpacked dst_dir_u, src_dir_u; 739 struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; 740 struct btree_trans *trans; 741 enum bch_rename_mode mode = flags & RENAME_EXCHANGE 742 ? BCH_RENAME_EXCHANGE 743 : dst_dentry->d_inode 744 ? BCH_RENAME_OVERWRITE : BCH_RENAME; 745 bool whiteout = !!(flags & RENAME_WHITEOUT); 746 int ret; 747 748 if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) 749 return -EINVAL; 750 751 if (mode == BCH_RENAME_OVERWRITE) { 752 ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 753 0, LLONG_MAX); 754 if (ret) 755 return ret; 756 } 757 758 bch2_lock_inodes(INODE_UPDATE_LOCK, 759 src_dir, 760 dst_dir, 761 src_inode, 762 dst_inode); 763 764 trans = bch2_trans_get(c); 765 766 ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: 767 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); 768 if (ret) 769 goto err; 770 771 if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { 772 ret = bch2_fs_quota_transfer(c, src_inode, 773 dst_dir->ei_qid, 774 1 << QTYP_PRJ, 775 KEY_TYPE_QUOTA_PREALLOC); 776 if (ret) 777 goto err; 778 } 779 780 if (mode == BCH_RENAME_EXCHANGE && 781 inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { 782 ret = bch2_fs_quota_transfer(c, dst_inode, 783 src_dir->ei_qid, 784 1 << QTYP_PRJ, 785 KEY_TYPE_QUOTA_PREALLOC); 786 if (ret) 787 goto err; 788 } 789 retry: 790 bch2_trans_begin(trans); 791 792 ret = bch2_rename_trans(trans, 793 inode_inum(src_dir), &src_dir_u, 794 inode_inum(dst_dir), &dst_dir_u, 795 &src_inode_u, 796 &dst_inode_u, 797 &src_dentry->d_name, 798 &dst_dentry->d_name, 799 mode); 800 if (unlikely(ret)) 801 goto err_tx_restart; 802 803 if (whiteout) { 804 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); 805 ret = PTR_ERR_OR_ZERO(whiteout_inode_u); 806 if (unlikely(ret)) 807 goto err_tx_restart; 808 bch2_inode_init_early(c, whiteout_inode_u); 809 810 ret = bch2_create_trans(trans, 811 inode_inum(src_dir), &src_dir_u, 812 whiteout_inode_u, 813 &src_dentry->d_name, 814 from_kuid(i_user_ns(&src_dir->v), current_fsuid()), 815 from_kgid(i_user_ns(&src_dir->v), current_fsgid()), 816 S_IFCHR|WHITEOUT_MODE, 0, 817 NULL, NULL, (subvol_inum) { 0 }, 0) ?: 818 bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, 819 KEY_TYPE_QUOTA_PREALLOC); 820 if (unlikely(ret)) 821 goto err_tx_restart; 822 } 823 824 ret = bch2_trans_commit(trans, NULL, NULL, 0); 825 if (unlikely(ret)) { 826 err_tx_restart: 827 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 828 goto retry; 829 goto err; 830 } 831 832 BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); 833 BUG_ON(dst_inode && 834 dst_inode->v.i_ino != dst_inode_u.bi_inum); 835 836 bch2_inode_update_after_write(trans, src_dir, &src_dir_u, 837 ATTR_MTIME|ATTR_CTIME); 838 839 if (src_dir != dst_dir) 840 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, 841 ATTR_MTIME|ATTR_CTIME); 842 843 bch2_inode_update_after_write(trans, src_inode, &src_inode_u, 844 ATTR_CTIME); 845 846 if (dst_inode) 847 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, 848 ATTR_CTIME); 849 err: 850 bch2_trans_put(trans); 851 852 bch2_fs_quota_transfer(c, src_inode, 853 bch_qid(&src_inode->ei_inode), 854 1 << QTYP_PRJ, 855 KEY_TYPE_QUOTA_NOCHECK); 856 if (dst_inode) 857 bch2_fs_quota_transfer(c, dst_inode, 858 bch_qid(&dst_inode->ei_inode), 859 1 << QTYP_PRJ, 860 KEY_TYPE_QUOTA_NOCHECK); 861 862 bch2_unlock_inodes(INODE_UPDATE_LOCK, 863 src_dir, 864 dst_dir, 865 src_inode, 866 dst_inode); 867 868 return bch2_err_class(ret); 869 } 870 871 static void bch2_setattr_copy(struct mnt_idmap *idmap, 872 struct bch_inode_info *inode, 873 struct bch_inode_unpacked *bi, 874 struct iattr *attr) 875 { 876 struct bch_fs *c = inode->v.i_sb->s_fs_info; 877 unsigned int ia_valid = attr->ia_valid; 878 kuid_t kuid; 879 kgid_t kgid; 880 881 if (ia_valid & ATTR_UID) { 882 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 883 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); 884 } 885 if (ia_valid & ATTR_GID) { 886 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 887 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); 888 } 889 890 if (ia_valid & ATTR_SIZE) 891 bi->bi_size = attr->ia_size; 892 893 if (ia_valid & ATTR_ATIME) 894 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); 895 if (ia_valid & ATTR_MTIME) 896 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); 897 if (ia_valid & ATTR_CTIME) 898 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); 899 900 if (ia_valid & ATTR_MODE) { 901 umode_t mode = attr->ia_mode; 902 kgid_t gid = ia_valid & ATTR_GID 903 ? kgid 904 : inode->v.i_gid; 905 906 if (!in_group_or_capable(idmap, &inode->v, 907 make_vfsgid(idmap, i_user_ns(&inode->v), gid))) 908 mode &= ~S_ISGID; 909 bi->bi_mode = mode; 910 } 911 } 912 913 int bch2_setattr_nonsize(struct mnt_idmap *idmap, 914 struct bch_inode_info *inode, 915 struct iattr *attr) 916 { 917 struct bch_fs *c = inode->v.i_sb->s_fs_info; 918 struct bch_qid qid; 919 struct btree_trans *trans; 920 struct btree_iter inode_iter = { NULL }; 921 struct bch_inode_unpacked inode_u; 922 struct posix_acl *acl = NULL; 923 kuid_t kuid; 924 kgid_t kgid; 925 int ret; 926 927 mutex_lock(&inode->ei_update_lock); 928 929 qid = inode->ei_qid; 930 931 if (attr->ia_valid & ATTR_UID) { 932 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 933 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); 934 } 935 936 if (attr->ia_valid & ATTR_GID) { 937 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 938 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); 939 } 940 941 ret = bch2_fs_quota_transfer(c, inode, qid, ~0, 942 KEY_TYPE_QUOTA_PREALLOC); 943 if (ret) 944 goto err; 945 946 trans = bch2_trans_get(c); 947 retry: 948 bch2_trans_begin(trans); 949 kfree(acl); 950 acl = NULL; 951 952 ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), 953 BTREE_ITER_intent); 954 if (ret) 955 goto btree_err; 956 957 bch2_setattr_copy(idmap, inode, &inode_u, attr); 958 959 if (attr->ia_valid & ATTR_MODE) { 960 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, 961 inode_u.bi_mode, &acl); 962 if (ret) 963 goto btree_err; 964 } 965 966 ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: 967 bch2_trans_commit(trans, NULL, NULL, 968 BCH_TRANS_COMMIT_no_enospc); 969 btree_err: 970 bch2_trans_iter_exit(trans, &inode_iter); 971 972 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 973 goto retry; 974 if (unlikely(ret)) 975 goto err_trans; 976 977 bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); 978 979 if (acl) 980 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 981 err_trans: 982 bch2_trans_put(trans); 983 err: 984 mutex_unlock(&inode->ei_update_lock); 985 986 return bch2_err_class(ret); 987 } 988 989 static int bch2_getattr(struct mnt_idmap *idmap, 990 const struct path *path, struct kstat *stat, 991 u32 request_mask, unsigned query_flags) 992 { 993 struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); 994 struct bch_fs *c = inode->v.i_sb->s_fs_info; 995 vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); 996 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); 997 998 stat->dev = inode->v.i_sb->s_dev; 999 stat->ino = inode->v.i_ino; 1000 stat->mode = inode->v.i_mode; 1001 stat->nlink = inode->v.i_nlink; 1002 stat->uid = vfsuid_into_kuid(vfsuid); 1003 stat->gid = vfsgid_into_kgid(vfsgid); 1004 stat->rdev = inode->v.i_rdev; 1005 stat->size = i_size_read(&inode->v); 1006 stat->atime = inode_get_atime(&inode->v); 1007 stat->mtime = inode_get_mtime(&inode->v); 1008 stat->ctime = inode_get_ctime(&inode->v); 1009 stat->blksize = block_bytes(c); 1010 stat->blocks = inode->v.i_blocks; 1011 1012 stat->subvol = inode->ei_inum.subvol; 1013 stat->result_mask |= STATX_SUBVOL; 1014 1015 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { 1016 stat->result_mask |= STATX_DIOALIGN; 1017 /* 1018 * this is incorrect; we should be tracking this in superblock, 1019 * and checking the alignment of open devices 1020 */ 1021 stat->dio_mem_align = SECTOR_SIZE; 1022 stat->dio_offset_align = block_bytes(c); 1023 } 1024 1025 if (request_mask & STATX_BTIME) { 1026 stat->result_mask |= STATX_BTIME; 1027 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); 1028 } 1029 1030 if (inode->ei_inode.bi_flags & BCH_INODE_immutable) 1031 stat->attributes |= STATX_ATTR_IMMUTABLE; 1032 stat->attributes_mask |= STATX_ATTR_IMMUTABLE; 1033 1034 if (inode->ei_inode.bi_flags & BCH_INODE_append) 1035 stat->attributes |= STATX_ATTR_APPEND; 1036 stat->attributes_mask |= STATX_ATTR_APPEND; 1037 1038 if (inode->ei_inode.bi_flags & BCH_INODE_nodump) 1039 stat->attributes |= STATX_ATTR_NODUMP; 1040 stat->attributes_mask |= STATX_ATTR_NODUMP; 1041 1042 return 0; 1043 } 1044 1045 static int bch2_setattr(struct mnt_idmap *idmap, 1046 struct dentry *dentry, struct iattr *iattr) 1047 { 1048 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 1049 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1050 int ret; 1051 1052 lockdep_assert_held(&inode->v.i_rwsem); 1053 1054 ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 1055 setattr_prepare(idmap, dentry, iattr); 1056 if (ret) 1057 return ret; 1058 1059 return iattr->ia_valid & ATTR_SIZE 1060 ? bchfs_truncate(idmap, inode, iattr) 1061 : bch2_setattr_nonsize(idmap, inode, iattr); 1062 } 1063 1064 static int bch2_tmpfile(struct mnt_idmap *idmap, 1065 struct inode *vdir, struct file *file, umode_t mode) 1066 { 1067 struct bch_inode_info *inode = 1068 __bch2_create(idmap, to_bch_ei(vdir), 1069 file->f_path.dentry, mode, 0, 1070 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 1071 1072 if (IS_ERR(inode)) 1073 return bch2_err_class(PTR_ERR(inode)); 1074 1075 d_mark_tmpfile(file, &inode->v); 1076 d_instantiate(file->f_path.dentry, &inode->v); 1077 return finish_open_simple(file, 0); 1078 } 1079 1080 static int bch2_fill_extent(struct bch_fs *c, 1081 struct fiemap_extent_info *info, 1082 struct bkey_s_c k, unsigned flags) 1083 { 1084 if (bkey_extent_is_direct_data(k.k)) { 1085 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1086 const union bch_extent_entry *entry; 1087 struct extent_ptr_decoded p; 1088 int ret; 1089 1090 if (k.k->type == KEY_TYPE_reflink_v) 1091 flags |= FIEMAP_EXTENT_SHARED; 1092 1093 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1094 int flags2 = 0; 1095 u64 offset = p.ptr.offset; 1096 1097 if (p.ptr.unwritten) 1098 flags2 |= FIEMAP_EXTENT_UNWRITTEN; 1099 1100 if (p.crc.compression_type) 1101 flags2 |= FIEMAP_EXTENT_ENCODED; 1102 else 1103 offset += p.crc.offset; 1104 1105 if ((offset & (block_sectors(c) - 1)) || 1106 (k.k->size & (block_sectors(c) - 1))) 1107 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; 1108 1109 ret = fiemap_fill_next_extent(info, 1110 bkey_start_offset(k.k) << 9, 1111 offset << 9, 1112 k.k->size << 9, flags|flags2); 1113 if (ret) 1114 return ret; 1115 } 1116 1117 return 0; 1118 } else if (bkey_extent_is_inline_data(k.k)) { 1119 return fiemap_fill_next_extent(info, 1120 bkey_start_offset(k.k) << 9, 1121 0, k.k->size << 9, 1122 flags| 1123 FIEMAP_EXTENT_DATA_INLINE); 1124 } else if (k.k->type == KEY_TYPE_reservation) { 1125 return fiemap_fill_next_extent(info, 1126 bkey_start_offset(k.k) << 9, 1127 0, k.k->size << 9, 1128 flags| 1129 FIEMAP_EXTENT_DELALLOC| 1130 FIEMAP_EXTENT_UNWRITTEN); 1131 } else { 1132 BUG(); 1133 } 1134 } 1135 1136 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, 1137 u64 start, u64 len) 1138 { 1139 struct bch_fs *c = vinode->i_sb->s_fs_info; 1140 struct bch_inode_info *ei = to_bch_ei(vinode); 1141 struct btree_trans *trans; 1142 struct btree_iter iter; 1143 struct bkey_s_c k; 1144 struct bkey_buf cur, prev; 1145 unsigned offset_into_extent, sectors; 1146 bool have_extent = false; 1147 int ret = 0; 1148 1149 ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); 1150 if (ret) 1151 return ret; 1152 1153 struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); 1154 if (start + len < start) 1155 return -EINVAL; 1156 1157 start >>= 9; 1158 1159 bch2_bkey_buf_init(&cur); 1160 bch2_bkey_buf_init(&prev); 1161 trans = bch2_trans_get(c); 1162 1163 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1164 POS(ei->v.i_ino, start), 0); 1165 1166 while (true) { 1167 enum btree_id data_btree = BTREE_ID_extents; 1168 1169 bch2_trans_begin(trans); 1170 1171 u32 snapshot; 1172 ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); 1173 if (ret) 1174 goto err; 1175 1176 bch2_btree_iter_set_snapshot(&iter, snapshot); 1177 1178 k = bch2_btree_iter_peek_upto(&iter, end); 1179 ret = bkey_err(k); 1180 if (ret) 1181 goto err; 1182 1183 if (!k.k) 1184 break; 1185 1186 if (!bkey_extent_is_data(k.k) && 1187 k.k->type != KEY_TYPE_reservation) { 1188 bch2_btree_iter_advance(&iter); 1189 continue; 1190 } 1191 1192 offset_into_extent = iter.pos.offset - 1193 bkey_start_offset(k.k); 1194 sectors = k.k->size - offset_into_extent; 1195 1196 bch2_bkey_buf_reassemble(&cur, c, k); 1197 1198 ret = bch2_read_indirect_extent(trans, &data_btree, 1199 &offset_into_extent, &cur); 1200 if (ret) 1201 break; 1202 1203 k = bkey_i_to_s_c(cur.k); 1204 bch2_bkey_buf_realloc(&prev, c, k.k->u64s); 1205 1206 sectors = min(sectors, k.k->size - offset_into_extent); 1207 1208 bch2_cut_front(POS(k.k->p.inode, 1209 bkey_start_offset(k.k) + 1210 offset_into_extent), 1211 cur.k); 1212 bch2_key_resize(&cur.k->k, sectors); 1213 cur.k->k.p = iter.pos; 1214 cur.k->k.p.offset += cur.k->k.size; 1215 1216 if (have_extent) { 1217 bch2_trans_unlock(trans); 1218 ret = bch2_fill_extent(c, info, 1219 bkey_i_to_s_c(prev.k), 0); 1220 if (ret) 1221 break; 1222 } 1223 1224 bkey_copy(prev.k, cur.k); 1225 have_extent = true; 1226 1227 bch2_btree_iter_set_pos(&iter, 1228 POS(iter.pos.inode, iter.pos.offset + sectors)); 1229 err: 1230 if (ret && 1231 !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1232 break; 1233 } 1234 bch2_trans_iter_exit(trans, &iter); 1235 1236 if (!ret && have_extent) { 1237 bch2_trans_unlock(trans); 1238 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 1239 FIEMAP_EXTENT_LAST); 1240 } 1241 1242 bch2_trans_put(trans); 1243 bch2_bkey_buf_exit(&cur, c); 1244 bch2_bkey_buf_exit(&prev, c); 1245 return ret < 0 ? ret : 0; 1246 } 1247 1248 static const struct vm_operations_struct bch_vm_ops = { 1249 .fault = bch2_page_fault, 1250 .map_pages = filemap_map_pages, 1251 .page_mkwrite = bch2_page_mkwrite, 1252 }; 1253 1254 static int bch2_mmap(struct file *file, struct vm_area_struct *vma) 1255 { 1256 file_accessed(file); 1257 1258 vma->vm_ops = &bch_vm_ops; 1259 return 0; 1260 } 1261 1262 /* Directories: */ 1263 1264 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) 1265 { 1266 return generic_file_llseek_size(file, offset, whence, 1267 S64_MAX, S64_MAX); 1268 } 1269 1270 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) 1271 { 1272 struct bch_inode_info *inode = file_bch_inode(file); 1273 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1274 1275 if (!dir_emit_dots(file, ctx)) 1276 return 0; 1277 1278 int ret = bch2_readdir(c, inode_inum(inode), ctx); 1279 1280 bch_err_fn(c, ret); 1281 return bch2_err_class(ret); 1282 } 1283 1284 static int bch2_open(struct inode *vinode, struct file *file) 1285 { 1286 if (file->f_flags & (O_WRONLY|O_RDWR)) { 1287 struct bch_inode_info *inode = to_bch_ei(vinode); 1288 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1289 1290 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); 1291 if (ret) 1292 return ret; 1293 } 1294 1295 file->f_mode |= FMODE_CAN_ODIRECT; 1296 1297 return generic_file_open(vinode, file); 1298 } 1299 1300 static const struct file_operations bch_file_operations = { 1301 .open = bch2_open, 1302 .llseek = bch2_llseek, 1303 .read_iter = bch2_read_iter, 1304 .write_iter = bch2_write_iter, 1305 .mmap = bch2_mmap, 1306 .get_unmapped_area = thp_get_unmapped_area, 1307 .fsync = bch2_fsync, 1308 .splice_read = filemap_splice_read, 1309 .splice_write = iter_file_splice_write, 1310 .fallocate = bch2_fallocate_dispatch, 1311 .unlocked_ioctl = bch2_fs_file_ioctl, 1312 #ifdef CONFIG_COMPAT 1313 .compat_ioctl = bch2_compat_fs_ioctl, 1314 #endif 1315 .remap_file_range = bch2_remap_file_range, 1316 }; 1317 1318 static const struct inode_operations bch_file_inode_operations = { 1319 .getattr = bch2_getattr, 1320 .setattr = bch2_setattr, 1321 .fiemap = bch2_fiemap, 1322 .listxattr = bch2_xattr_list, 1323 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1324 .get_inode_acl = bch2_get_acl, 1325 .set_acl = bch2_set_acl, 1326 #endif 1327 }; 1328 1329 static const struct inode_operations bch_dir_inode_operations = { 1330 .lookup = bch2_lookup, 1331 .create = bch2_create, 1332 .link = bch2_link, 1333 .unlink = bch2_unlink, 1334 .symlink = bch2_symlink, 1335 .mkdir = bch2_mkdir, 1336 .rmdir = bch2_unlink, 1337 .mknod = bch2_mknod, 1338 .rename = bch2_rename2, 1339 .getattr = bch2_getattr, 1340 .setattr = bch2_setattr, 1341 .tmpfile = bch2_tmpfile, 1342 .listxattr = bch2_xattr_list, 1343 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1344 .get_inode_acl = bch2_get_acl, 1345 .set_acl = bch2_set_acl, 1346 #endif 1347 }; 1348 1349 static const struct file_operations bch_dir_file_operations = { 1350 .llseek = bch2_dir_llseek, 1351 .read = generic_read_dir, 1352 .iterate_shared = bch2_vfs_readdir, 1353 .fsync = bch2_fsync, 1354 .unlocked_ioctl = bch2_fs_file_ioctl, 1355 #ifdef CONFIG_COMPAT 1356 .compat_ioctl = bch2_compat_fs_ioctl, 1357 #endif 1358 }; 1359 1360 static const struct inode_operations bch_symlink_inode_operations = { 1361 .get_link = page_get_link, 1362 .getattr = bch2_getattr, 1363 .setattr = bch2_setattr, 1364 .listxattr = bch2_xattr_list, 1365 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1366 .get_inode_acl = bch2_get_acl, 1367 .set_acl = bch2_set_acl, 1368 #endif 1369 }; 1370 1371 static const struct inode_operations bch_special_inode_operations = { 1372 .getattr = bch2_getattr, 1373 .setattr = bch2_setattr, 1374 .listxattr = bch2_xattr_list, 1375 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1376 .get_inode_acl = bch2_get_acl, 1377 .set_acl = bch2_set_acl, 1378 #endif 1379 }; 1380 1381 static const struct address_space_operations bch_address_space_operations = { 1382 .read_folio = bch2_read_folio, 1383 .writepages = bch2_writepages, 1384 .readahead = bch2_readahead, 1385 .dirty_folio = filemap_dirty_folio, 1386 .write_begin = bch2_write_begin, 1387 .write_end = bch2_write_end, 1388 .invalidate_folio = bch2_invalidate_folio, 1389 .release_folio = bch2_release_folio, 1390 #ifdef CONFIG_MIGRATION 1391 .migrate_folio = filemap_migrate_folio, 1392 #endif 1393 .error_remove_folio = generic_error_remove_folio, 1394 }; 1395 1396 struct bcachefs_fid { 1397 u64 inum; 1398 u32 subvol; 1399 u32 gen; 1400 } __packed; 1401 1402 struct bcachefs_fid_with_parent { 1403 struct bcachefs_fid fid; 1404 struct bcachefs_fid dir; 1405 } __packed; 1406 1407 static int bcachefs_fid_valid(int fh_len, int fh_type) 1408 { 1409 switch (fh_type) { 1410 case FILEID_BCACHEFS_WITHOUT_PARENT: 1411 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); 1412 case FILEID_BCACHEFS_WITH_PARENT: 1413 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); 1414 default: 1415 return false; 1416 } 1417 } 1418 1419 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) 1420 { 1421 return (struct bcachefs_fid) { 1422 .inum = inode->ei_inum.inum, 1423 .subvol = inode->ei_inum.subvol, 1424 .gen = inode->ei_inode.bi_generation, 1425 }; 1426 } 1427 1428 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, 1429 struct inode *vdir) 1430 { 1431 struct bch_inode_info *inode = to_bch_ei(vinode); 1432 struct bch_inode_info *dir = to_bch_ei(vdir); 1433 int min_len; 1434 1435 if (!S_ISDIR(inode->v.i_mode) && dir) { 1436 struct bcachefs_fid_with_parent *fid = (void *) fh; 1437 1438 min_len = sizeof(*fid) / sizeof(u32); 1439 if (*len < min_len) { 1440 *len = min_len; 1441 return FILEID_INVALID; 1442 } 1443 1444 fid->fid = bch2_inode_to_fid(inode); 1445 fid->dir = bch2_inode_to_fid(dir); 1446 1447 *len = min_len; 1448 return FILEID_BCACHEFS_WITH_PARENT; 1449 } else { 1450 struct bcachefs_fid *fid = (void *) fh; 1451 1452 min_len = sizeof(*fid) / sizeof(u32); 1453 if (*len < min_len) { 1454 *len = min_len; 1455 return FILEID_INVALID; 1456 } 1457 *fid = bch2_inode_to_fid(inode); 1458 1459 *len = min_len; 1460 return FILEID_BCACHEFS_WITHOUT_PARENT; 1461 } 1462 } 1463 1464 static struct inode *bch2_nfs_get_inode(struct super_block *sb, 1465 struct bcachefs_fid fid) 1466 { 1467 struct bch_fs *c = sb->s_fs_info; 1468 struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { 1469 .subvol = fid.subvol, 1470 .inum = fid.inum, 1471 }); 1472 if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { 1473 iput(vinode); 1474 vinode = ERR_PTR(-ESTALE); 1475 } 1476 return vinode; 1477 } 1478 1479 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, 1480 int fh_len, int fh_type) 1481 { 1482 struct bcachefs_fid *fid = (void *) _fid; 1483 1484 if (!bcachefs_fid_valid(fh_len, fh_type)) 1485 return NULL; 1486 1487 return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); 1488 } 1489 1490 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, 1491 int fh_len, int fh_type) 1492 { 1493 struct bcachefs_fid_with_parent *fid = (void *) _fid; 1494 1495 if (!bcachefs_fid_valid(fh_len, fh_type) || 1496 fh_type != FILEID_BCACHEFS_WITH_PARENT) 1497 return NULL; 1498 1499 return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); 1500 } 1501 1502 static struct dentry *bch2_get_parent(struct dentry *child) 1503 { 1504 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1505 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1506 subvol_inum parent_inum = { 1507 .subvol = inode->ei_inode.bi_parent_subvol ?: 1508 inode->ei_inum.subvol, 1509 .inum = inode->ei_inode.bi_dir, 1510 }; 1511 1512 return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); 1513 } 1514 1515 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) 1516 { 1517 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1518 struct bch_inode_info *dir = to_bch_ei(parent->d_inode); 1519 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1520 struct btree_trans *trans; 1521 struct btree_iter iter1; 1522 struct btree_iter iter2; 1523 struct bkey_s_c k; 1524 struct bkey_s_c_dirent d; 1525 struct bch_inode_unpacked inode_u; 1526 subvol_inum target; 1527 u32 snapshot; 1528 struct qstr dirent_name; 1529 unsigned name_len = 0; 1530 int ret; 1531 1532 if (!S_ISDIR(dir->v.i_mode)) 1533 return -EINVAL; 1534 1535 trans = bch2_trans_get(c); 1536 1537 bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, 1538 POS(dir->ei_inode.bi_inum, 0), 0); 1539 bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, 1540 POS(dir->ei_inode.bi_inum, 0), 0); 1541 retry: 1542 bch2_trans_begin(trans); 1543 1544 ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); 1545 if (ret) 1546 goto err; 1547 1548 bch2_btree_iter_set_snapshot(&iter1, snapshot); 1549 bch2_btree_iter_set_snapshot(&iter2, snapshot); 1550 1551 ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); 1552 if (ret) 1553 goto err; 1554 1555 if (inode_u.bi_dir == dir->ei_inode.bi_inum) { 1556 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); 1557 1558 k = bch2_btree_iter_peek_slot(&iter1); 1559 ret = bkey_err(k); 1560 if (ret) 1561 goto err; 1562 1563 if (k.k->type != KEY_TYPE_dirent) { 1564 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1565 goto err; 1566 } 1567 1568 d = bkey_s_c_to_dirent(k); 1569 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1570 if (ret > 0) 1571 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1572 if (ret) 1573 goto err; 1574 1575 if (subvol_inum_eq(target, inode->ei_inum)) 1576 goto found; 1577 } else { 1578 /* 1579 * File with multiple hardlinks and our backref is to the wrong 1580 * directory - linear search: 1581 */ 1582 for_each_btree_key_continue_norestart(iter2, 0, k, ret) { 1583 if (k.k->p.inode > dir->ei_inode.bi_inum) 1584 break; 1585 1586 if (k.k->type != KEY_TYPE_dirent) 1587 continue; 1588 1589 d = bkey_s_c_to_dirent(k); 1590 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1591 if (ret < 0) 1592 break; 1593 if (ret) 1594 continue; 1595 1596 if (subvol_inum_eq(target, inode->ei_inum)) 1597 goto found; 1598 } 1599 } 1600 1601 ret = -ENOENT; 1602 goto err; 1603 found: 1604 dirent_name = bch2_dirent_get_name(d); 1605 1606 name_len = min_t(unsigned, dirent_name.len, NAME_MAX); 1607 memcpy(name, dirent_name.name, name_len); 1608 name[name_len] = '\0'; 1609 err: 1610 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1611 goto retry; 1612 1613 bch2_trans_iter_exit(trans, &iter1); 1614 bch2_trans_iter_exit(trans, &iter2); 1615 bch2_trans_put(trans); 1616 1617 return ret; 1618 } 1619 1620 static const struct export_operations bch_export_ops = { 1621 .encode_fh = bch2_encode_fh, 1622 .fh_to_dentry = bch2_fh_to_dentry, 1623 .fh_to_parent = bch2_fh_to_parent, 1624 .get_parent = bch2_get_parent, 1625 .get_name = bch2_get_name, 1626 }; 1627 1628 static void bch2_vfs_inode_init(struct btree_trans *trans, 1629 subvol_inum inum, 1630 struct bch_inode_info *inode, 1631 struct bch_inode_unpacked *bi, 1632 struct bch_subvolume *subvol) 1633 { 1634 inode->v.i_ino = inum.inum; 1635 inode->ei_inum = inum; 1636 inode->ei_inode.bi_inum = inum.inum; 1637 bch2_inode_update_after_write(trans, inode, bi, ~0); 1638 1639 inode->v.i_blocks = bi->bi_sectors; 1640 inode->v.i_ino = bi->bi_inum; 1641 inode->v.i_rdev = bi->bi_dev; 1642 inode->v.i_generation = bi->bi_generation; 1643 inode->v.i_size = bi->bi_size; 1644 1645 inode->ei_flags = 0; 1646 inode->ei_quota_reserved = 0; 1647 inode->ei_qid = bch_qid(bi); 1648 1649 if (BCH_SUBVOLUME_SNAP(subvol)) 1650 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); 1651 1652 inode->v.i_mapping->a_ops = &bch_address_space_operations; 1653 1654 switch (inode->v.i_mode & S_IFMT) { 1655 case S_IFREG: 1656 inode->v.i_op = &bch_file_inode_operations; 1657 inode->v.i_fop = &bch_file_operations; 1658 break; 1659 case S_IFDIR: 1660 inode->v.i_op = &bch_dir_inode_operations; 1661 inode->v.i_fop = &bch_dir_file_operations; 1662 break; 1663 case S_IFLNK: 1664 inode_nohighmem(&inode->v); 1665 inode->v.i_op = &bch_symlink_inode_operations; 1666 break; 1667 default: 1668 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); 1669 inode->v.i_op = &bch_special_inode_operations; 1670 break; 1671 } 1672 1673 mapping_set_large_folios(inode->v.i_mapping); 1674 } 1675 1676 static void bch2_free_inode(struct inode *vinode) 1677 { 1678 kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); 1679 } 1680 1681 static int inode_update_times_fn(struct btree_trans *trans, 1682 struct bch_inode_info *inode, 1683 struct bch_inode_unpacked *bi, 1684 void *p) 1685 { 1686 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1687 1688 bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); 1689 bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); 1690 bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); 1691 1692 return 0; 1693 } 1694 1695 static int bch2_vfs_write_inode(struct inode *vinode, 1696 struct writeback_control *wbc) 1697 { 1698 struct bch_fs *c = vinode->i_sb->s_fs_info; 1699 struct bch_inode_info *inode = to_bch_ei(vinode); 1700 int ret; 1701 1702 mutex_lock(&inode->ei_update_lock); 1703 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 1704 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); 1705 mutex_unlock(&inode->ei_update_lock); 1706 1707 return bch2_err_class(ret); 1708 } 1709 1710 static void bch2_evict_inode(struct inode *vinode) 1711 { 1712 struct bch_fs *c = vinode->i_sb->s_fs_info; 1713 struct bch_inode_info *inode = to_bch_ei(vinode); 1714 bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); 1715 1716 /* 1717 * evict() has waited for outstanding writeback, we'll do no more IO 1718 * through this inode: it's safe to remove from VFS inode hashtable here 1719 * 1720 * Do that now so that other threads aren't blocked from pulling it back 1721 * in, there's no reason for them to be: 1722 */ 1723 if (!delete) 1724 bch2_inode_hash_remove(c, inode); 1725 1726 truncate_inode_pages_final(&inode->v.i_data); 1727 1728 clear_inode(&inode->v); 1729 1730 BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); 1731 1732 if (delete) { 1733 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), 1734 KEY_TYPE_QUOTA_WARN); 1735 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, 1736 KEY_TYPE_QUOTA_WARN); 1737 bch2_inode_rm(c, inode_inum(inode)); 1738 1739 /* 1740 * If we are deleting, we need it present in the vfs hash table 1741 * so that fsck can check if unlinked inodes are still open: 1742 */ 1743 bch2_inode_hash_remove(c, inode); 1744 } 1745 1746 mutex_lock(&c->vfs_inodes_lock); 1747 list_del_init(&inode->ei_vfs_inode_list); 1748 mutex_unlock(&c->vfs_inodes_lock); 1749 } 1750 1751 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) 1752 { 1753 struct bch_inode_info *inode; 1754 DARRAY(struct bch_inode_info *) grabbed; 1755 bool clean_pass = false, this_pass_clean; 1756 1757 /* 1758 * Initially, we scan for inodes without I_DONTCACHE, then mark them to 1759 * be pruned with d_mark_dontcache(). 1760 * 1761 * Once we've had a clean pass where we didn't find any inodes without 1762 * I_DONTCACHE, we wait for them to be freed: 1763 */ 1764 1765 darray_init(&grabbed); 1766 darray_make_room(&grabbed, 1024); 1767 again: 1768 cond_resched(); 1769 this_pass_clean = true; 1770 1771 mutex_lock(&c->vfs_inodes_lock); 1772 list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { 1773 if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) 1774 continue; 1775 1776 if (!(inode->v.i_state & I_DONTCACHE) && 1777 !(inode->v.i_state & I_FREEING) && 1778 igrab(&inode->v)) { 1779 this_pass_clean = false; 1780 1781 if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { 1782 iput(&inode->v); 1783 break; 1784 } 1785 } else if (clean_pass && this_pass_clean) { 1786 struct wait_bit_queue_entry wqe; 1787 struct wait_queue_head *wq_head; 1788 1789 wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); 1790 prepare_to_wait_event(wq_head, &wqe.wq_entry, 1791 TASK_UNINTERRUPTIBLE); 1792 mutex_unlock(&c->vfs_inodes_lock); 1793 1794 schedule(); 1795 finish_wait(wq_head, &wqe.wq_entry); 1796 goto again; 1797 } 1798 } 1799 mutex_unlock(&c->vfs_inodes_lock); 1800 1801 darray_for_each(grabbed, i) { 1802 inode = *i; 1803 d_mark_dontcache(&inode->v); 1804 d_prune_aliases(&inode->v); 1805 iput(&inode->v); 1806 } 1807 grabbed.nr = 0; 1808 1809 if (!clean_pass || !this_pass_clean) { 1810 clean_pass = this_pass_clean; 1811 goto again; 1812 } 1813 1814 darray_exit(&grabbed); 1815 } 1816 1817 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) 1818 { 1819 struct super_block *sb = dentry->d_sb; 1820 struct bch_fs *c = sb->s_fs_info; 1821 struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); 1822 unsigned shift = sb->s_blocksize_bits - 9; 1823 /* 1824 * this assumes inodes take up 64 bytes, which is a decent average 1825 * number: 1826 */ 1827 u64 avail_inodes = ((usage.capacity - usage.used) << 3); 1828 1829 buf->f_type = BCACHEFS_STATFS_MAGIC; 1830 buf->f_bsize = sb->s_blocksize; 1831 buf->f_blocks = usage.capacity >> shift; 1832 buf->f_bfree = usage.free >> shift; 1833 buf->f_bavail = avail_factor(usage.free) >> shift; 1834 1835 buf->f_files = usage.nr_inodes + avail_inodes; 1836 buf->f_ffree = avail_inodes; 1837 1838 buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); 1839 buf->f_namelen = BCH_NAME_MAX; 1840 1841 return 0; 1842 } 1843 1844 static int bch2_sync_fs(struct super_block *sb, int wait) 1845 { 1846 struct bch_fs *c = sb->s_fs_info; 1847 int ret; 1848 1849 trace_bch2_sync_fs(sb, wait); 1850 1851 if (c->opts.journal_flush_disabled) 1852 return 0; 1853 1854 if (!wait) { 1855 bch2_journal_flush_async(&c->journal, NULL); 1856 return 0; 1857 } 1858 1859 ret = bch2_journal_flush(&c->journal); 1860 return bch2_err_class(ret); 1861 } 1862 1863 static struct bch_fs *bch2_path_to_fs(const char *path) 1864 { 1865 struct bch_fs *c; 1866 dev_t dev; 1867 int ret; 1868 1869 ret = lookup_bdev(path, &dev); 1870 if (ret) 1871 return ERR_PTR(ret); 1872 1873 c = bch2_dev_to_fs(dev); 1874 if (c) 1875 closure_put(&c->cl); 1876 return c ?: ERR_PTR(-ENOENT); 1877 } 1878 1879 static int bch2_remount(struct super_block *sb, int *flags, 1880 struct bch_opts opts) 1881 { 1882 struct bch_fs *c = sb->s_fs_info; 1883 int ret = 0; 1884 1885 opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); 1886 1887 if (opts.read_only != c->opts.read_only) { 1888 down_write(&c->state_lock); 1889 1890 if (opts.read_only) { 1891 bch2_fs_read_only(c); 1892 1893 sb->s_flags |= SB_RDONLY; 1894 } else { 1895 ret = bch2_fs_read_write(c); 1896 if (ret) { 1897 bch_err(c, "error going rw: %i", ret); 1898 up_write(&c->state_lock); 1899 ret = -EINVAL; 1900 goto err; 1901 } 1902 1903 sb->s_flags &= ~SB_RDONLY; 1904 } 1905 1906 c->opts.read_only = opts.read_only; 1907 1908 up_write(&c->state_lock); 1909 } 1910 1911 if (opt_defined(opts, errors)) 1912 c->opts.errors = opts.errors; 1913 err: 1914 return bch2_err_class(ret); 1915 } 1916 1917 static int bch2_show_devname(struct seq_file *seq, struct dentry *root) 1918 { 1919 struct bch_fs *c = root->d_sb->s_fs_info; 1920 bool first = true; 1921 1922 for_each_online_member(c, ca) { 1923 if (!first) 1924 seq_putc(seq, ':'); 1925 first = false; 1926 seq_puts(seq, ca->disk_sb.sb_name); 1927 } 1928 1929 return 0; 1930 } 1931 1932 static int bch2_show_options(struct seq_file *seq, struct dentry *root) 1933 { 1934 struct bch_fs *c = root->d_sb->s_fs_info; 1935 struct printbuf buf = PRINTBUF; 1936 1937 bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, 1938 OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); 1939 printbuf_nul_terminate(&buf); 1940 seq_puts(seq, buf.buf); 1941 1942 int ret = buf.allocation_failure ? -ENOMEM : 0; 1943 printbuf_exit(&buf); 1944 return ret; 1945 } 1946 1947 static void bch2_put_super(struct super_block *sb) 1948 { 1949 struct bch_fs *c = sb->s_fs_info; 1950 1951 __bch2_fs_stop(c); 1952 } 1953 1954 /* 1955 * bcachefs doesn't currently integrate intwrite freeze protection but the 1956 * internal write references serve the same purpose. Therefore reuse the 1957 * read-only transition code to perform the quiesce. The caveat is that we don't 1958 * currently have the ability to block tasks that want a write reference while 1959 * the superblock is frozen. This is fine for now, but we should either add 1960 * blocking support or find a way to integrate sb_start_intwrite() and friends. 1961 */ 1962 static int bch2_freeze(struct super_block *sb) 1963 { 1964 struct bch_fs *c = sb->s_fs_info; 1965 1966 down_write(&c->state_lock); 1967 bch2_fs_read_only(c); 1968 up_write(&c->state_lock); 1969 return 0; 1970 } 1971 1972 static int bch2_unfreeze(struct super_block *sb) 1973 { 1974 struct bch_fs *c = sb->s_fs_info; 1975 int ret; 1976 1977 if (test_bit(BCH_FS_emergency_ro, &c->flags)) 1978 return 0; 1979 1980 down_write(&c->state_lock); 1981 ret = bch2_fs_read_write(c); 1982 up_write(&c->state_lock); 1983 return ret; 1984 } 1985 1986 static const struct super_operations bch_super_operations = { 1987 .alloc_inode = bch2_alloc_inode, 1988 .free_inode = bch2_free_inode, 1989 .write_inode = bch2_vfs_write_inode, 1990 .evict_inode = bch2_evict_inode, 1991 .sync_fs = bch2_sync_fs, 1992 .statfs = bch2_statfs, 1993 .show_devname = bch2_show_devname, 1994 .show_options = bch2_show_options, 1995 .put_super = bch2_put_super, 1996 .freeze_fs = bch2_freeze, 1997 .unfreeze_fs = bch2_unfreeze, 1998 }; 1999 2000 static int bch2_set_super(struct super_block *s, void *data) 2001 { 2002 s->s_fs_info = data; 2003 return 0; 2004 } 2005 2006 static int bch2_noset_super(struct super_block *s, void *data) 2007 { 2008 return -EBUSY; 2009 } 2010 2011 typedef DARRAY(struct bch_fs *) darray_fs; 2012 2013 static int bch2_test_super(struct super_block *s, void *data) 2014 { 2015 struct bch_fs *c = s->s_fs_info; 2016 darray_fs *d = data; 2017 2018 if (!c) 2019 return false; 2020 2021 darray_for_each(*d, i) 2022 if (c != *i) 2023 return false; 2024 return true; 2025 } 2026 2027 static int bch2_fs_get_tree(struct fs_context *fc) 2028 { 2029 struct bch_fs *c; 2030 struct super_block *sb; 2031 struct inode *vinode; 2032 struct bch2_opts_parse *opts_parse = fc->fs_private; 2033 struct bch_opts opts = opts_parse->opts; 2034 darray_str devs; 2035 darray_fs devs_to_fs = {}; 2036 int ret; 2037 2038 opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2039 opt_set(opts, nostart, true); 2040 2041 if (!fc->source || strlen(fc->source) == 0) 2042 return -EINVAL; 2043 2044 ret = bch2_split_devs(fc->source, &devs); 2045 if (ret) 2046 return ret; 2047 2048 darray_for_each(devs, i) { 2049 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); 2050 if (ret) 2051 goto err; 2052 } 2053 2054 sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); 2055 if (!IS_ERR(sb)) 2056 goto got_sb; 2057 2058 c = bch2_fs_open(devs.data, devs.nr, opts); 2059 ret = PTR_ERR_OR_ZERO(c); 2060 if (ret) 2061 goto err; 2062 2063 /* Some options can't be parsed until after the fs is started: */ 2064 opts = bch2_opts_empty(); 2065 ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); 2066 if (ret) 2067 goto err_stop_fs; 2068 2069 bch2_opts_apply(&c->opts, opts); 2070 2071 ret = bch2_fs_start(c); 2072 if (ret) 2073 goto err_stop_fs; 2074 2075 sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); 2076 ret = PTR_ERR_OR_ZERO(sb); 2077 if (ret) 2078 goto err_stop_fs; 2079 got_sb: 2080 c = sb->s_fs_info; 2081 2082 if (sb->s_root) { 2083 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { 2084 ret = -EBUSY; 2085 goto err_put_super; 2086 } 2087 goto out; 2088 } 2089 2090 sb->s_blocksize = block_bytes(c); 2091 sb->s_blocksize_bits = ilog2(block_bytes(c)); 2092 sb->s_maxbytes = MAX_LFS_FILESIZE; 2093 sb->s_op = &bch_super_operations; 2094 sb->s_export_op = &bch_export_ops; 2095 #ifdef CONFIG_BCACHEFS_QUOTA 2096 sb->s_qcop = &bch2_quotactl_operations; 2097 sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; 2098 #endif 2099 sb->s_xattr = bch2_xattr_handlers; 2100 sb->s_magic = BCACHEFS_STATFS_MAGIC; 2101 sb->s_time_gran = c->sb.nsec_per_time_unit; 2102 sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; 2103 sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); 2104 sb->s_uuid = c->sb.user_uuid; 2105 sb->s_shrink->seeks = 0; 2106 c->vfs_sb = sb; 2107 strscpy(sb->s_id, c->name, sizeof(sb->s_id)); 2108 2109 ret = super_setup_bdi(sb); 2110 if (ret) 2111 goto err_put_super; 2112 2113 sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 2114 2115 for_each_online_member(c, ca) { 2116 struct block_device *bdev = ca->disk_sb.bdev; 2117 2118 /* XXX: create an anonymous device for multi device filesystems */ 2119 sb->s_bdev = bdev; 2120 sb->s_dev = bdev->bd_dev; 2121 percpu_ref_put(&ca->io_ref); 2122 break; 2123 } 2124 2125 c->dev = sb->s_dev; 2126 2127 #ifdef CONFIG_BCACHEFS_POSIX_ACL 2128 if (c->opts.acl) 2129 sb->s_flags |= SB_POSIXACL; 2130 #endif 2131 2132 sb->s_shrink->seeks = 0; 2133 2134 vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); 2135 ret = PTR_ERR_OR_ZERO(vinode); 2136 bch_err_msg(c, ret, "mounting: error getting root inode"); 2137 if (ret) 2138 goto err_put_super; 2139 2140 sb->s_root = d_make_root(vinode); 2141 if (!sb->s_root) { 2142 bch_err(c, "error mounting: error allocating root dentry"); 2143 ret = -ENOMEM; 2144 goto err_put_super; 2145 } 2146 2147 sb->s_flags |= SB_ACTIVE; 2148 out: 2149 fc->root = dget(sb->s_root); 2150 err: 2151 darray_exit(&devs_to_fs); 2152 bch2_darray_str_exit(&devs); 2153 if (ret) 2154 pr_err("error: %s", bch2_err_str(ret)); 2155 /* 2156 * On an inconsistency error in recovery we might see an -EROFS derived 2157 * errorcode (from the journal), but we don't want to return that to 2158 * userspace as that causes util-linux to retry the mount RO - which is 2159 * confusing: 2160 */ 2161 if (bch2_err_matches(ret, EROFS) && ret != -EROFS) 2162 ret = -EIO; 2163 return bch2_err_class(ret); 2164 2165 err_stop_fs: 2166 bch2_fs_stop(c); 2167 goto err; 2168 2169 err_put_super: 2170 __bch2_fs_stop(c); 2171 deactivate_locked_super(sb); 2172 goto err; 2173 } 2174 2175 static void bch2_kill_sb(struct super_block *sb) 2176 { 2177 struct bch_fs *c = sb->s_fs_info; 2178 2179 generic_shutdown_super(sb); 2180 bch2_fs_free(c); 2181 } 2182 2183 static void bch2_fs_context_free(struct fs_context *fc) 2184 { 2185 struct bch2_opts_parse *opts = fc->fs_private; 2186 2187 if (opts) { 2188 printbuf_exit(&opts->parse_later); 2189 kfree(opts); 2190 } 2191 } 2192 2193 static int bch2_fs_parse_param(struct fs_context *fc, 2194 struct fs_parameter *param) 2195 { 2196 /* 2197 * the "source" param, i.e., the name of the device(s) to mount, 2198 * is handled by the VFS layer. 2199 */ 2200 if (!strcmp(param->key, "source")) 2201 return -ENOPARAM; 2202 2203 struct bch2_opts_parse *opts = fc->fs_private; 2204 struct bch_fs *c = NULL; 2205 2206 /* for reconfigure, we already have a struct bch_fs */ 2207 if (fc->root) 2208 c = fc->root->d_sb->s_fs_info; 2209 2210 int ret = bch2_parse_one_mount_opt(c, &opts->opts, 2211 &opts->parse_later, param->key, 2212 param->string); 2213 2214 return bch2_err_class(ret); 2215 } 2216 2217 static int bch2_fs_reconfigure(struct fs_context *fc) 2218 { 2219 struct super_block *sb = fc->root->d_sb; 2220 struct bch2_opts_parse *opts = fc->fs_private; 2221 2222 return bch2_remount(sb, &fc->sb_flags, opts->opts); 2223 } 2224 2225 static const struct fs_context_operations bch2_context_ops = { 2226 .free = bch2_fs_context_free, 2227 .parse_param = bch2_fs_parse_param, 2228 .get_tree = bch2_fs_get_tree, 2229 .reconfigure = bch2_fs_reconfigure, 2230 }; 2231 2232 static int bch2_init_fs_context(struct fs_context *fc) 2233 { 2234 struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); 2235 2236 if (!opts) 2237 return -ENOMEM; 2238 2239 opts->parse_later = PRINTBUF; 2240 2241 fc->ops = &bch2_context_ops; 2242 fc->fs_private = opts; 2243 2244 return 0; 2245 } 2246 2247 void bch2_fs_vfs_exit(struct bch_fs *c) 2248 { 2249 if (c->vfs_inodes_table.tbl) 2250 rhashtable_destroy(&c->vfs_inodes_table); 2251 } 2252 2253 int bch2_fs_vfs_init(struct bch_fs *c) 2254 { 2255 return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); 2256 } 2257 2258 static struct file_system_type bcache_fs_type = { 2259 .owner = THIS_MODULE, 2260 .name = "bcachefs", 2261 .init_fs_context = bch2_init_fs_context, 2262 .kill_sb = bch2_kill_sb, 2263 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, 2264 }; 2265 2266 MODULE_ALIAS_FS("bcachefs"); 2267 2268 void bch2_vfs_exit(void) 2269 { 2270 unregister_filesystem(&bcache_fs_type); 2271 kmem_cache_destroy(bch2_inode_cache); 2272 } 2273 2274 int __init bch2_vfs_init(void) 2275 { 2276 int ret = -ENOMEM; 2277 2278 bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | 2279 SLAB_ACCOUNT); 2280 if (!bch2_inode_cache) 2281 goto err; 2282 2283 ret = register_filesystem(&bcache_fs_type); 2284 if (ret) 2285 goto err; 2286 2287 return 0; 2288 err: 2289 bch2_vfs_exit(); 2290 return ret; 2291 } 2292 2293 #endif /* NO_BCACHEFS_FS */ 2294