1 // SPDX-License-Identifier: GPL-2.0 2 #ifndef NO_BCACHEFS_FS 3 4 #include "bcachefs.h" 5 #include "acl.h" 6 #include "bkey_buf.h" 7 #include "btree_update.h" 8 #include "buckets.h" 9 #include "chardev.h" 10 #include "dirent.h" 11 #include "errcode.h" 12 #include "extents.h" 13 #include "fs.h" 14 #include "fs-common.h" 15 #include "fs-io.h" 16 #include "fs-ioctl.h" 17 #include "fs-io-buffered.h" 18 #include "fs-io-direct.h" 19 #include "fs-io-pagecache.h" 20 #include "fsck.h" 21 #include "inode.h" 22 #include "io_read.h" 23 #include "journal.h" 24 #include "keylist.h" 25 #include "quota.h" 26 #include "snapshot.h" 27 #include "super.h" 28 #include "xattr.h" 29 #include "trace.h" 30 31 #include <linux/aio.h> 32 #include <linux/backing-dev.h> 33 #include <linux/exportfs.h> 34 #include <linux/fiemap.h> 35 #include <linux/fs_context.h> 36 #include <linux/module.h> 37 #include <linux/pagemap.h> 38 #include <linux/posix_acl.h> 39 #include <linux/random.h> 40 #include <linux/seq_file.h> 41 #include <linux/statfs.h> 42 #include <linux/string.h> 43 #include <linux/xattr.h> 44 45 static struct kmem_cache *bch2_inode_cache; 46 47 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, 48 struct bch_inode_info *, 49 struct bch_inode_unpacked *, 50 struct bch_subvolume *); 51 52 void bch2_inode_update_after_write(struct btree_trans *trans, 53 struct bch_inode_info *inode, 54 struct bch_inode_unpacked *bi, 55 unsigned fields) 56 { 57 struct bch_fs *c = trans->c; 58 59 BUG_ON(bi->bi_inum != inode->v.i_ino); 60 61 bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); 62 63 set_nlink(&inode->v, bch2_inode_nlink_get(bi)); 64 i_uid_write(&inode->v, bi->bi_uid); 65 i_gid_write(&inode->v, bi->bi_gid); 66 inode->v.i_mode = bi->bi_mode; 67 68 if (fields & ATTR_ATIME) 69 inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); 70 if (fields & ATTR_MTIME) 71 inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); 72 if (fields & ATTR_CTIME) 73 inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); 74 75 inode->ei_inode = *bi; 76 77 bch2_inode_flags_to_vfs(inode); 78 } 79 80 int __must_check bch2_write_inode(struct bch_fs *c, 81 struct bch_inode_info *inode, 82 inode_set_fn set, 83 void *p, unsigned fields) 84 { 85 struct btree_trans *trans = bch2_trans_get(c); 86 struct btree_iter iter = { NULL }; 87 struct bch_inode_unpacked inode_u; 88 int ret; 89 retry: 90 bch2_trans_begin(trans); 91 92 ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), 93 BTREE_ITER_intent) ?: 94 (set ? set(trans, inode, &inode_u, p) : 0) ?: 95 bch2_inode_write(trans, &iter, &inode_u) ?: 96 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 97 98 /* 99 * the btree node lock protects inode->ei_inode, not ei_update_lock; 100 * this is important for inode updates via bchfs_write_index_update 101 */ 102 if (!ret) 103 bch2_inode_update_after_write(trans, inode, &inode_u, fields); 104 105 bch2_trans_iter_exit(trans, &iter); 106 107 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 108 goto retry; 109 110 bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, 111 "%s: inode %llu:%llu not found when updating", 112 bch2_err_str(ret), 113 inode_inum(inode).subvol, 114 inode_inum(inode).inum); 115 116 bch2_trans_put(trans); 117 return ret < 0 ? ret : 0; 118 } 119 120 int bch2_fs_quota_transfer(struct bch_fs *c, 121 struct bch_inode_info *inode, 122 struct bch_qid new_qid, 123 unsigned qtypes, 124 enum quota_acct_mode mode) 125 { 126 unsigned i; 127 int ret; 128 129 qtypes &= enabled_qtypes(c); 130 131 for (i = 0; i < QTYP_NR; i++) 132 if (new_qid.q[i] == inode->ei_qid.q[i]) 133 qtypes &= ~(1U << i); 134 135 if (!qtypes) 136 return 0; 137 138 mutex_lock(&inode->ei_quota_lock); 139 140 ret = bch2_quota_transfer(c, qtypes, new_qid, 141 inode->ei_qid, 142 inode->v.i_blocks + 143 inode->ei_quota_reserved, 144 mode); 145 if (!ret) 146 for (i = 0; i < QTYP_NR; i++) 147 if (qtypes & (1 << i)) 148 inode->ei_qid.q[i] = new_qid.q[i]; 149 150 mutex_unlock(&inode->ei_quota_lock); 151 152 return ret; 153 } 154 155 static bool subvol_inum_eq(subvol_inum a, subvol_inum b) 156 { 157 return a.subvol == b.subvol && a.inum == b.inum; 158 } 159 160 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, 161 const void *obj) 162 { 163 const struct bch_inode_info *inode = obj; 164 const subvol_inum *v = arg->key; 165 166 return !subvol_inum_eq(inode->ei_inum, *v); 167 } 168 169 static const struct rhashtable_params bch2_vfs_inodes_params = { 170 .head_offset = offsetof(struct bch_inode_info, hash), 171 .key_offset = offsetof(struct bch_inode_info, ei_inum), 172 .key_len = sizeof(subvol_inum), 173 .obj_cmpfn = bch2_vfs_inode_cmp_fn, 174 .automatic_shrinking = true, 175 }; 176 177 struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) 178 { 179 return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); 180 } 181 182 static void __wait_on_freeing_inode(struct bch_fs *c, 183 struct bch_inode_info *inode, 184 subvol_inum inum) 185 { 186 wait_queue_head_t *wq; 187 DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); 188 wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); 189 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 190 spin_unlock(&inode->v.i_lock); 191 192 if (__bch2_inode_hash_find(c, inum) == inode) 193 schedule_timeout(HZ * 10); 194 finish_wait(wq, &wait.wq_entry); 195 } 196 197 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, 198 subvol_inum inum) 199 { 200 struct bch_inode_info *inode; 201 repeat: 202 inode = __bch2_inode_hash_find(c, inum); 203 if (inode) { 204 spin_lock(&inode->v.i_lock); 205 if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { 206 spin_unlock(&inode->v.i_lock); 207 return NULL; 208 } 209 if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { 210 if (!trans) { 211 __wait_on_freeing_inode(c, inode, inum); 212 } else { 213 bch2_trans_unlock(trans); 214 __wait_on_freeing_inode(c, inode, inum); 215 int ret = bch2_trans_relock(trans); 216 if (ret) 217 return ERR_PTR(ret); 218 } 219 goto repeat; 220 } 221 __iget(&inode->v); 222 spin_unlock(&inode->v.i_lock); 223 } 224 225 return inode; 226 } 227 228 static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) 229 { 230 spin_lock(&inode->v.i_lock); 231 bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); 232 spin_unlock(&inode->v.i_lock); 233 234 if (remove) { 235 int ret = rhashtable_remove_fast(&c->vfs_inodes_table, 236 &inode->hash, bch2_vfs_inodes_params); 237 BUG_ON(ret); 238 inode->v.i_hash.pprev = NULL; 239 /* 240 * This pairs with the bch2_inode_hash_find() -> 241 * __wait_on_freeing_inode() path 242 */ 243 inode_wake_up_bit(&inode->v, __I_NEW); 244 } 245 } 246 247 static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, 248 struct btree_trans *trans, 249 struct bch_inode_info *inode) 250 { 251 struct bch_inode_info *old = inode; 252 253 set_bit(EI_INODE_HASHED, &inode->ei_flags); 254 retry: 255 if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, 256 &inode->hash, 257 bch2_vfs_inodes_params))) { 258 old = bch2_inode_hash_find(c, trans, inode->ei_inum); 259 if (!old) 260 goto retry; 261 262 clear_bit(EI_INODE_HASHED, &inode->ei_flags); 263 264 /* 265 * bcachefs doesn't use I_NEW; we have no use for it since we 266 * only insert fully created inodes in the inode hash table. But 267 * discard_new_inode() expects it to be set... 268 */ 269 inode->v.i_state |= I_NEW; 270 /* 271 * We don't want bch2_evict_inode() to delete the inode on disk, 272 * we just raced and had another inode in cache. Normally new 273 * inodes don't have nlink == 0 - except tmpfiles do... 274 */ 275 set_nlink(&inode->v, 1); 276 discard_new_inode(&inode->v); 277 return old; 278 } else { 279 inode_fake_hash(&inode->v); 280 281 inode_sb_list_add(&inode->v); 282 283 mutex_lock(&c->vfs_inodes_lock); 284 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); 285 mutex_unlock(&c->vfs_inodes_lock); 286 return inode; 287 } 288 } 289 290 #define memalloc_flags_do(_flags, _do) \ 291 ({ \ 292 unsigned _saved_flags = memalloc_flags_save(_flags); \ 293 typeof(_do) _ret = _do; \ 294 memalloc_noreclaim_restore(_saved_flags); \ 295 _ret; \ 296 }) 297 298 static struct inode *bch2_alloc_inode(struct super_block *sb) 299 { 300 BUG(); 301 } 302 303 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) 304 { 305 struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, 306 bch2_inode_cache, gfp); 307 if (!inode) 308 return NULL; 309 310 inode_init_once(&inode->v); 311 mutex_init(&inode->ei_update_lock); 312 two_state_lock_init(&inode->ei_pagecache_lock); 313 INIT_LIST_HEAD(&inode->ei_vfs_inode_list); 314 inode->ei_flags = 0; 315 mutex_init(&inode->ei_quota_lock); 316 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); 317 318 if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { 319 kmem_cache_free(bch2_inode_cache, inode); 320 return NULL; 321 } 322 323 return inode; 324 } 325 326 /* 327 * Allocate a new inode, dropping/retaking btree locks if necessary: 328 */ 329 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) 330 { 331 struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); 332 333 if (unlikely(!inode)) { 334 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); 335 if (ret && inode) { 336 __destroy_inode(&inode->v); 337 kmem_cache_free(bch2_inode_cache, inode); 338 } 339 if (ret) 340 return ERR_PTR(ret); 341 } 342 343 return inode; 344 } 345 346 static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, 347 subvol_inum inum, 348 struct bch_inode_unpacked *bi, 349 struct bch_subvolume *subvol) 350 { 351 struct bch_inode_info *inode = bch2_new_inode(trans); 352 if (IS_ERR(inode)) 353 return inode; 354 355 bch2_vfs_inode_init(trans, inum, inode, bi, subvol); 356 357 return bch2_inode_hash_insert(trans->c, trans, inode); 358 359 } 360 361 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) 362 { 363 struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); 364 if (inode) 365 return &inode->v; 366 367 struct btree_trans *trans = bch2_trans_get(c); 368 369 struct bch_inode_unpacked inode_u; 370 struct bch_subvolume subvol; 371 int ret = lockrestart_do(trans, 372 bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 373 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: 374 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 375 bch2_trans_put(trans); 376 377 return ret ? ERR_PTR(ret) : &inode->v; 378 } 379 380 struct bch_inode_info * 381 __bch2_create(struct mnt_idmap *idmap, 382 struct bch_inode_info *dir, struct dentry *dentry, 383 umode_t mode, dev_t rdev, subvol_inum snapshot_src, 384 unsigned flags) 385 { 386 struct bch_fs *c = dir->v.i_sb->s_fs_info; 387 struct btree_trans *trans; 388 struct bch_inode_unpacked dir_u; 389 struct bch_inode_info *inode; 390 struct bch_inode_unpacked inode_u; 391 struct posix_acl *default_acl = NULL, *acl = NULL; 392 subvol_inum inum; 393 struct bch_subvolume subvol; 394 u64 journal_seq = 0; 395 kuid_t kuid; 396 kgid_t kgid; 397 int ret; 398 399 /* 400 * preallocate acls + vfs inode before btree transaction, so that 401 * nothing can fail after the transaction succeeds: 402 */ 403 #ifdef CONFIG_BCACHEFS_POSIX_ACL 404 ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); 405 if (ret) 406 return ERR_PTR(ret); 407 #endif 408 inode = __bch2_new_inode(c, GFP_NOFS); 409 if (unlikely(!inode)) { 410 inode = ERR_PTR(-ENOMEM); 411 goto err; 412 } 413 414 bch2_inode_init_early(c, &inode_u); 415 416 if (!(flags & BCH_CREATE_TMPFILE)) 417 mutex_lock(&dir->ei_update_lock); 418 419 trans = bch2_trans_get(c); 420 retry: 421 bch2_trans_begin(trans); 422 423 kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); 424 kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); 425 ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: 426 bch2_create_trans(trans, 427 inode_inum(dir), &dir_u, &inode_u, 428 !(flags & BCH_CREATE_TMPFILE) 429 ? &dentry->d_name : NULL, 430 from_kuid(i_user_ns(&dir->v), kuid), 431 from_kgid(i_user_ns(&dir->v), kgid), 432 mode, rdev, 433 default_acl, acl, snapshot_src, flags) ?: 434 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, 435 KEY_TYPE_QUOTA_PREALLOC); 436 if (unlikely(ret)) 437 goto err_before_quota; 438 439 inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; 440 inum.inum = inode_u.bi_inum; 441 442 ret = bch2_subvolume_get(trans, inum.subvol, true, 443 BTREE_ITER_with_updates, &subvol) ?: 444 bch2_trans_commit(trans, NULL, &journal_seq, 0); 445 if (unlikely(ret)) { 446 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, 447 KEY_TYPE_QUOTA_WARN); 448 err_before_quota: 449 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 450 goto retry; 451 goto err_trans; 452 } 453 454 if (!(flags & BCH_CREATE_TMPFILE)) { 455 bch2_inode_update_after_write(trans, dir, &dir_u, 456 ATTR_MTIME|ATTR_CTIME); 457 mutex_unlock(&dir->ei_update_lock); 458 } 459 460 bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); 461 462 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 463 set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); 464 465 /* 466 * we must insert the new inode into the inode cache before calling 467 * bch2_trans_exit() and dropping locks, else we could race with another 468 * thread pulling the inode in and modifying it: 469 * 470 * also, calling bch2_inode_hash_insert() without passing in the 471 * transaction object is sketchy - if we could ever end up in 472 * __wait_on_freeing_inode(), we'd risk deadlock. 473 * 474 * But that shouldn't be possible, since we still have the inode locked 475 * that we just created, and we _really_ can't take a transaction 476 * restart here. 477 */ 478 inode = bch2_inode_hash_insert(c, NULL, inode); 479 bch2_trans_put(trans); 480 err: 481 posix_acl_release(default_acl); 482 posix_acl_release(acl); 483 return inode; 484 err_trans: 485 if (!(flags & BCH_CREATE_TMPFILE)) 486 mutex_unlock(&dir->ei_update_lock); 487 488 bch2_trans_put(trans); 489 make_bad_inode(&inode->v); 490 iput(&inode->v); 491 inode = ERR_PTR(ret); 492 goto err; 493 } 494 495 /* methods */ 496 497 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, 498 subvol_inum dir, struct bch_hash_info *dir_hash_info, 499 const struct qstr *name) 500 { 501 struct bch_fs *c = trans->c; 502 struct btree_iter dirent_iter = {}; 503 subvol_inum inum = {}; 504 struct printbuf buf = PRINTBUF; 505 506 struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, 507 dir_hash_info, dir, name, 0); 508 int ret = bkey_err(k); 509 if (ret) 510 return ERR_PTR(ret); 511 512 ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); 513 if (ret > 0) 514 ret = -ENOENT; 515 if (ret) 516 goto err; 517 518 struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); 519 if (inode) 520 goto out; 521 522 struct bch_subvolume subvol; 523 struct bch_inode_unpacked inode_u; 524 ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: 525 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: 526 PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); 527 528 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), 529 c, "dirent to missing inode:\n %s", 530 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 531 if (ret) 532 goto err; 533 534 /* regular files may have hardlinks: */ 535 if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && 536 !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), 537 c, 538 "dirent points to inode that does not point back:\n %s", 539 (bch2_bkey_val_to_text(&buf, c, k), 540 prt_printf(&buf, "\n "), 541 bch2_inode_unpacked_to_text(&buf, &inode_u), 542 buf.buf))) { 543 ret = -ENOENT; 544 goto err; 545 } 546 out: 547 bch2_trans_iter_exit(trans, &dirent_iter); 548 printbuf_exit(&buf); 549 return inode; 550 err: 551 inode = ERR_PTR(ret); 552 goto out; 553 } 554 555 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, 556 unsigned int flags) 557 { 558 struct bch_fs *c = vdir->i_sb->s_fs_info; 559 struct bch_inode_info *dir = to_bch_ei(vdir); 560 struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); 561 562 struct bch_inode_info *inode; 563 bch2_trans_do(c, NULL, NULL, 0, 564 PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), 565 &hash, &dentry->d_name))); 566 if (IS_ERR(inode)) 567 inode = NULL; 568 569 return d_splice_alias(&inode->v, dentry); 570 } 571 572 static int bch2_mknod(struct mnt_idmap *idmap, 573 struct inode *vdir, struct dentry *dentry, 574 umode_t mode, dev_t rdev) 575 { 576 struct bch_inode_info *inode = 577 __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 578 (subvol_inum) { 0 }, 0); 579 580 if (IS_ERR(inode)) 581 return bch2_err_class(PTR_ERR(inode)); 582 583 d_instantiate(dentry, &inode->v); 584 return 0; 585 } 586 587 static int bch2_create(struct mnt_idmap *idmap, 588 struct inode *vdir, struct dentry *dentry, 589 umode_t mode, bool excl) 590 { 591 return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); 592 } 593 594 static int __bch2_link(struct bch_fs *c, 595 struct bch_inode_info *inode, 596 struct bch_inode_info *dir, 597 struct dentry *dentry) 598 { 599 struct bch_inode_unpacked dir_u, inode_u; 600 int ret; 601 602 mutex_lock(&inode->ei_update_lock); 603 struct btree_trans *trans = bch2_trans_get(c); 604 605 ret = commit_do(trans, NULL, NULL, 0, 606 bch2_link_trans(trans, 607 inode_inum(dir), &dir_u, 608 inode_inum(inode), &inode_u, 609 &dentry->d_name)); 610 611 if (likely(!ret)) { 612 bch2_inode_update_after_write(trans, dir, &dir_u, 613 ATTR_MTIME|ATTR_CTIME); 614 bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); 615 } 616 617 bch2_trans_put(trans); 618 mutex_unlock(&inode->ei_update_lock); 619 return ret; 620 } 621 622 static int bch2_link(struct dentry *old_dentry, struct inode *vdir, 623 struct dentry *dentry) 624 { 625 struct bch_fs *c = vdir->i_sb->s_fs_info; 626 struct bch_inode_info *dir = to_bch_ei(vdir); 627 struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); 628 int ret; 629 630 lockdep_assert_held(&inode->v.i_rwsem); 631 632 ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 633 bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 634 __bch2_link(c, inode, dir, dentry); 635 if (unlikely(ret)) 636 return bch2_err_class(ret); 637 638 ihold(&inode->v); 639 d_instantiate(dentry, &inode->v); 640 return 0; 641 } 642 643 int __bch2_unlink(struct inode *vdir, struct dentry *dentry, 644 bool deleting_snapshot) 645 { 646 struct bch_fs *c = vdir->i_sb->s_fs_info; 647 struct bch_inode_info *dir = to_bch_ei(vdir); 648 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 649 struct bch_inode_unpacked dir_u, inode_u; 650 int ret; 651 652 bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); 653 654 struct btree_trans *trans = bch2_trans_get(c); 655 656 ret = commit_do(trans, NULL, NULL, 657 BCH_TRANS_COMMIT_no_enospc, 658 bch2_unlink_trans(trans, 659 inode_inum(dir), &dir_u, 660 &inode_u, &dentry->d_name, 661 deleting_snapshot)); 662 if (unlikely(ret)) 663 goto err; 664 665 bch2_inode_update_after_write(trans, dir, &dir_u, 666 ATTR_MTIME|ATTR_CTIME); 667 bch2_inode_update_after_write(trans, inode, &inode_u, 668 ATTR_MTIME); 669 670 if (inode_u.bi_subvol) { 671 /* 672 * Subvolume deletion is asynchronous, but we still want to tell 673 * the VFS that it's been deleted here: 674 */ 675 set_nlink(&inode->v, 0); 676 } 677 err: 678 bch2_trans_put(trans); 679 bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); 680 681 return ret; 682 } 683 684 static int bch2_unlink(struct inode *vdir, struct dentry *dentry) 685 { 686 struct bch_inode_info *dir= to_bch_ei(vdir); 687 struct bch_fs *c = dir->v.i_sb->s_fs_info; 688 689 int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: 690 __bch2_unlink(vdir, dentry, false); 691 return bch2_err_class(ret); 692 } 693 694 static int bch2_symlink(struct mnt_idmap *idmap, 695 struct inode *vdir, struct dentry *dentry, 696 const char *symname) 697 { 698 struct bch_fs *c = vdir->i_sb->s_fs_info; 699 struct bch_inode_info *dir = to_bch_ei(vdir), *inode; 700 int ret; 701 702 inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, 703 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 704 if (IS_ERR(inode)) 705 return bch2_err_class(PTR_ERR(inode)); 706 707 inode_lock(&inode->v); 708 ret = page_symlink(&inode->v, symname, strlen(symname) + 1); 709 inode_unlock(&inode->v); 710 711 if (unlikely(ret)) 712 goto err; 713 714 ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); 715 if (unlikely(ret)) 716 goto err; 717 718 ret = __bch2_link(c, inode, dir, dentry); 719 if (unlikely(ret)) 720 goto err; 721 722 d_instantiate(dentry, &inode->v); 723 return 0; 724 err: 725 iput(&inode->v); 726 return bch2_err_class(ret); 727 } 728 729 static int bch2_mkdir(struct mnt_idmap *idmap, 730 struct inode *vdir, struct dentry *dentry, umode_t mode) 731 { 732 return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); 733 } 734 735 static int bch2_rename2(struct mnt_idmap *idmap, 736 struct inode *src_vdir, struct dentry *src_dentry, 737 struct inode *dst_vdir, struct dentry *dst_dentry, 738 unsigned flags) 739 { 740 struct bch_fs *c = src_vdir->i_sb->s_fs_info; 741 struct bch_inode_info *src_dir = to_bch_ei(src_vdir); 742 struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); 743 struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); 744 struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); 745 struct bch_inode_unpacked dst_dir_u, src_dir_u; 746 struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; 747 struct btree_trans *trans; 748 enum bch_rename_mode mode = flags & RENAME_EXCHANGE 749 ? BCH_RENAME_EXCHANGE 750 : dst_dentry->d_inode 751 ? BCH_RENAME_OVERWRITE : BCH_RENAME; 752 bool whiteout = !!(flags & RENAME_WHITEOUT); 753 int ret; 754 755 if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) 756 return -EINVAL; 757 758 if (mode == BCH_RENAME_OVERWRITE) { 759 ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 760 0, LLONG_MAX); 761 if (ret) 762 return ret; 763 } 764 765 bch2_lock_inodes(INODE_UPDATE_LOCK, 766 src_dir, 767 dst_dir, 768 src_inode, 769 dst_inode); 770 771 trans = bch2_trans_get(c); 772 773 ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: 774 bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); 775 if (ret) 776 goto err; 777 778 if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { 779 ret = bch2_fs_quota_transfer(c, src_inode, 780 dst_dir->ei_qid, 781 1 << QTYP_PRJ, 782 KEY_TYPE_QUOTA_PREALLOC); 783 if (ret) 784 goto err; 785 } 786 787 if (mode == BCH_RENAME_EXCHANGE && 788 inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { 789 ret = bch2_fs_quota_transfer(c, dst_inode, 790 src_dir->ei_qid, 791 1 << QTYP_PRJ, 792 KEY_TYPE_QUOTA_PREALLOC); 793 if (ret) 794 goto err; 795 } 796 retry: 797 bch2_trans_begin(trans); 798 799 ret = bch2_rename_trans(trans, 800 inode_inum(src_dir), &src_dir_u, 801 inode_inum(dst_dir), &dst_dir_u, 802 &src_inode_u, 803 &dst_inode_u, 804 &src_dentry->d_name, 805 &dst_dentry->d_name, 806 mode); 807 if (unlikely(ret)) 808 goto err_tx_restart; 809 810 if (whiteout) { 811 whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); 812 ret = PTR_ERR_OR_ZERO(whiteout_inode_u); 813 if (unlikely(ret)) 814 goto err_tx_restart; 815 bch2_inode_init_early(c, whiteout_inode_u); 816 817 ret = bch2_create_trans(trans, 818 inode_inum(src_dir), &src_dir_u, 819 whiteout_inode_u, 820 &src_dentry->d_name, 821 from_kuid(i_user_ns(&src_dir->v), current_fsuid()), 822 from_kgid(i_user_ns(&src_dir->v), current_fsgid()), 823 S_IFCHR|WHITEOUT_MODE, 0, 824 NULL, NULL, (subvol_inum) { 0 }, 0) ?: 825 bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, 826 KEY_TYPE_QUOTA_PREALLOC); 827 if (unlikely(ret)) 828 goto err_tx_restart; 829 } 830 831 ret = bch2_trans_commit(trans, NULL, NULL, 0); 832 if (unlikely(ret)) { 833 err_tx_restart: 834 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 835 goto retry; 836 goto err; 837 } 838 839 BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); 840 BUG_ON(dst_inode && 841 dst_inode->v.i_ino != dst_inode_u.bi_inum); 842 843 bch2_inode_update_after_write(trans, src_dir, &src_dir_u, 844 ATTR_MTIME|ATTR_CTIME); 845 846 if (src_dir != dst_dir) 847 bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, 848 ATTR_MTIME|ATTR_CTIME); 849 850 bch2_inode_update_after_write(trans, src_inode, &src_inode_u, 851 ATTR_CTIME); 852 853 if (dst_inode) 854 bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, 855 ATTR_CTIME); 856 err: 857 bch2_trans_put(trans); 858 859 bch2_fs_quota_transfer(c, src_inode, 860 bch_qid(&src_inode->ei_inode), 861 1 << QTYP_PRJ, 862 KEY_TYPE_QUOTA_NOCHECK); 863 if (dst_inode) 864 bch2_fs_quota_transfer(c, dst_inode, 865 bch_qid(&dst_inode->ei_inode), 866 1 << QTYP_PRJ, 867 KEY_TYPE_QUOTA_NOCHECK); 868 869 bch2_unlock_inodes(INODE_UPDATE_LOCK, 870 src_dir, 871 dst_dir, 872 src_inode, 873 dst_inode); 874 875 return bch2_err_class(ret); 876 } 877 878 static void bch2_setattr_copy(struct mnt_idmap *idmap, 879 struct bch_inode_info *inode, 880 struct bch_inode_unpacked *bi, 881 struct iattr *attr) 882 { 883 struct bch_fs *c = inode->v.i_sb->s_fs_info; 884 unsigned int ia_valid = attr->ia_valid; 885 kuid_t kuid; 886 kgid_t kgid; 887 888 if (ia_valid & ATTR_UID) { 889 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 890 bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); 891 } 892 if (ia_valid & ATTR_GID) { 893 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 894 bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); 895 } 896 897 if (ia_valid & ATTR_SIZE) 898 bi->bi_size = attr->ia_size; 899 900 if (ia_valid & ATTR_ATIME) 901 bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); 902 if (ia_valid & ATTR_MTIME) 903 bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); 904 if (ia_valid & ATTR_CTIME) 905 bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); 906 907 if (ia_valid & ATTR_MODE) { 908 umode_t mode = attr->ia_mode; 909 kgid_t gid = ia_valid & ATTR_GID 910 ? kgid 911 : inode->v.i_gid; 912 913 if (!in_group_or_capable(idmap, &inode->v, 914 make_vfsgid(idmap, i_user_ns(&inode->v), gid))) 915 mode &= ~S_ISGID; 916 bi->bi_mode = mode; 917 } 918 } 919 920 int bch2_setattr_nonsize(struct mnt_idmap *idmap, 921 struct bch_inode_info *inode, 922 struct iattr *attr) 923 { 924 struct bch_fs *c = inode->v.i_sb->s_fs_info; 925 struct bch_qid qid; 926 struct btree_trans *trans; 927 struct btree_iter inode_iter = { NULL }; 928 struct bch_inode_unpacked inode_u; 929 struct posix_acl *acl = NULL; 930 kuid_t kuid; 931 kgid_t kgid; 932 int ret; 933 934 mutex_lock(&inode->ei_update_lock); 935 936 qid = inode->ei_qid; 937 938 if (attr->ia_valid & ATTR_UID) { 939 kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); 940 qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); 941 } 942 943 if (attr->ia_valid & ATTR_GID) { 944 kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); 945 qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); 946 } 947 948 ret = bch2_fs_quota_transfer(c, inode, qid, ~0, 949 KEY_TYPE_QUOTA_PREALLOC); 950 if (ret) 951 goto err; 952 953 trans = bch2_trans_get(c); 954 retry: 955 bch2_trans_begin(trans); 956 kfree(acl); 957 acl = NULL; 958 959 ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), 960 BTREE_ITER_intent); 961 if (ret) 962 goto btree_err; 963 964 bch2_setattr_copy(idmap, inode, &inode_u, attr); 965 966 if (attr->ia_valid & ATTR_MODE) { 967 ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, 968 inode_u.bi_mode, &acl); 969 if (ret) 970 goto btree_err; 971 } 972 973 ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: 974 bch2_trans_commit(trans, NULL, NULL, 975 BCH_TRANS_COMMIT_no_enospc); 976 btree_err: 977 bch2_trans_iter_exit(trans, &inode_iter); 978 979 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 980 goto retry; 981 if (unlikely(ret)) 982 goto err_trans; 983 984 bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); 985 986 if (acl) 987 set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); 988 err_trans: 989 bch2_trans_put(trans); 990 err: 991 mutex_unlock(&inode->ei_update_lock); 992 993 return bch2_err_class(ret); 994 } 995 996 static int bch2_getattr(struct mnt_idmap *idmap, 997 const struct path *path, struct kstat *stat, 998 u32 request_mask, unsigned query_flags) 999 { 1000 struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); 1001 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1002 vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); 1003 vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); 1004 1005 stat->dev = inode->v.i_sb->s_dev; 1006 stat->ino = inode->v.i_ino; 1007 stat->mode = inode->v.i_mode; 1008 stat->nlink = inode->v.i_nlink; 1009 stat->uid = vfsuid_into_kuid(vfsuid); 1010 stat->gid = vfsgid_into_kgid(vfsgid); 1011 stat->rdev = inode->v.i_rdev; 1012 stat->size = i_size_read(&inode->v); 1013 stat->atime = inode_get_atime(&inode->v); 1014 stat->mtime = inode_get_mtime(&inode->v); 1015 stat->ctime = inode_get_ctime(&inode->v); 1016 stat->blksize = block_bytes(c); 1017 stat->blocks = inode->v.i_blocks; 1018 1019 stat->subvol = inode->ei_inum.subvol; 1020 stat->result_mask |= STATX_SUBVOL; 1021 1022 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { 1023 stat->result_mask |= STATX_DIOALIGN; 1024 /* 1025 * this is incorrect; we should be tracking this in superblock, 1026 * and checking the alignment of open devices 1027 */ 1028 stat->dio_mem_align = SECTOR_SIZE; 1029 stat->dio_offset_align = block_bytes(c); 1030 } 1031 1032 if (request_mask & STATX_BTIME) { 1033 stat->result_mask |= STATX_BTIME; 1034 stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); 1035 } 1036 1037 if (inode->ei_inode.bi_flags & BCH_INODE_immutable) 1038 stat->attributes |= STATX_ATTR_IMMUTABLE; 1039 stat->attributes_mask |= STATX_ATTR_IMMUTABLE; 1040 1041 if (inode->ei_inode.bi_flags & BCH_INODE_append) 1042 stat->attributes |= STATX_ATTR_APPEND; 1043 stat->attributes_mask |= STATX_ATTR_APPEND; 1044 1045 if (inode->ei_inode.bi_flags & BCH_INODE_nodump) 1046 stat->attributes |= STATX_ATTR_NODUMP; 1047 stat->attributes_mask |= STATX_ATTR_NODUMP; 1048 1049 return 0; 1050 } 1051 1052 static int bch2_setattr(struct mnt_idmap *idmap, 1053 struct dentry *dentry, struct iattr *iattr) 1054 { 1055 struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); 1056 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1057 int ret; 1058 1059 lockdep_assert_held(&inode->v.i_rwsem); 1060 1061 ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: 1062 setattr_prepare(idmap, dentry, iattr); 1063 if (ret) 1064 return ret; 1065 1066 return iattr->ia_valid & ATTR_SIZE 1067 ? bchfs_truncate(idmap, inode, iattr) 1068 : bch2_setattr_nonsize(idmap, inode, iattr); 1069 } 1070 1071 static int bch2_tmpfile(struct mnt_idmap *idmap, 1072 struct inode *vdir, struct file *file, umode_t mode) 1073 { 1074 struct bch_inode_info *inode = 1075 __bch2_create(idmap, to_bch_ei(vdir), 1076 file->f_path.dentry, mode, 0, 1077 (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); 1078 1079 if (IS_ERR(inode)) 1080 return bch2_err_class(PTR_ERR(inode)); 1081 1082 d_mark_tmpfile(file, &inode->v); 1083 d_instantiate(file->f_path.dentry, &inode->v); 1084 return finish_open_simple(file, 0); 1085 } 1086 1087 static int bch2_fill_extent(struct bch_fs *c, 1088 struct fiemap_extent_info *info, 1089 struct bkey_s_c k, unsigned flags) 1090 { 1091 if (bkey_extent_is_direct_data(k.k)) { 1092 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1093 const union bch_extent_entry *entry; 1094 struct extent_ptr_decoded p; 1095 int ret; 1096 1097 if (k.k->type == KEY_TYPE_reflink_v) 1098 flags |= FIEMAP_EXTENT_SHARED; 1099 1100 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 1101 int flags2 = 0; 1102 u64 offset = p.ptr.offset; 1103 1104 if (p.ptr.unwritten) 1105 flags2 |= FIEMAP_EXTENT_UNWRITTEN; 1106 1107 if (p.crc.compression_type) 1108 flags2 |= FIEMAP_EXTENT_ENCODED; 1109 else 1110 offset += p.crc.offset; 1111 1112 if ((offset & (block_sectors(c) - 1)) || 1113 (k.k->size & (block_sectors(c) - 1))) 1114 flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; 1115 1116 ret = fiemap_fill_next_extent(info, 1117 bkey_start_offset(k.k) << 9, 1118 offset << 9, 1119 k.k->size << 9, flags|flags2); 1120 if (ret) 1121 return ret; 1122 } 1123 1124 return 0; 1125 } else if (bkey_extent_is_inline_data(k.k)) { 1126 return fiemap_fill_next_extent(info, 1127 bkey_start_offset(k.k) << 9, 1128 0, k.k->size << 9, 1129 flags| 1130 FIEMAP_EXTENT_DATA_INLINE); 1131 } else if (k.k->type == KEY_TYPE_reservation) { 1132 return fiemap_fill_next_extent(info, 1133 bkey_start_offset(k.k) << 9, 1134 0, k.k->size << 9, 1135 flags| 1136 FIEMAP_EXTENT_DELALLOC| 1137 FIEMAP_EXTENT_UNWRITTEN); 1138 } else { 1139 BUG(); 1140 } 1141 } 1142 1143 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, 1144 u64 start, u64 len) 1145 { 1146 struct bch_fs *c = vinode->i_sb->s_fs_info; 1147 struct bch_inode_info *ei = to_bch_ei(vinode); 1148 struct btree_trans *trans; 1149 struct btree_iter iter; 1150 struct bkey_s_c k; 1151 struct bkey_buf cur, prev; 1152 unsigned offset_into_extent, sectors; 1153 bool have_extent = false; 1154 int ret = 0; 1155 1156 ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); 1157 if (ret) 1158 return ret; 1159 1160 struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); 1161 if (start + len < start) 1162 return -EINVAL; 1163 1164 start >>= 9; 1165 1166 bch2_bkey_buf_init(&cur); 1167 bch2_bkey_buf_init(&prev); 1168 trans = bch2_trans_get(c); 1169 1170 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1171 POS(ei->v.i_ino, start), 0); 1172 1173 while (true) { 1174 enum btree_id data_btree = BTREE_ID_extents; 1175 1176 bch2_trans_begin(trans); 1177 1178 u32 snapshot; 1179 ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); 1180 if (ret) 1181 goto err; 1182 1183 bch2_btree_iter_set_snapshot(&iter, snapshot); 1184 1185 k = bch2_btree_iter_peek_upto(&iter, end); 1186 ret = bkey_err(k); 1187 if (ret) 1188 goto err; 1189 1190 if (!k.k) 1191 break; 1192 1193 if (!bkey_extent_is_data(k.k) && 1194 k.k->type != KEY_TYPE_reservation) { 1195 bch2_btree_iter_advance(&iter); 1196 continue; 1197 } 1198 1199 offset_into_extent = iter.pos.offset - 1200 bkey_start_offset(k.k); 1201 sectors = k.k->size - offset_into_extent; 1202 1203 bch2_bkey_buf_reassemble(&cur, c, k); 1204 1205 ret = bch2_read_indirect_extent(trans, &data_btree, 1206 &offset_into_extent, &cur); 1207 if (ret) 1208 break; 1209 1210 k = bkey_i_to_s_c(cur.k); 1211 bch2_bkey_buf_realloc(&prev, c, k.k->u64s); 1212 1213 sectors = min(sectors, k.k->size - offset_into_extent); 1214 1215 bch2_cut_front(POS(k.k->p.inode, 1216 bkey_start_offset(k.k) + 1217 offset_into_extent), 1218 cur.k); 1219 bch2_key_resize(&cur.k->k, sectors); 1220 cur.k->k.p = iter.pos; 1221 cur.k->k.p.offset += cur.k->k.size; 1222 1223 if (have_extent) { 1224 bch2_trans_unlock(trans); 1225 ret = bch2_fill_extent(c, info, 1226 bkey_i_to_s_c(prev.k), 0); 1227 if (ret) 1228 break; 1229 } 1230 1231 bkey_copy(prev.k, cur.k); 1232 have_extent = true; 1233 1234 bch2_btree_iter_set_pos(&iter, 1235 POS(iter.pos.inode, iter.pos.offset + sectors)); 1236 err: 1237 if (ret && 1238 !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1239 break; 1240 } 1241 bch2_trans_iter_exit(trans, &iter); 1242 1243 if (!ret && have_extent) { 1244 bch2_trans_unlock(trans); 1245 ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 1246 FIEMAP_EXTENT_LAST); 1247 } 1248 1249 bch2_trans_put(trans); 1250 bch2_bkey_buf_exit(&cur, c); 1251 bch2_bkey_buf_exit(&prev, c); 1252 return ret < 0 ? ret : 0; 1253 } 1254 1255 static const struct vm_operations_struct bch_vm_ops = { 1256 .fault = bch2_page_fault, 1257 .map_pages = filemap_map_pages, 1258 .page_mkwrite = bch2_page_mkwrite, 1259 }; 1260 1261 static int bch2_mmap(struct file *file, struct vm_area_struct *vma) 1262 { 1263 file_accessed(file); 1264 1265 vma->vm_ops = &bch_vm_ops; 1266 return 0; 1267 } 1268 1269 /* Directories: */ 1270 1271 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) 1272 { 1273 return generic_file_llseek_size(file, offset, whence, 1274 S64_MAX, S64_MAX); 1275 } 1276 1277 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) 1278 { 1279 struct bch_inode_info *inode = file_bch_inode(file); 1280 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1281 1282 if (!dir_emit_dots(file, ctx)) 1283 return 0; 1284 1285 int ret = bch2_readdir(c, inode_inum(inode), ctx); 1286 1287 bch_err_fn(c, ret); 1288 return bch2_err_class(ret); 1289 } 1290 1291 static int bch2_open(struct inode *vinode, struct file *file) 1292 { 1293 if (file->f_flags & (O_WRONLY|O_RDWR)) { 1294 struct bch_inode_info *inode = to_bch_ei(vinode); 1295 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1296 1297 int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); 1298 if (ret) 1299 return ret; 1300 } 1301 1302 file->f_mode |= FMODE_CAN_ODIRECT; 1303 1304 return generic_file_open(vinode, file); 1305 } 1306 1307 static const struct file_operations bch_file_operations = { 1308 .open = bch2_open, 1309 .llseek = bch2_llseek, 1310 .read_iter = bch2_read_iter, 1311 .write_iter = bch2_write_iter, 1312 .mmap = bch2_mmap, 1313 .get_unmapped_area = thp_get_unmapped_area, 1314 .fsync = bch2_fsync, 1315 .splice_read = filemap_splice_read, 1316 .splice_write = iter_file_splice_write, 1317 .fallocate = bch2_fallocate_dispatch, 1318 .unlocked_ioctl = bch2_fs_file_ioctl, 1319 #ifdef CONFIG_COMPAT 1320 .compat_ioctl = bch2_compat_fs_ioctl, 1321 #endif 1322 .remap_file_range = bch2_remap_file_range, 1323 }; 1324 1325 static const struct inode_operations bch_file_inode_operations = { 1326 .getattr = bch2_getattr, 1327 .setattr = bch2_setattr, 1328 .fiemap = bch2_fiemap, 1329 .listxattr = bch2_xattr_list, 1330 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1331 .get_inode_acl = bch2_get_acl, 1332 .set_acl = bch2_set_acl, 1333 #endif 1334 }; 1335 1336 static const struct inode_operations bch_dir_inode_operations = { 1337 .lookup = bch2_lookup, 1338 .create = bch2_create, 1339 .link = bch2_link, 1340 .unlink = bch2_unlink, 1341 .symlink = bch2_symlink, 1342 .mkdir = bch2_mkdir, 1343 .rmdir = bch2_unlink, 1344 .mknod = bch2_mknod, 1345 .rename = bch2_rename2, 1346 .getattr = bch2_getattr, 1347 .setattr = bch2_setattr, 1348 .tmpfile = bch2_tmpfile, 1349 .listxattr = bch2_xattr_list, 1350 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1351 .get_inode_acl = bch2_get_acl, 1352 .set_acl = bch2_set_acl, 1353 #endif 1354 }; 1355 1356 static const struct file_operations bch_dir_file_operations = { 1357 .llseek = bch2_dir_llseek, 1358 .read = generic_read_dir, 1359 .iterate_shared = bch2_vfs_readdir, 1360 .fsync = bch2_fsync, 1361 .unlocked_ioctl = bch2_fs_file_ioctl, 1362 #ifdef CONFIG_COMPAT 1363 .compat_ioctl = bch2_compat_fs_ioctl, 1364 #endif 1365 }; 1366 1367 static const struct inode_operations bch_symlink_inode_operations = { 1368 .get_link = page_get_link, 1369 .getattr = bch2_getattr, 1370 .setattr = bch2_setattr, 1371 .listxattr = bch2_xattr_list, 1372 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1373 .get_inode_acl = bch2_get_acl, 1374 .set_acl = bch2_set_acl, 1375 #endif 1376 }; 1377 1378 static const struct inode_operations bch_special_inode_operations = { 1379 .getattr = bch2_getattr, 1380 .setattr = bch2_setattr, 1381 .listxattr = bch2_xattr_list, 1382 #ifdef CONFIG_BCACHEFS_POSIX_ACL 1383 .get_inode_acl = bch2_get_acl, 1384 .set_acl = bch2_set_acl, 1385 #endif 1386 }; 1387 1388 static const struct address_space_operations bch_address_space_operations = { 1389 .read_folio = bch2_read_folio, 1390 .writepages = bch2_writepages, 1391 .readahead = bch2_readahead, 1392 .dirty_folio = filemap_dirty_folio, 1393 .write_begin = bch2_write_begin, 1394 .write_end = bch2_write_end, 1395 .invalidate_folio = bch2_invalidate_folio, 1396 .release_folio = bch2_release_folio, 1397 #ifdef CONFIG_MIGRATION 1398 .migrate_folio = filemap_migrate_folio, 1399 #endif 1400 .error_remove_folio = generic_error_remove_folio, 1401 }; 1402 1403 struct bcachefs_fid { 1404 u64 inum; 1405 u32 subvol; 1406 u32 gen; 1407 } __packed; 1408 1409 struct bcachefs_fid_with_parent { 1410 struct bcachefs_fid fid; 1411 struct bcachefs_fid dir; 1412 } __packed; 1413 1414 static int bcachefs_fid_valid(int fh_len, int fh_type) 1415 { 1416 switch (fh_type) { 1417 case FILEID_BCACHEFS_WITHOUT_PARENT: 1418 return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); 1419 case FILEID_BCACHEFS_WITH_PARENT: 1420 return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); 1421 default: 1422 return false; 1423 } 1424 } 1425 1426 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) 1427 { 1428 return (struct bcachefs_fid) { 1429 .inum = inode->ei_inum.inum, 1430 .subvol = inode->ei_inum.subvol, 1431 .gen = inode->ei_inode.bi_generation, 1432 }; 1433 } 1434 1435 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, 1436 struct inode *vdir) 1437 { 1438 struct bch_inode_info *inode = to_bch_ei(vinode); 1439 struct bch_inode_info *dir = to_bch_ei(vdir); 1440 int min_len; 1441 1442 if (!S_ISDIR(inode->v.i_mode) && dir) { 1443 struct bcachefs_fid_with_parent *fid = (void *) fh; 1444 1445 min_len = sizeof(*fid) / sizeof(u32); 1446 if (*len < min_len) { 1447 *len = min_len; 1448 return FILEID_INVALID; 1449 } 1450 1451 fid->fid = bch2_inode_to_fid(inode); 1452 fid->dir = bch2_inode_to_fid(dir); 1453 1454 *len = min_len; 1455 return FILEID_BCACHEFS_WITH_PARENT; 1456 } else { 1457 struct bcachefs_fid *fid = (void *) fh; 1458 1459 min_len = sizeof(*fid) / sizeof(u32); 1460 if (*len < min_len) { 1461 *len = min_len; 1462 return FILEID_INVALID; 1463 } 1464 *fid = bch2_inode_to_fid(inode); 1465 1466 *len = min_len; 1467 return FILEID_BCACHEFS_WITHOUT_PARENT; 1468 } 1469 } 1470 1471 static struct inode *bch2_nfs_get_inode(struct super_block *sb, 1472 struct bcachefs_fid fid) 1473 { 1474 struct bch_fs *c = sb->s_fs_info; 1475 struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { 1476 .subvol = fid.subvol, 1477 .inum = fid.inum, 1478 }); 1479 if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { 1480 iput(vinode); 1481 vinode = ERR_PTR(-ESTALE); 1482 } 1483 return vinode; 1484 } 1485 1486 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, 1487 int fh_len, int fh_type) 1488 { 1489 struct bcachefs_fid *fid = (void *) _fid; 1490 1491 if (!bcachefs_fid_valid(fh_len, fh_type)) 1492 return NULL; 1493 1494 return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); 1495 } 1496 1497 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, 1498 int fh_len, int fh_type) 1499 { 1500 struct bcachefs_fid_with_parent *fid = (void *) _fid; 1501 1502 if (!bcachefs_fid_valid(fh_len, fh_type) || 1503 fh_type != FILEID_BCACHEFS_WITH_PARENT) 1504 return NULL; 1505 1506 return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); 1507 } 1508 1509 static struct dentry *bch2_get_parent(struct dentry *child) 1510 { 1511 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1512 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1513 subvol_inum parent_inum = { 1514 .subvol = inode->ei_inode.bi_parent_subvol ?: 1515 inode->ei_inum.subvol, 1516 .inum = inode->ei_inode.bi_dir, 1517 }; 1518 1519 return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); 1520 } 1521 1522 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) 1523 { 1524 struct bch_inode_info *inode = to_bch_ei(child->d_inode); 1525 struct bch_inode_info *dir = to_bch_ei(parent->d_inode); 1526 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1527 struct btree_trans *trans; 1528 struct btree_iter iter1; 1529 struct btree_iter iter2; 1530 struct bkey_s_c k; 1531 struct bkey_s_c_dirent d; 1532 struct bch_inode_unpacked inode_u; 1533 subvol_inum target; 1534 u32 snapshot; 1535 struct qstr dirent_name; 1536 unsigned name_len = 0; 1537 int ret; 1538 1539 if (!S_ISDIR(dir->v.i_mode)) 1540 return -EINVAL; 1541 1542 trans = bch2_trans_get(c); 1543 1544 bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, 1545 POS(dir->ei_inode.bi_inum, 0), 0); 1546 bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, 1547 POS(dir->ei_inode.bi_inum, 0), 0); 1548 retry: 1549 bch2_trans_begin(trans); 1550 1551 ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); 1552 if (ret) 1553 goto err; 1554 1555 bch2_btree_iter_set_snapshot(&iter1, snapshot); 1556 bch2_btree_iter_set_snapshot(&iter2, snapshot); 1557 1558 ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); 1559 if (ret) 1560 goto err; 1561 1562 if (inode_u.bi_dir == dir->ei_inode.bi_inum) { 1563 bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); 1564 1565 k = bch2_btree_iter_peek_slot(&iter1); 1566 ret = bkey_err(k); 1567 if (ret) 1568 goto err; 1569 1570 if (k.k->type != KEY_TYPE_dirent) { 1571 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1572 goto err; 1573 } 1574 1575 d = bkey_s_c_to_dirent(k); 1576 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1577 if (ret > 0) 1578 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 1579 if (ret) 1580 goto err; 1581 1582 if (subvol_inum_eq(target, inode->ei_inum)) 1583 goto found; 1584 } else { 1585 /* 1586 * File with multiple hardlinks and our backref is to the wrong 1587 * directory - linear search: 1588 */ 1589 for_each_btree_key_continue_norestart(iter2, 0, k, ret) { 1590 if (k.k->p.inode > dir->ei_inode.bi_inum) 1591 break; 1592 1593 if (k.k->type != KEY_TYPE_dirent) 1594 continue; 1595 1596 d = bkey_s_c_to_dirent(k); 1597 ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); 1598 if (ret < 0) 1599 break; 1600 if (ret) 1601 continue; 1602 1603 if (subvol_inum_eq(target, inode->ei_inum)) 1604 goto found; 1605 } 1606 } 1607 1608 ret = -ENOENT; 1609 goto err; 1610 found: 1611 dirent_name = bch2_dirent_get_name(d); 1612 1613 name_len = min_t(unsigned, dirent_name.len, NAME_MAX); 1614 memcpy(name, dirent_name.name, name_len); 1615 name[name_len] = '\0'; 1616 err: 1617 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1618 goto retry; 1619 1620 bch2_trans_iter_exit(trans, &iter1); 1621 bch2_trans_iter_exit(trans, &iter2); 1622 bch2_trans_put(trans); 1623 1624 return ret; 1625 } 1626 1627 static const struct export_operations bch_export_ops = { 1628 .encode_fh = bch2_encode_fh, 1629 .fh_to_dentry = bch2_fh_to_dentry, 1630 .fh_to_parent = bch2_fh_to_parent, 1631 .get_parent = bch2_get_parent, 1632 .get_name = bch2_get_name, 1633 }; 1634 1635 static void bch2_vfs_inode_init(struct btree_trans *trans, 1636 subvol_inum inum, 1637 struct bch_inode_info *inode, 1638 struct bch_inode_unpacked *bi, 1639 struct bch_subvolume *subvol) 1640 { 1641 inode->v.i_ino = inum.inum; 1642 inode->ei_inum = inum; 1643 inode->ei_inode.bi_inum = inum.inum; 1644 bch2_inode_update_after_write(trans, inode, bi, ~0); 1645 1646 inode->v.i_blocks = bi->bi_sectors; 1647 inode->v.i_ino = bi->bi_inum; 1648 inode->v.i_rdev = bi->bi_dev; 1649 inode->v.i_generation = bi->bi_generation; 1650 inode->v.i_size = bi->bi_size; 1651 1652 inode->ei_flags = 0; 1653 inode->ei_quota_reserved = 0; 1654 inode->ei_qid = bch_qid(bi); 1655 1656 if (BCH_SUBVOLUME_SNAP(subvol)) 1657 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); 1658 1659 inode->v.i_mapping->a_ops = &bch_address_space_operations; 1660 1661 switch (inode->v.i_mode & S_IFMT) { 1662 case S_IFREG: 1663 inode->v.i_op = &bch_file_inode_operations; 1664 inode->v.i_fop = &bch_file_operations; 1665 break; 1666 case S_IFDIR: 1667 inode->v.i_op = &bch_dir_inode_operations; 1668 inode->v.i_fop = &bch_dir_file_operations; 1669 break; 1670 case S_IFLNK: 1671 inode_nohighmem(&inode->v); 1672 inode->v.i_op = &bch_symlink_inode_operations; 1673 break; 1674 default: 1675 init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); 1676 inode->v.i_op = &bch_special_inode_operations; 1677 break; 1678 } 1679 1680 mapping_set_large_folios(inode->v.i_mapping); 1681 } 1682 1683 static void bch2_free_inode(struct inode *vinode) 1684 { 1685 kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); 1686 } 1687 1688 static int inode_update_times_fn(struct btree_trans *trans, 1689 struct bch_inode_info *inode, 1690 struct bch_inode_unpacked *bi, 1691 void *p) 1692 { 1693 struct bch_fs *c = inode->v.i_sb->s_fs_info; 1694 1695 bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); 1696 bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); 1697 bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); 1698 1699 return 0; 1700 } 1701 1702 static int bch2_vfs_write_inode(struct inode *vinode, 1703 struct writeback_control *wbc) 1704 { 1705 struct bch_fs *c = vinode->i_sb->s_fs_info; 1706 struct bch_inode_info *inode = to_bch_ei(vinode); 1707 int ret; 1708 1709 mutex_lock(&inode->ei_update_lock); 1710 ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, 1711 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); 1712 mutex_unlock(&inode->ei_update_lock); 1713 1714 return bch2_err_class(ret); 1715 } 1716 1717 static void bch2_evict_inode(struct inode *vinode) 1718 { 1719 struct bch_fs *c = vinode->i_sb->s_fs_info; 1720 struct bch_inode_info *inode = to_bch_ei(vinode); 1721 bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); 1722 1723 /* 1724 * evict() has waited for outstanding writeback, we'll do no more IO 1725 * through this inode: it's safe to remove from VFS inode hashtable here 1726 * 1727 * Do that now so that other threads aren't blocked from pulling it back 1728 * in, there's no reason for them to be: 1729 */ 1730 if (!delete) 1731 bch2_inode_hash_remove(c, inode); 1732 1733 truncate_inode_pages_final(&inode->v.i_data); 1734 1735 clear_inode(&inode->v); 1736 1737 BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); 1738 1739 if (delete) { 1740 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), 1741 KEY_TYPE_QUOTA_WARN); 1742 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, 1743 KEY_TYPE_QUOTA_WARN); 1744 bch2_inode_rm(c, inode_inum(inode)); 1745 1746 /* 1747 * If we are deleting, we need it present in the vfs hash table 1748 * so that fsck can check if unlinked inodes are still open: 1749 */ 1750 bch2_inode_hash_remove(c, inode); 1751 } 1752 1753 mutex_lock(&c->vfs_inodes_lock); 1754 list_del_init(&inode->ei_vfs_inode_list); 1755 mutex_unlock(&c->vfs_inodes_lock); 1756 } 1757 1758 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) 1759 { 1760 struct bch_inode_info *inode; 1761 DARRAY(struct bch_inode_info *) grabbed; 1762 bool clean_pass = false, this_pass_clean; 1763 1764 /* 1765 * Initially, we scan for inodes without I_DONTCACHE, then mark them to 1766 * be pruned with d_mark_dontcache(). 1767 * 1768 * Once we've had a clean pass where we didn't find any inodes without 1769 * I_DONTCACHE, we wait for them to be freed: 1770 */ 1771 1772 darray_init(&grabbed); 1773 darray_make_room(&grabbed, 1024); 1774 again: 1775 cond_resched(); 1776 this_pass_clean = true; 1777 1778 mutex_lock(&c->vfs_inodes_lock); 1779 list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { 1780 if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) 1781 continue; 1782 1783 if (!(inode->v.i_state & I_DONTCACHE) && 1784 !(inode->v.i_state & I_FREEING) && 1785 igrab(&inode->v)) { 1786 this_pass_clean = false; 1787 1788 if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { 1789 iput(&inode->v); 1790 break; 1791 } 1792 } else if (clean_pass && this_pass_clean) { 1793 struct wait_bit_queue_entry wqe; 1794 struct wait_queue_head *wq_head; 1795 1796 wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); 1797 prepare_to_wait_event(wq_head, &wqe.wq_entry, 1798 TASK_UNINTERRUPTIBLE); 1799 mutex_unlock(&c->vfs_inodes_lock); 1800 1801 schedule(); 1802 finish_wait(wq_head, &wqe.wq_entry); 1803 goto again; 1804 } 1805 } 1806 mutex_unlock(&c->vfs_inodes_lock); 1807 1808 darray_for_each(grabbed, i) { 1809 inode = *i; 1810 d_mark_dontcache(&inode->v); 1811 d_prune_aliases(&inode->v); 1812 iput(&inode->v); 1813 } 1814 grabbed.nr = 0; 1815 1816 if (!clean_pass || !this_pass_clean) { 1817 clean_pass = this_pass_clean; 1818 goto again; 1819 } 1820 1821 darray_exit(&grabbed); 1822 } 1823 1824 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) 1825 { 1826 struct super_block *sb = dentry->d_sb; 1827 struct bch_fs *c = sb->s_fs_info; 1828 struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); 1829 unsigned shift = sb->s_blocksize_bits - 9; 1830 /* 1831 * this assumes inodes take up 64 bytes, which is a decent average 1832 * number: 1833 */ 1834 u64 avail_inodes = ((usage.capacity - usage.used) << 3); 1835 1836 buf->f_type = BCACHEFS_STATFS_MAGIC; 1837 buf->f_bsize = sb->s_blocksize; 1838 buf->f_blocks = usage.capacity >> shift; 1839 buf->f_bfree = usage.free >> shift; 1840 buf->f_bavail = avail_factor(usage.free) >> shift; 1841 1842 buf->f_files = usage.nr_inodes + avail_inodes; 1843 buf->f_ffree = avail_inodes; 1844 1845 buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); 1846 buf->f_namelen = BCH_NAME_MAX; 1847 1848 return 0; 1849 } 1850 1851 static int bch2_sync_fs(struct super_block *sb, int wait) 1852 { 1853 struct bch_fs *c = sb->s_fs_info; 1854 int ret; 1855 1856 trace_bch2_sync_fs(sb, wait); 1857 1858 if (c->opts.journal_flush_disabled) 1859 return 0; 1860 1861 if (!wait) { 1862 bch2_journal_flush_async(&c->journal, NULL); 1863 return 0; 1864 } 1865 1866 ret = bch2_journal_flush(&c->journal); 1867 return bch2_err_class(ret); 1868 } 1869 1870 static struct bch_fs *bch2_path_to_fs(const char *path) 1871 { 1872 struct bch_fs *c; 1873 dev_t dev; 1874 int ret; 1875 1876 ret = lookup_bdev(path, &dev); 1877 if (ret) 1878 return ERR_PTR(ret); 1879 1880 c = bch2_dev_to_fs(dev); 1881 if (c) 1882 closure_put(&c->cl); 1883 return c ?: ERR_PTR(-ENOENT); 1884 } 1885 1886 static int bch2_remount(struct super_block *sb, int *flags, 1887 struct bch_opts opts) 1888 { 1889 struct bch_fs *c = sb->s_fs_info; 1890 int ret = 0; 1891 1892 opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); 1893 1894 if (opts.read_only != c->opts.read_only) { 1895 down_write(&c->state_lock); 1896 1897 if (opts.read_only) { 1898 bch2_fs_read_only(c); 1899 1900 sb->s_flags |= SB_RDONLY; 1901 } else { 1902 ret = bch2_fs_read_write(c); 1903 if (ret) { 1904 bch_err(c, "error going rw: %i", ret); 1905 up_write(&c->state_lock); 1906 ret = -EINVAL; 1907 goto err; 1908 } 1909 1910 sb->s_flags &= ~SB_RDONLY; 1911 } 1912 1913 c->opts.read_only = opts.read_only; 1914 1915 up_write(&c->state_lock); 1916 } 1917 1918 if (opt_defined(opts, errors)) 1919 c->opts.errors = opts.errors; 1920 err: 1921 return bch2_err_class(ret); 1922 } 1923 1924 static int bch2_show_devname(struct seq_file *seq, struct dentry *root) 1925 { 1926 struct bch_fs *c = root->d_sb->s_fs_info; 1927 bool first = true; 1928 1929 for_each_online_member(c, ca) { 1930 if (!first) 1931 seq_putc(seq, ':'); 1932 first = false; 1933 seq_puts(seq, ca->disk_sb.sb_name); 1934 } 1935 1936 return 0; 1937 } 1938 1939 static int bch2_show_options(struct seq_file *seq, struct dentry *root) 1940 { 1941 struct bch_fs *c = root->d_sb->s_fs_info; 1942 struct printbuf buf = PRINTBUF; 1943 1944 bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, 1945 OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); 1946 printbuf_nul_terminate(&buf); 1947 seq_puts(seq, buf.buf); 1948 1949 int ret = buf.allocation_failure ? -ENOMEM : 0; 1950 printbuf_exit(&buf); 1951 return ret; 1952 } 1953 1954 static void bch2_put_super(struct super_block *sb) 1955 { 1956 struct bch_fs *c = sb->s_fs_info; 1957 1958 __bch2_fs_stop(c); 1959 } 1960 1961 /* 1962 * bcachefs doesn't currently integrate intwrite freeze protection but the 1963 * internal write references serve the same purpose. Therefore reuse the 1964 * read-only transition code to perform the quiesce. The caveat is that we don't 1965 * currently have the ability to block tasks that want a write reference while 1966 * the superblock is frozen. This is fine for now, but we should either add 1967 * blocking support or find a way to integrate sb_start_intwrite() and friends. 1968 */ 1969 static int bch2_freeze(struct super_block *sb) 1970 { 1971 struct bch_fs *c = sb->s_fs_info; 1972 1973 down_write(&c->state_lock); 1974 bch2_fs_read_only(c); 1975 up_write(&c->state_lock); 1976 return 0; 1977 } 1978 1979 static int bch2_unfreeze(struct super_block *sb) 1980 { 1981 struct bch_fs *c = sb->s_fs_info; 1982 int ret; 1983 1984 if (test_bit(BCH_FS_emergency_ro, &c->flags)) 1985 return 0; 1986 1987 down_write(&c->state_lock); 1988 ret = bch2_fs_read_write(c); 1989 up_write(&c->state_lock); 1990 return ret; 1991 } 1992 1993 static const struct super_operations bch_super_operations = { 1994 .alloc_inode = bch2_alloc_inode, 1995 .free_inode = bch2_free_inode, 1996 .write_inode = bch2_vfs_write_inode, 1997 .evict_inode = bch2_evict_inode, 1998 .sync_fs = bch2_sync_fs, 1999 .statfs = bch2_statfs, 2000 .show_devname = bch2_show_devname, 2001 .show_options = bch2_show_options, 2002 .put_super = bch2_put_super, 2003 .freeze_fs = bch2_freeze, 2004 .unfreeze_fs = bch2_unfreeze, 2005 }; 2006 2007 static int bch2_set_super(struct super_block *s, void *data) 2008 { 2009 s->s_fs_info = data; 2010 return 0; 2011 } 2012 2013 static int bch2_noset_super(struct super_block *s, void *data) 2014 { 2015 return -EBUSY; 2016 } 2017 2018 typedef DARRAY(struct bch_fs *) darray_fs; 2019 2020 static int bch2_test_super(struct super_block *s, void *data) 2021 { 2022 struct bch_fs *c = s->s_fs_info; 2023 darray_fs *d = data; 2024 2025 if (!c) 2026 return false; 2027 2028 darray_for_each(*d, i) 2029 if (c != *i) 2030 return false; 2031 return true; 2032 } 2033 2034 static int bch2_fs_get_tree(struct fs_context *fc) 2035 { 2036 struct bch_fs *c; 2037 struct super_block *sb; 2038 struct inode *vinode; 2039 struct bch2_opts_parse *opts_parse = fc->fs_private; 2040 struct bch_opts opts = opts_parse->opts; 2041 darray_str devs; 2042 darray_fs devs_to_fs = {}; 2043 int ret; 2044 2045 opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); 2046 opt_set(opts, nostart, true); 2047 2048 if (!fc->source || strlen(fc->source) == 0) 2049 return -EINVAL; 2050 2051 ret = bch2_split_devs(fc->source, &devs); 2052 if (ret) 2053 return ret; 2054 2055 darray_for_each(devs, i) { 2056 ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); 2057 if (ret) 2058 goto err; 2059 } 2060 2061 sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); 2062 if (!IS_ERR(sb)) 2063 goto got_sb; 2064 2065 c = bch2_fs_open(devs.data, devs.nr, opts); 2066 ret = PTR_ERR_OR_ZERO(c); 2067 if (ret) 2068 goto err; 2069 2070 /* Some options can't be parsed until after the fs is started: */ 2071 opts = bch2_opts_empty(); 2072 ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); 2073 if (ret) 2074 goto err_stop_fs; 2075 2076 bch2_opts_apply(&c->opts, opts); 2077 2078 ret = bch2_fs_start(c); 2079 if (ret) 2080 goto err_stop_fs; 2081 2082 sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); 2083 ret = PTR_ERR_OR_ZERO(sb); 2084 if (ret) 2085 goto err_stop_fs; 2086 got_sb: 2087 c = sb->s_fs_info; 2088 2089 if (sb->s_root) { 2090 if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { 2091 ret = -EBUSY; 2092 goto err_put_super; 2093 } 2094 goto out; 2095 } 2096 2097 sb->s_blocksize = block_bytes(c); 2098 sb->s_blocksize_bits = ilog2(block_bytes(c)); 2099 sb->s_maxbytes = MAX_LFS_FILESIZE; 2100 sb->s_op = &bch_super_operations; 2101 sb->s_export_op = &bch_export_ops; 2102 #ifdef CONFIG_BCACHEFS_QUOTA 2103 sb->s_qcop = &bch2_quotactl_operations; 2104 sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; 2105 #endif 2106 sb->s_xattr = bch2_xattr_handlers; 2107 sb->s_magic = BCACHEFS_STATFS_MAGIC; 2108 sb->s_time_gran = c->sb.nsec_per_time_unit; 2109 sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; 2110 sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); 2111 sb->s_uuid = c->sb.user_uuid; 2112 sb->s_shrink->seeks = 0; 2113 c->vfs_sb = sb; 2114 strscpy(sb->s_id, c->name, sizeof(sb->s_id)); 2115 2116 ret = super_setup_bdi(sb); 2117 if (ret) 2118 goto err_put_super; 2119 2120 sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; 2121 2122 for_each_online_member(c, ca) { 2123 struct block_device *bdev = ca->disk_sb.bdev; 2124 2125 /* XXX: create an anonymous device for multi device filesystems */ 2126 sb->s_bdev = bdev; 2127 sb->s_dev = bdev->bd_dev; 2128 percpu_ref_put(&ca->io_ref); 2129 break; 2130 } 2131 2132 c->dev = sb->s_dev; 2133 2134 #ifdef CONFIG_BCACHEFS_POSIX_ACL 2135 if (c->opts.acl) 2136 sb->s_flags |= SB_POSIXACL; 2137 #endif 2138 2139 sb->s_shrink->seeks = 0; 2140 2141 vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); 2142 ret = PTR_ERR_OR_ZERO(vinode); 2143 bch_err_msg(c, ret, "mounting: error getting root inode"); 2144 if (ret) 2145 goto err_put_super; 2146 2147 sb->s_root = d_make_root(vinode); 2148 if (!sb->s_root) { 2149 bch_err(c, "error mounting: error allocating root dentry"); 2150 ret = -ENOMEM; 2151 goto err_put_super; 2152 } 2153 2154 sb->s_flags |= SB_ACTIVE; 2155 out: 2156 fc->root = dget(sb->s_root); 2157 err: 2158 darray_exit(&devs_to_fs); 2159 bch2_darray_str_exit(&devs); 2160 if (ret) 2161 pr_err("error: %s", bch2_err_str(ret)); 2162 /* 2163 * On an inconsistency error in recovery we might see an -EROFS derived 2164 * errorcode (from the journal), but we don't want to return that to 2165 * userspace as that causes util-linux to retry the mount RO - which is 2166 * confusing: 2167 */ 2168 if (bch2_err_matches(ret, EROFS) && ret != -EROFS) 2169 ret = -EIO; 2170 return bch2_err_class(ret); 2171 2172 err_stop_fs: 2173 bch2_fs_stop(c); 2174 goto err; 2175 2176 err_put_super: 2177 __bch2_fs_stop(c); 2178 deactivate_locked_super(sb); 2179 goto err; 2180 } 2181 2182 static void bch2_kill_sb(struct super_block *sb) 2183 { 2184 struct bch_fs *c = sb->s_fs_info; 2185 2186 generic_shutdown_super(sb); 2187 bch2_fs_free(c); 2188 } 2189 2190 static void bch2_fs_context_free(struct fs_context *fc) 2191 { 2192 struct bch2_opts_parse *opts = fc->fs_private; 2193 2194 if (opts) { 2195 printbuf_exit(&opts->parse_later); 2196 kfree(opts); 2197 } 2198 } 2199 2200 static int bch2_fs_parse_param(struct fs_context *fc, 2201 struct fs_parameter *param) 2202 { 2203 /* 2204 * the "source" param, i.e., the name of the device(s) to mount, 2205 * is handled by the VFS layer. 2206 */ 2207 if (!strcmp(param->key, "source")) 2208 return -ENOPARAM; 2209 2210 struct bch2_opts_parse *opts = fc->fs_private; 2211 struct bch_fs *c = NULL; 2212 2213 /* for reconfigure, we already have a struct bch_fs */ 2214 if (fc->root) 2215 c = fc->root->d_sb->s_fs_info; 2216 2217 int ret = bch2_parse_one_mount_opt(c, &opts->opts, 2218 &opts->parse_later, param->key, 2219 param->string); 2220 2221 return bch2_err_class(ret); 2222 } 2223 2224 static int bch2_fs_reconfigure(struct fs_context *fc) 2225 { 2226 struct super_block *sb = fc->root->d_sb; 2227 struct bch2_opts_parse *opts = fc->fs_private; 2228 2229 return bch2_remount(sb, &fc->sb_flags, opts->opts); 2230 } 2231 2232 static const struct fs_context_operations bch2_context_ops = { 2233 .free = bch2_fs_context_free, 2234 .parse_param = bch2_fs_parse_param, 2235 .get_tree = bch2_fs_get_tree, 2236 .reconfigure = bch2_fs_reconfigure, 2237 }; 2238 2239 static int bch2_init_fs_context(struct fs_context *fc) 2240 { 2241 struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); 2242 2243 if (!opts) 2244 return -ENOMEM; 2245 2246 opts->parse_later = PRINTBUF; 2247 2248 fc->ops = &bch2_context_ops; 2249 fc->fs_private = opts; 2250 2251 return 0; 2252 } 2253 2254 void bch2_fs_vfs_exit(struct bch_fs *c) 2255 { 2256 if (c->vfs_inodes_table.tbl) 2257 rhashtable_destroy(&c->vfs_inodes_table); 2258 } 2259 2260 int bch2_fs_vfs_init(struct bch_fs *c) 2261 { 2262 return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); 2263 } 2264 2265 static struct file_system_type bcache_fs_type = { 2266 .owner = THIS_MODULE, 2267 .name = "bcachefs", 2268 .init_fs_context = bch2_init_fs_context, 2269 .kill_sb = bch2_kill_sb, 2270 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, 2271 }; 2272 2273 MODULE_ALIAS_FS("bcachefs"); 2274 2275 void bch2_vfs_exit(void) 2276 { 2277 unregister_filesystem(&bcache_fs_type); 2278 kmem_cache_destroy(bch2_inode_cache); 2279 } 2280 2281 int __init bch2_vfs_init(void) 2282 { 2283 int ret = -ENOMEM; 2284 2285 bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | 2286 SLAB_ACCOUNT); 2287 if (!bch2_inode_cache) 2288 goto err; 2289 2290 ret = register_filesystem(&bcache_fs_type); 2291 if (ret) 2292 goto err; 2293 2294 return 0; 2295 err: 2296 bch2_vfs_exit(); 2297 return ret; 2298 } 2299 2300 #endif /* NO_BCACHEFS_FS */ 2301