xref: /linux/fs/bcachefs/fs.c (revision b58b13f156c00c2457035b7071eaaac105fe6836)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42 
43 static struct kmem_cache *bch2_inode_cache;
44 
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46 				struct bch_inode_info *,
47 				struct bch_inode_unpacked *,
48 				struct bch_subvolume *);
49 
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51 				   struct bch_inode_info *inode,
52 				   struct bch_inode_unpacked *bi,
53 				   unsigned fields)
54 {
55 	struct bch_fs *c = trans->c;
56 
57 	BUG_ON(bi->bi_inum != inode->v.i_ino);
58 
59 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60 			       POS(0, bi->bi_inum),
61 			       c->opts.inodes_use_key_cache);
62 
63 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64 	i_uid_write(&inode->v, bi->bi_uid);
65 	i_gid_write(&inode->v, bi->bi_gid);
66 	inode->v.i_mode	= bi->bi_mode;
67 
68 	if (fields & ATTR_ATIME)
69 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70 	if (fields & ATTR_MTIME)
71 		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72 	if (fields & ATTR_CTIME)
73 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74 
75 	inode->ei_inode		= *bi;
76 
77 	bch2_inode_flags_to_vfs(inode);
78 }
79 
80 int __must_check bch2_write_inode(struct bch_fs *c,
81 				  struct bch_inode_info *inode,
82 				  inode_set_fn set,
83 				  void *p, unsigned fields)
84 {
85 	struct btree_trans *trans = bch2_trans_get(c);
86 	struct btree_iter iter = { NULL };
87 	struct bch_inode_unpacked inode_u;
88 	int ret;
89 retry:
90 	bch2_trans_begin(trans);
91 
92 	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93 				BTREE_ITER_INTENT) ?:
94 		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95 		bch2_inode_write(trans, &iter, &inode_u) ?:
96 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97 
98 	/*
99 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 	 * this is important for inode updates via bchfs_write_index_update
101 	 */
102 	if (!ret)
103 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104 
105 	bch2_trans_iter_exit(trans, &iter);
106 
107 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108 		goto retry;
109 
110 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111 			     "inode %u:%llu not found when updating",
112 			     inode_inum(inode).subvol,
113 			     inode_inum(inode).inum);
114 
115 	bch2_trans_put(trans);
116 	return ret < 0 ? ret : 0;
117 }
118 
119 int bch2_fs_quota_transfer(struct bch_fs *c,
120 			   struct bch_inode_info *inode,
121 			   struct bch_qid new_qid,
122 			   unsigned qtypes,
123 			   enum quota_acct_mode mode)
124 {
125 	unsigned i;
126 	int ret;
127 
128 	qtypes &= enabled_qtypes(c);
129 
130 	for (i = 0; i < QTYP_NR; i++)
131 		if (new_qid.q[i] == inode->ei_qid.q[i])
132 			qtypes &= ~(1U << i);
133 
134 	if (!qtypes)
135 		return 0;
136 
137 	mutex_lock(&inode->ei_quota_lock);
138 
139 	ret = bch2_quota_transfer(c, qtypes, new_qid,
140 				  inode->ei_qid,
141 				  inode->v.i_blocks +
142 				  inode->ei_quota_reserved,
143 				  mode);
144 	if (!ret)
145 		for (i = 0; i < QTYP_NR; i++)
146 			if (qtypes & (1 << i))
147 				inode->ei_qid.q[i] = new_qid.q[i];
148 
149 	mutex_unlock(&inode->ei_quota_lock);
150 
151 	return ret;
152 }
153 
154 static int bch2_iget5_test(struct inode *vinode, void *p)
155 {
156 	struct bch_inode_info *inode = to_bch_ei(vinode);
157 	subvol_inum *inum = p;
158 
159 	return inode->ei_subvol == inum->subvol &&
160 		inode->ei_inode.bi_inum == inum->inum;
161 }
162 
163 static int bch2_iget5_set(struct inode *vinode, void *p)
164 {
165 	struct bch_inode_info *inode = to_bch_ei(vinode);
166 	subvol_inum *inum = p;
167 
168 	inode->v.i_ino		= inum->inum;
169 	inode->ei_subvol	= inum->subvol;
170 	inode->ei_inode.bi_inum	= inum->inum;
171 	return 0;
172 }
173 
174 static unsigned bch2_inode_hash(subvol_inum inum)
175 {
176 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
177 }
178 
179 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
180 {
181 	struct bch_inode_unpacked inode_u;
182 	struct bch_inode_info *inode;
183 	struct btree_trans *trans;
184 	struct bch_subvolume subvol;
185 	int ret;
186 
187 	inode = to_bch_ei(iget5_locked(c->vfs_sb,
188 				       bch2_inode_hash(inum),
189 				       bch2_iget5_test,
190 				       bch2_iget5_set,
191 				       &inum));
192 	if (unlikely(!inode))
193 		return ERR_PTR(-ENOMEM);
194 	if (!(inode->v.i_state & I_NEW))
195 		return &inode->v;
196 
197 	trans = bch2_trans_get(c);
198 	ret = lockrestart_do(trans,
199 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
200 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
201 
202 	if (!ret)
203 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
204 	bch2_trans_put(trans);
205 
206 	if (ret) {
207 		iget_failed(&inode->v);
208 		return ERR_PTR(bch2_err_class(ret));
209 	}
210 
211 	mutex_lock(&c->vfs_inodes_lock);
212 	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
213 	mutex_unlock(&c->vfs_inodes_lock);
214 
215 	unlock_new_inode(&inode->v);
216 
217 	return &inode->v;
218 }
219 
220 struct bch_inode_info *
221 __bch2_create(struct mnt_idmap *idmap,
222 	      struct bch_inode_info *dir, struct dentry *dentry,
223 	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
224 	      unsigned flags)
225 {
226 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
227 	struct btree_trans *trans;
228 	struct bch_inode_unpacked dir_u;
229 	struct bch_inode_info *inode, *old;
230 	struct bch_inode_unpacked inode_u;
231 	struct posix_acl *default_acl = NULL, *acl = NULL;
232 	subvol_inum inum;
233 	struct bch_subvolume subvol;
234 	u64 journal_seq = 0;
235 	int ret;
236 
237 	/*
238 	 * preallocate acls + vfs inode before btree transaction, so that
239 	 * nothing can fail after the transaction succeeds:
240 	 */
241 #ifdef CONFIG_BCACHEFS_POSIX_ACL
242 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
243 	if (ret)
244 		return ERR_PTR(ret);
245 #endif
246 	inode = to_bch_ei(new_inode(c->vfs_sb));
247 	if (unlikely(!inode)) {
248 		inode = ERR_PTR(-ENOMEM);
249 		goto err;
250 	}
251 
252 	bch2_inode_init_early(c, &inode_u);
253 
254 	if (!(flags & BCH_CREATE_TMPFILE))
255 		mutex_lock(&dir->ei_update_lock);
256 
257 	trans = bch2_trans_get(c);
258 retry:
259 	bch2_trans_begin(trans);
260 
261 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
262 		bch2_create_trans(trans,
263 				  inode_inum(dir), &dir_u, &inode_u,
264 				  !(flags & BCH_CREATE_TMPFILE)
265 				  ? &dentry->d_name : NULL,
266 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
267 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
268 				  mode, rdev,
269 				  default_acl, acl, snapshot_src, flags) ?:
270 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
271 				KEY_TYPE_QUOTA_PREALLOC);
272 	if (unlikely(ret))
273 		goto err_before_quota;
274 
275 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
276 	inum.inum = inode_u.bi_inum;
277 
278 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
279 				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
280 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
281 	if (unlikely(ret)) {
282 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
283 				KEY_TYPE_QUOTA_WARN);
284 err_before_quota:
285 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
286 			goto retry;
287 		goto err_trans;
288 	}
289 
290 	if (!(flags & BCH_CREATE_TMPFILE)) {
291 		bch2_inode_update_after_write(trans, dir, &dir_u,
292 					      ATTR_MTIME|ATTR_CTIME);
293 		mutex_unlock(&dir->ei_update_lock);
294 	}
295 
296 	bch2_iget5_set(&inode->v, &inum);
297 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
298 
299 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
300 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
301 
302 	/*
303 	 * we must insert the new inode into the inode cache before calling
304 	 * bch2_trans_exit() and dropping locks, else we could race with another
305 	 * thread pulling the inode in and modifying it:
306 	 */
307 
308 	inode->v.i_state |= I_CREATING;
309 
310 	old = to_bch_ei(inode_insert5(&inode->v,
311 				      bch2_inode_hash(inum),
312 				      bch2_iget5_test,
313 				      bch2_iget5_set,
314 				      &inum));
315 	BUG_ON(!old);
316 
317 	if (unlikely(old != inode)) {
318 		/*
319 		 * We raced, another process pulled the new inode into cache
320 		 * before us:
321 		 */
322 		make_bad_inode(&inode->v);
323 		iput(&inode->v);
324 
325 		inode = old;
326 	} else {
327 		mutex_lock(&c->vfs_inodes_lock);
328 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
329 		mutex_unlock(&c->vfs_inodes_lock);
330 		/*
331 		 * we really don't want insert_inode_locked2() to be setting
332 		 * I_NEW...
333 		 */
334 		unlock_new_inode(&inode->v);
335 	}
336 
337 	bch2_trans_put(trans);
338 err:
339 	posix_acl_release(default_acl);
340 	posix_acl_release(acl);
341 	return inode;
342 err_trans:
343 	if (!(flags & BCH_CREATE_TMPFILE))
344 		mutex_unlock(&dir->ei_update_lock);
345 
346 	bch2_trans_put(trans);
347 	make_bad_inode(&inode->v);
348 	iput(&inode->v);
349 	inode = ERR_PTR(ret);
350 	goto err;
351 }
352 
353 /* methods */
354 
355 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
356 				  unsigned int flags)
357 {
358 	struct bch_fs *c = vdir->i_sb->s_fs_info;
359 	struct bch_inode_info *dir = to_bch_ei(vdir);
360 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
361 	struct inode *vinode = NULL;
362 	subvol_inum inum = { .subvol = 1 };
363 	int ret;
364 
365 	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
366 				 &dentry->d_name, &inum);
367 
368 	if (!ret)
369 		vinode = bch2_vfs_inode_get(c, inum);
370 
371 	return d_splice_alias(vinode, dentry);
372 }
373 
374 static int bch2_mknod(struct mnt_idmap *idmap,
375 		      struct inode *vdir, struct dentry *dentry,
376 		      umode_t mode, dev_t rdev)
377 {
378 	struct bch_inode_info *inode =
379 		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
380 			      (subvol_inum) { 0 }, 0);
381 
382 	if (IS_ERR(inode))
383 		return bch2_err_class(PTR_ERR(inode));
384 
385 	d_instantiate(dentry, &inode->v);
386 	return 0;
387 }
388 
389 static int bch2_create(struct mnt_idmap *idmap,
390 		       struct inode *vdir, struct dentry *dentry,
391 		       umode_t mode, bool excl)
392 {
393 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
394 }
395 
396 static int __bch2_link(struct bch_fs *c,
397 		       struct bch_inode_info *inode,
398 		       struct bch_inode_info *dir,
399 		       struct dentry *dentry)
400 {
401 	struct btree_trans *trans = bch2_trans_get(c);
402 	struct bch_inode_unpacked dir_u, inode_u;
403 	int ret;
404 
405 	mutex_lock(&inode->ei_update_lock);
406 
407 	ret = commit_do(trans, NULL, NULL, 0,
408 			bch2_link_trans(trans,
409 					inode_inum(dir),   &dir_u,
410 					inode_inum(inode), &inode_u,
411 					&dentry->d_name));
412 
413 	if (likely(!ret)) {
414 		bch2_inode_update_after_write(trans, dir, &dir_u,
415 					      ATTR_MTIME|ATTR_CTIME);
416 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
417 	}
418 
419 	bch2_trans_put(trans);
420 	mutex_unlock(&inode->ei_update_lock);
421 	return ret;
422 }
423 
424 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
425 		     struct dentry *dentry)
426 {
427 	struct bch_fs *c = vdir->i_sb->s_fs_info;
428 	struct bch_inode_info *dir = to_bch_ei(vdir);
429 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
430 	int ret;
431 
432 	lockdep_assert_held(&inode->v.i_rwsem);
433 
434 	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
435 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
436 		__bch2_link(c, inode, dir, dentry);
437 	if (unlikely(ret))
438 		return ret;
439 
440 	ihold(&inode->v);
441 	d_instantiate(dentry, &inode->v);
442 	return 0;
443 }
444 
445 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
446 		  bool deleting_snapshot)
447 {
448 	struct bch_fs *c = vdir->i_sb->s_fs_info;
449 	struct bch_inode_info *dir = to_bch_ei(vdir);
450 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
451 	struct bch_inode_unpacked dir_u, inode_u;
452 	struct btree_trans *trans = bch2_trans_get(c);
453 	int ret;
454 
455 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
456 
457 	ret = commit_do(trans, NULL, NULL,
458 			BCH_TRANS_COMMIT_no_enospc,
459 		bch2_unlink_trans(trans,
460 				  inode_inum(dir), &dir_u,
461 				  &inode_u, &dentry->d_name,
462 				  deleting_snapshot));
463 	if (unlikely(ret))
464 		goto err;
465 
466 	bch2_inode_update_after_write(trans, dir, &dir_u,
467 				      ATTR_MTIME|ATTR_CTIME);
468 	bch2_inode_update_after_write(trans, inode, &inode_u,
469 				      ATTR_MTIME);
470 
471 	if (inode_u.bi_subvol) {
472 		/*
473 		 * Subvolume deletion is asynchronous, but we still want to tell
474 		 * the VFS that it's been deleted here:
475 		 */
476 		set_nlink(&inode->v, 0);
477 	}
478 err:
479 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
480 	bch2_trans_put(trans);
481 
482 	return ret;
483 }
484 
485 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
486 {
487 	struct bch_inode_info *dir= to_bch_ei(vdir);
488 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
489 
490 	return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
491 		__bch2_unlink(vdir, dentry, false);
492 }
493 
494 static int bch2_symlink(struct mnt_idmap *idmap,
495 			struct inode *vdir, struct dentry *dentry,
496 			const char *symname)
497 {
498 	struct bch_fs *c = vdir->i_sb->s_fs_info;
499 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
500 	int ret;
501 
502 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
503 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
504 	if (IS_ERR(inode))
505 		return bch2_err_class(PTR_ERR(inode));
506 
507 	inode_lock(&inode->v);
508 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
509 	inode_unlock(&inode->v);
510 
511 	if (unlikely(ret))
512 		goto err;
513 
514 	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
515 	if (unlikely(ret))
516 		goto err;
517 
518 	ret = __bch2_link(c, inode, dir, dentry);
519 	if (unlikely(ret))
520 		goto err;
521 
522 	d_instantiate(dentry, &inode->v);
523 	return 0;
524 err:
525 	iput(&inode->v);
526 	return ret;
527 }
528 
529 static int bch2_mkdir(struct mnt_idmap *idmap,
530 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
531 {
532 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
533 }
534 
535 static int bch2_rename2(struct mnt_idmap *idmap,
536 			struct inode *src_vdir, struct dentry *src_dentry,
537 			struct inode *dst_vdir, struct dentry *dst_dentry,
538 			unsigned flags)
539 {
540 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
541 	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
542 	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
543 	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
544 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
545 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
546 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
547 	struct btree_trans *trans;
548 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
549 		? BCH_RENAME_EXCHANGE
550 		: dst_dentry->d_inode
551 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
552 	int ret;
553 
554 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
555 		return -EINVAL;
556 
557 	if (mode == BCH_RENAME_OVERWRITE) {
558 		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
559 						   0, LLONG_MAX);
560 		if (ret)
561 			return ret;
562 	}
563 
564 	trans = bch2_trans_get(c);
565 
566 	bch2_lock_inodes(INODE_UPDATE_LOCK,
567 			 src_dir,
568 			 dst_dir,
569 			 src_inode,
570 			 dst_inode);
571 
572 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
573 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
574 	if (ret)
575 		goto err;
576 
577 	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
578 		ret = bch2_fs_quota_transfer(c, src_inode,
579 					     dst_dir->ei_qid,
580 					     1 << QTYP_PRJ,
581 					     KEY_TYPE_QUOTA_PREALLOC);
582 		if (ret)
583 			goto err;
584 	}
585 
586 	if (mode == BCH_RENAME_EXCHANGE &&
587 	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
588 		ret = bch2_fs_quota_transfer(c, dst_inode,
589 					     src_dir->ei_qid,
590 					     1 << QTYP_PRJ,
591 					     KEY_TYPE_QUOTA_PREALLOC);
592 		if (ret)
593 			goto err;
594 	}
595 
596 	ret = commit_do(trans, NULL, NULL, 0,
597 			bch2_rename_trans(trans,
598 					  inode_inum(src_dir), &src_dir_u,
599 					  inode_inum(dst_dir), &dst_dir_u,
600 					  &src_inode_u,
601 					  &dst_inode_u,
602 					  &src_dentry->d_name,
603 					  &dst_dentry->d_name,
604 					  mode));
605 	if (unlikely(ret))
606 		goto err;
607 
608 	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
609 	BUG_ON(dst_inode &&
610 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
611 
612 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
613 				      ATTR_MTIME|ATTR_CTIME);
614 
615 	if (src_dir != dst_dir)
616 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
617 					      ATTR_MTIME|ATTR_CTIME);
618 
619 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
620 				      ATTR_CTIME);
621 
622 	if (dst_inode)
623 		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
624 					      ATTR_CTIME);
625 err:
626 	bch2_trans_put(trans);
627 
628 	bch2_fs_quota_transfer(c, src_inode,
629 			       bch_qid(&src_inode->ei_inode),
630 			       1 << QTYP_PRJ,
631 			       KEY_TYPE_QUOTA_NOCHECK);
632 	if (dst_inode)
633 		bch2_fs_quota_transfer(c, dst_inode,
634 				       bch_qid(&dst_inode->ei_inode),
635 				       1 << QTYP_PRJ,
636 				       KEY_TYPE_QUOTA_NOCHECK);
637 
638 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
639 			   src_dir,
640 			   dst_dir,
641 			   src_inode,
642 			   dst_inode);
643 
644 	return ret;
645 }
646 
647 static void bch2_setattr_copy(struct mnt_idmap *idmap,
648 			      struct bch_inode_info *inode,
649 			      struct bch_inode_unpacked *bi,
650 			      struct iattr *attr)
651 {
652 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
653 	unsigned int ia_valid = attr->ia_valid;
654 
655 	if (ia_valid & ATTR_UID)
656 		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
657 	if (ia_valid & ATTR_GID)
658 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
659 
660 	if (ia_valid & ATTR_SIZE)
661 		bi->bi_size = attr->ia_size;
662 
663 	if (ia_valid & ATTR_ATIME)
664 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
665 	if (ia_valid & ATTR_MTIME)
666 		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
667 	if (ia_valid & ATTR_CTIME)
668 		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
669 
670 	if (ia_valid & ATTR_MODE) {
671 		umode_t mode = attr->ia_mode;
672 		kgid_t gid = ia_valid & ATTR_GID
673 			? attr->ia_gid
674 			: inode->v.i_gid;
675 
676 		if (!in_group_p(gid) &&
677 		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
678 			mode &= ~S_ISGID;
679 		bi->bi_mode = mode;
680 	}
681 }
682 
683 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
684 			 struct bch_inode_info *inode,
685 			 struct iattr *attr)
686 {
687 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
688 	struct bch_qid qid;
689 	struct btree_trans *trans;
690 	struct btree_iter inode_iter = { NULL };
691 	struct bch_inode_unpacked inode_u;
692 	struct posix_acl *acl = NULL;
693 	int ret;
694 
695 	mutex_lock(&inode->ei_update_lock);
696 
697 	qid = inode->ei_qid;
698 
699 	if (attr->ia_valid & ATTR_UID)
700 		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
701 
702 	if (attr->ia_valid & ATTR_GID)
703 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
704 
705 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
706 				     KEY_TYPE_QUOTA_PREALLOC);
707 	if (ret)
708 		goto err;
709 
710 	trans = bch2_trans_get(c);
711 retry:
712 	bch2_trans_begin(trans);
713 	kfree(acl);
714 	acl = NULL;
715 
716 	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
717 			      BTREE_ITER_INTENT);
718 	if (ret)
719 		goto btree_err;
720 
721 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
722 
723 	if (attr->ia_valid & ATTR_MODE) {
724 		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
725 				     inode_u.bi_mode, &acl);
726 		if (ret)
727 			goto btree_err;
728 	}
729 
730 	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
731 		bch2_trans_commit(trans, NULL, NULL,
732 				  BCH_TRANS_COMMIT_no_enospc);
733 btree_err:
734 	bch2_trans_iter_exit(trans, &inode_iter);
735 
736 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737 		goto retry;
738 	if (unlikely(ret))
739 		goto err_trans;
740 
741 	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
742 
743 	if (acl)
744 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
745 err_trans:
746 	bch2_trans_put(trans);
747 err:
748 	mutex_unlock(&inode->ei_update_lock);
749 
750 	return bch2_err_class(ret);
751 }
752 
753 static int bch2_getattr(struct mnt_idmap *idmap,
754 			const struct path *path, struct kstat *stat,
755 			u32 request_mask, unsigned query_flags)
756 {
757 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
758 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
759 
760 	stat->dev	= inode->v.i_sb->s_dev;
761 	stat->ino	= inode->v.i_ino;
762 	stat->mode	= inode->v.i_mode;
763 	stat->nlink	= inode->v.i_nlink;
764 	stat->uid	= inode->v.i_uid;
765 	stat->gid	= inode->v.i_gid;
766 	stat->rdev	= inode->v.i_rdev;
767 	stat->size	= i_size_read(&inode->v);
768 	stat->atime	= inode_get_atime(&inode->v);
769 	stat->mtime	= inode_get_mtime(&inode->v);
770 	stat->ctime	= inode_get_ctime(&inode->v);
771 	stat->blksize	= block_bytes(c);
772 	stat->blocks	= inode->v.i_blocks;
773 
774 	if (request_mask & STATX_BTIME) {
775 		stat->result_mask |= STATX_BTIME;
776 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
777 	}
778 
779 	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
780 		stat->attributes |= STATX_ATTR_IMMUTABLE;
781 	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
782 
783 	if (inode->ei_inode.bi_flags & BCH_INODE_append)
784 		stat->attributes |= STATX_ATTR_APPEND;
785 	stat->attributes_mask	 |= STATX_ATTR_APPEND;
786 
787 	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
788 		stat->attributes |= STATX_ATTR_NODUMP;
789 	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
790 
791 	return 0;
792 }
793 
794 static int bch2_setattr(struct mnt_idmap *idmap,
795 			struct dentry *dentry, struct iattr *iattr)
796 {
797 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
798 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
799 	int ret;
800 
801 	lockdep_assert_held(&inode->v.i_rwsem);
802 
803 	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
804 		setattr_prepare(idmap, dentry, iattr);
805 	if (ret)
806 		return ret;
807 
808 	return iattr->ia_valid & ATTR_SIZE
809 		? bchfs_truncate(idmap, inode, iattr)
810 		: bch2_setattr_nonsize(idmap, inode, iattr);
811 }
812 
813 static int bch2_tmpfile(struct mnt_idmap *idmap,
814 			struct inode *vdir, struct file *file, umode_t mode)
815 {
816 	struct bch_inode_info *inode =
817 		__bch2_create(idmap, to_bch_ei(vdir),
818 			      file->f_path.dentry, mode, 0,
819 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
820 
821 	if (IS_ERR(inode))
822 		return bch2_err_class(PTR_ERR(inode));
823 
824 	d_mark_tmpfile(file, &inode->v);
825 	d_instantiate(file->f_path.dentry, &inode->v);
826 	return finish_open_simple(file, 0);
827 }
828 
829 static int bch2_fill_extent(struct bch_fs *c,
830 			    struct fiemap_extent_info *info,
831 			    struct bkey_s_c k, unsigned flags)
832 {
833 	if (bkey_extent_is_direct_data(k.k)) {
834 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
835 		const union bch_extent_entry *entry;
836 		struct extent_ptr_decoded p;
837 		int ret;
838 
839 		if (k.k->type == KEY_TYPE_reflink_v)
840 			flags |= FIEMAP_EXTENT_SHARED;
841 
842 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
843 			int flags2 = 0;
844 			u64 offset = p.ptr.offset;
845 
846 			if (p.ptr.unwritten)
847 				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
848 
849 			if (p.crc.compression_type)
850 				flags2 |= FIEMAP_EXTENT_ENCODED;
851 			else
852 				offset += p.crc.offset;
853 
854 			if ((offset & (block_sectors(c) - 1)) ||
855 			    (k.k->size & (block_sectors(c) - 1)))
856 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
857 
858 			ret = fiemap_fill_next_extent(info,
859 						bkey_start_offset(k.k) << 9,
860 						offset << 9,
861 						k.k->size << 9, flags|flags2);
862 			if (ret)
863 				return ret;
864 		}
865 
866 		return 0;
867 	} else if (bkey_extent_is_inline_data(k.k)) {
868 		return fiemap_fill_next_extent(info,
869 					       bkey_start_offset(k.k) << 9,
870 					       0, k.k->size << 9,
871 					       flags|
872 					       FIEMAP_EXTENT_DATA_INLINE);
873 	} else if (k.k->type == KEY_TYPE_reservation) {
874 		return fiemap_fill_next_extent(info,
875 					       bkey_start_offset(k.k) << 9,
876 					       0, k.k->size << 9,
877 					       flags|
878 					       FIEMAP_EXTENT_DELALLOC|
879 					       FIEMAP_EXTENT_UNWRITTEN);
880 	} else {
881 		BUG();
882 	}
883 }
884 
885 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
886 		       u64 start, u64 len)
887 {
888 	struct bch_fs *c = vinode->i_sb->s_fs_info;
889 	struct bch_inode_info *ei = to_bch_ei(vinode);
890 	struct btree_trans *trans;
891 	struct btree_iter iter;
892 	struct bkey_s_c k;
893 	struct bkey_buf cur, prev;
894 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
895 	unsigned offset_into_extent, sectors;
896 	bool have_extent = false;
897 	u32 snapshot;
898 	int ret = 0;
899 
900 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
901 	if (ret)
902 		return ret;
903 
904 	if (start + len < start)
905 		return -EINVAL;
906 
907 	start >>= 9;
908 
909 	bch2_bkey_buf_init(&cur);
910 	bch2_bkey_buf_init(&prev);
911 	trans = bch2_trans_get(c);
912 retry:
913 	bch2_trans_begin(trans);
914 
915 	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
916 	if (ret)
917 		goto err;
918 
919 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
920 			     SPOS(ei->v.i_ino, start, snapshot), 0);
921 
922 	while (!(ret = btree_trans_too_many_iters(trans)) &&
923 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
924 	       !(ret = bkey_err(k))) {
925 		enum btree_id data_btree = BTREE_ID_extents;
926 
927 		if (!bkey_extent_is_data(k.k) &&
928 		    k.k->type != KEY_TYPE_reservation) {
929 			bch2_btree_iter_advance(&iter);
930 			continue;
931 		}
932 
933 		offset_into_extent	= iter.pos.offset -
934 			bkey_start_offset(k.k);
935 		sectors			= k.k->size - offset_into_extent;
936 
937 		bch2_bkey_buf_reassemble(&cur, c, k);
938 
939 		ret = bch2_read_indirect_extent(trans, &data_btree,
940 					&offset_into_extent, &cur);
941 		if (ret)
942 			break;
943 
944 		k = bkey_i_to_s_c(cur.k);
945 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
946 
947 		sectors = min(sectors, k.k->size - offset_into_extent);
948 
949 		bch2_cut_front(POS(k.k->p.inode,
950 				   bkey_start_offset(k.k) +
951 				   offset_into_extent),
952 			       cur.k);
953 		bch2_key_resize(&cur.k->k, sectors);
954 		cur.k->k.p = iter.pos;
955 		cur.k->k.p.offset += cur.k->k.size;
956 
957 		if (have_extent) {
958 			bch2_trans_unlock(trans);
959 			ret = bch2_fill_extent(c, info,
960 					bkey_i_to_s_c(prev.k), 0);
961 			if (ret)
962 				break;
963 		}
964 
965 		bkey_copy(prev.k, cur.k);
966 		have_extent = true;
967 
968 		bch2_btree_iter_set_pos(&iter,
969 			POS(iter.pos.inode, iter.pos.offset + sectors));
970 	}
971 	start = iter.pos.offset;
972 	bch2_trans_iter_exit(trans, &iter);
973 err:
974 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
975 		goto retry;
976 
977 	if (!ret && have_extent) {
978 		bch2_trans_unlock(trans);
979 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
980 				       FIEMAP_EXTENT_LAST);
981 	}
982 
983 	bch2_trans_put(trans);
984 	bch2_bkey_buf_exit(&cur, c);
985 	bch2_bkey_buf_exit(&prev, c);
986 	return ret < 0 ? ret : 0;
987 }
988 
989 static const struct vm_operations_struct bch_vm_ops = {
990 	.fault		= bch2_page_fault,
991 	.map_pages	= filemap_map_pages,
992 	.page_mkwrite   = bch2_page_mkwrite,
993 };
994 
995 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
996 {
997 	file_accessed(file);
998 
999 	vma->vm_ops = &bch_vm_ops;
1000 	return 0;
1001 }
1002 
1003 /* Directories: */
1004 
1005 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1006 {
1007 	return generic_file_llseek_size(file, offset, whence,
1008 					S64_MAX, S64_MAX);
1009 }
1010 
1011 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1012 {
1013 	struct bch_inode_info *inode = file_bch_inode(file);
1014 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1015 
1016 	if (!dir_emit_dots(file, ctx))
1017 		return 0;
1018 
1019 	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1020 
1021 	bch_err_fn(c, ret);
1022 	return bch2_err_class(ret);
1023 }
1024 
1025 static int bch2_open(struct inode *vinode, struct file *file)
1026 {
1027 	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1028 		struct bch_inode_info *inode = to_bch_ei(vinode);
1029 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1030 
1031 		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1032 		if (ret)
1033 			return ret;
1034 	}
1035 
1036 	return generic_file_open(vinode, file);
1037 }
1038 
1039 static const struct file_operations bch_file_operations = {
1040 	.open		= bch2_open,
1041 	.llseek		= bch2_llseek,
1042 	.read_iter	= bch2_read_iter,
1043 	.write_iter	= bch2_write_iter,
1044 	.mmap		= bch2_mmap,
1045 	.fsync		= bch2_fsync,
1046 	.splice_read	= filemap_splice_read,
1047 	.splice_write	= iter_file_splice_write,
1048 	.fallocate	= bch2_fallocate_dispatch,
1049 	.unlocked_ioctl = bch2_fs_file_ioctl,
1050 #ifdef CONFIG_COMPAT
1051 	.compat_ioctl	= bch2_compat_fs_ioctl,
1052 #endif
1053 	.remap_file_range = bch2_remap_file_range,
1054 };
1055 
1056 static const struct inode_operations bch_file_inode_operations = {
1057 	.getattr	= bch2_getattr,
1058 	.setattr	= bch2_setattr,
1059 	.fiemap		= bch2_fiemap,
1060 	.listxattr	= bch2_xattr_list,
1061 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1062 	.get_acl	= bch2_get_acl,
1063 	.set_acl	= bch2_set_acl,
1064 #endif
1065 };
1066 
1067 static const struct inode_operations bch_dir_inode_operations = {
1068 	.lookup		= bch2_lookup,
1069 	.create		= bch2_create,
1070 	.link		= bch2_link,
1071 	.unlink		= bch2_unlink,
1072 	.symlink	= bch2_symlink,
1073 	.mkdir		= bch2_mkdir,
1074 	.rmdir		= bch2_unlink,
1075 	.mknod		= bch2_mknod,
1076 	.rename		= bch2_rename2,
1077 	.getattr	= bch2_getattr,
1078 	.setattr	= bch2_setattr,
1079 	.tmpfile	= bch2_tmpfile,
1080 	.listxattr	= bch2_xattr_list,
1081 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1082 	.get_acl	= bch2_get_acl,
1083 	.set_acl	= bch2_set_acl,
1084 #endif
1085 };
1086 
1087 static const struct file_operations bch_dir_file_operations = {
1088 	.llseek		= bch2_dir_llseek,
1089 	.read		= generic_read_dir,
1090 	.iterate_shared	= bch2_vfs_readdir,
1091 	.fsync		= bch2_fsync,
1092 	.unlocked_ioctl = bch2_fs_file_ioctl,
1093 #ifdef CONFIG_COMPAT
1094 	.compat_ioctl	= bch2_compat_fs_ioctl,
1095 #endif
1096 };
1097 
1098 static const struct inode_operations bch_symlink_inode_operations = {
1099 	.get_link	= page_get_link,
1100 	.getattr	= bch2_getattr,
1101 	.setattr	= bch2_setattr,
1102 	.listxattr	= bch2_xattr_list,
1103 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1104 	.get_acl	= bch2_get_acl,
1105 	.set_acl	= bch2_set_acl,
1106 #endif
1107 };
1108 
1109 static const struct inode_operations bch_special_inode_operations = {
1110 	.getattr	= bch2_getattr,
1111 	.setattr	= bch2_setattr,
1112 	.listxattr	= bch2_xattr_list,
1113 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1114 	.get_acl	= bch2_get_acl,
1115 	.set_acl	= bch2_set_acl,
1116 #endif
1117 };
1118 
1119 static const struct address_space_operations bch_address_space_operations = {
1120 	.read_folio	= bch2_read_folio,
1121 	.writepages	= bch2_writepages,
1122 	.readahead	= bch2_readahead,
1123 	.dirty_folio	= filemap_dirty_folio,
1124 	.write_begin	= bch2_write_begin,
1125 	.write_end	= bch2_write_end,
1126 	.invalidate_folio = bch2_invalidate_folio,
1127 	.release_folio	= bch2_release_folio,
1128 	.direct_IO	= noop_direct_IO,
1129 #ifdef CONFIG_MIGRATION
1130 	.migrate_folio	= filemap_migrate_folio,
1131 #endif
1132 	.error_remove_folio = generic_error_remove_folio,
1133 };
1134 
1135 struct bcachefs_fid {
1136 	u64		inum;
1137 	u32		subvol;
1138 	u32		gen;
1139 } __packed;
1140 
1141 struct bcachefs_fid_with_parent {
1142 	struct bcachefs_fid	fid;
1143 	struct bcachefs_fid	dir;
1144 } __packed;
1145 
1146 static int bcachefs_fid_valid(int fh_len, int fh_type)
1147 {
1148 	switch (fh_type) {
1149 	case FILEID_BCACHEFS_WITHOUT_PARENT:
1150 		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1151 	case FILEID_BCACHEFS_WITH_PARENT:
1152 		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1153 	default:
1154 		return false;
1155 	}
1156 }
1157 
1158 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1159 {
1160 	return (struct bcachefs_fid) {
1161 		.inum	= inode->ei_inode.bi_inum,
1162 		.subvol	= inode->ei_subvol,
1163 		.gen	= inode->ei_inode.bi_generation,
1164 	};
1165 }
1166 
1167 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1168 			  struct inode *vdir)
1169 {
1170 	struct bch_inode_info *inode	= to_bch_ei(vinode);
1171 	struct bch_inode_info *dir	= to_bch_ei(vdir);
1172 	int min_len;
1173 
1174 	if (!S_ISDIR(inode->v.i_mode) && dir) {
1175 		struct bcachefs_fid_with_parent *fid = (void *) fh;
1176 
1177 		min_len = sizeof(*fid) / sizeof(u32);
1178 		if (*len < min_len) {
1179 			*len = min_len;
1180 			return FILEID_INVALID;
1181 		}
1182 
1183 		fid->fid = bch2_inode_to_fid(inode);
1184 		fid->dir = bch2_inode_to_fid(dir);
1185 
1186 		*len = min_len;
1187 		return FILEID_BCACHEFS_WITH_PARENT;
1188 	} else {
1189 		struct bcachefs_fid *fid = (void *) fh;
1190 
1191 		min_len = sizeof(*fid) / sizeof(u32);
1192 		if (*len < min_len) {
1193 			*len = min_len;
1194 			return FILEID_INVALID;
1195 		}
1196 		*fid = bch2_inode_to_fid(inode);
1197 
1198 		*len = min_len;
1199 		return FILEID_BCACHEFS_WITHOUT_PARENT;
1200 	}
1201 }
1202 
1203 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1204 					struct bcachefs_fid fid)
1205 {
1206 	struct bch_fs *c = sb->s_fs_info;
1207 	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1208 				    .subvol = fid.subvol,
1209 				    .inum = fid.inum,
1210 	});
1211 	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1212 		iput(vinode);
1213 		vinode = ERR_PTR(-ESTALE);
1214 	}
1215 	return vinode;
1216 }
1217 
1218 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1219 		int fh_len, int fh_type)
1220 {
1221 	struct bcachefs_fid *fid = (void *) _fid;
1222 
1223 	if (!bcachefs_fid_valid(fh_len, fh_type))
1224 		return NULL;
1225 
1226 	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1227 }
1228 
1229 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1230 		int fh_len, int fh_type)
1231 {
1232 	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1233 
1234 	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1235 	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1236 		return NULL;
1237 
1238 	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1239 }
1240 
1241 static struct dentry *bch2_get_parent(struct dentry *child)
1242 {
1243 	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1244 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1245 	subvol_inum parent_inum = {
1246 		.subvol = inode->ei_inode.bi_parent_subvol ?:
1247 			inode->ei_subvol,
1248 		.inum = inode->ei_inode.bi_dir,
1249 	};
1250 
1251 	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1252 }
1253 
1254 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1255 {
1256 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1257 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1258 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1259 	struct btree_trans *trans;
1260 	struct btree_iter iter1;
1261 	struct btree_iter iter2;
1262 	struct bkey_s_c k;
1263 	struct bkey_s_c_dirent d;
1264 	struct bch_inode_unpacked inode_u;
1265 	subvol_inum target;
1266 	u32 snapshot;
1267 	struct qstr dirent_name;
1268 	unsigned name_len = 0;
1269 	int ret;
1270 
1271 	if (!S_ISDIR(dir->v.i_mode))
1272 		return -EINVAL;
1273 
1274 	trans = bch2_trans_get(c);
1275 
1276 	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1277 			     POS(dir->ei_inode.bi_inum, 0), 0);
1278 	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1279 			     POS(dir->ei_inode.bi_inum, 0), 0);
1280 retry:
1281 	bch2_trans_begin(trans);
1282 
1283 	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1284 	if (ret)
1285 		goto err;
1286 
1287 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1288 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1289 
1290 	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1291 	if (ret)
1292 		goto err;
1293 
1294 	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1295 		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1296 
1297 		k = bch2_btree_iter_peek_slot(&iter1);
1298 		ret = bkey_err(k);
1299 		if (ret)
1300 			goto err;
1301 
1302 		if (k.k->type != KEY_TYPE_dirent) {
1303 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1304 			goto err;
1305 		}
1306 
1307 		d = bkey_s_c_to_dirent(k);
1308 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1309 		if (ret > 0)
1310 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1311 		if (ret)
1312 			goto err;
1313 
1314 		if (target.subvol	== inode->ei_subvol &&
1315 		    target.inum		== inode->ei_inode.bi_inum)
1316 			goto found;
1317 	} else {
1318 		/*
1319 		 * File with multiple hardlinks and our backref is to the wrong
1320 		 * directory - linear search:
1321 		 */
1322 		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1323 			if (k.k->p.inode > dir->ei_inode.bi_inum)
1324 				break;
1325 
1326 			if (k.k->type != KEY_TYPE_dirent)
1327 				continue;
1328 
1329 			d = bkey_s_c_to_dirent(k);
1330 			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1331 			if (ret < 0)
1332 				break;
1333 			if (ret)
1334 				continue;
1335 
1336 			if (target.subvol	== inode->ei_subvol &&
1337 			    target.inum		== inode->ei_inode.bi_inum)
1338 				goto found;
1339 		}
1340 	}
1341 
1342 	ret = -ENOENT;
1343 	goto err;
1344 found:
1345 	dirent_name = bch2_dirent_get_name(d);
1346 
1347 	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1348 	memcpy(name, dirent_name.name, name_len);
1349 	name[name_len] = '\0';
1350 err:
1351 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1352 		goto retry;
1353 
1354 	bch2_trans_iter_exit(trans, &iter1);
1355 	bch2_trans_iter_exit(trans, &iter2);
1356 	bch2_trans_put(trans);
1357 
1358 	return ret;
1359 }
1360 
1361 static const struct export_operations bch_export_ops = {
1362 	.encode_fh	= bch2_encode_fh,
1363 	.fh_to_dentry	= bch2_fh_to_dentry,
1364 	.fh_to_parent	= bch2_fh_to_parent,
1365 	.get_parent	= bch2_get_parent,
1366 	.get_name	= bch2_get_name,
1367 };
1368 
1369 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1370 				struct bch_inode_info *inode,
1371 				struct bch_inode_unpacked *bi,
1372 				struct bch_subvolume *subvol)
1373 {
1374 	bch2_inode_update_after_write(trans, inode, bi, ~0);
1375 
1376 	if (BCH_SUBVOLUME_SNAP(subvol))
1377 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1378 	else
1379 		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1380 
1381 	inode->v.i_blocks	= bi->bi_sectors;
1382 	inode->v.i_ino		= bi->bi_inum;
1383 	inode->v.i_rdev		= bi->bi_dev;
1384 	inode->v.i_generation	= bi->bi_generation;
1385 	inode->v.i_size		= bi->bi_size;
1386 
1387 	inode->ei_flags		= 0;
1388 	inode->ei_quota_reserved = 0;
1389 	inode->ei_qid		= bch_qid(bi);
1390 	inode->ei_subvol	= inum.subvol;
1391 
1392 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1393 
1394 	switch (inode->v.i_mode & S_IFMT) {
1395 	case S_IFREG:
1396 		inode->v.i_op	= &bch_file_inode_operations;
1397 		inode->v.i_fop	= &bch_file_operations;
1398 		break;
1399 	case S_IFDIR:
1400 		inode->v.i_op	= &bch_dir_inode_operations;
1401 		inode->v.i_fop	= &bch_dir_file_operations;
1402 		break;
1403 	case S_IFLNK:
1404 		inode_nohighmem(&inode->v);
1405 		inode->v.i_op	= &bch_symlink_inode_operations;
1406 		break;
1407 	default:
1408 		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1409 		inode->v.i_op	= &bch_special_inode_operations;
1410 		break;
1411 	}
1412 
1413 	mapping_set_large_folios(inode->v.i_mapping);
1414 }
1415 
1416 static struct inode *bch2_alloc_inode(struct super_block *sb)
1417 {
1418 	struct bch_inode_info *inode;
1419 
1420 	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1421 	if (!inode)
1422 		return NULL;
1423 
1424 	inode_init_once(&inode->v);
1425 	mutex_init(&inode->ei_update_lock);
1426 	two_state_lock_init(&inode->ei_pagecache_lock);
1427 	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
1428 	mutex_init(&inode->ei_quota_lock);
1429 
1430 	return &inode->v;
1431 }
1432 
1433 static void bch2_i_callback(struct rcu_head *head)
1434 {
1435 	struct inode *vinode = container_of(head, struct inode, i_rcu);
1436 	struct bch_inode_info *inode = to_bch_ei(vinode);
1437 
1438 	kmem_cache_free(bch2_inode_cache, inode);
1439 }
1440 
1441 static void bch2_destroy_inode(struct inode *vinode)
1442 {
1443 	call_rcu(&vinode->i_rcu, bch2_i_callback);
1444 }
1445 
1446 static int inode_update_times_fn(struct btree_trans *trans,
1447 				 struct bch_inode_info *inode,
1448 				 struct bch_inode_unpacked *bi,
1449 				 void *p)
1450 {
1451 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1452 
1453 	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1454 	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1455 	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1456 
1457 	return 0;
1458 }
1459 
1460 static int bch2_vfs_write_inode(struct inode *vinode,
1461 				struct writeback_control *wbc)
1462 {
1463 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1464 	struct bch_inode_info *inode = to_bch_ei(vinode);
1465 	int ret;
1466 
1467 	mutex_lock(&inode->ei_update_lock);
1468 	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1469 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1470 	mutex_unlock(&inode->ei_update_lock);
1471 
1472 	return bch2_err_class(ret);
1473 }
1474 
1475 static void bch2_evict_inode(struct inode *vinode)
1476 {
1477 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1478 	struct bch_inode_info *inode = to_bch_ei(vinode);
1479 
1480 	truncate_inode_pages_final(&inode->v.i_data);
1481 
1482 	clear_inode(&inode->v);
1483 
1484 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1485 
1486 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1487 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1488 				KEY_TYPE_QUOTA_WARN);
1489 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1490 				KEY_TYPE_QUOTA_WARN);
1491 		bch2_inode_rm(c, inode_inum(inode));
1492 	}
1493 
1494 	mutex_lock(&c->vfs_inodes_lock);
1495 	list_del_init(&inode->ei_vfs_inode_list);
1496 	mutex_unlock(&c->vfs_inodes_lock);
1497 }
1498 
1499 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1500 {
1501 	struct bch_inode_info *inode;
1502 	DARRAY(struct bch_inode_info *) grabbed;
1503 	bool clean_pass = false, this_pass_clean;
1504 
1505 	/*
1506 	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1507 	 * be pruned with d_mark_dontcache().
1508 	 *
1509 	 * Once we've had a clean pass where we didn't find any inodes without
1510 	 * I_DONTCACHE, we wait for them to be freed:
1511 	 */
1512 
1513 	darray_init(&grabbed);
1514 	darray_make_room(&grabbed, 1024);
1515 again:
1516 	cond_resched();
1517 	this_pass_clean = true;
1518 
1519 	mutex_lock(&c->vfs_inodes_lock);
1520 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1521 		if (!snapshot_list_has_id(s, inode->ei_subvol))
1522 			continue;
1523 
1524 		if (!(inode->v.i_state & I_DONTCACHE) &&
1525 		    !(inode->v.i_state & I_FREEING) &&
1526 		    igrab(&inode->v)) {
1527 			this_pass_clean = false;
1528 
1529 			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1530 				iput(&inode->v);
1531 				break;
1532 			}
1533 		} else if (clean_pass && this_pass_clean) {
1534 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1535 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1536 
1537 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1538 			mutex_unlock(&c->vfs_inodes_lock);
1539 
1540 			schedule();
1541 			finish_wait(wq, &wait.wq_entry);
1542 			goto again;
1543 		}
1544 	}
1545 	mutex_unlock(&c->vfs_inodes_lock);
1546 
1547 	darray_for_each(grabbed, i) {
1548 		inode = *i;
1549 		d_mark_dontcache(&inode->v);
1550 		d_prune_aliases(&inode->v);
1551 		iput(&inode->v);
1552 	}
1553 	grabbed.nr = 0;
1554 
1555 	if (!clean_pass || !this_pass_clean) {
1556 		clean_pass = this_pass_clean;
1557 		goto again;
1558 	}
1559 
1560 	darray_exit(&grabbed);
1561 }
1562 
1563 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1564 {
1565 	struct super_block *sb = dentry->d_sb;
1566 	struct bch_fs *c = sb->s_fs_info;
1567 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1568 	unsigned shift = sb->s_blocksize_bits - 9;
1569 	/*
1570 	 * this assumes inodes take up 64 bytes, which is a decent average
1571 	 * number:
1572 	 */
1573 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1574 	u64 fsid;
1575 
1576 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1577 	buf->f_bsize	= sb->s_blocksize;
1578 	buf->f_blocks	= usage.capacity >> shift;
1579 	buf->f_bfree	= usage.free >> shift;
1580 	buf->f_bavail	= avail_factor(usage.free) >> shift;
1581 
1582 	buf->f_files	= usage.nr_inodes + avail_inodes;
1583 	buf->f_ffree	= avail_inodes;
1584 
1585 	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
1586 	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
1587 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
1588 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
1589 	buf->f_namelen	= BCH_NAME_MAX;
1590 
1591 	return 0;
1592 }
1593 
1594 static int bch2_sync_fs(struct super_block *sb, int wait)
1595 {
1596 	struct bch_fs *c = sb->s_fs_info;
1597 	int ret;
1598 
1599 	if (c->opts.journal_flush_disabled)
1600 		return 0;
1601 
1602 	if (!wait) {
1603 		bch2_journal_flush_async(&c->journal, NULL);
1604 		return 0;
1605 	}
1606 
1607 	ret = bch2_journal_flush(&c->journal);
1608 	return bch2_err_class(ret);
1609 }
1610 
1611 static struct bch_fs *bch2_path_to_fs(const char *path)
1612 {
1613 	struct bch_fs *c;
1614 	dev_t dev;
1615 	int ret;
1616 
1617 	ret = lookup_bdev(path, &dev);
1618 	if (ret)
1619 		return ERR_PTR(ret);
1620 
1621 	c = bch2_dev_to_fs(dev);
1622 	if (c)
1623 		closure_put(&c->cl);
1624 	return c ?: ERR_PTR(-ENOENT);
1625 }
1626 
1627 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1628 {
1629 	struct bch_fs *c = sb->s_fs_info;
1630 	struct bch_opts opts = bch2_opts_empty();
1631 	int ret;
1632 
1633 	ret = bch2_parse_mount_opts(c, &opts, data);
1634 	if (ret)
1635 		goto err;
1636 
1637 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1638 
1639 	if (opts.read_only != c->opts.read_only) {
1640 		down_write(&c->state_lock);
1641 
1642 		if (opts.read_only) {
1643 			bch2_fs_read_only(c);
1644 
1645 			sb->s_flags |= SB_RDONLY;
1646 		} else {
1647 			ret = bch2_fs_read_write(c);
1648 			if (ret) {
1649 				bch_err(c, "error going rw: %i", ret);
1650 				up_write(&c->state_lock);
1651 				ret = -EINVAL;
1652 				goto err;
1653 			}
1654 
1655 			sb->s_flags &= ~SB_RDONLY;
1656 		}
1657 
1658 		c->opts.read_only = opts.read_only;
1659 
1660 		up_write(&c->state_lock);
1661 	}
1662 
1663 	if (opt_defined(opts, errors))
1664 		c->opts.errors = opts.errors;
1665 err:
1666 	return bch2_err_class(ret);
1667 }
1668 
1669 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1670 {
1671 	struct bch_fs *c = root->d_sb->s_fs_info;
1672 	bool first = true;
1673 
1674 	for_each_online_member(c, ca) {
1675 		if (!first)
1676 			seq_putc(seq, ':');
1677 		first = false;
1678 		seq_puts(seq, ca->disk_sb.sb_name);
1679 	}
1680 
1681 	return 0;
1682 }
1683 
1684 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1685 {
1686 	struct bch_fs *c = root->d_sb->s_fs_info;
1687 	enum bch_opt_id i;
1688 	struct printbuf buf = PRINTBUF;
1689 	int ret = 0;
1690 
1691 	for (i = 0; i < bch2_opts_nr; i++) {
1692 		const struct bch_option *opt = &bch2_opt_table[i];
1693 		u64 v = bch2_opt_get_by_id(&c->opts, i);
1694 
1695 		if (!(opt->flags & OPT_MOUNT))
1696 			continue;
1697 
1698 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1699 			continue;
1700 
1701 		printbuf_reset(&buf);
1702 		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1703 				 OPT_SHOW_MOUNT_STYLE);
1704 		seq_putc(seq, ',');
1705 		seq_puts(seq, buf.buf);
1706 	}
1707 
1708 	if (buf.allocation_failure)
1709 		ret = -ENOMEM;
1710 	printbuf_exit(&buf);
1711 	return ret;
1712 }
1713 
1714 static void bch2_put_super(struct super_block *sb)
1715 {
1716 	struct bch_fs *c = sb->s_fs_info;
1717 
1718 	__bch2_fs_stop(c);
1719 }
1720 
1721 /*
1722  * bcachefs doesn't currently integrate intwrite freeze protection but the
1723  * internal write references serve the same purpose. Therefore reuse the
1724  * read-only transition code to perform the quiesce. The caveat is that we don't
1725  * currently have the ability to block tasks that want a write reference while
1726  * the superblock is frozen. This is fine for now, but we should either add
1727  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1728  */
1729 static int bch2_freeze(struct super_block *sb)
1730 {
1731 	struct bch_fs *c = sb->s_fs_info;
1732 
1733 	down_write(&c->state_lock);
1734 	bch2_fs_read_only(c);
1735 	up_write(&c->state_lock);
1736 	return 0;
1737 }
1738 
1739 static int bch2_unfreeze(struct super_block *sb)
1740 {
1741 	struct bch_fs *c = sb->s_fs_info;
1742 	int ret;
1743 
1744 	if (test_bit(BCH_FS_emergency_ro, &c->flags))
1745 		return 0;
1746 
1747 	down_write(&c->state_lock);
1748 	ret = bch2_fs_read_write(c);
1749 	up_write(&c->state_lock);
1750 	return ret;
1751 }
1752 
1753 static const struct super_operations bch_super_operations = {
1754 	.alloc_inode	= bch2_alloc_inode,
1755 	.destroy_inode	= bch2_destroy_inode,
1756 	.write_inode	= bch2_vfs_write_inode,
1757 	.evict_inode	= bch2_evict_inode,
1758 	.sync_fs	= bch2_sync_fs,
1759 	.statfs		= bch2_statfs,
1760 	.show_devname	= bch2_show_devname,
1761 	.show_options	= bch2_show_options,
1762 	.remount_fs	= bch2_remount,
1763 	.put_super	= bch2_put_super,
1764 	.freeze_fs	= bch2_freeze,
1765 	.unfreeze_fs	= bch2_unfreeze,
1766 };
1767 
1768 static int bch2_set_super(struct super_block *s, void *data)
1769 {
1770 	s->s_fs_info = data;
1771 	return 0;
1772 }
1773 
1774 static int bch2_noset_super(struct super_block *s, void *data)
1775 {
1776 	return -EBUSY;
1777 }
1778 
1779 typedef DARRAY(struct bch_fs *) darray_fs;
1780 
1781 static int bch2_test_super(struct super_block *s, void *data)
1782 {
1783 	struct bch_fs *c = s->s_fs_info;
1784 	darray_fs *d = data;
1785 
1786 	if (!c)
1787 		return false;
1788 
1789 	darray_for_each(*d, i)
1790 		if (c != *i)
1791 			return false;
1792 	return true;
1793 }
1794 
1795 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1796 				 int flags, const char *dev_name, void *data)
1797 {
1798 	struct bch_fs *c;
1799 	struct super_block *sb;
1800 	struct inode *vinode;
1801 	struct bch_opts opts = bch2_opts_empty();
1802 	int ret;
1803 
1804 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1805 
1806 	ret = bch2_parse_mount_opts(NULL, &opts, data);
1807 	if (ret)
1808 		return ERR_PTR(ret);
1809 
1810 	if (!dev_name || strlen(dev_name) == 0)
1811 		return ERR_PTR(-EINVAL);
1812 
1813 	darray_str devs;
1814 	ret = bch2_split_devs(dev_name, &devs);
1815 	if (ret)
1816 		return ERR_PTR(ret);
1817 
1818 	darray_fs devs_to_fs = {};
1819 	darray_for_each(devs, i) {
1820 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1821 		if (ret) {
1822 			sb = ERR_PTR(ret);
1823 			goto got_sb;
1824 		}
1825 	}
1826 
1827 	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1828 	if (!IS_ERR(sb))
1829 		goto got_sb;
1830 
1831 	c = bch2_fs_open(devs.data, devs.nr, opts);
1832 	if (IS_ERR(c)) {
1833 		sb = ERR_CAST(c);
1834 		goto got_sb;
1835 	}
1836 
1837 	/* Some options can't be parsed until after the fs is started: */
1838 	ret = bch2_parse_mount_opts(c, &opts, data);
1839 	if (ret) {
1840 		bch2_fs_stop(c);
1841 		sb = ERR_PTR(ret);
1842 		goto got_sb;
1843 	}
1844 
1845 	bch2_opts_apply(&c->opts, opts);
1846 
1847 	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1848 	if (IS_ERR(sb))
1849 		bch2_fs_stop(c);
1850 got_sb:
1851 	darray_exit(&devs_to_fs);
1852 	bch2_darray_str_exit(&devs);
1853 
1854 	if (IS_ERR(sb)) {
1855 		ret = PTR_ERR(sb);
1856 		ret = bch2_err_class(ret);
1857 		return ERR_PTR(ret);
1858 	}
1859 
1860 	c = sb->s_fs_info;
1861 
1862 	if (sb->s_root) {
1863 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
1864 			ret = -EBUSY;
1865 			goto err_put_super;
1866 		}
1867 		goto out;
1868 	}
1869 
1870 	sb->s_blocksize		= block_bytes(c);
1871 	sb->s_blocksize_bits	= ilog2(block_bytes(c));
1872 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
1873 	sb->s_op		= &bch_super_operations;
1874 	sb->s_export_op		= &bch_export_ops;
1875 #ifdef CONFIG_BCACHEFS_QUOTA
1876 	sb->s_qcop		= &bch2_quotactl_operations;
1877 	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1878 #endif
1879 	sb->s_xattr		= bch2_xattr_handlers;
1880 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
1881 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
1882 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1883 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
1884 	c->vfs_sb		= sb;
1885 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1886 
1887 	ret = super_setup_bdi(sb);
1888 	if (ret)
1889 		goto err_put_super;
1890 
1891 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
1892 
1893 	for_each_online_member(c, ca) {
1894 		struct block_device *bdev = ca->disk_sb.bdev;
1895 
1896 		/* XXX: create an anonymous device for multi device filesystems */
1897 		sb->s_bdev	= bdev;
1898 		sb->s_dev	= bdev->bd_dev;
1899 		percpu_ref_put(&ca->io_ref);
1900 		break;
1901 	}
1902 
1903 	c->dev = sb->s_dev;
1904 
1905 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1906 	if (c->opts.acl)
1907 		sb->s_flags	|= SB_POSIXACL;
1908 #endif
1909 
1910 	sb->s_shrink->seeks = 0;
1911 
1912 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1913 	ret = PTR_ERR_OR_ZERO(vinode);
1914 	bch_err_msg(c, ret, "mounting: error getting root inode");
1915 	if (ret)
1916 		goto err_put_super;
1917 
1918 	sb->s_root = d_make_root(vinode);
1919 	if (!sb->s_root) {
1920 		bch_err(c, "error mounting: error allocating root dentry");
1921 		ret = -ENOMEM;
1922 		goto err_put_super;
1923 	}
1924 
1925 	sb->s_flags |= SB_ACTIVE;
1926 out:
1927 	return dget(sb->s_root);
1928 
1929 err_put_super:
1930 	deactivate_locked_super(sb);
1931 	return ERR_PTR(bch2_err_class(ret));
1932 }
1933 
1934 static void bch2_kill_sb(struct super_block *sb)
1935 {
1936 	struct bch_fs *c = sb->s_fs_info;
1937 
1938 	generic_shutdown_super(sb);
1939 	bch2_fs_free(c);
1940 }
1941 
1942 static struct file_system_type bcache_fs_type = {
1943 	.owner		= THIS_MODULE,
1944 	.name		= "bcachefs",
1945 	.mount		= bch2_mount,
1946 	.kill_sb	= bch2_kill_sb,
1947 	.fs_flags	= FS_REQUIRES_DEV,
1948 };
1949 
1950 MODULE_ALIAS_FS("bcachefs");
1951 
1952 void bch2_vfs_exit(void)
1953 {
1954 	unregister_filesystem(&bcache_fs_type);
1955 	kmem_cache_destroy(bch2_inode_cache);
1956 }
1957 
1958 int __init bch2_vfs_init(void)
1959 {
1960 	int ret = -ENOMEM;
1961 
1962 	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
1963 	if (!bch2_inode_cache)
1964 		goto err;
1965 
1966 	ret = register_filesystem(&bcache_fs_type);
1967 	if (ret)
1968 		goto err;
1969 
1970 	return 0;
1971 err:
1972 	bch2_vfs_exit();
1973 	return ret;
1974 }
1975 
1976 #endif /* NO_BCACHEFS_FS */
1977