xref: /linux/fs/bcachefs/fs.c (revision 36f353a1ebf88280f58d1ebfe2731251d9159456)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42 
43 static struct kmem_cache *bch2_inode_cache;
44 
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46 				struct bch_inode_info *,
47 				struct bch_inode_unpacked *,
48 				struct bch_subvolume *);
49 
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51 				   struct bch_inode_info *inode,
52 				   struct bch_inode_unpacked *bi,
53 				   unsigned fields)
54 {
55 	struct bch_fs *c = trans->c;
56 
57 	BUG_ON(bi->bi_inum != inode->v.i_ino);
58 
59 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60 			       POS(0, bi->bi_inum),
61 			       c->opts.inodes_use_key_cache);
62 
63 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64 	i_uid_write(&inode->v, bi->bi_uid);
65 	i_gid_write(&inode->v, bi->bi_gid);
66 	inode->v.i_mode	= bi->bi_mode;
67 
68 	if (fields & ATTR_ATIME)
69 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70 	if (fields & ATTR_MTIME)
71 		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72 	if (fields & ATTR_CTIME)
73 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74 
75 	inode->ei_inode		= *bi;
76 
77 	bch2_inode_flags_to_vfs(inode);
78 }
79 
80 int __must_check bch2_write_inode(struct bch_fs *c,
81 				  struct bch_inode_info *inode,
82 				  inode_set_fn set,
83 				  void *p, unsigned fields)
84 {
85 	struct btree_trans *trans = bch2_trans_get(c);
86 	struct btree_iter iter = { NULL };
87 	struct bch_inode_unpacked inode_u;
88 	int ret;
89 retry:
90 	bch2_trans_begin(trans);
91 
92 	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93 				BTREE_ITER_INTENT) ?:
94 		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95 		bch2_inode_write(trans, &iter, &inode_u) ?:
96 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97 
98 	/*
99 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 	 * this is important for inode updates via bchfs_write_index_update
101 	 */
102 	if (!ret)
103 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104 
105 	bch2_trans_iter_exit(trans, &iter);
106 
107 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108 		goto retry;
109 
110 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111 			     "%s: inode %u:%llu not found when updating",
112 			     bch2_err_str(ret),
113 			     inode_inum(inode).subvol,
114 			     inode_inum(inode).inum);
115 
116 	bch2_trans_put(trans);
117 	return ret < 0 ? ret : 0;
118 }
119 
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121 			   struct bch_inode_info *inode,
122 			   struct bch_qid new_qid,
123 			   unsigned qtypes,
124 			   enum quota_acct_mode mode)
125 {
126 	unsigned i;
127 	int ret;
128 
129 	qtypes &= enabled_qtypes(c);
130 
131 	for (i = 0; i < QTYP_NR; i++)
132 		if (new_qid.q[i] == inode->ei_qid.q[i])
133 			qtypes &= ~(1U << i);
134 
135 	if (!qtypes)
136 		return 0;
137 
138 	mutex_lock(&inode->ei_quota_lock);
139 
140 	ret = bch2_quota_transfer(c, qtypes, new_qid,
141 				  inode->ei_qid,
142 				  inode->v.i_blocks +
143 				  inode->ei_quota_reserved,
144 				  mode);
145 	if (!ret)
146 		for (i = 0; i < QTYP_NR; i++)
147 			if (qtypes & (1 << i))
148 				inode->ei_qid.q[i] = new_qid.q[i];
149 
150 	mutex_unlock(&inode->ei_quota_lock);
151 
152 	return ret;
153 }
154 
155 static int bch2_iget5_test(struct inode *vinode, void *p)
156 {
157 	struct bch_inode_info *inode = to_bch_ei(vinode);
158 	subvol_inum *inum = p;
159 
160 	return inode->ei_subvol == inum->subvol &&
161 		inode->ei_inode.bi_inum == inum->inum;
162 }
163 
164 static int bch2_iget5_set(struct inode *vinode, void *p)
165 {
166 	struct bch_inode_info *inode = to_bch_ei(vinode);
167 	subvol_inum *inum = p;
168 
169 	inode->v.i_ino		= inum->inum;
170 	inode->ei_subvol	= inum->subvol;
171 	inode->ei_inode.bi_inum	= inum->inum;
172 	return 0;
173 }
174 
175 static unsigned bch2_inode_hash(subvol_inum inum)
176 {
177 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
178 }
179 
180 static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
181 {
182 	subvol_inum inum = inode_inum(inode);
183 	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
184 				      bch2_inode_hash(inum),
185 				      bch2_iget5_test,
186 				      bch2_iget5_set,
187 				      &inum));
188 	BUG_ON(!old);
189 
190 	if (unlikely(old != inode)) {
191 		discard_new_inode(&inode->v);
192 		inode = old;
193 	} else {
194 		mutex_lock(&c->vfs_inodes_lock);
195 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
196 		mutex_unlock(&c->vfs_inodes_lock);
197 		/*
198 		 * we really don't want insert_inode_locked2() to be setting
199 		 * I_NEW...
200 		 */
201 		unlock_new_inode(&inode->v);
202 	}
203 
204 	return inode;
205 }
206 
207 #define memalloc_flags_do(_flags, _do)						\
208 ({										\
209 	unsigned _saved_flags = memalloc_flags_save(_flags);			\
210 	typeof(_do) _ret = _do;							\
211 	memalloc_noreclaim_restore(_saved_flags);				\
212 	_ret;									\
213 })
214 
215 /*
216  * Allocate a new inode, dropping/retaking btree locks if necessary:
217  */
218 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
219 {
220 	struct bch_fs *c = trans->c;
221 
222 	struct bch_inode_info *inode =
223 		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
224 				  to_bch_ei(new_inode(c->vfs_sb)));
225 
226 	if (unlikely(!inode)) {
227 		int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
228 		if (ret && inode)
229 			discard_new_inode(&inode->v);
230 		if (ret)
231 			return ERR_PTR(ret);
232 	}
233 
234 	return inode;
235 }
236 
237 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
238 {
239 	struct bch_inode_info *inode =
240 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
241 					  bch2_inode_hash(inum),
242 					  bch2_iget5_test,
243 					  &inum));
244 	if (inode)
245 		return &inode->v;
246 
247 	struct btree_trans *trans = bch2_trans_get(c);
248 
249 	struct bch_inode_unpacked inode_u;
250 	struct bch_subvolume subvol;
251 	int ret = lockrestart_do(trans,
252 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
253 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
254 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
255 	if (!ret) {
256 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
257 		inode = bch2_inode_insert(c, inode);
258 	}
259 	bch2_trans_put(trans);
260 
261 	return ret ? ERR_PTR(ret) : &inode->v;
262 }
263 
264 struct bch_inode_info *
265 __bch2_create(struct mnt_idmap *idmap,
266 	      struct bch_inode_info *dir, struct dentry *dentry,
267 	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
268 	      unsigned flags)
269 {
270 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
271 	struct btree_trans *trans;
272 	struct bch_inode_unpacked dir_u;
273 	struct bch_inode_info *inode;
274 	struct bch_inode_unpacked inode_u;
275 	struct posix_acl *default_acl = NULL, *acl = NULL;
276 	subvol_inum inum;
277 	struct bch_subvolume subvol;
278 	u64 journal_seq = 0;
279 	int ret;
280 
281 	/*
282 	 * preallocate acls + vfs inode before btree transaction, so that
283 	 * nothing can fail after the transaction succeeds:
284 	 */
285 #ifdef CONFIG_BCACHEFS_POSIX_ACL
286 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
287 	if (ret)
288 		return ERR_PTR(ret);
289 #endif
290 	inode = to_bch_ei(new_inode(c->vfs_sb));
291 	if (unlikely(!inode)) {
292 		inode = ERR_PTR(-ENOMEM);
293 		goto err;
294 	}
295 
296 	bch2_inode_init_early(c, &inode_u);
297 
298 	if (!(flags & BCH_CREATE_TMPFILE))
299 		mutex_lock(&dir->ei_update_lock);
300 
301 	trans = bch2_trans_get(c);
302 retry:
303 	bch2_trans_begin(trans);
304 
305 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
306 		bch2_create_trans(trans,
307 				  inode_inum(dir), &dir_u, &inode_u,
308 				  !(flags & BCH_CREATE_TMPFILE)
309 				  ? &dentry->d_name : NULL,
310 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
311 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
312 				  mode, rdev,
313 				  default_acl, acl, snapshot_src, flags) ?:
314 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
315 				KEY_TYPE_QUOTA_PREALLOC);
316 	if (unlikely(ret))
317 		goto err_before_quota;
318 
319 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
320 	inum.inum = inode_u.bi_inum;
321 
322 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
323 				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
324 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
325 	if (unlikely(ret)) {
326 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
327 				KEY_TYPE_QUOTA_WARN);
328 err_before_quota:
329 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
330 			goto retry;
331 		goto err_trans;
332 	}
333 
334 	if (!(flags & BCH_CREATE_TMPFILE)) {
335 		bch2_inode_update_after_write(trans, dir, &dir_u,
336 					      ATTR_MTIME|ATTR_CTIME);
337 		mutex_unlock(&dir->ei_update_lock);
338 	}
339 
340 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
341 
342 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
343 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
344 
345 	/*
346 	 * we must insert the new inode into the inode cache before calling
347 	 * bch2_trans_exit() and dropping locks, else we could race with another
348 	 * thread pulling the inode in and modifying it:
349 	 */
350 	inode = bch2_inode_insert(c, inode);
351 	bch2_trans_put(trans);
352 err:
353 	posix_acl_release(default_acl);
354 	posix_acl_release(acl);
355 	return inode;
356 err_trans:
357 	if (!(flags & BCH_CREATE_TMPFILE))
358 		mutex_unlock(&dir->ei_update_lock);
359 
360 	bch2_trans_put(trans);
361 	make_bad_inode(&inode->v);
362 	iput(&inode->v);
363 	inode = ERR_PTR(ret);
364 	goto err;
365 }
366 
367 /* methods */
368 
369 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
370 			subvol_inum dir, struct bch_hash_info *dir_hash_info,
371 			const struct qstr *name)
372 {
373 	struct bch_fs *c = trans->c;
374 	struct btree_iter dirent_iter = {};
375 	subvol_inum inum = {};
376 
377 	int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
378 				   dir_hash_info, dir, name, 0);
379 	if (ret)
380 		return ERR_PTR(ret);
381 
382 	struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
383 	ret = bkey_err(k);
384 	if (ret)
385 		goto err;
386 
387 	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
388 	if (ret > 0)
389 		ret = -ENOENT;
390 	if (ret)
391 		goto err;
392 
393 	struct bch_inode_info *inode =
394 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
395 					  bch2_inode_hash(inum),
396 					  bch2_iget5_test,
397 					  &inum));
398 	if (inode)
399 		goto out;
400 
401 	struct bch_subvolume subvol;
402 	struct bch_inode_unpacked inode_u;
403 	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
404 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
405 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
406 	if (bch2_err_matches(ret, ENOENT)) {
407 		struct printbuf buf = PRINTBUF;
408 
409 		bch2_bkey_val_to_text(&buf, c, k);
410 		bch_err(c, "%s points to missing inode", buf.buf);
411 		printbuf_exit(&buf);
412 	}
413 	if (ret)
414 		goto err;
415 
416 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
417 	inode = bch2_inode_insert(c, inode);
418 out:
419 	bch2_trans_iter_exit(trans, &dirent_iter);
420 	return inode;
421 err:
422 	inode = ERR_PTR(ret);
423 	goto out;
424 }
425 
426 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
427 				  unsigned int flags)
428 {
429 	struct bch_fs *c = vdir->i_sb->s_fs_info;
430 	struct bch_inode_info *dir = to_bch_ei(vdir);
431 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
432 
433 	struct bch_inode_info *inode;
434 	bch2_trans_do(c, NULL, NULL, 0,
435 		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
436 							  &hash, &dentry->d_name)));
437 	if (IS_ERR(inode))
438 		inode = NULL;
439 
440 	return d_splice_alias(&inode->v, dentry);
441 }
442 
443 static int bch2_mknod(struct mnt_idmap *idmap,
444 		      struct inode *vdir, struct dentry *dentry,
445 		      umode_t mode, dev_t rdev)
446 {
447 	struct bch_inode_info *inode =
448 		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
449 			      (subvol_inum) { 0 }, 0);
450 
451 	if (IS_ERR(inode))
452 		return bch2_err_class(PTR_ERR(inode));
453 
454 	d_instantiate(dentry, &inode->v);
455 	return 0;
456 }
457 
458 static int bch2_create(struct mnt_idmap *idmap,
459 		       struct inode *vdir, struct dentry *dentry,
460 		       umode_t mode, bool excl)
461 {
462 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
463 }
464 
465 static int __bch2_link(struct bch_fs *c,
466 		       struct bch_inode_info *inode,
467 		       struct bch_inode_info *dir,
468 		       struct dentry *dentry)
469 {
470 	struct btree_trans *trans = bch2_trans_get(c);
471 	struct bch_inode_unpacked dir_u, inode_u;
472 	int ret;
473 
474 	mutex_lock(&inode->ei_update_lock);
475 
476 	ret = commit_do(trans, NULL, NULL, 0,
477 			bch2_link_trans(trans,
478 					inode_inum(dir),   &dir_u,
479 					inode_inum(inode), &inode_u,
480 					&dentry->d_name));
481 
482 	if (likely(!ret)) {
483 		bch2_inode_update_after_write(trans, dir, &dir_u,
484 					      ATTR_MTIME|ATTR_CTIME);
485 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
486 	}
487 
488 	bch2_trans_put(trans);
489 	mutex_unlock(&inode->ei_update_lock);
490 	return ret;
491 }
492 
493 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
494 		     struct dentry *dentry)
495 {
496 	struct bch_fs *c = vdir->i_sb->s_fs_info;
497 	struct bch_inode_info *dir = to_bch_ei(vdir);
498 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
499 	int ret;
500 
501 	lockdep_assert_held(&inode->v.i_rwsem);
502 
503 	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
504 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
505 		__bch2_link(c, inode, dir, dentry);
506 	if (unlikely(ret))
507 		return bch2_err_class(ret);
508 
509 	ihold(&inode->v);
510 	d_instantiate(dentry, &inode->v);
511 	return 0;
512 }
513 
514 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
515 		  bool deleting_snapshot)
516 {
517 	struct bch_fs *c = vdir->i_sb->s_fs_info;
518 	struct bch_inode_info *dir = to_bch_ei(vdir);
519 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
520 	struct bch_inode_unpacked dir_u, inode_u;
521 	struct btree_trans *trans = bch2_trans_get(c);
522 	int ret;
523 
524 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
525 
526 	ret = commit_do(trans, NULL, NULL,
527 			BCH_TRANS_COMMIT_no_enospc,
528 		bch2_unlink_trans(trans,
529 				  inode_inum(dir), &dir_u,
530 				  &inode_u, &dentry->d_name,
531 				  deleting_snapshot));
532 	if (unlikely(ret))
533 		goto err;
534 
535 	bch2_inode_update_after_write(trans, dir, &dir_u,
536 				      ATTR_MTIME|ATTR_CTIME);
537 	bch2_inode_update_after_write(trans, inode, &inode_u,
538 				      ATTR_MTIME);
539 
540 	if (inode_u.bi_subvol) {
541 		/*
542 		 * Subvolume deletion is asynchronous, but we still want to tell
543 		 * the VFS that it's been deleted here:
544 		 */
545 		set_nlink(&inode->v, 0);
546 	}
547 err:
548 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
549 	bch2_trans_put(trans);
550 
551 	return ret;
552 }
553 
554 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
555 {
556 	struct bch_inode_info *dir= to_bch_ei(vdir);
557 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
558 
559 	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
560 		__bch2_unlink(vdir, dentry, false);
561 	return bch2_err_class(ret);
562 }
563 
564 static int bch2_symlink(struct mnt_idmap *idmap,
565 			struct inode *vdir, struct dentry *dentry,
566 			const char *symname)
567 {
568 	struct bch_fs *c = vdir->i_sb->s_fs_info;
569 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
570 	int ret;
571 
572 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
573 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
574 	if (IS_ERR(inode))
575 		return bch2_err_class(PTR_ERR(inode));
576 
577 	inode_lock(&inode->v);
578 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
579 	inode_unlock(&inode->v);
580 
581 	if (unlikely(ret))
582 		goto err;
583 
584 	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
585 	if (unlikely(ret))
586 		goto err;
587 
588 	ret = __bch2_link(c, inode, dir, dentry);
589 	if (unlikely(ret))
590 		goto err;
591 
592 	d_instantiate(dentry, &inode->v);
593 	return 0;
594 err:
595 	iput(&inode->v);
596 	return bch2_err_class(ret);
597 }
598 
599 static int bch2_mkdir(struct mnt_idmap *idmap,
600 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
601 {
602 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
603 }
604 
605 static int bch2_rename2(struct mnt_idmap *idmap,
606 			struct inode *src_vdir, struct dentry *src_dentry,
607 			struct inode *dst_vdir, struct dentry *dst_dentry,
608 			unsigned flags)
609 {
610 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
611 	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
612 	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
613 	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
614 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
615 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
616 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
617 	struct btree_trans *trans;
618 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
619 		? BCH_RENAME_EXCHANGE
620 		: dst_dentry->d_inode
621 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
622 	int ret;
623 
624 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
625 		return -EINVAL;
626 
627 	if (mode == BCH_RENAME_OVERWRITE) {
628 		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
629 						   0, LLONG_MAX);
630 		if (ret)
631 			return ret;
632 	}
633 
634 	trans = bch2_trans_get(c);
635 
636 	bch2_lock_inodes(INODE_UPDATE_LOCK,
637 			 src_dir,
638 			 dst_dir,
639 			 src_inode,
640 			 dst_inode);
641 
642 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
643 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
644 	if (ret)
645 		goto err;
646 
647 	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
648 		ret = bch2_fs_quota_transfer(c, src_inode,
649 					     dst_dir->ei_qid,
650 					     1 << QTYP_PRJ,
651 					     KEY_TYPE_QUOTA_PREALLOC);
652 		if (ret)
653 			goto err;
654 	}
655 
656 	if (mode == BCH_RENAME_EXCHANGE &&
657 	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
658 		ret = bch2_fs_quota_transfer(c, dst_inode,
659 					     src_dir->ei_qid,
660 					     1 << QTYP_PRJ,
661 					     KEY_TYPE_QUOTA_PREALLOC);
662 		if (ret)
663 			goto err;
664 	}
665 
666 	ret = commit_do(trans, NULL, NULL, 0,
667 			bch2_rename_trans(trans,
668 					  inode_inum(src_dir), &src_dir_u,
669 					  inode_inum(dst_dir), &dst_dir_u,
670 					  &src_inode_u,
671 					  &dst_inode_u,
672 					  &src_dentry->d_name,
673 					  &dst_dentry->d_name,
674 					  mode));
675 	if (unlikely(ret))
676 		goto err;
677 
678 	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
679 	BUG_ON(dst_inode &&
680 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
681 
682 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
683 				      ATTR_MTIME|ATTR_CTIME);
684 
685 	if (src_dir != dst_dir)
686 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
687 					      ATTR_MTIME|ATTR_CTIME);
688 
689 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
690 				      ATTR_CTIME);
691 
692 	if (dst_inode)
693 		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
694 					      ATTR_CTIME);
695 err:
696 	bch2_trans_put(trans);
697 
698 	bch2_fs_quota_transfer(c, src_inode,
699 			       bch_qid(&src_inode->ei_inode),
700 			       1 << QTYP_PRJ,
701 			       KEY_TYPE_QUOTA_NOCHECK);
702 	if (dst_inode)
703 		bch2_fs_quota_transfer(c, dst_inode,
704 				       bch_qid(&dst_inode->ei_inode),
705 				       1 << QTYP_PRJ,
706 				       KEY_TYPE_QUOTA_NOCHECK);
707 
708 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
709 			   src_dir,
710 			   dst_dir,
711 			   src_inode,
712 			   dst_inode);
713 
714 	return bch2_err_class(ret);
715 }
716 
717 static void bch2_setattr_copy(struct mnt_idmap *idmap,
718 			      struct bch_inode_info *inode,
719 			      struct bch_inode_unpacked *bi,
720 			      struct iattr *attr)
721 {
722 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
723 	unsigned int ia_valid = attr->ia_valid;
724 
725 	if (ia_valid & ATTR_UID)
726 		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
727 	if (ia_valid & ATTR_GID)
728 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
729 
730 	if (ia_valid & ATTR_SIZE)
731 		bi->bi_size = attr->ia_size;
732 
733 	if (ia_valid & ATTR_ATIME)
734 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
735 	if (ia_valid & ATTR_MTIME)
736 		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
737 	if (ia_valid & ATTR_CTIME)
738 		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
739 
740 	if (ia_valid & ATTR_MODE) {
741 		umode_t mode = attr->ia_mode;
742 		kgid_t gid = ia_valid & ATTR_GID
743 			? attr->ia_gid
744 			: inode->v.i_gid;
745 
746 		if (!in_group_p(gid) &&
747 		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
748 			mode &= ~S_ISGID;
749 		bi->bi_mode = mode;
750 	}
751 }
752 
753 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
754 			 struct bch_inode_info *inode,
755 			 struct iattr *attr)
756 {
757 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
758 	struct bch_qid qid;
759 	struct btree_trans *trans;
760 	struct btree_iter inode_iter = { NULL };
761 	struct bch_inode_unpacked inode_u;
762 	struct posix_acl *acl = NULL;
763 	int ret;
764 
765 	mutex_lock(&inode->ei_update_lock);
766 
767 	qid = inode->ei_qid;
768 
769 	if (attr->ia_valid & ATTR_UID)
770 		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
771 
772 	if (attr->ia_valid & ATTR_GID)
773 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
774 
775 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
776 				     KEY_TYPE_QUOTA_PREALLOC);
777 	if (ret)
778 		goto err;
779 
780 	trans = bch2_trans_get(c);
781 retry:
782 	bch2_trans_begin(trans);
783 	kfree(acl);
784 	acl = NULL;
785 
786 	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
787 			      BTREE_ITER_INTENT);
788 	if (ret)
789 		goto btree_err;
790 
791 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
792 
793 	if (attr->ia_valid & ATTR_MODE) {
794 		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
795 				     inode_u.bi_mode, &acl);
796 		if (ret)
797 			goto btree_err;
798 	}
799 
800 	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
801 		bch2_trans_commit(trans, NULL, NULL,
802 				  BCH_TRANS_COMMIT_no_enospc);
803 btree_err:
804 	bch2_trans_iter_exit(trans, &inode_iter);
805 
806 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
807 		goto retry;
808 	if (unlikely(ret))
809 		goto err_trans;
810 
811 	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
812 
813 	if (acl)
814 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
815 err_trans:
816 	bch2_trans_put(trans);
817 err:
818 	mutex_unlock(&inode->ei_update_lock);
819 
820 	return bch2_err_class(ret);
821 }
822 
823 static int bch2_getattr(struct mnt_idmap *idmap,
824 			const struct path *path, struct kstat *stat,
825 			u32 request_mask, unsigned query_flags)
826 {
827 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
828 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
829 
830 	stat->dev	= inode->v.i_sb->s_dev;
831 	stat->ino	= inode->v.i_ino;
832 	stat->mode	= inode->v.i_mode;
833 	stat->nlink	= inode->v.i_nlink;
834 	stat->uid	= inode->v.i_uid;
835 	stat->gid	= inode->v.i_gid;
836 	stat->rdev	= inode->v.i_rdev;
837 	stat->size	= i_size_read(&inode->v);
838 	stat->atime	= inode_get_atime(&inode->v);
839 	stat->mtime	= inode_get_mtime(&inode->v);
840 	stat->ctime	= inode_get_ctime(&inode->v);
841 	stat->blksize	= block_bytes(c);
842 	stat->blocks	= inode->v.i_blocks;
843 
844 	if (request_mask & STATX_BTIME) {
845 		stat->result_mask |= STATX_BTIME;
846 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
847 	}
848 
849 	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
850 		stat->attributes |= STATX_ATTR_IMMUTABLE;
851 	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
852 
853 	if (inode->ei_inode.bi_flags & BCH_INODE_append)
854 		stat->attributes |= STATX_ATTR_APPEND;
855 	stat->attributes_mask	 |= STATX_ATTR_APPEND;
856 
857 	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
858 		stat->attributes |= STATX_ATTR_NODUMP;
859 	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
860 
861 	return 0;
862 }
863 
864 static int bch2_setattr(struct mnt_idmap *idmap,
865 			struct dentry *dentry, struct iattr *iattr)
866 {
867 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
868 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
869 	int ret;
870 
871 	lockdep_assert_held(&inode->v.i_rwsem);
872 
873 	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
874 		setattr_prepare(idmap, dentry, iattr);
875 	if (ret)
876 		return ret;
877 
878 	return iattr->ia_valid & ATTR_SIZE
879 		? bchfs_truncate(idmap, inode, iattr)
880 		: bch2_setattr_nonsize(idmap, inode, iattr);
881 }
882 
883 static int bch2_tmpfile(struct mnt_idmap *idmap,
884 			struct inode *vdir, struct file *file, umode_t mode)
885 {
886 	struct bch_inode_info *inode =
887 		__bch2_create(idmap, to_bch_ei(vdir),
888 			      file->f_path.dentry, mode, 0,
889 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
890 
891 	if (IS_ERR(inode))
892 		return bch2_err_class(PTR_ERR(inode));
893 
894 	d_mark_tmpfile(file, &inode->v);
895 	d_instantiate(file->f_path.dentry, &inode->v);
896 	return finish_open_simple(file, 0);
897 }
898 
899 static int bch2_fill_extent(struct bch_fs *c,
900 			    struct fiemap_extent_info *info,
901 			    struct bkey_s_c k, unsigned flags)
902 {
903 	if (bkey_extent_is_direct_data(k.k)) {
904 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
905 		const union bch_extent_entry *entry;
906 		struct extent_ptr_decoded p;
907 		int ret;
908 
909 		if (k.k->type == KEY_TYPE_reflink_v)
910 			flags |= FIEMAP_EXTENT_SHARED;
911 
912 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
913 			int flags2 = 0;
914 			u64 offset = p.ptr.offset;
915 
916 			if (p.ptr.unwritten)
917 				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
918 
919 			if (p.crc.compression_type)
920 				flags2 |= FIEMAP_EXTENT_ENCODED;
921 			else
922 				offset += p.crc.offset;
923 
924 			if ((offset & (block_sectors(c) - 1)) ||
925 			    (k.k->size & (block_sectors(c) - 1)))
926 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
927 
928 			ret = fiemap_fill_next_extent(info,
929 						bkey_start_offset(k.k) << 9,
930 						offset << 9,
931 						k.k->size << 9, flags|flags2);
932 			if (ret)
933 				return ret;
934 		}
935 
936 		return 0;
937 	} else if (bkey_extent_is_inline_data(k.k)) {
938 		return fiemap_fill_next_extent(info,
939 					       bkey_start_offset(k.k) << 9,
940 					       0, k.k->size << 9,
941 					       flags|
942 					       FIEMAP_EXTENT_DATA_INLINE);
943 	} else if (k.k->type == KEY_TYPE_reservation) {
944 		return fiemap_fill_next_extent(info,
945 					       bkey_start_offset(k.k) << 9,
946 					       0, k.k->size << 9,
947 					       flags|
948 					       FIEMAP_EXTENT_DELALLOC|
949 					       FIEMAP_EXTENT_UNWRITTEN);
950 	} else {
951 		BUG();
952 	}
953 }
954 
955 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
956 		       u64 start, u64 len)
957 {
958 	struct bch_fs *c = vinode->i_sb->s_fs_info;
959 	struct bch_inode_info *ei = to_bch_ei(vinode);
960 	struct btree_trans *trans;
961 	struct btree_iter iter;
962 	struct bkey_s_c k;
963 	struct bkey_buf cur, prev;
964 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
965 	unsigned offset_into_extent, sectors;
966 	bool have_extent = false;
967 	u32 snapshot;
968 	int ret = 0;
969 
970 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
971 	if (ret)
972 		return ret;
973 
974 	if (start + len < start)
975 		return -EINVAL;
976 
977 	start >>= 9;
978 
979 	bch2_bkey_buf_init(&cur);
980 	bch2_bkey_buf_init(&prev);
981 	trans = bch2_trans_get(c);
982 retry:
983 	bch2_trans_begin(trans);
984 
985 	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
986 	if (ret)
987 		goto err;
988 
989 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
990 			     SPOS(ei->v.i_ino, start, snapshot), 0);
991 
992 	while (!(ret = btree_trans_too_many_iters(trans)) &&
993 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
994 	       !(ret = bkey_err(k))) {
995 		enum btree_id data_btree = BTREE_ID_extents;
996 
997 		if (!bkey_extent_is_data(k.k) &&
998 		    k.k->type != KEY_TYPE_reservation) {
999 			bch2_btree_iter_advance(&iter);
1000 			continue;
1001 		}
1002 
1003 		offset_into_extent	= iter.pos.offset -
1004 			bkey_start_offset(k.k);
1005 		sectors			= k.k->size - offset_into_extent;
1006 
1007 		bch2_bkey_buf_reassemble(&cur, c, k);
1008 
1009 		ret = bch2_read_indirect_extent(trans, &data_btree,
1010 					&offset_into_extent, &cur);
1011 		if (ret)
1012 			break;
1013 
1014 		k = bkey_i_to_s_c(cur.k);
1015 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1016 
1017 		sectors = min(sectors, k.k->size - offset_into_extent);
1018 
1019 		bch2_cut_front(POS(k.k->p.inode,
1020 				   bkey_start_offset(k.k) +
1021 				   offset_into_extent),
1022 			       cur.k);
1023 		bch2_key_resize(&cur.k->k, sectors);
1024 		cur.k->k.p = iter.pos;
1025 		cur.k->k.p.offset += cur.k->k.size;
1026 
1027 		if (have_extent) {
1028 			bch2_trans_unlock(trans);
1029 			ret = bch2_fill_extent(c, info,
1030 					bkey_i_to_s_c(prev.k), 0);
1031 			if (ret)
1032 				break;
1033 		}
1034 
1035 		bkey_copy(prev.k, cur.k);
1036 		have_extent = true;
1037 
1038 		bch2_btree_iter_set_pos(&iter,
1039 			POS(iter.pos.inode, iter.pos.offset + sectors));
1040 	}
1041 	start = iter.pos.offset;
1042 	bch2_trans_iter_exit(trans, &iter);
1043 err:
1044 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1045 		goto retry;
1046 
1047 	if (!ret && have_extent) {
1048 		bch2_trans_unlock(trans);
1049 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1050 				       FIEMAP_EXTENT_LAST);
1051 	}
1052 
1053 	bch2_trans_put(trans);
1054 	bch2_bkey_buf_exit(&cur, c);
1055 	bch2_bkey_buf_exit(&prev, c);
1056 	return ret < 0 ? ret : 0;
1057 }
1058 
1059 static const struct vm_operations_struct bch_vm_ops = {
1060 	.fault		= bch2_page_fault,
1061 	.map_pages	= filemap_map_pages,
1062 	.page_mkwrite   = bch2_page_mkwrite,
1063 };
1064 
1065 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1066 {
1067 	file_accessed(file);
1068 
1069 	vma->vm_ops = &bch_vm_ops;
1070 	return 0;
1071 }
1072 
1073 /* Directories: */
1074 
1075 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1076 {
1077 	return generic_file_llseek_size(file, offset, whence,
1078 					S64_MAX, S64_MAX);
1079 }
1080 
1081 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1082 {
1083 	struct bch_inode_info *inode = file_bch_inode(file);
1084 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1085 
1086 	if (!dir_emit_dots(file, ctx))
1087 		return 0;
1088 
1089 	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1090 
1091 	bch_err_fn(c, ret);
1092 	return bch2_err_class(ret);
1093 }
1094 
1095 static int bch2_open(struct inode *vinode, struct file *file)
1096 {
1097 	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1098 		struct bch_inode_info *inode = to_bch_ei(vinode);
1099 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1100 
1101 		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1102 		if (ret)
1103 			return ret;
1104 	}
1105 
1106 	return generic_file_open(vinode, file);
1107 }
1108 
1109 static const struct file_operations bch_file_operations = {
1110 	.open		= bch2_open,
1111 	.llseek		= bch2_llseek,
1112 	.read_iter	= bch2_read_iter,
1113 	.write_iter	= bch2_write_iter,
1114 	.mmap		= bch2_mmap,
1115 	.fsync		= bch2_fsync,
1116 	.splice_read	= filemap_splice_read,
1117 	.splice_write	= iter_file_splice_write,
1118 	.fallocate	= bch2_fallocate_dispatch,
1119 	.unlocked_ioctl = bch2_fs_file_ioctl,
1120 #ifdef CONFIG_COMPAT
1121 	.compat_ioctl	= bch2_compat_fs_ioctl,
1122 #endif
1123 	.remap_file_range = bch2_remap_file_range,
1124 };
1125 
1126 static const struct inode_operations bch_file_inode_operations = {
1127 	.getattr	= bch2_getattr,
1128 	.setattr	= bch2_setattr,
1129 	.fiemap		= bch2_fiemap,
1130 	.listxattr	= bch2_xattr_list,
1131 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1132 	.get_acl	= bch2_get_acl,
1133 	.set_acl	= bch2_set_acl,
1134 #endif
1135 };
1136 
1137 static const struct inode_operations bch_dir_inode_operations = {
1138 	.lookup		= bch2_lookup,
1139 	.create		= bch2_create,
1140 	.link		= bch2_link,
1141 	.unlink		= bch2_unlink,
1142 	.symlink	= bch2_symlink,
1143 	.mkdir		= bch2_mkdir,
1144 	.rmdir		= bch2_unlink,
1145 	.mknod		= bch2_mknod,
1146 	.rename		= bch2_rename2,
1147 	.getattr	= bch2_getattr,
1148 	.setattr	= bch2_setattr,
1149 	.tmpfile	= bch2_tmpfile,
1150 	.listxattr	= bch2_xattr_list,
1151 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1152 	.get_acl	= bch2_get_acl,
1153 	.set_acl	= bch2_set_acl,
1154 #endif
1155 };
1156 
1157 static const struct file_operations bch_dir_file_operations = {
1158 	.llseek		= bch2_dir_llseek,
1159 	.read		= generic_read_dir,
1160 	.iterate_shared	= bch2_vfs_readdir,
1161 	.fsync		= bch2_fsync,
1162 	.unlocked_ioctl = bch2_fs_file_ioctl,
1163 #ifdef CONFIG_COMPAT
1164 	.compat_ioctl	= bch2_compat_fs_ioctl,
1165 #endif
1166 };
1167 
1168 static const struct inode_operations bch_symlink_inode_operations = {
1169 	.get_link	= page_get_link,
1170 	.getattr	= bch2_getattr,
1171 	.setattr	= bch2_setattr,
1172 	.listxattr	= bch2_xattr_list,
1173 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1174 	.get_acl	= bch2_get_acl,
1175 	.set_acl	= bch2_set_acl,
1176 #endif
1177 };
1178 
1179 static const struct inode_operations bch_special_inode_operations = {
1180 	.getattr	= bch2_getattr,
1181 	.setattr	= bch2_setattr,
1182 	.listxattr	= bch2_xattr_list,
1183 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1184 	.get_acl	= bch2_get_acl,
1185 	.set_acl	= bch2_set_acl,
1186 #endif
1187 };
1188 
1189 static const struct address_space_operations bch_address_space_operations = {
1190 	.read_folio	= bch2_read_folio,
1191 	.writepages	= bch2_writepages,
1192 	.readahead	= bch2_readahead,
1193 	.dirty_folio	= filemap_dirty_folio,
1194 	.write_begin	= bch2_write_begin,
1195 	.write_end	= bch2_write_end,
1196 	.invalidate_folio = bch2_invalidate_folio,
1197 	.release_folio	= bch2_release_folio,
1198 	.direct_IO	= noop_direct_IO,
1199 #ifdef CONFIG_MIGRATION
1200 	.migrate_folio	= filemap_migrate_folio,
1201 #endif
1202 	.error_remove_folio = generic_error_remove_folio,
1203 };
1204 
1205 struct bcachefs_fid {
1206 	u64		inum;
1207 	u32		subvol;
1208 	u32		gen;
1209 } __packed;
1210 
1211 struct bcachefs_fid_with_parent {
1212 	struct bcachefs_fid	fid;
1213 	struct bcachefs_fid	dir;
1214 } __packed;
1215 
1216 static int bcachefs_fid_valid(int fh_len, int fh_type)
1217 {
1218 	switch (fh_type) {
1219 	case FILEID_BCACHEFS_WITHOUT_PARENT:
1220 		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1221 	case FILEID_BCACHEFS_WITH_PARENT:
1222 		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1223 	default:
1224 		return false;
1225 	}
1226 }
1227 
1228 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1229 {
1230 	return (struct bcachefs_fid) {
1231 		.inum	= inode->ei_inode.bi_inum,
1232 		.subvol	= inode->ei_subvol,
1233 		.gen	= inode->ei_inode.bi_generation,
1234 	};
1235 }
1236 
1237 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1238 			  struct inode *vdir)
1239 {
1240 	struct bch_inode_info *inode	= to_bch_ei(vinode);
1241 	struct bch_inode_info *dir	= to_bch_ei(vdir);
1242 	int min_len;
1243 
1244 	if (!S_ISDIR(inode->v.i_mode) && dir) {
1245 		struct bcachefs_fid_with_parent *fid = (void *) fh;
1246 
1247 		min_len = sizeof(*fid) / sizeof(u32);
1248 		if (*len < min_len) {
1249 			*len = min_len;
1250 			return FILEID_INVALID;
1251 		}
1252 
1253 		fid->fid = bch2_inode_to_fid(inode);
1254 		fid->dir = bch2_inode_to_fid(dir);
1255 
1256 		*len = min_len;
1257 		return FILEID_BCACHEFS_WITH_PARENT;
1258 	} else {
1259 		struct bcachefs_fid *fid = (void *) fh;
1260 
1261 		min_len = sizeof(*fid) / sizeof(u32);
1262 		if (*len < min_len) {
1263 			*len = min_len;
1264 			return FILEID_INVALID;
1265 		}
1266 		*fid = bch2_inode_to_fid(inode);
1267 
1268 		*len = min_len;
1269 		return FILEID_BCACHEFS_WITHOUT_PARENT;
1270 	}
1271 }
1272 
1273 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1274 					struct bcachefs_fid fid)
1275 {
1276 	struct bch_fs *c = sb->s_fs_info;
1277 	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1278 				    .subvol = fid.subvol,
1279 				    .inum = fid.inum,
1280 	});
1281 	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1282 		iput(vinode);
1283 		vinode = ERR_PTR(-ESTALE);
1284 	}
1285 	return vinode;
1286 }
1287 
1288 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1289 		int fh_len, int fh_type)
1290 {
1291 	struct bcachefs_fid *fid = (void *) _fid;
1292 
1293 	if (!bcachefs_fid_valid(fh_len, fh_type))
1294 		return NULL;
1295 
1296 	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1297 }
1298 
1299 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1300 		int fh_len, int fh_type)
1301 {
1302 	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1303 
1304 	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1305 	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1306 		return NULL;
1307 
1308 	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1309 }
1310 
1311 static struct dentry *bch2_get_parent(struct dentry *child)
1312 {
1313 	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1314 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1315 	subvol_inum parent_inum = {
1316 		.subvol = inode->ei_inode.bi_parent_subvol ?:
1317 			inode->ei_subvol,
1318 		.inum = inode->ei_inode.bi_dir,
1319 	};
1320 
1321 	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1322 }
1323 
1324 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1325 {
1326 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1327 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1328 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1329 	struct btree_trans *trans;
1330 	struct btree_iter iter1;
1331 	struct btree_iter iter2;
1332 	struct bkey_s_c k;
1333 	struct bkey_s_c_dirent d;
1334 	struct bch_inode_unpacked inode_u;
1335 	subvol_inum target;
1336 	u32 snapshot;
1337 	struct qstr dirent_name;
1338 	unsigned name_len = 0;
1339 	int ret;
1340 
1341 	if (!S_ISDIR(dir->v.i_mode))
1342 		return -EINVAL;
1343 
1344 	trans = bch2_trans_get(c);
1345 
1346 	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1347 			     POS(dir->ei_inode.bi_inum, 0), 0);
1348 	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1349 			     POS(dir->ei_inode.bi_inum, 0), 0);
1350 retry:
1351 	bch2_trans_begin(trans);
1352 
1353 	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1354 	if (ret)
1355 		goto err;
1356 
1357 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1358 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1359 
1360 	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1361 	if (ret)
1362 		goto err;
1363 
1364 	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1365 		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1366 
1367 		k = bch2_btree_iter_peek_slot(&iter1);
1368 		ret = bkey_err(k);
1369 		if (ret)
1370 			goto err;
1371 
1372 		if (k.k->type != KEY_TYPE_dirent) {
1373 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1374 			goto err;
1375 		}
1376 
1377 		d = bkey_s_c_to_dirent(k);
1378 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1379 		if (ret > 0)
1380 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1381 		if (ret)
1382 			goto err;
1383 
1384 		if (target.subvol	== inode->ei_subvol &&
1385 		    target.inum		== inode->ei_inode.bi_inum)
1386 			goto found;
1387 	} else {
1388 		/*
1389 		 * File with multiple hardlinks and our backref is to the wrong
1390 		 * directory - linear search:
1391 		 */
1392 		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1393 			if (k.k->p.inode > dir->ei_inode.bi_inum)
1394 				break;
1395 
1396 			if (k.k->type != KEY_TYPE_dirent)
1397 				continue;
1398 
1399 			d = bkey_s_c_to_dirent(k);
1400 			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1401 			if (ret < 0)
1402 				break;
1403 			if (ret)
1404 				continue;
1405 
1406 			if (target.subvol	== inode->ei_subvol &&
1407 			    target.inum		== inode->ei_inode.bi_inum)
1408 				goto found;
1409 		}
1410 	}
1411 
1412 	ret = -ENOENT;
1413 	goto err;
1414 found:
1415 	dirent_name = bch2_dirent_get_name(d);
1416 
1417 	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1418 	memcpy(name, dirent_name.name, name_len);
1419 	name[name_len] = '\0';
1420 err:
1421 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1422 		goto retry;
1423 
1424 	bch2_trans_iter_exit(trans, &iter1);
1425 	bch2_trans_iter_exit(trans, &iter2);
1426 	bch2_trans_put(trans);
1427 
1428 	return ret;
1429 }
1430 
1431 static const struct export_operations bch_export_ops = {
1432 	.encode_fh	= bch2_encode_fh,
1433 	.fh_to_dentry	= bch2_fh_to_dentry,
1434 	.fh_to_parent	= bch2_fh_to_parent,
1435 	.get_parent	= bch2_get_parent,
1436 	.get_name	= bch2_get_name,
1437 };
1438 
1439 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1440 				struct bch_inode_info *inode,
1441 				struct bch_inode_unpacked *bi,
1442 				struct bch_subvolume *subvol)
1443 {
1444 	bch2_iget5_set(&inode->v, &inum);
1445 	bch2_inode_update_after_write(trans, inode, bi, ~0);
1446 
1447 	if (BCH_SUBVOLUME_SNAP(subvol))
1448 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1449 	else
1450 		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1451 
1452 	inode->v.i_blocks	= bi->bi_sectors;
1453 	inode->v.i_ino		= bi->bi_inum;
1454 	inode->v.i_rdev		= bi->bi_dev;
1455 	inode->v.i_generation	= bi->bi_generation;
1456 	inode->v.i_size		= bi->bi_size;
1457 
1458 	inode->ei_flags		= 0;
1459 	inode->ei_quota_reserved = 0;
1460 	inode->ei_qid		= bch_qid(bi);
1461 	inode->ei_subvol	= inum.subvol;
1462 
1463 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1464 
1465 	switch (inode->v.i_mode & S_IFMT) {
1466 	case S_IFREG:
1467 		inode->v.i_op	= &bch_file_inode_operations;
1468 		inode->v.i_fop	= &bch_file_operations;
1469 		break;
1470 	case S_IFDIR:
1471 		inode->v.i_op	= &bch_dir_inode_operations;
1472 		inode->v.i_fop	= &bch_dir_file_operations;
1473 		break;
1474 	case S_IFLNK:
1475 		inode_nohighmem(&inode->v);
1476 		inode->v.i_op	= &bch_symlink_inode_operations;
1477 		break;
1478 	default:
1479 		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1480 		inode->v.i_op	= &bch_special_inode_operations;
1481 		break;
1482 	}
1483 
1484 	mapping_set_large_folios(inode->v.i_mapping);
1485 }
1486 
1487 static struct inode *bch2_alloc_inode(struct super_block *sb)
1488 {
1489 	struct bch_inode_info *inode;
1490 
1491 	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1492 	if (!inode)
1493 		return NULL;
1494 
1495 	inode_init_once(&inode->v);
1496 	mutex_init(&inode->ei_update_lock);
1497 	two_state_lock_init(&inode->ei_pagecache_lock);
1498 	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
1499 	mutex_init(&inode->ei_quota_lock);
1500 
1501 	return &inode->v;
1502 }
1503 
1504 static void bch2_i_callback(struct rcu_head *head)
1505 {
1506 	struct inode *vinode = container_of(head, struct inode, i_rcu);
1507 	struct bch_inode_info *inode = to_bch_ei(vinode);
1508 
1509 	kmem_cache_free(bch2_inode_cache, inode);
1510 }
1511 
1512 static void bch2_destroy_inode(struct inode *vinode)
1513 {
1514 	call_rcu(&vinode->i_rcu, bch2_i_callback);
1515 }
1516 
1517 static int inode_update_times_fn(struct btree_trans *trans,
1518 				 struct bch_inode_info *inode,
1519 				 struct bch_inode_unpacked *bi,
1520 				 void *p)
1521 {
1522 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1523 
1524 	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1525 	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1526 	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1527 
1528 	return 0;
1529 }
1530 
1531 static int bch2_vfs_write_inode(struct inode *vinode,
1532 				struct writeback_control *wbc)
1533 {
1534 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1535 	struct bch_inode_info *inode = to_bch_ei(vinode);
1536 	int ret;
1537 
1538 	mutex_lock(&inode->ei_update_lock);
1539 	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1540 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1541 	mutex_unlock(&inode->ei_update_lock);
1542 
1543 	return bch2_err_class(ret);
1544 }
1545 
1546 static void bch2_evict_inode(struct inode *vinode)
1547 {
1548 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1549 	struct bch_inode_info *inode = to_bch_ei(vinode);
1550 
1551 	truncate_inode_pages_final(&inode->v.i_data);
1552 
1553 	clear_inode(&inode->v);
1554 
1555 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1556 
1557 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1558 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1559 				KEY_TYPE_QUOTA_WARN);
1560 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1561 				KEY_TYPE_QUOTA_WARN);
1562 		bch2_inode_rm(c, inode_inum(inode));
1563 	}
1564 
1565 	mutex_lock(&c->vfs_inodes_lock);
1566 	list_del_init(&inode->ei_vfs_inode_list);
1567 	mutex_unlock(&c->vfs_inodes_lock);
1568 }
1569 
1570 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1571 {
1572 	struct bch_inode_info *inode;
1573 	DARRAY(struct bch_inode_info *) grabbed;
1574 	bool clean_pass = false, this_pass_clean;
1575 
1576 	/*
1577 	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1578 	 * be pruned with d_mark_dontcache().
1579 	 *
1580 	 * Once we've had a clean pass where we didn't find any inodes without
1581 	 * I_DONTCACHE, we wait for them to be freed:
1582 	 */
1583 
1584 	darray_init(&grabbed);
1585 	darray_make_room(&grabbed, 1024);
1586 again:
1587 	cond_resched();
1588 	this_pass_clean = true;
1589 
1590 	mutex_lock(&c->vfs_inodes_lock);
1591 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1592 		if (!snapshot_list_has_id(s, inode->ei_subvol))
1593 			continue;
1594 
1595 		if (!(inode->v.i_state & I_DONTCACHE) &&
1596 		    !(inode->v.i_state & I_FREEING) &&
1597 		    igrab(&inode->v)) {
1598 			this_pass_clean = false;
1599 
1600 			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1601 				iput(&inode->v);
1602 				break;
1603 			}
1604 		} else if (clean_pass && this_pass_clean) {
1605 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1606 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1607 
1608 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1609 			mutex_unlock(&c->vfs_inodes_lock);
1610 
1611 			schedule();
1612 			finish_wait(wq, &wait.wq_entry);
1613 			goto again;
1614 		}
1615 	}
1616 	mutex_unlock(&c->vfs_inodes_lock);
1617 
1618 	darray_for_each(grabbed, i) {
1619 		inode = *i;
1620 		d_mark_dontcache(&inode->v);
1621 		d_prune_aliases(&inode->v);
1622 		iput(&inode->v);
1623 	}
1624 	grabbed.nr = 0;
1625 
1626 	if (!clean_pass || !this_pass_clean) {
1627 		clean_pass = this_pass_clean;
1628 		goto again;
1629 	}
1630 
1631 	darray_exit(&grabbed);
1632 }
1633 
1634 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1635 {
1636 	struct super_block *sb = dentry->d_sb;
1637 	struct bch_fs *c = sb->s_fs_info;
1638 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1639 	unsigned shift = sb->s_blocksize_bits - 9;
1640 	/*
1641 	 * this assumes inodes take up 64 bytes, which is a decent average
1642 	 * number:
1643 	 */
1644 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1645 
1646 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1647 	buf->f_bsize	= sb->s_blocksize;
1648 	buf->f_blocks	= usage.capacity >> shift;
1649 	buf->f_bfree	= usage.free >> shift;
1650 	buf->f_bavail	= avail_factor(usage.free) >> shift;
1651 
1652 	buf->f_files	= usage.nr_inodes + avail_inodes;
1653 	buf->f_ffree	= avail_inodes;
1654 
1655 	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
1656 	buf->f_namelen	= BCH_NAME_MAX;
1657 
1658 	return 0;
1659 }
1660 
1661 static int bch2_sync_fs(struct super_block *sb, int wait)
1662 {
1663 	struct bch_fs *c = sb->s_fs_info;
1664 	int ret;
1665 
1666 	if (c->opts.journal_flush_disabled)
1667 		return 0;
1668 
1669 	if (!wait) {
1670 		bch2_journal_flush_async(&c->journal, NULL);
1671 		return 0;
1672 	}
1673 
1674 	ret = bch2_journal_flush(&c->journal);
1675 	return bch2_err_class(ret);
1676 }
1677 
1678 static struct bch_fs *bch2_path_to_fs(const char *path)
1679 {
1680 	struct bch_fs *c;
1681 	dev_t dev;
1682 	int ret;
1683 
1684 	ret = lookup_bdev(path, &dev);
1685 	if (ret)
1686 		return ERR_PTR(ret);
1687 
1688 	c = bch2_dev_to_fs(dev);
1689 	if (c)
1690 		closure_put(&c->cl);
1691 	return c ?: ERR_PTR(-ENOENT);
1692 }
1693 
1694 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1695 {
1696 	struct bch_fs *c = sb->s_fs_info;
1697 	struct bch_opts opts = bch2_opts_empty();
1698 	int ret;
1699 
1700 	ret = bch2_parse_mount_opts(c, &opts, data);
1701 	if (ret)
1702 		goto err;
1703 
1704 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1705 
1706 	if (opts.read_only != c->opts.read_only) {
1707 		down_write(&c->state_lock);
1708 
1709 		if (opts.read_only) {
1710 			bch2_fs_read_only(c);
1711 
1712 			sb->s_flags |= SB_RDONLY;
1713 		} else {
1714 			ret = bch2_fs_read_write(c);
1715 			if (ret) {
1716 				bch_err(c, "error going rw: %i", ret);
1717 				up_write(&c->state_lock);
1718 				ret = -EINVAL;
1719 				goto err;
1720 			}
1721 
1722 			sb->s_flags &= ~SB_RDONLY;
1723 		}
1724 
1725 		c->opts.read_only = opts.read_only;
1726 
1727 		up_write(&c->state_lock);
1728 	}
1729 
1730 	if (opt_defined(opts, errors))
1731 		c->opts.errors = opts.errors;
1732 err:
1733 	return bch2_err_class(ret);
1734 }
1735 
1736 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1737 {
1738 	struct bch_fs *c = root->d_sb->s_fs_info;
1739 	bool first = true;
1740 
1741 	for_each_online_member(c, ca) {
1742 		if (!first)
1743 			seq_putc(seq, ':');
1744 		first = false;
1745 		seq_puts(seq, ca->disk_sb.sb_name);
1746 	}
1747 
1748 	return 0;
1749 }
1750 
1751 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1752 {
1753 	struct bch_fs *c = root->d_sb->s_fs_info;
1754 	enum bch_opt_id i;
1755 	struct printbuf buf = PRINTBUF;
1756 	int ret = 0;
1757 
1758 	for (i = 0; i < bch2_opts_nr; i++) {
1759 		const struct bch_option *opt = &bch2_opt_table[i];
1760 		u64 v = bch2_opt_get_by_id(&c->opts, i);
1761 
1762 		if (!(opt->flags & OPT_MOUNT))
1763 			continue;
1764 
1765 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1766 			continue;
1767 
1768 		printbuf_reset(&buf);
1769 		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1770 				 OPT_SHOW_MOUNT_STYLE);
1771 		seq_putc(seq, ',');
1772 		seq_puts(seq, buf.buf);
1773 	}
1774 
1775 	if (buf.allocation_failure)
1776 		ret = -ENOMEM;
1777 	printbuf_exit(&buf);
1778 	return ret;
1779 }
1780 
1781 static void bch2_put_super(struct super_block *sb)
1782 {
1783 	struct bch_fs *c = sb->s_fs_info;
1784 
1785 	__bch2_fs_stop(c);
1786 }
1787 
1788 /*
1789  * bcachefs doesn't currently integrate intwrite freeze protection but the
1790  * internal write references serve the same purpose. Therefore reuse the
1791  * read-only transition code to perform the quiesce. The caveat is that we don't
1792  * currently have the ability to block tasks that want a write reference while
1793  * the superblock is frozen. This is fine for now, but we should either add
1794  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1795  */
1796 static int bch2_freeze(struct super_block *sb)
1797 {
1798 	struct bch_fs *c = sb->s_fs_info;
1799 
1800 	down_write(&c->state_lock);
1801 	bch2_fs_read_only(c);
1802 	up_write(&c->state_lock);
1803 	return 0;
1804 }
1805 
1806 static int bch2_unfreeze(struct super_block *sb)
1807 {
1808 	struct bch_fs *c = sb->s_fs_info;
1809 	int ret;
1810 
1811 	if (test_bit(BCH_FS_emergency_ro, &c->flags))
1812 		return 0;
1813 
1814 	down_write(&c->state_lock);
1815 	ret = bch2_fs_read_write(c);
1816 	up_write(&c->state_lock);
1817 	return ret;
1818 }
1819 
1820 static const struct super_operations bch_super_operations = {
1821 	.alloc_inode	= bch2_alloc_inode,
1822 	.destroy_inode	= bch2_destroy_inode,
1823 	.write_inode	= bch2_vfs_write_inode,
1824 	.evict_inode	= bch2_evict_inode,
1825 	.sync_fs	= bch2_sync_fs,
1826 	.statfs		= bch2_statfs,
1827 	.show_devname	= bch2_show_devname,
1828 	.show_options	= bch2_show_options,
1829 	.remount_fs	= bch2_remount,
1830 	.put_super	= bch2_put_super,
1831 	.freeze_fs	= bch2_freeze,
1832 	.unfreeze_fs	= bch2_unfreeze,
1833 };
1834 
1835 static int bch2_set_super(struct super_block *s, void *data)
1836 {
1837 	s->s_fs_info = data;
1838 	return 0;
1839 }
1840 
1841 static int bch2_noset_super(struct super_block *s, void *data)
1842 {
1843 	return -EBUSY;
1844 }
1845 
1846 typedef DARRAY(struct bch_fs *) darray_fs;
1847 
1848 static int bch2_test_super(struct super_block *s, void *data)
1849 {
1850 	struct bch_fs *c = s->s_fs_info;
1851 	darray_fs *d = data;
1852 
1853 	if (!c)
1854 		return false;
1855 
1856 	darray_for_each(*d, i)
1857 		if (c != *i)
1858 			return false;
1859 	return true;
1860 }
1861 
1862 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1863 				 int flags, const char *dev_name, void *data)
1864 {
1865 	struct bch_fs *c;
1866 	struct super_block *sb;
1867 	struct inode *vinode;
1868 	struct bch_opts opts = bch2_opts_empty();
1869 	int ret;
1870 
1871 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1872 
1873 	ret = bch2_parse_mount_opts(NULL, &opts, data);
1874 	if (ret) {
1875 		ret = bch2_err_class(ret);
1876 		return ERR_PTR(ret);
1877 	}
1878 
1879 	if (!dev_name || strlen(dev_name) == 0)
1880 		return ERR_PTR(-EINVAL);
1881 
1882 	darray_str devs;
1883 	ret = bch2_split_devs(dev_name, &devs);
1884 	if (ret)
1885 		return ERR_PTR(ret);
1886 
1887 	darray_fs devs_to_fs = {};
1888 	darray_for_each(devs, i) {
1889 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1890 		if (ret) {
1891 			sb = ERR_PTR(ret);
1892 			goto got_sb;
1893 		}
1894 	}
1895 
1896 	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1897 	if (!IS_ERR(sb))
1898 		goto got_sb;
1899 
1900 	c = bch2_fs_open(devs.data, devs.nr, opts);
1901 	if (IS_ERR(c)) {
1902 		sb = ERR_CAST(c);
1903 		goto got_sb;
1904 	}
1905 
1906 	/* Some options can't be parsed until after the fs is started: */
1907 	ret = bch2_parse_mount_opts(c, &opts, data);
1908 	if (ret) {
1909 		bch2_fs_stop(c);
1910 		sb = ERR_PTR(ret);
1911 		goto got_sb;
1912 	}
1913 
1914 	bch2_opts_apply(&c->opts, opts);
1915 
1916 	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1917 	if (IS_ERR(sb))
1918 		bch2_fs_stop(c);
1919 got_sb:
1920 	darray_exit(&devs_to_fs);
1921 	bch2_darray_str_exit(&devs);
1922 
1923 	if (IS_ERR(sb)) {
1924 		ret = PTR_ERR(sb);
1925 		ret = bch2_err_class(ret);
1926 		return ERR_PTR(ret);
1927 	}
1928 
1929 	c = sb->s_fs_info;
1930 
1931 	if (sb->s_root) {
1932 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
1933 			ret = -EBUSY;
1934 			goto err_put_super;
1935 		}
1936 		goto out;
1937 	}
1938 
1939 	sb->s_blocksize		= block_bytes(c);
1940 	sb->s_blocksize_bits	= ilog2(block_bytes(c));
1941 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
1942 	sb->s_op		= &bch_super_operations;
1943 	sb->s_export_op		= &bch_export_ops;
1944 #ifdef CONFIG_BCACHEFS_QUOTA
1945 	sb->s_qcop		= &bch2_quotactl_operations;
1946 	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1947 #endif
1948 	sb->s_xattr		= bch2_xattr_handlers;
1949 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
1950 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
1951 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1952 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
1953 	sb->s_uuid		= c->sb.user_uuid;
1954 	c->vfs_sb		= sb;
1955 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1956 
1957 	ret = super_setup_bdi(sb);
1958 	if (ret)
1959 		goto err_put_super;
1960 
1961 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
1962 
1963 	for_each_online_member(c, ca) {
1964 		struct block_device *bdev = ca->disk_sb.bdev;
1965 
1966 		/* XXX: create an anonymous device for multi device filesystems */
1967 		sb->s_bdev	= bdev;
1968 		sb->s_dev	= bdev->bd_dev;
1969 		percpu_ref_put(&ca->io_ref);
1970 		break;
1971 	}
1972 
1973 	c->dev = sb->s_dev;
1974 
1975 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1976 	if (c->opts.acl)
1977 		sb->s_flags	|= SB_POSIXACL;
1978 #endif
1979 
1980 	sb->s_shrink->seeks = 0;
1981 
1982 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1983 	ret = PTR_ERR_OR_ZERO(vinode);
1984 	bch_err_msg(c, ret, "mounting: error getting root inode");
1985 	if (ret)
1986 		goto err_put_super;
1987 
1988 	sb->s_root = d_make_root(vinode);
1989 	if (!sb->s_root) {
1990 		bch_err(c, "error mounting: error allocating root dentry");
1991 		ret = -ENOMEM;
1992 		goto err_put_super;
1993 	}
1994 
1995 	sb->s_flags |= SB_ACTIVE;
1996 out:
1997 	return dget(sb->s_root);
1998 
1999 err_put_super:
2000 	deactivate_locked_super(sb);
2001 	return ERR_PTR(bch2_err_class(ret));
2002 }
2003 
2004 static void bch2_kill_sb(struct super_block *sb)
2005 {
2006 	struct bch_fs *c = sb->s_fs_info;
2007 
2008 	generic_shutdown_super(sb);
2009 	bch2_fs_free(c);
2010 }
2011 
2012 static struct file_system_type bcache_fs_type = {
2013 	.owner		= THIS_MODULE,
2014 	.name		= "bcachefs",
2015 	.mount		= bch2_mount,
2016 	.kill_sb	= bch2_kill_sb,
2017 	.fs_flags	= FS_REQUIRES_DEV,
2018 };
2019 
2020 MODULE_ALIAS_FS("bcachefs");
2021 
2022 void bch2_vfs_exit(void)
2023 {
2024 	unregister_filesystem(&bcache_fs_type);
2025 	kmem_cache_destroy(bch2_inode_cache);
2026 }
2027 
2028 int __init bch2_vfs_init(void)
2029 {
2030 	int ret = -ENOMEM;
2031 
2032 	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
2033 	if (!bch2_inode_cache)
2034 		goto err;
2035 
2036 	ret = register_filesystem(&bcache_fs_type);
2037 	if (ret)
2038 		goto err;
2039 
2040 	return 0;
2041 err:
2042 	bch2_vfs_exit();
2043 	return ret;
2044 }
2045 
2046 #endif /* NO_BCACHEFS_FS */
2047