xref: /linux/fs/bcachefs/fs.c (revision 45d8b572fac3aa8b49d53c946b3685eaf78a2824)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42 
43 static struct kmem_cache *bch2_inode_cache;
44 
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46 				struct bch_inode_info *,
47 				struct bch_inode_unpacked *,
48 				struct bch_subvolume *);
49 
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51 				   struct bch_inode_info *inode,
52 				   struct bch_inode_unpacked *bi,
53 				   unsigned fields)
54 {
55 	struct bch_fs *c = trans->c;
56 
57 	BUG_ON(bi->bi_inum != inode->v.i_ino);
58 
59 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60 			       POS(0, bi->bi_inum),
61 			       c->opts.inodes_use_key_cache);
62 
63 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64 	i_uid_write(&inode->v, bi->bi_uid);
65 	i_gid_write(&inode->v, bi->bi_gid);
66 	inode->v.i_mode	= bi->bi_mode;
67 
68 	if (fields & ATTR_ATIME)
69 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70 	if (fields & ATTR_MTIME)
71 		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72 	if (fields & ATTR_CTIME)
73 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74 
75 	inode->ei_inode		= *bi;
76 
77 	bch2_inode_flags_to_vfs(inode);
78 }
79 
80 int __must_check bch2_write_inode(struct bch_fs *c,
81 				  struct bch_inode_info *inode,
82 				  inode_set_fn set,
83 				  void *p, unsigned fields)
84 {
85 	struct btree_trans *trans = bch2_trans_get(c);
86 	struct btree_iter iter = { NULL };
87 	struct bch_inode_unpacked inode_u;
88 	int ret;
89 retry:
90 	bch2_trans_begin(trans);
91 
92 	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93 				BTREE_ITER_INTENT) ?:
94 		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95 		bch2_inode_write(trans, &iter, &inode_u) ?:
96 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97 
98 	/*
99 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 	 * this is important for inode updates via bchfs_write_index_update
101 	 */
102 	if (!ret)
103 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104 
105 	bch2_trans_iter_exit(trans, &iter);
106 
107 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108 		goto retry;
109 
110 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111 			     "%s: inode %u:%llu not found when updating",
112 			     bch2_err_str(ret),
113 			     inode_inum(inode).subvol,
114 			     inode_inum(inode).inum);
115 
116 	bch2_trans_put(trans);
117 	return ret < 0 ? ret : 0;
118 }
119 
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121 			   struct bch_inode_info *inode,
122 			   struct bch_qid new_qid,
123 			   unsigned qtypes,
124 			   enum quota_acct_mode mode)
125 {
126 	unsigned i;
127 	int ret;
128 
129 	qtypes &= enabled_qtypes(c);
130 
131 	for (i = 0; i < QTYP_NR; i++)
132 		if (new_qid.q[i] == inode->ei_qid.q[i])
133 			qtypes &= ~(1U << i);
134 
135 	if (!qtypes)
136 		return 0;
137 
138 	mutex_lock(&inode->ei_quota_lock);
139 
140 	ret = bch2_quota_transfer(c, qtypes, new_qid,
141 				  inode->ei_qid,
142 				  inode->v.i_blocks +
143 				  inode->ei_quota_reserved,
144 				  mode);
145 	if (!ret)
146 		for (i = 0; i < QTYP_NR; i++)
147 			if (qtypes & (1 << i))
148 				inode->ei_qid.q[i] = new_qid.q[i];
149 
150 	mutex_unlock(&inode->ei_quota_lock);
151 
152 	return ret;
153 }
154 
155 static int bch2_iget5_test(struct inode *vinode, void *p)
156 {
157 	struct bch_inode_info *inode = to_bch_ei(vinode);
158 	subvol_inum *inum = p;
159 
160 	return inode->ei_subvol == inum->subvol &&
161 		inode->ei_inode.bi_inum == inum->inum;
162 }
163 
164 static int bch2_iget5_set(struct inode *vinode, void *p)
165 {
166 	struct bch_inode_info *inode = to_bch_ei(vinode);
167 	subvol_inum *inum = p;
168 
169 	inode->v.i_ino		= inum->inum;
170 	inode->ei_subvol	= inum->subvol;
171 	inode->ei_inode.bi_inum	= inum->inum;
172 	return 0;
173 }
174 
175 static unsigned bch2_inode_hash(subvol_inum inum)
176 {
177 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
178 }
179 
180 static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
181 {
182 	subvol_inum inum = inode_inum(inode);
183 	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
184 				      bch2_inode_hash(inum),
185 				      bch2_iget5_test,
186 				      bch2_iget5_set,
187 				      &inum));
188 	BUG_ON(!old);
189 
190 	if (unlikely(old != inode)) {
191 		__destroy_inode(&inode->v);
192 		kmem_cache_free(bch2_inode_cache, inode);
193 		inode = old;
194 	} else {
195 		mutex_lock(&c->vfs_inodes_lock);
196 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
197 		mutex_unlock(&c->vfs_inodes_lock);
198 		/*
199 		 * we really don't want insert_inode_locked2() to be setting
200 		 * I_NEW...
201 		 */
202 		unlock_new_inode(&inode->v);
203 	}
204 
205 	return inode;
206 }
207 
208 #define memalloc_flags_do(_flags, _do)						\
209 ({										\
210 	unsigned _saved_flags = memalloc_flags_save(_flags);			\
211 	typeof(_do) _ret = _do;							\
212 	memalloc_noreclaim_restore(_saved_flags);				\
213 	_ret;									\
214 })
215 
216 /*
217  * Allocate a new inode, dropping/retaking btree locks if necessary:
218  */
219 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
220 {
221 	struct bch_fs *c = trans->c;
222 
223 	struct bch_inode_info *inode =
224 		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
225 				  to_bch_ei(new_inode(c->vfs_sb)));
226 
227 	if (unlikely(!inode)) {
228 		int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
229 		if (ret && inode) {
230 			__destroy_inode(&inode->v);
231 			kmem_cache_free(bch2_inode_cache, inode);
232 		}
233 		if (ret)
234 			return ERR_PTR(ret);
235 	}
236 
237 	return inode;
238 }
239 
240 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
241 {
242 	struct bch_inode_info *inode =
243 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
244 					  bch2_inode_hash(inum),
245 					  bch2_iget5_test,
246 					  &inum));
247 	if (inode)
248 		return &inode->v;
249 
250 	struct btree_trans *trans = bch2_trans_get(c);
251 
252 	struct bch_inode_unpacked inode_u;
253 	struct bch_subvolume subvol;
254 	int ret = lockrestart_do(trans,
255 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
256 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
257 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
258 	if (!ret) {
259 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
260 		inode = bch2_inode_insert(c, inode);
261 	}
262 	bch2_trans_put(trans);
263 
264 	return ret ? ERR_PTR(ret) : &inode->v;
265 }
266 
267 struct bch_inode_info *
268 __bch2_create(struct mnt_idmap *idmap,
269 	      struct bch_inode_info *dir, struct dentry *dentry,
270 	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
271 	      unsigned flags)
272 {
273 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
274 	struct btree_trans *trans;
275 	struct bch_inode_unpacked dir_u;
276 	struct bch_inode_info *inode;
277 	struct bch_inode_unpacked inode_u;
278 	struct posix_acl *default_acl = NULL, *acl = NULL;
279 	subvol_inum inum;
280 	struct bch_subvolume subvol;
281 	u64 journal_seq = 0;
282 	int ret;
283 
284 	/*
285 	 * preallocate acls + vfs inode before btree transaction, so that
286 	 * nothing can fail after the transaction succeeds:
287 	 */
288 #ifdef CONFIG_BCACHEFS_POSIX_ACL
289 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
290 	if (ret)
291 		return ERR_PTR(ret);
292 #endif
293 	inode = to_bch_ei(new_inode(c->vfs_sb));
294 	if (unlikely(!inode)) {
295 		inode = ERR_PTR(-ENOMEM);
296 		goto err;
297 	}
298 
299 	bch2_inode_init_early(c, &inode_u);
300 
301 	if (!(flags & BCH_CREATE_TMPFILE))
302 		mutex_lock(&dir->ei_update_lock);
303 
304 	trans = bch2_trans_get(c);
305 retry:
306 	bch2_trans_begin(trans);
307 
308 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
309 		bch2_create_trans(trans,
310 				  inode_inum(dir), &dir_u, &inode_u,
311 				  !(flags & BCH_CREATE_TMPFILE)
312 				  ? &dentry->d_name : NULL,
313 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
314 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
315 				  mode, rdev,
316 				  default_acl, acl, snapshot_src, flags) ?:
317 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
318 				KEY_TYPE_QUOTA_PREALLOC);
319 	if (unlikely(ret))
320 		goto err_before_quota;
321 
322 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
323 	inum.inum = inode_u.bi_inum;
324 
325 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
326 				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
327 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
328 	if (unlikely(ret)) {
329 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
330 				KEY_TYPE_QUOTA_WARN);
331 err_before_quota:
332 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
333 			goto retry;
334 		goto err_trans;
335 	}
336 
337 	if (!(flags & BCH_CREATE_TMPFILE)) {
338 		bch2_inode_update_after_write(trans, dir, &dir_u,
339 					      ATTR_MTIME|ATTR_CTIME);
340 		mutex_unlock(&dir->ei_update_lock);
341 	}
342 
343 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
344 
345 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
346 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
347 
348 	/*
349 	 * we must insert the new inode into the inode cache before calling
350 	 * bch2_trans_exit() and dropping locks, else we could race with another
351 	 * thread pulling the inode in and modifying it:
352 	 */
353 	inode = bch2_inode_insert(c, inode);
354 	bch2_trans_put(trans);
355 err:
356 	posix_acl_release(default_acl);
357 	posix_acl_release(acl);
358 	return inode;
359 err_trans:
360 	if (!(flags & BCH_CREATE_TMPFILE))
361 		mutex_unlock(&dir->ei_update_lock);
362 
363 	bch2_trans_put(trans);
364 	make_bad_inode(&inode->v);
365 	iput(&inode->v);
366 	inode = ERR_PTR(ret);
367 	goto err;
368 }
369 
370 /* methods */
371 
372 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
373 			subvol_inum dir, struct bch_hash_info *dir_hash_info,
374 			const struct qstr *name)
375 {
376 	struct bch_fs *c = trans->c;
377 	struct btree_iter dirent_iter = {};
378 	subvol_inum inum = {};
379 
380 	int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
381 				   dir_hash_info, dir, name, 0);
382 	if (ret)
383 		return ERR_PTR(ret);
384 
385 	struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
386 	ret = bkey_err(k);
387 	if (ret)
388 		goto err;
389 
390 	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
391 	if (ret > 0)
392 		ret = -ENOENT;
393 	if (ret)
394 		goto err;
395 
396 	struct bch_inode_info *inode =
397 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
398 					  bch2_inode_hash(inum),
399 					  bch2_iget5_test,
400 					  &inum));
401 	if (inode)
402 		goto out;
403 
404 	struct bch_subvolume subvol;
405 	struct bch_inode_unpacked inode_u;
406 	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
407 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
408 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
409 	if (bch2_err_matches(ret, ENOENT)) {
410 		struct printbuf buf = PRINTBUF;
411 
412 		bch2_bkey_val_to_text(&buf, c, k);
413 		bch_err(c, "%s points to missing inode", buf.buf);
414 		printbuf_exit(&buf);
415 	}
416 	if (ret)
417 		goto err;
418 
419 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
420 	inode = bch2_inode_insert(c, inode);
421 out:
422 	bch2_trans_iter_exit(trans, &dirent_iter);
423 	return inode;
424 err:
425 	inode = ERR_PTR(ret);
426 	goto out;
427 }
428 
429 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
430 				  unsigned int flags)
431 {
432 	struct bch_fs *c = vdir->i_sb->s_fs_info;
433 	struct bch_inode_info *dir = to_bch_ei(vdir);
434 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
435 
436 	struct bch_inode_info *inode;
437 	bch2_trans_do(c, NULL, NULL, 0,
438 		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
439 							  &hash, &dentry->d_name)));
440 	if (IS_ERR(inode))
441 		inode = NULL;
442 
443 	return d_splice_alias(&inode->v, dentry);
444 }
445 
446 static int bch2_mknod(struct mnt_idmap *idmap,
447 		      struct inode *vdir, struct dentry *dentry,
448 		      umode_t mode, dev_t rdev)
449 {
450 	struct bch_inode_info *inode =
451 		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
452 			      (subvol_inum) { 0 }, 0);
453 
454 	if (IS_ERR(inode))
455 		return bch2_err_class(PTR_ERR(inode));
456 
457 	d_instantiate(dentry, &inode->v);
458 	return 0;
459 }
460 
461 static int bch2_create(struct mnt_idmap *idmap,
462 		       struct inode *vdir, struct dentry *dentry,
463 		       umode_t mode, bool excl)
464 {
465 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
466 }
467 
468 static int __bch2_link(struct bch_fs *c,
469 		       struct bch_inode_info *inode,
470 		       struct bch_inode_info *dir,
471 		       struct dentry *dentry)
472 {
473 	struct btree_trans *trans = bch2_trans_get(c);
474 	struct bch_inode_unpacked dir_u, inode_u;
475 	int ret;
476 
477 	mutex_lock(&inode->ei_update_lock);
478 
479 	ret = commit_do(trans, NULL, NULL, 0,
480 			bch2_link_trans(trans,
481 					inode_inum(dir),   &dir_u,
482 					inode_inum(inode), &inode_u,
483 					&dentry->d_name));
484 
485 	if (likely(!ret)) {
486 		bch2_inode_update_after_write(trans, dir, &dir_u,
487 					      ATTR_MTIME|ATTR_CTIME);
488 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
489 	}
490 
491 	bch2_trans_put(trans);
492 	mutex_unlock(&inode->ei_update_lock);
493 	return ret;
494 }
495 
496 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
497 		     struct dentry *dentry)
498 {
499 	struct bch_fs *c = vdir->i_sb->s_fs_info;
500 	struct bch_inode_info *dir = to_bch_ei(vdir);
501 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
502 	int ret;
503 
504 	lockdep_assert_held(&inode->v.i_rwsem);
505 
506 	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
507 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
508 		__bch2_link(c, inode, dir, dentry);
509 	if (unlikely(ret))
510 		return bch2_err_class(ret);
511 
512 	ihold(&inode->v);
513 	d_instantiate(dentry, &inode->v);
514 	return 0;
515 }
516 
517 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
518 		  bool deleting_snapshot)
519 {
520 	struct bch_fs *c = vdir->i_sb->s_fs_info;
521 	struct bch_inode_info *dir = to_bch_ei(vdir);
522 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
523 	struct bch_inode_unpacked dir_u, inode_u;
524 	struct btree_trans *trans = bch2_trans_get(c);
525 	int ret;
526 
527 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
528 
529 	ret = commit_do(trans, NULL, NULL,
530 			BCH_TRANS_COMMIT_no_enospc,
531 		bch2_unlink_trans(trans,
532 				  inode_inum(dir), &dir_u,
533 				  &inode_u, &dentry->d_name,
534 				  deleting_snapshot));
535 	if (unlikely(ret))
536 		goto err;
537 
538 	bch2_inode_update_after_write(trans, dir, &dir_u,
539 				      ATTR_MTIME|ATTR_CTIME);
540 	bch2_inode_update_after_write(trans, inode, &inode_u,
541 				      ATTR_MTIME);
542 
543 	if (inode_u.bi_subvol) {
544 		/*
545 		 * Subvolume deletion is asynchronous, but we still want to tell
546 		 * the VFS that it's been deleted here:
547 		 */
548 		set_nlink(&inode->v, 0);
549 	}
550 err:
551 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
552 	bch2_trans_put(trans);
553 
554 	return ret;
555 }
556 
557 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
558 {
559 	struct bch_inode_info *dir= to_bch_ei(vdir);
560 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
561 
562 	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
563 		__bch2_unlink(vdir, dentry, false);
564 	return bch2_err_class(ret);
565 }
566 
567 static int bch2_symlink(struct mnt_idmap *idmap,
568 			struct inode *vdir, struct dentry *dentry,
569 			const char *symname)
570 {
571 	struct bch_fs *c = vdir->i_sb->s_fs_info;
572 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
573 	int ret;
574 
575 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
576 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
577 	if (IS_ERR(inode))
578 		return bch2_err_class(PTR_ERR(inode));
579 
580 	inode_lock(&inode->v);
581 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
582 	inode_unlock(&inode->v);
583 
584 	if (unlikely(ret))
585 		goto err;
586 
587 	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
588 	if (unlikely(ret))
589 		goto err;
590 
591 	ret = __bch2_link(c, inode, dir, dentry);
592 	if (unlikely(ret))
593 		goto err;
594 
595 	d_instantiate(dentry, &inode->v);
596 	return 0;
597 err:
598 	iput(&inode->v);
599 	return bch2_err_class(ret);
600 }
601 
602 static int bch2_mkdir(struct mnt_idmap *idmap,
603 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
604 {
605 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
606 }
607 
608 static int bch2_rename2(struct mnt_idmap *idmap,
609 			struct inode *src_vdir, struct dentry *src_dentry,
610 			struct inode *dst_vdir, struct dentry *dst_dentry,
611 			unsigned flags)
612 {
613 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
614 	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
615 	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
616 	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
617 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
618 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
619 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
620 	struct btree_trans *trans;
621 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
622 		? BCH_RENAME_EXCHANGE
623 		: dst_dentry->d_inode
624 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
625 	int ret;
626 
627 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
628 		return -EINVAL;
629 
630 	if (mode == BCH_RENAME_OVERWRITE) {
631 		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
632 						   0, LLONG_MAX);
633 		if (ret)
634 			return ret;
635 	}
636 
637 	trans = bch2_trans_get(c);
638 
639 	bch2_lock_inodes(INODE_UPDATE_LOCK,
640 			 src_dir,
641 			 dst_dir,
642 			 src_inode,
643 			 dst_inode);
644 
645 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
646 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
647 	if (ret)
648 		goto err;
649 
650 	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
651 		ret = bch2_fs_quota_transfer(c, src_inode,
652 					     dst_dir->ei_qid,
653 					     1 << QTYP_PRJ,
654 					     KEY_TYPE_QUOTA_PREALLOC);
655 		if (ret)
656 			goto err;
657 	}
658 
659 	if (mode == BCH_RENAME_EXCHANGE &&
660 	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
661 		ret = bch2_fs_quota_transfer(c, dst_inode,
662 					     src_dir->ei_qid,
663 					     1 << QTYP_PRJ,
664 					     KEY_TYPE_QUOTA_PREALLOC);
665 		if (ret)
666 			goto err;
667 	}
668 
669 	ret = commit_do(trans, NULL, NULL, 0,
670 			bch2_rename_trans(trans,
671 					  inode_inum(src_dir), &src_dir_u,
672 					  inode_inum(dst_dir), &dst_dir_u,
673 					  &src_inode_u,
674 					  &dst_inode_u,
675 					  &src_dentry->d_name,
676 					  &dst_dentry->d_name,
677 					  mode));
678 	if (unlikely(ret))
679 		goto err;
680 
681 	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
682 	BUG_ON(dst_inode &&
683 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
684 
685 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
686 				      ATTR_MTIME|ATTR_CTIME);
687 
688 	if (src_dir != dst_dir)
689 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
690 					      ATTR_MTIME|ATTR_CTIME);
691 
692 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
693 				      ATTR_CTIME);
694 
695 	if (dst_inode)
696 		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
697 					      ATTR_CTIME);
698 err:
699 	bch2_trans_put(trans);
700 
701 	bch2_fs_quota_transfer(c, src_inode,
702 			       bch_qid(&src_inode->ei_inode),
703 			       1 << QTYP_PRJ,
704 			       KEY_TYPE_QUOTA_NOCHECK);
705 	if (dst_inode)
706 		bch2_fs_quota_transfer(c, dst_inode,
707 				       bch_qid(&dst_inode->ei_inode),
708 				       1 << QTYP_PRJ,
709 				       KEY_TYPE_QUOTA_NOCHECK);
710 
711 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
712 			   src_dir,
713 			   dst_dir,
714 			   src_inode,
715 			   dst_inode);
716 
717 	return bch2_err_class(ret);
718 }
719 
720 static void bch2_setattr_copy(struct mnt_idmap *idmap,
721 			      struct bch_inode_info *inode,
722 			      struct bch_inode_unpacked *bi,
723 			      struct iattr *attr)
724 {
725 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
726 	unsigned int ia_valid = attr->ia_valid;
727 
728 	if (ia_valid & ATTR_UID)
729 		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
730 	if (ia_valid & ATTR_GID)
731 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
732 
733 	if (ia_valid & ATTR_SIZE)
734 		bi->bi_size = attr->ia_size;
735 
736 	if (ia_valid & ATTR_ATIME)
737 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
738 	if (ia_valid & ATTR_MTIME)
739 		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
740 	if (ia_valid & ATTR_CTIME)
741 		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
742 
743 	if (ia_valid & ATTR_MODE) {
744 		umode_t mode = attr->ia_mode;
745 		kgid_t gid = ia_valid & ATTR_GID
746 			? attr->ia_gid
747 			: inode->v.i_gid;
748 
749 		if (!in_group_p(gid) &&
750 		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
751 			mode &= ~S_ISGID;
752 		bi->bi_mode = mode;
753 	}
754 }
755 
756 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
757 			 struct bch_inode_info *inode,
758 			 struct iattr *attr)
759 {
760 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
761 	struct bch_qid qid;
762 	struct btree_trans *trans;
763 	struct btree_iter inode_iter = { NULL };
764 	struct bch_inode_unpacked inode_u;
765 	struct posix_acl *acl = NULL;
766 	int ret;
767 
768 	mutex_lock(&inode->ei_update_lock);
769 
770 	qid = inode->ei_qid;
771 
772 	if (attr->ia_valid & ATTR_UID)
773 		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
774 
775 	if (attr->ia_valid & ATTR_GID)
776 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
777 
778 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
779 				     KEY_TYPE_QUOTA_PREALLOC);
780 	if (ret)
781 		goto err;
782 
783 	trans = bch2_trans_get(c);
784 retry:
785 	bch2_trans_begin(trans);
786 	kfree(acl);
787 	acl = NULL;
788 
789 	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
790 			      BTREE_ITER_INTENT);
791 	if (ret)
792 		goto btree_err;
793 
794 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
795 
796 	if (attr->ia_valid & ATTR_MODE) {
797 		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
798 				     inode_u.bi_mode, &acl);
799 		if (ret)
800 			goto btree_err;
801 	}
802 
803 	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
804 		bch2_trans_commit(trans, NULL, NULL,
805 				  BCH_TRANS_COMMIT_no_enospc);
806 btree_err:
807 	bch2_trans_iter_exit(trans, &inode_iter);
808 
809 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
810 		goto retry;
811 	if (unlikely(ret))
812 		goto err_trans;
813 
814 	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
815 
816 	if (acl)
817 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
818 err_trans:
819 	bch2_trans_put(trans);
820 err:
821 	mutex_unlock(&inode->ei_update_lock);
822 
823 	return bch2_err_class(ret);
824 }
825 
826 static int bch2_getattr(struct mnt_idmap *idmap,
827 			const struct path *path, struct kstat *stat,
828 			u32 request_mask, unsigned query_flags)
829 {
830 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
831 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
832 
833 	stat->dev	= inode->v.i_sb->s_dev;
834 	stat->ino	= inode->v.i_ino;
835 	stat->mode	= inode->v.i_mode;
836 	stat->nlink	= inode->v.i_nlink;
837 	stat->uid	= inode->v.i_uid;
838 	stat->gid	= inode->v.i_gid;
839 	stat->rdev	= inode->v.i_rdev;
840 	stat->size	= i_size_read(&inode->v);
841 	stat->atime	= inode_get_atime(&inode->v);
842 	stat->mtime	= inode_get_mtime(&inode->v);
843 	stat->ctime	= inode_get_ctime(&inode->v);
844 	stat->blksize	= block_bytes(c);
845 	stat->blocks	= inode->v.i_blocks;
846 
847 	if (request_mask & STATX_BTIME) {
848 		stat->result_mask |= STATX_BTIME;
849 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
850 	}
851 
852 	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
853 		stat->attributes |= STATX_ATTR_IMMUTABLE;
854 	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
855 
856 	if (inode->ei_inode.bi_flags & BCH_INODE_append)
857 		stat->attributes |= STATX_ATTR_APPEND;
858 	stat->attributes_mask	 |= STATX_ATTR_APPEND;
859 
860 	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
861 		stat->attributes |= STATX_ATTR_NODUMP;
862 	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
863 
864 	return 0;
865 }
866 
867 static int bch2_setattr(struct mnt_idmap *idmap,
868 			struct dentry *dentry, struct iattr *iattr)
869 {
870 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
871 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
872 	int ret;
873 
874 	lockdep_assert_held(&inode->v.i_rwsem);
875 
876 	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
877 		setattr_prepare(idmap, dentry, iattr);
878 	if (ret)
879 		return ret;
880 
881 	return iattr->ia_valid & ATTR_SIZE
882 		? bchfs_truncate(idmap, inode, iattr)
883 		: bch2_setattr_nonsize(idmap, inode, iattr);
884 }
885 
886 static int bch2_tmpfile(struct mnt_idmap *idmap,
887 			struct inode *vdir, struct file *file, umode_t mode)
888 {
889 	struct bch_inode_info *inode =
890 		__bch2_create(idmap, to_bch_ei(vdir),
891 			      file->f_path.dentry, mode, 0,
892 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
893 
894 	if (IS_ERR(inode))
895 		return bch2_err_class(PTR_ERR(inode));
896 
897 	d_mark_tmpfile(file, &inode->v);
898 	d_instantiate(file->f_path.dentry, &inode->v);
899 	return finish_open_simple(file, 0);
900 }
901 
902 static int bch2_fill_extent(struct bch_fs *c,
903 			    struct fiemap_extent_info *info,
904 			    struct bkey_s_c k, unsigned flags)
905 {
906 	if (bkey_extent_is_direct_data(k.k)) {
907 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
908 		const union bch_extent_entry *entry;
909 		struct extent_ptr_decoded p;
910 		int ret;
911 
912 		if (k.k->type == KEY_TYPE_reflink_v)
913 			flags |= FIEMAP_EXTENT_SHARED;
914 
915 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
916 			int flags2 = 0;
917 			u64 offset = p.ptr.offset;
918 
919 			if (p.ptr.unwritten)
920 				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
921 
922 			if (p.crc.compression_type)
923 				flags2 |= FIEMAP_EXTENT_ENCODED;
924 			else
925 				offset += p.crc.offset;
926 
927 			if ((offset & (block_sectors(c) - 1)) ||
928 			    (k.k->size & (block_sectors(c) - 1)))
929 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
930 
931 			ret = fiemap_fill_next_extent(info,
932 						bkey_start_offset(k.k) << 9,
933 						offset << 9,
934 						k.k->size << 9, flags|flags2);
935 			if (ret)
936 				return ret;
937 		}
938 
939 		return 0;
940 	} else if (bkey_extent_is_inline_data(k.k)) {
941 		return fiemap_fill_next_extent(info,
942 					       bkey_start_offset(k.k) << 9,
943 					       0, k.k->size << 9,
944 					       flags|
945 					       FIEMAP_EXTENT_DATA_INLINE);
946 	} else if (k.k->type == KEY_TYPE_reservation) {
947 		return fiemap_fill_next_extent(info,
948 					       bkey_start_offset(k.k) << 9,
949 					       0, k.k->size << 9,
950 					       flags|
951 					       FIEMAP_EXTENT_DELALLOC|
952 					       FIEMAP_EXTENT_UNWRITTEN);
953 	} else {
954 		BUG();
955 	}
956 }
957 
958 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
959 		       u64 start, u64 len)
960 {
961 	struct bch_fs *c = vinode->i_sb->s_fs_info;
962 	struct bch_inode_info *ei = to_bch_ei(vinode);
963 	struct btree_trans *trans;
964 	struct btree_iter iter;
965 	struct bkey_s_c k;
966 	struct bkey_buf cur, prev;
967 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
968 	unsigned offset_into_extent, sectors;
969 	bool have_extent = false;
970 	u32 snapshot;
971 	int ret = 0;
972 
973 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
974 	if (ret)
975 		return ret;
976 
977 	if (start + len < start)
978 		return -EINVAL;
979 
980 	start >>= 9;
981 
982 	bch2_bkey_buf_init(&cur);
983 	bch2_bkey_buf_init(&prev);
984 	trans = bch2_trans_get(c);
985 retry:
986 	bch2_trans_begin(trans);
987 
988 	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
989 	if (ret)
990 		goto err;
991 
992 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
993 			     SPOS(ei->v.i_ino, start, snapshot), 0);
994 
995 	while (!(ret = btree_trans_too_many_iters(trans)) &&
996 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
997 	       !(ret = bkey_err(k))) {
998 		enum btree_id data_btree = BTREE_ID_extents;
999 
1000 		if (!bkey_extent_is_data(k.k) &&
1001 		    k.k->type != KEY_TYPE_reservation) {
1002 			bch2_btree_iter_advance(&iter);
1003 			continue;
1004 		}
1005 
1006 		offset_into_extent	= iter.pos.offset -
1007 			bkey_start_offset(k.k);
1008 		sectors			= k.k->size - offset_into_extent;
1009 
1010 		bch2_bkey_buf_reassemble(&cur, c, k);
1011 
1012 		ret = bch2_read_indirect_extent(trans, &data_btree,
1013 					&offset_into_extent, &cur);
1014 		if (ret)
1015 			break;
1016 
1017 		k = bkey_i_to_s_c(cur.k);
1018 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1019 
1020 		sectors = min(sectors, k.k->size - offset_into_extent);
1021 
1022 		bch2_cut_front(POS(k.k->p.inode,
1023 				   bkey_start_offset(k.k) +
1024 				   offset_into_extent),
1025 			       cur.k);
1026 		bch2_key_resize(&cur.k->k, sectors);
1027 		cur.k->k.p = iter.pos;
1028 		cur.k->k.p.offset += cur.k->k.size;
1029 
1030 		if (have_extent) {
1031 			bch2_trans_unlock(trans);
1032 			ret = bch2_fill_extent(c, info,
1033 					bkey_i_to_s_c(prev.k), 0);
1034 			if (ret)
1035 				break;
1036 		}
1037 
1038 		bkey_copy(prev.k, cur.k);
1039 		have_extent = true;
1040 
1041 		bch2_btree_iter_set_pos(&iter,
1042 			POS(iter.pos.inode, iter.pos.offset + sectors));
1043 	}
1044 	start = iter.pos.offset;
1045 	bch2_trans_iter_exit(trans, &iter);
1046 err:
1047 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1048 		goto retry;
1049 
1050 	if (!ret && have_extent) {
1051 		bch2_trans_unlock(trans);
1052 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1053 				       FIEMAP_EXTENT_LAST);
1054 	}
1055 
1056 	bch2_trans_put(trans);
1057 	bch2_bkey_buf_exit(&cur, c);
1058 	bch2_bkey_buf_exit(&prev, c);
1059 	return ret < 0 ? ret : 0;
1060 }
1061 
1062 static const struct vm_operations_struct bch_vm_ops = {
1063 	.fault		= bch2_page_fault,
1064 	.map_pages	= filemap_map_pages,
1065 	.page_mkwrite   = bch2_page_mkwrite,
1066 };
1067 
1068 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1069 {
1070 	file_accessed(file);
1071 
1072 	vma->vm_ops = &bch_vm_ops;
1073 	return 0;
1074 }
1075 
1076 /* Directories: */
1077 
1078 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1079 {
1080 	return generic_file_llseek_size(file, offset, whence,
1081 					S64_MAX, S64_MAX);
1082 }
1083 
1084 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1085 {
1086 	struct bch_inode_info *inode = file_bch_inode(file);
1087 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1088 
1089 	if (!dir_emit_dots(file, ctx))
1090 		return 0;
1091 
1092 	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1093 
1094 	bch_err_fn(c, ret);
1095 	return bch2_err_class(ret);
1096 }
1097 
1098 static int bch2_open(struct inode *vinode, struct file *file)
1099 {
1100 	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1101 		struct bch_inode_info *inode = to_bch_ei(vinode);
1102 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1103 
1104 		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1105 		if (ret)
1106 			return ret;
1107 	}
1108 
1109 	return generic_file_open(vinode, file);
1110 }
1111 
1112 static const struct file_operations bch_file_operations = {
1113 	.open		= bch2_open,
1114 	.llseek		= bch2_llseek,
1115 	.read_iter	= bch2_read_iter,
1116 	.write_iter	= bch2_write_iter,
1117 	.mmap		= bch2_mmap,
1118 	.fsync		= bch2_fsync,
1119 	.splice_read	= filemap_splice_read,
1120 	.splice_write	= iter_file_splice_write,
1121 	.fallocate	= bch2_fallocate_dispatch,
1122 	.unlocked_ioctl = bch2_fs_file_ioctl,
1123 #ifdef CONFIG_COMPAT
1124 	.compat_ioctl	= bch2_compat_fs_ioctl,
1125 #endif
1126 	.remap_file_range = bch2_remap_file_range,
1127 };
1128 
1129 static const struct inode_operations bch_file_inode_operations = {
1130 	.getattr	= bch2_getattr,
1131 	.setattr	= bch2_setattr,
1132 	.fiemap		= bch2_fiemap,
1133 	.listxattr	= bch2_xattr_list,
1134 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1135 	.get_acl	= bch2_get_acl,
1136 	.set_acl	= bch2_set_acl,
1137 #endif
1138 };
1139 
1140 static const struct inode_operations bch_dir_inode_operations = {
1141 	.lookup		= bch2_lookup,
1142 	.create		= bch2_create,
1143 	.link		= bch2_link,
1144 	.unlink		= bch2_unlink,
1145 	.symlink	= bch2_symlink,
1146 	.mkdir		= bch2_mkdir,
1147 	.rmdir		= bch2_unlink,
1148 	.mknod		= bch2_mknod,
1149 	.rename		= bch2_rename2,
1150 	.getattr	= bch2_getattr,
1151 	.setattr	= bch2_setattr,
1152 	.tmpfile	= bch2_tmpfile,
1153 	.listxattr	= bch2_xattr_list,
1154 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1155 	.get_acl	= bch2_get_acl,
1156 	.set_acl	= bch2_set_acl,
1157 #endif
1158 };
1159 
1160 static const struct file_operations bch_dir_file_operations = {
1161 	.llseek		= bch2_dir_llseek,
1162 	.read		= generic_read_dir,
1163 	.iterate_shared	= bch2_vfs_readdir,
1164 	.fsync		= bch2_fsync,
1165 	.unlocked_ioctl = bch2_fs_file_ioctl,
1166 #ifdef CONFIG_COMPAT
1167 	.compat_ioctl	= bch2_compat_fs_ioctl,
1168 #endif
1169 };
1170 
1171 static const struct inode_operations bch_symlink_inode_operations = {
1172 	.get_link	= page_get_link,
1173 	.getattr	= bch2_getattr,
1174 	.setattr	= bch2_setattr,
1175 	.listxattr	= bch2_xattr_list,
1176 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1177 	.get_acl	= bch2_get_acl,
1178 	.set_acl	= bch2_set_acl,
1179 #endif
1180 };
1181 
1182 static const struct inode_operations bch_special_inode_operations = {
1183 	.getattr	= bch2_getattr,
1184 	.setattr	= bch2_setattr,
1185 	.listxattr	= bch2_xattr_list,
1186 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1187 	.get_acl	= bch2_get_acl,
1188 	.set_acl	= bch2_set_acl,
1189 #endif
1190 };
1191 
1192 static const struct address_space_operations bch_address_space_operations = {
1193 	.read_folio	= bch2_read_folio,
1194 	.writepages	= bch2_writepages,
1195 	.readahead	= bch2_readahead,
1196 	.dirty_folio	= filemap_dirty_folio,
1197 	.write_begin	= bch2_write_begin,
1198 	.write_end	= bch2_write_end,
1199 	.invalidate_folio = bch2_invalidate_folio,
1200 	.release_folio	= bch2_release_folio,
1201 	.direct_IO	= noop_direct_IO,
1202 #ifdef CONFIG_MIGRATION
1203 	.migrate_folio	= filemap_migrate_folio,
1204 #endif
1205 	.error_remove_folio = generic_error_remove_folio,
1206 };
1207 
1208 struct bcachefs_fid {
1209 	u64		inum;
1210 	u32		subvol;
1211 	u32		gen;
1212 } __packed;
1213 
1214 struct bcachefs_fid_with_parent {
1215 	struct bcachefs_fid	fid;
1216 	struct bcachefs_fid	dir;
1217 } __packed;
1218 
1219 static int bcachefs_fid_valid(int fh_len, int fh_type)
1220 {
1221 	switch (fh_type) {
1222 	case FILEID_BCACHEFS_WITHOUT_PARENT:
1223 		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1224 	case FILEID_BCACHEFS_WITH_PARENT:
1225 		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1226 	default:
1227 		return false;
1228 	}
1229 }
1230 
1231 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1232 {
1233 	return (struct bcachefs_fid) {
1234 		.inum	= inode->ei_inode.bi_inum,
1235 		.subvol	= inode->ei_subvol,
1236 		.gen	= inode->ei_inode.bi_generation,
1237 	};
1238 }
1239 
1240 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1241 			  struct inode *vdir)
1242 {
1243 	struct bch_inode_info *inode	= to_bch_ei(vinode);
1244 	struct bch_inode_info *dir	= to_bch_ei(vdir);
1245 	int min_len;
1246 
1247 	if (!S_ISDIR(inode->v.i_mode) && dir) {
1248 		struct bcachefs_fid_with_parent *fid = (void *) fh;
1249 
1250 		min_len = sizeof(*fid) / sizeof(u32);
1251 		if (*len < min_len) {
1252 			*len = min_len;
1253 			return FILEID_INVALID;
1254 		}
1255 
1256 		fid->fid = bch2_inode_to_fid(inode);
1257 		fid->dir = bch2_inode_to_fid(dir);
1258 
1259 		*len = min_len;
1260 		return FILEID_BCACHEFS_WITH_PARENT;
1261 	} else {
1262 		struct bcachefs_fid *fid = (void *) fh;
1263 
1264 		min_len = sizeof(*fid) / sizeof(u32);
1265 		if (*len < min_len) {
1266 			*len = min_len;
1267 			return FILEID_INVALID;
1268 		}
1269 		*fid = bch2_inode_to_fid(inode);
1270 
1271 		*len = min_len;
1272 		return FILEID_BCACHEFS_WITHOUT_PARENT;
1273 	}
1274 }
1275 
1276 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1277 					struct bcachefs_fid fid)
1278 {
1279 	struct bch_fs *c = sb->s_fs_info;
1280 	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1281 				    .subvol = fid.subvol,
1282 				    .inum = fid.inum,
1283 	});
1284 	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1285 		iput(vinode);
1286 		vinode = ERR_PTR(-ESTALE);
1287 	}
1288 	return vinode;
1289 }
1290 
1291 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1292 		int fh_len, int fh_type)
1293 {
1294 	struct bcachefs_fid *fid = (void *) _fid;
1295 
1296 	if (!bcachefs_fid_valid(fh_len, fh_type))
1297 		return NULL;
1298 
1299 	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1300 }
1301 
1302 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1303 		int fh_len, int fh_type)
1304 {
1305 	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1306 
1307 	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1308 	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1309 		return NULL;
1310 
1311 	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1312 }
1313 
1314 static struct dentry *bch2_get_parent(struct dentry *child)
1315 {
1316 	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1317 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1318 	subvol_inum parent_inum = {
1319 		.subvol = inode->ei_inode.bi_parent_subvol ?:
1320 			inode->ei_subvol,
1321 		.inum = inode->ei_inode.bi_dir,
1322 	};
1323 
1324 	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1325 }
1326 
1327 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1328 {
1329 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1330 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1331 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1332 	struct btree_trans *trans;
1333 	struct btree_iter iter1;
1334 	struct btree_iter iter2;
1335 	struct bkey_s_c k;
1336 	struct bkey_s_c_dirent d;
1337 	struct bch_inode_unpacked inode_u;
1338 	subvol_inum target;
1339 	u32 snapshot;
1340 	struct qstr dirent_name;
1341 	unsigned name_len = 0;
1342 	int ret;
1343 
1344 	if (!S_ISDIR(dir->v.i_mode))
1345 		return -EINVAL;
1346 
1347 	trans = bch2_trans_get(c);
1348 
1349 	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1350 			     POS(dir->ei_inode.bi_inum, 0), 0);
1351 	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1352 			     POS(dir->ei_inode.bi_inum, 0), 0);
1353 retry:
1354 	bch2_trans_begin(trans);
1355 
1356 	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1357 	if (ret)
1358 		goto err;
1359 
1360 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1361 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1362 
1363 	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1364 	if (ret)
1365 		goto err;
1366 
1367 	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1368 		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1369 
1370 		k = bch2_btree_iter_peek_slot(&iter1);
1371 		ret = bkey_err(k);
1372 		if (ret)
1373 			goto err;
1374 
1375 		if (k.k->type != KEY_TYPE_dirent) {
1376 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1377 			goto err;
1378 		}
1379 
1380 		d = bkey_s_c_to_dirent(k);
1381 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1382 		if (ret > 0)
1383 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1384 		if (ret)
1385 			goto err;
1386 
1387 		if (target.subvol	== inode->ei_subvol &&
1388 		    target.inum		== inode->ei_inode.bi_inum)
1389 			goto found;
1390 	} else {
1391 		/*
1392 		 * File with multiple hardlinks and our backref is to the wrong
1393 		 * directory - linear search:
1394 		 */
1395 		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1396 			if (k.k->p.inode > dir->ei_inode.bi_inum)
1397 				break;
1398 
1399 			if (k.k->type != KEY_TYPE_dirent)
1400 				continue;
1401 
1402 			d = bkey_s_c_to_dirent(k);
1403 			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1404 			if (ret < 0)
1405 				break;
1406 			if (ret)
1407 				continue;
1408 
1409 			if (target.subvol	== inode->ei_subvol &&
1410 			    target.inum		== inode->ei_inode.bi_inum)
1411 				goto found;
1412 		}
1413 	}
1414 
1415 	ret = -ENOENT;
1416 	goto err;
1417 found:
1418 	dirent_name = bch2_dirent_get_name(d);
1419 
1420 	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1421 	memcpy(name, dirent_name.name, name_len);
1422 	name[name_len] = '\0';
1423 err:
1424 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1425 		goto retry;
1426 
1427 	bch2_trans_iter_exit(trans, &iter1);
1428 	bch2_trans_iter_exit(trans, &iter2);
1429 	bch2_trans_put(trans);
1430 
1431 	return ret;
1432 }
1433 
1434 static const struct export_operations bch_export_ops = {
1435 	.encode_fh	= bch2_encode_fh,
1436 	.fh_to_dentry	= bch2_fh_to_dentry,
1437 	.fh_to_parent	= bch2_fh_to_parent,
1438 	.get_parent	= bch2_get_parent,
1439 	.get_name	= bch2_get_name,
1440 };
1441 
1442 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1443 				struct bch_inode_info *inode,
1444 				struct bch_inode_unpacked *bi,
1445 				struct bch_subvolume *subvol)
1446 {
1447 	bch2_iget5_set(&inode->v, &inum);
1448 	bch2_inode_update_after_write(trans, inode, bi, ~0);
1449 
1450 	if (BCH_SUBVOLUME_SNAP(subvol))
1451 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1452 	else
1453 		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1454 
1455 	inode->v.i_blocks	= bi->bi_sectors;
1456 	inode->v.i_ino		= bi->bi_inum;
1457 	inode->v.i_rdev		= bi->bi_dev;
1458 	inode->v.i_generation	= bi->bi_generation;
1459 	inode->v.i_size		= bi->bi_size;
1460 
1461 	inode->ei_flags		= 0;
1462 	inode->ei_quota_reserved = 0;
1463 	inode->ei_qid		= bch_qid(bi);
1464 	inode->ei_subvol	= inum.subvol;
1465 
1466 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1467 
1468 	switch (inode->v.i_mode & S_IFMT) {
1469 	case S_IFREG:
1470 		inode->v.i_op	= &bch_file_inode_operations;
1471 		inode->v.i_fop	= &bch_file_operations;
1472 		break;
1473 	case S_IFDIR:
1474 		inode->v.i_op	= &bch_dir_inode_operations;
1475 		inode->v.i_fop	= &bch_dir_file_operations;
1476 		break;
1477 	case S_IFLNK:
1478 		inode_nohighmem(&inode->v);
1479 		inode->v.i_op	= &bch_symlink_inode_operations;
1480 		break;
1481 	default:
1482 		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1483 		inode->v.i_op	= &bch_special_inode_operations;
1484 		break;
1485 	}
1486 
1487 	mapping_set_large_folios(inode->v.i_mapping);
1488 }
1489 
1490 static struct inode *bch2_alloc_inode(struct super_block *sb)
1491 {
1492 	struct bch_inode_info *inode;
1493 
1494 	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1495 	if (!inode)
1496 		return NULL;
1497 
1498 	inode_init_once(&inode->v);
1499 	mutex_init(&inode->ei_update_lock);
1500 	two_state_lock_init(&inode->ei_pagecache_lock);
1501 	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
1502 	mutex_init(&inode->ei_quota_lock);
1503 
1504 	return &inode->v;
1505 }
1506 
1507 static void bch2_i_callback(struct rcu_head *head)
1508 {
1509 	struct inode *vinode = container_of(head, struct inode, i_rcu);
1510 	struct bch_inode_info *inode = to_bch_ei(vinode);
1511 
1512 	kmem_cache_free(bch2_inode_cache, inode);
1513 }
1514 
1515 static void bch2_destroy_inode(struct inode *vinode)
1516 {
1517 	call_rcu(&vinode->i_rcu, bch2_i_callback);
1518 }
1519 
1520 static int inode_update_times_fn(struct btree_trans *trans,
1521 				 struct bch_inode_info *inode,
1522 				 struct bch_inode_unpacked *bi,
1523 				 void *p)
1524 {
1525 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1526 
1527 	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1528 	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1529 	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1530 
1531 	return 0;
1532 }
1533 
1534 static int bch2_vfs_write_inode(struct inode *vinode,
1535 				struct writeback_control *wbc)
1536 {
1537 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1538 	struct bch_inode_info *inode = to_bch_ei(vinode);
1539 	int ret;
1540 
1541 	mutex_lock(&inode->ei_update_lock);
1542 	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1543 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1544 	mutex_unlock(&inode->ei_update_lock);
1545 
1546 	return bch2_err_class(ret);
1547 }
1548 
1549 static void bch2_evict_inode(struct inode *vinode)
1550 {
1551 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1552 	struct bch_inode_info *inode = to_bch_ei(vinode);
1553 
1554 	truncate_inode_pages_final(&inode->v.i_data);
1555 
1556 	clear_inode(&inode->v);
1557 
1558 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1559 
1560 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1561 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1562 				KEY_TYPE_QUOTA_WARN);
1563 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1564 				KEY_TYPE_QUOTA_WARN);
1565 		bch2_inode_rm(c, inode_inum(inode));
1566 	}
1567 
1568 	mutex_lock(&c->vfs_inodes_lock);
1569 	list_del_init(&inode->ei_vfs_inode_list);
1570 	mutex_unlock(&c->vfs_inodes_lock);
1571 }
1572 
1573 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1574 {
1575 	struct bch_inode_info *inode;
1576 	DARRAY(struct bch_inode_info *) grabbed;
1577 	bool clean_pass = false, this_pass_clean;
1578 
1579 	/*
1580 	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1581 	 * be pruned with d_mark_dontcache().
1582 	 *
1583 	 * Once we've had a clean pass where we didn't find any inodes without
1584 	 * I_DONTCACHE, we wait for them to be freed:
1585 	 */
1586 
1587 	darray_init(&grabbed);
1588 	darray_make_room(&grabbed, 1024);
1589 again:
1590 	cond_resched();
1591 	this_pass_clean = true;
1592 
1593 	mutex_lock(&c->vfs_inodes_lock);
1594 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1595 		if (!snapshot_list_has_id(s, inode->ei_subvol))
1596 			continue;
1597 
1598 		if (!(inode->v.i_state & I_DONTCACHE) &&
1599 		    !(inode->v.i_state & I_FREEING) &&
1600 		    igrab(&inode->v)) {
1601 			this_pass_clean = false;
1602 
1603 			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1604 				iput(&inode->v);
1605 				break;
1606 			}
1607 		} else if (clean_pass && this_pass_clean) {
1608 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1609 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1610 
1611 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1612 			mutex_unlock(&c->vfs_inodes_lock);
1613 
1614 			schedule();
1615 			finish_wait(wq, &wait.wq_entry);
1616 			goto again;
1617 		}
1618 	}
1619 	mutex_unlock(&c->vfs_inodes_lock);
1620 
1621 	darray_for_each(grabbed, i) {
1622 		inode = *i;
1623 		d_mark_dontcache(&inode->v);
1624 		d_prune_aliases(&inode->v);
1625 		iput(&inode->v);
1626 	}
1627 	grabbed.nr = 0;
1628 
1629 	if (!clean_pass || !this_pass_clean) {
1630 		clean_pass = this_pass_clean;
1631 		goto again;
1632 	}
1633 
1634 	darray_exit(&grabbed);
1635 }
1636 
1637 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1638 {
1639 	struct super_block *sb = dentry->d_sb;
1640 	struct bch_fs *c = sb->s_fs_info;
1641 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1642 	unsigned shift = sb->s_blocksize_bits - 9;
1643 	/*
1644 	 * this assumes inodes take up 64 bytes, which is a decent average
1645 	 * number:
1646 	 */
1647 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1648 
1649 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1650 	buf->f_bsize	= sb->s_blocksize;
1651 	buf->f_blocks	= usage.capacity >> shift;
1652 	buf->f_bfree	= usage.free >> shift;
1653 	buf->f_bavail	= avail_factor(usage.free) >> shift;
1654 
1655 	buf->f_files	= usage.nr_inodes + avail_inodes;
1656 	buf->f_ffree	= avail_inodes;
1657 
1658 	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
1659 	buf->f_namelen	= BCH_NAME_MAX;
1660 
1661 	return 0;
1662 }
1663 
1664 static int bch2_sync_fs(struct super_block *sb, int wait)
1665 {
1666 	struct bch_fs *c = sb->s_fs_info;
1667 	int ret;
1668 
1669 	if (c->opts.journal_flush_disabled)
1670 		return 0;
1671 
1672 	if (!wait) {
1673 		bch2_journal_flush_async(&c->journal, NULL);
1674 		return 0;
1675 	}
1676 
1677 	ret = bch2_journal_flush(&c->journal);
1678 	return bch2_err_class(ret);
1679 }
1680 
1681 static struct bch_fs *bch2_path_to_fs(const char *path)
1682 {
1683 	struct bch_fs *c;
1684 	dev_t dev;
1685 	int ret;
1686 
1687 	ret = lookup_bdev(path, &dev);
1688 	if (ret)
1689 		return ERR_PTR(ret);
1690 
1691 	c = bch2_dev_to_fs(dev);
1692 	if (c)
1693 		closure_put(&c->cl);
1694 	return c ?: ERR_PTR(-ENOENT);
1695 }
1696 
1697 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1698 {
1699 	struct bch_fs *c = sb->s_fs_info;
1700 	struct bch_opts opts = bch2_opts_empty();
1701 	int ret;
1702 
1703 	ret = bch2_parse_mount_opts(c, &opts, data);
1704 	if (ret)
1705 		goto err;
1706 
1707 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1708 
1709 	if (opts.read_only != c->opts.read_only) {
1710 		down_write(&c->state_lock);
1711 
1712 		if (opts.read_only) {
1713 			bch2_fs_read_only(c);
1714 
1715 			sb->s_flags |= SB_RDONLY;
1716 		} else {
1717 			ret = bch2_fs_read_write(c);
1718 			if (ret) {
1719 				bch_err(c, "error going rw: %i", ret);
1720 				up_write(&c->state_lock);
1721 				ret = -EINVAL;
1722 				goto err;
1723 			}
1724 
1725 			sb->s_flags &= ~SB_RDONLY;
1726 		}
1727 
1728 		c->opts.read_only = opts.read_only;
1729 
1730 		up_write(&c->state_lock);
1731 	}
1732 
1733 	if (opt_defined(opts, errors))
1734 		c->opts.errors = opts.errors;
1735 err:
1736 	return bch2_err_class(ret);
1737 }
1738 
1739 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1740 {
1741 	struct bch_fs *c = root->d_sb->s_fs_info;
1742 	bool first = true;
1743 
1744 	for_each_online_member(c, ca) {
1745 		if (!first)
1746 			seq_putc(seq, ':');
1747 		first = false;
1748 		seq_puts(seq, ca->disk_sb.sb_name);
1749 	}
1750 
1751 	return 0;
1752 }
1753 
1754 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1755 {
1756 	struct bch_fs *c = root->d_sb->s_fs_info;
1757 	enum bch_opt_id i;
1758 	struct printbuf buf = PRINTBUF;
1759 	int ret = 0;
1760 
1761 	for (i = 0; i < bch2_opts_nr; i++) {
1762 		const struct bch_option *opt = &bch2_opt_table[i];
1763 		u64 v = bch2_opt_get_by_id(&c->opts, i);
1764 
1765 		if (!(opt->flags & OPT_MOUNT))
1766 			continue;
1767 
1768 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1769 			continue;
1770 
1771 		printbuf_reset(&buf);
1772 		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1773 				 OPT_SHOW_MOUNT_STYLE);
1774 		seq_putc(seq, ',');
1775 		seq_puts(seq, buf.buf);
1776 	}
1777 
1778 	if (buf.allocation_failure)
1779 		ret = -ENOMEM;
1780 	printbuf_exit(&buf);
1781 	return ret;
1782 }
1783 
1784 static void bch2_put_super(struct super_block *sb)
1785 {
1786 	struct bch_fs *c = sb->s_fs_info;
1787 
1788 	__bch2_fs_stop(c);
1789 }
1790 
1791 /*
1792  * bcachefs doesn't currently integrate intwrite freeze protection but the
1793  * internal write references serve the same purpose. Therefore reuse the
1794  * read-only transition code to perform the quiesce. The caveat is that we don't
1795  * currently have the ability to block tasks that want a write reference while
1796  * the superblock is frozen. This is fine for now, but we should either add
1797  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1798  */
1799 static int bch2_freeze(struct super_block *sb)
1800 {
1801 	struct bch_fs *c = sb->s_fs_info;
1802 
1803 	down_write(&c->state_lock);
1804 	bch2_fs_read_only(c);
1805 	up_write(&c->state_lock);
1806 	return 0;
1807 }
1808 
1809 static int bch2_unfreeze(struct super_block *sb)
1810 {
1811 	struct bch_fs *c = sb->s_fs_info;
1812 	int ret;
1813 
1814 	if (test_bit(BCH_FS_emergency_ro, &c->flags))
1815 		return 0;
1816 
1817 	down_write(&c->state_lock);
1818 	ret = bch2_fs_read_write(c);
1819 	up_write(&c->state_lock);
1820 	return ret;
1821 }
1822 
1823 static const struct super_operations bch_super_operations = {
1824 	.alloc_inode	= bch2_alloc_inode,
1825 	.destroy_inode	= bch2_destroy_inode,
1826 	.write_inode	= bch2_vfs_write_inode,
1827 	.evict_inode	= bch2_evict_inode,
1828 	.sync_fs	= bch2_sync_fs,
1829 	.statfs		= bch2_statfs,
1830 	.show_devname	= bch2_show_devname,
1831 	.show_options	= bch2_show_options,
1832 	.remount_fs	= bch2_remount,
1833 	.put_super	= bch2_put_super,
1834 	.freeze_fs	= bch2_freeze,
1835 	.unfreeze_fs	= bch2_unfreeze,
1836 };
1837 
1838 static int bch2_set_super(struct super_block *s, void *data)
1839 {
1840 	s->s_fs_info = data;
1841 	return 0;
1842 }
1843 
1844 static int bch2_noset_super(struct super_block *s, void *data)
1845 {
1846 	return -EBUSY;
1847 }
1848 
1849 typedef DARRAY(struct bch_fs *) darray_fs;
1850 
1851 static int bch2_test_super(struct super_block *s, void *data)
1852 {
1853 	struct bch_fs *c = s->s_fs_info;
1854 	darray_fs *d = data;
1855 
1856 	if (!c)
1857 		return false;
1858 
1859 	darray_for_each(*d, i)
1860 		if (c != *i)
1861 			return false;
1862 	return true;
1863 }
1864 
1865 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1866 				 int flags, const char *dev_name, void *data)
1867 {
1868 	struct bch_fs *c;
1869 	struct super_block *sb;
1870 	struct inode *vinode;
1871 	struct bch_opts opts = bch2_opts_empty();
1872 	int ret;
1873 
1874 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1875 
1876 	ret = bch2_parse_mount_opts(NULL, &opts, data);
1877 	if (ret) {
1878 		ret = bch2_err_class(ret);
1879 		return ERR_PTR(ret);
1880 	}
1881 
1882 	if (!dev_name || strlen(dev_name) == 0)
1883 		return ERR_PTR(-EINVAL);
1884 
1885 	darray_str devs;
1886 	ret = bch2_split_devs(dev_name, &devs);
1887 	if (ret)
1888 		return ERR_PTR(ret);
1889 
1890 	darray_fs devs_to_fs = {};
1891 	darray_for_each(devs, i) {
1892 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1893 		if (ret) {
1894 			sb = ERR_PTR(ret);
1895 			goto got_sb;
1896 		}
1897 	}
1898 
1899 	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1900 	if (!IS_ERR(sb))
1901 		goto got_sb;
1902 
1903 	c = bch2_fs_open(devs.data, devs.nr, opts);
1904 	if (IS_ERR(c)) {
1905 		sb = ERR_CAST(c);
1906 		goto got_sb;
1907 	}
1908 
1909 	/* Some options can't be parsed until after the fs is started: */
1910 	ret = bch2_parse_mount_opts(c, &opts, data);
1911 	if (ret) {
1912 		bch2_fs_stop(c);
1913 		sb = ERR_PTR(ret);
1914 		goto got_sb;
1915 	}
1916 
1917 	bch2_opts_apply(&c->opts, opts);
1918 
1919 	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1920 	if (IS_ERR(sb))
1921 		bch2_fs_stop(c);
1922 got_sb:
1923 	darray_exit(&devs_to_fs);
1924 	bch2_darray_str_exit(&devs);
1925 
1926 	if (IS_ERR(sb)) {
1927 		ret = PTR_ERR(sb);
1928 		ret = bch2_err_class(ret);
1929 		return ERR_PTR(ret);
1930 	}
1931 
1932 	c = sb->s_fs_info;
1933 
1934 	if (sb->s_root) {
1935 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
1936 			ret = -EBUSY;
1937 			goto err_put_super;
1938 		}
1939 		goto out;
1940 	}
1941 
1942 	sb->s_blocksize		= block_bytes(c);
1943 	sb->s_blocksize_bits	= ilog2(block_bytes(c));
1944 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
1945 	sb->s_op		= &bch_super_operations;
1946 	sb->s_export_op		= &bch_export_ops;
1947 #ifdef CONFIG_BCACHEFS_QUOTA
1948 	sb->s_qcop		= &bch2_quotactl_operations;
1949 	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1950 #endif
1951 	sb->s_xattr		= bch2_xattr_handlers;
1952 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
1953 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
1954 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1955 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
1956 	sb->s_uuid		= c->sb.user_uuid;
1957 	c->vfs_sb		= sb;
1958 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1959 
1960 	ret = super_setup_bdi(sb);
1961 	if (ret)
1962 		goto err_put_super;
1963 
1964 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
1965 
1966 	for_each_online_member(c, ca) {
1967 		struct block_device *bdev = ca->disk_sb.bdev;
1968 
1969 		/* XXX: create an anonymous device for multi device filesystems */
1970 		sb->s_bdev	= bdev;
1971 		sb->s_dev	= bdev->bd_dev;
1972 		percpu_ref_put(&ca->io_ref);
1973 		break;
1974 	}
1975 
1976 	c->dev = sb->s_dev;
1977 
1978 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1979 	if (c->opts.acl)
1980 		sb->s_flags	|= SB_POSIXACL;
1981 #endif
1982 
1983 	sb->s_shrink->seeks = 0;
1984 
1985 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1986 	ret = PTR_ERR_OR_ZERO(vinode);
1987 	bch_err_msg(c, ret, "mounting: error getting root inode");
1988 	if (ret)
1989 		goto err_put_super;
1990 
1991 	sb->s_root = d_make_root(vinode);
1992 	if (!sb->s_root) {
1993 		bch_err(c, "error mounting: error allocating root dentry");
1994 		ret = -ENOMEM;
1995 		goto err_put_super;
1996 	}
1997 
1998 	sb->s_flags |= SB_ACTIVE;
1999 out:
2000 	return dget(sb->s_root);
2001 
2002 err_put_super:
2003 	__bch2_fs_stop(c);
2004 	deactivate_locked_super(sb);
2005 	return ERR_PTR(bch2_err_class(ret));
2006 }
2007 
2008 static void bch2_kill_sb(struct super_block *sb)
2009 {
2010 	struct bch_fs *c = sb->s_fs_info;
2011 
2012 	generic_shutdown_super(sb);
2013 	bch2_fs_free(c);
2014 }
2015 
2016 static struct file_system_type bcache_fs_type = {
2017 	.owner		= THIS_MODULE,
2018 	.name		= "bcachefs",
2019 	.mount		= bch2_mount,
2020 	.kill_sb	= bch2_kill_sb,
2021 	.fs_flags	= FS_REQUIRES_DEV,
2022 };
2023 
2024 MODULE_ALIAS_FS("bcachefs");
2025 
2026 void bch2_vfs_exit(void)
2027 {
2028 	unregister_filesystem(&bcache_fs_type);
2029 	kmem_cache_destroy(bch2_inode_cache);
2030 }
2031 
2032 int __init bch2_vfs_init(void)
2033 {
2034 	int ret = -ENOMEM;
2035 
2036 	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
2037 	if (!bch2_inode_cache)
2038 		goto err;
2039 
2040 	ret = register_filesystem(&bcache_fs_type);
2041 	if (ret)
2042 		goto err;
2043 
2044 	return 0;
2045 err:
2046 	bch2_vfs_exit();
2047 	return ret;
2048 }
2049 
2050 #endif /* NO_BCACHEFS_FS */
2051