xref: /linux/fs/bcachefs/fs.c (revision 8f5b5f78113e881cb8570c961b0dc42b218a1b9e)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42 
43 static struct kmem_cache *bch2_inode_cache;
44 
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46 				struct bch_inode_info *,
47 				struct bch_inode_unpacked *,
48 				struct bch_subvolume *);
49 
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51 				   struct bch_inode_info *inode,
52 				   struct bch_inode_unpacked *bi,
53 				   unsigned fields)
54 {
55 	struct bch_fs *c = trans->c;
56 
57 	BUG_ON(bi->bi_inum != inode->v.i_ino);
58 
59 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60 			       POS(0, bi->bi_inum),
61 			       c->opts.inodes_use_key_cache);
62 
63 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64 	i_uid_write(&inode->v, bi->bi_uid);
65 	i_gid_write(&inode->v, bi->bi_gid);
66 	inode->v.i_mode	= bi->bi_mode;
67 
68 	if (fields & ATTR_ATIME)
69 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70 	if (fields & ATTR_MTIME)
71 		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72 	if (fields & ATTR_CTIME)
73 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74 
75 	inode->ei_inode		= *bi;
76 
77 	bch2_inode_flags_to_vfs(inode);
78 }
79 
80 int __must_check bch2_write_inode(struct bch_fs *c,
81 				  struct bch_inode_info *inode,
82 				  inode_set_fn set,
83 				  void *p, unsigned fields)
84 {
85 	struct btree_trans *trans = bch2_trans_get(c);
86 	struct btree_iter iter = { NULL };
87 	struct bch_inode_unpacked inode_u;
88 	int ret;
89 retry:
90 	bch2_trans_begin(trans);
91 
92 	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93 				BTREE_ITER_INTENT) ?:
94 		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95 		bch2_inode_write(trans, &iter, &inode_u) ?:
96 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97 
98 	/*
99 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 	 * this is important for inode updates via bchfs_write_index_update
101 	 */
102 	if (!ret)
103 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104 
105 	bch2_trans_iter_exit(trans, &iter);
106 
107 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108 		goto retry;
109 
110 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111 			     "%s: inode %u:%llu not found when updating",
112 			     bch2_err_str(ret),
113 			     inode_inum(inode).subvol,
114 			     inode_inum(inode).inum);
115 
116 	bch2_trans_put(trans);
117 	return ret < 0 ? ret : 0;
118 }
119 
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121 			   struct bch_inode_info *inode,
122 			   struct bch_qid new_qid,
123 			   unsigned qtypes,
124 			   enum quota_acct_mode mode)
125 {
126 	unsigned i;
127 	int ret;
128 
129 	qtypes &= enabled_qtypes(c);
130 
131 	for (i = 0; i < QTYP_NR; i++)
132 		if (new_qid.q[i] == inode->ei_qid.q[i])
133 			qtypes &= ~(1U << i);
134 
135 	if (!qtypes)
136 		return 0;
137 
138 	mutex_lock(&inode->ei_quota_lock);
139 
140 	ret = bch2_quota_transfer(c, qtypes, new_qid,
141 				  inode->ei_qid,
142 				  inode->v.i_blocks +
143 				  inode->ei_quota_reserved,
144 				  mode);
145 	if (!ret)
146 		for (i = 0; i < QTYP_NR; i++)
147 			if (qtypes & (1 << i))
148 				inode->ei_qid.q[i] = new_qid.q[i];
149 
150 	mutex_unlock(&inode->ei_quota_lock);
151 
152 	return ret;
153 }
154 
155 static int bch2_iget5_test(struct inode *vinode, void *p)
156 {
157 	struct bch_inode_info *inode = to_bch_ei(vinode);
158 	subvol_inum *inum = p;
159 
160 	return inode->ei_subvol == inum->subvol &&
161 		inode->ei_inode.bi_inum == inum->inum;
162 }
163 
164 static int bch2_iget5_set(struct inode *vinode, void *p)
165 {
166 	struct bch_inode_info *inode = to_bch_ei(vinode);
167 	subvol_inum *inum = p;
168 
169 	inode->v.i_ino		= inum->inum;
170 	inode->ei_subvol	= inum->subvol;
171 	inode->ei_inode.bi_inum	= inum->inum;
172 	return 0;
173 }
174 
175 static unsigned bch2_inode_hash(subvol_inum inum)
176 {
177 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
178 }
179 
180 static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
181 {
182 	subvol_inum inum = inode_inum(inode);
183 	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
184 				      bch2_inode_hash(inum),
185 				      bch2_iget5_test,
186 				      bch2_iget5_set,
187 				      &inum));
188 	BUG_ON(!old);
189 
190 	if (unlikely(old != inode)) {
191 		__destroy_inode(&inode->v);
192 		kmem_cache_free(bch2_inode_cache, inode);
193 		inode = old;
194 	} else {
195 		mutex_lock(&c->vfs_inodes_lock);
196 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
197 		mutex_unlock(&c->vfs_inodes_lock);
198 		/*
199 		 * we really don't want insert_inode_locked2() to be setting
200 		 * I_NEW...
201 		 */
202 		unlock_new_inode(&inode->v);
203 	}
204 
205 	return inode;
206 }
207 
208 #define memalloc_flags_do(_flags, _do)						\
209 ({										\
210 	unsigned _saved_flags = memalloc_flags_save(_flags);			\
211 	typeof(_do) _ret = _do;							\
212 	memalloc_noreclaim_restore(_saved_flags);				\
213 	_ret;									\
214 })
215 
216 /*
217  * Allocate a new inode, dropping/retaking btree locks if necessary:
218  */
219 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
220 {
221 	struct bch_fs *c = trans->c;
222 
223 	struct bch_inode_info *inode =
224 		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
225 				  to_bch_ei(new_inode(c->vfs_sb)));
226 
227 	if (unlikely(!inode)) {
228 		int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
229 		if (ret && inode) {
230 			__destroy_inode(&inode->v);
231 			kmem_cache_free(bch2_inode_cache, inode);
232 		}
233 		if (ret)
234 			return ERR_PTR(ret);
235 	}
236 
237 	return inode;
238 }
239 
240 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
241 {
242 	struct bch_inode_info *inode =
243 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
244 					  bch2_inode_hash(inum),
245 					  bch2_iget5_test,
246 					  &inum));
247 	if (inode)
248 		return &inode->v;
249 
250 	struct btree_trans *trans = bch2_trans_get(c);
251 
252 	struct bch_inode_unpacked inode_u;
253 	struct bch_subvolume subvol;
254 	int ret = lockrestart_do(trans,
255 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
256 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
257 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
258 	if (!ret) {
259 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
260 		inode = bch2_inode_insert(c, inode);
261 	}
262 	bch2_trans_put(trans);
263 
264 	return ret ? ERR_PTR(ret) : &inode->v;
265 }
266 
267 struct bch_inode_info *
268 __bch2_create(struct mnt_idmap *idmap,
269 	      struct bch_inode_info *dir, struct dentry *dentry,
270 	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
271 	      unsigned flags)
272 {
273 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
274 	struct btree_trans *trans;
275 	struct bch_inode_unpacked dir_u;
276 	struct bch_inode_info *inode;
277 	struct bch_inode_unpacked inode_u;
278 	struct posix_acl *default_acl = NULL, *acl = NULL;
279 	subvol_inum inum;
280 	struct bch_subvolume subvol;
281 	u64 journal_seq = 0;
282 	int ret;
283 
284 	/*
285 	 * preallocate acls + vfs inode before btree transaction, so that
286 	 * nothing can fail after the transaction succeeds:
287 	 */
288 #ifdef CONFIG_BCACHEFS_POSIX_ACL
289 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
290 	if (ret)
291 		return ERR_PTR(ret);
292 #endif
293 	inode = to_bch_ei(new_inode(c->vfs_sb));
294 	if (unlikely(!inode)) {
295 		inode = ERR_PTR(-ENOMEM);
296 		goto err;
297 	}
298 
299 	bch2_inode_init_early(c, &inode_u);
300 
301 	if (!(flags & BCH_CREATE_TMPFILE))
302 		mutex_lock(&dir->ei_update_lock);
303 
304 	trans = bch2_trans_get(c);
305 retry:
306 	bch2_trans_begin(trans);
307 
308 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
309 		bch2_create_trans(trans,
310 				  inode_inum(dir), &dir_u, &inode_u,
311 				  !(flags & BCH_CREATE_TMPFILE)
312 				  ? &dentry->d_name : NULL,
313 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
314 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
315 				  mode, rdev,
316 				  default_acl, acl, snapshot_src, flags) ?:
317 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
318 				KEY_TYPE_QUOTA_PREALLOC);
319 	if (unlikely(ret))
320 		goto err_before_quota;
321 
322 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
323 	inum.inum = inode_u.bi_inum;
324 
325 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
326 				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
327 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
328 	if (unlikely(ret)) {
329 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
330 				KEY_TYPE_QUOTA_WARN);
331 err_before_quota:
332 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
333 			goto retry;
334 		goto err_trans;
335 	}
336 
337 	if (!(flags & BCH_CREATE_TMPFILE)) {
338 		bch2_inode_update_after_write(trans, dir, &dir_u,
339 					      ATTR_MTIME|ATTR_CTIME);
340 		mutex_unlock(&dir->ei_update_lock);
341 	}
342 
343 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
344 
345 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
346 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
347 
348 	/*
349 	 * we must insert the new inode into the inode cache before calling
350 	 * bch2_trans_exit() and dropping locks, else we could race with another
351 	 * thread pulling the inode in and modifying it:
352 	 */
353 	inode = bch2_inode_insert(c, inode);
354 	bch2_trans_put(trans);
355 err:
356 	posix_acl_release(default_acl);
357 	posix_acl_release(acl);
358 	return inode;
359 err_trans:
360 	if (!(flags & BCH_CREATE_TMPFILE))
361 		mutex_unlock(&dir->ei_update_lock);
362 
363 	bch2_trans_put(trans);
364 	make_bad_inode(&inode->v);
365 	iput(&inode->v);
366 	inode = ERR_PTR(ret);
367 	goto err;
368 }
369 
370 /* methods */
371 
372 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
373 			subvol_inum dir, struct bch_hash_info *dir_hash_info,
374 			const struct qstr *name)
375 {
376 	struct bch_fs *c = trans->c;
377 	struct btree_iter dirent_iter = {};
378 	subvol_inum inum = {};
379 
380 	int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
381 				   dir_hash_info, dir, name, 0);
382 	if (ret)
383 		return ERR_PTR(ret);
384 
385 	struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
386 	ret = bkey_err(k);
387 	if (ret)
388 		goto err;
389 
390 	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
391 	if (ret > 0)
392 		ret = -ENOENT;
393 	if (ret)
394 		goto err;
395 
396 	struct bch_inode_info *inode =
397 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
398 					  bch2_inode_hash(inum),
399 					  bch2_iget5_test,
400 					  &inum));
401 	if (inode)
402 		goto out;
403 
404 	struct bch_subvolume subvol;
405 	struct bch_inode_unpacked inode_u;
406 	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
407 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
408 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
409 	if (bch2_err_matches(ret, ENOENT)) {
410 		struct printbuf buf = PRINTBUF;
411 
412 		bch2_bkey_val_to_text(&buf, c, k);
413 		bch_err(c, "%s points to missing inode", buf.buf);
414 		printbuf_exit(&buf);
415 	}
416 	if (ret)
417 		goto err;
418 
419 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
420 	inode = bch2_inode_insert(c, inode);
421 out:
422 	bch2_trans_iter_exit(trans, &dirent_iter);
423 	return inode;
424 err:
425 	inode = ERR_PTR(ret);
426 	goto out;
427 }
428 
429 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
430 				  unsigned int flags)
431 {
432 	struct bch_fs *c = vdir->i_sb->s_fs_info;
433 	struct bch_inode_info *dir = to_bch_ei(vdir);
434 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
435 
436 	struct bch_inode_info *inode;
437 	bch2_trans_do(c, NULL, NULL, 0,
438 		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
439 							  &hash, &dentry->d_name)));
440 	if (IS_ERR(inode))
441 		inode = NULL;
442 
443 	return d_splice_alias(&inode->v, dentry);
444 }
445 
446 static int bch2_mknod(struct mnt_idmap *idmap,
447 		      struct inode *vdir, struct dentry *dentry,
448 		      umode_t mode, dev_t rdev)
449 {
450 	struct bch_inode_info *inode =
451 		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
452 			      (subvol_inum) { 0 }, 0);
453 
454 	if (IS_ERR(inode))
455 		return bch2_err_class(PTR_ERR(inode));
456 
457 	d_instantiate(dentry, &inode->v);
458 	return 0;
459 }
460 
461 static int bch2_create(struct mnt_idmap *idmap,
462 		       struct inode *vdir, struct dentry *dentry,
463 		       umode_t mode, bool excl)
464 {
465 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
466 }
467 
468 static int __bch2_link(struct bch_fs *c,
469 		       struct bch_inode_info *inode,
470 		       struct bch_inode_info *dir,
471 		       struct dentry *dentry)
472 {
473 	struct btree_trans *trans = bch2_trans_get(c);
474 	struct bch_inode_unpacked dir_u, inode_u;
475 	int ret;
476 
477 	mutex_lock(&inode->ei_update_lock);
478 
479 	ret = commit_do(trans, NULL, NULL, 0,
480 			bch2_link_trans(trans,
481 					inode_inum(dir),   &dir_u,
482 					inode_inum(inode), &inode_u,
483 					&dentry->d_name));
484 
485 	if (likely(!ret)) {
486 		bch2_inode_update_after_write(trans, dir, &dir_u,
487 					      ATTR_MTIME|ATTR_CTIME);
488 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
489 	}
490 
491 	bch2_trans_put(trans);
492 	mutex_unlock(&inode->ei_update_lock);
493 	return ret;
494 }
495 
496 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
497 		     struct dentry *dentry)
498 {
499 	struct bch_fs *c = vdir->i_sb->s_fs_info;
500 	struct bch_inode_info *dir = to_bch_ei(vdir);
501 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
502 	int ret;
503 
504 	lockdep_assert_held(&inode->v.i_rwsem);
505 
506 	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
507 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
508 		__bch2_link(c, inode, dir, dentry);
509 	if (unlikely(ret))
510 		return bch2_err_class(ret);
511 
512 	ihold(&inode->v);
513 	d_instantiate(dentry, &inode->v);
514 	return 0;
515 }
516 
517 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
518 		  bool deleting_snapshot)
519 {
520 	struct bch_fs *c = vdir->i_sb->s_fs_info;
521 	struct bch_inode_info *dir = to_bch_ei(vdir);
522 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
523 	struct bch_inode_unpacked dir_u, inode_u;
524 	struct btree_trans *trans = bch2_trans_get(c);
525 	int ret;
526 
527 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
528 
529 	ret = commit_do(trans, NULL, NULL,
530 			BCH_TRANS_COMMIT_no_enospc,
531 		bch2_unlink_trans(trans,
532 				  inode_inum(dir), &dir_u,
533 				  &inode_u, &dentry->d_name,
534 				  deleting_snapshot));
535 	if (unlikely(ret))
536 		goto err;
537 
538 	bch2_inode_update_after_write(trans, dir, &dir_u,
539 				      ATTR_MTIME|ATTR_CTIME);
540 	bch2_inode_update_after_write(trans, inode, &inode_u,
541 				      ATTR_MTIME);
542 
543 	if (inode_u.bi_subvol) {
544 		/*
545 		 * Subvolume deletion is asynchronous, but we still want to tell
546 		 * the VFS that it's been deleted here:
547 		 */
548 		set_nlink(&inode->v, 0);
549 	}
550 err:
551 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
552 	bch2_trans_put(trans);
553 
554 	return ret;
555 }
556 
557 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
558 {
559 	struct bch_inode_info *dir= to_bch_ei(vdir);
560 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
561 
562 	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
563 		__bch2_unlink(vdir, dentry, false);
564 	return bch2_err_class(ret);
565 }
566 
567 static int bch2_symlink(struct mnt_idmap *idmap,
568 			struct inode *vdir, struct dentry *dentry,
569 			const char *symname)
570 {
571 	struct bch_fs *c = vdir->i_sb->s_fs_info;
572 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
573 	int ret;
574 
575 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
576 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
577 	if (IS_ERR(inode))
578 		return bch2_err_class(PTR_ERR(inode));
579 
580 	inode_lock(&inode->v);
581 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
582 	inode_unlock(&inode->v);
583 
584 	if (unlikely(ret))
585 		goto err;
586 
587 	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
588 	if (unlikely(ret))
589 		goto err;
590 
591 	ret = __bch2_link(c, inode, dir, dentry);
592 	if (unlikely(ret))
593 		goto err;
594 
595 	d_instantiate(dentry, &inode->v);
596 	return 0;
597 err:
598 	iput(&inode->v);
599 	return bch2_err_class(ret);
600 }
601 
602 static int bch2_mkdir(struct mnt_idmap *idmap,
603 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
604 {
605 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
606 }
607 
608 static int bch2_rename2(struct mnt_idmap *idmap,
609 			struct inode *src_vdir, struct dentry *src_dentry,
610 			struct inode *dst_vdir, struct dentry *dst_dentry,
611 			unsigned flags)
612 {
613 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
614 	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
615 	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
616 	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
617 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
618 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
619 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
620 	struct btree_trans *trans;
621 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
622 		? BCH_RENAME_EXCHANGE
623 		: dst_dentry->d_inode
624 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
625 	int ret;
626 
627 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
628 		return -EINVAL;
629 
630 	if (mode == BCH_RENAME_OVERWRITE) {
631 		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
632 						   0, LLONG_MAX);
633 		if (ret)
634 			return ret;
635 	}
636 
637 	trans = bch2_trans_get(c);
638 
639 	bch2_lock_inodes(INODE_UPDATE_LOCK,
640 			 src_dir,
641 			 dst_dir,
642 			 src_inode,
643 			 dst_inode);
644 
645 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
646 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
647 	if (ret)
648 		goto err;
649 
650 	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
651 		ret = bch2_fs_quota_transfer(c, src_inode,
652 					     dst_dir->ei_qid,
653 					     1 << QTYP_PRJ,
654 					     KEY_TYPE_QUOTA_PREALLOC);
655 		if (ret)
656 			goto err;
657 	}
658 
659 	if (mode == BCH_RENAME_EXCHANGE &&
660 	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
661 		ret = bch2_fs_quota_transfer(c, dst_inode,
662 					     src_dir->ei_qid,
663 					     1 << QTYP_PRJ,
664 					     KEY_TYPE_QUOTA_PREALLOC);
665 		if (ret)
666 			goto err;
667 	}
668 
669 	ret = commit_do(trans, NULL, NULL, 0,
670 			bch2_rename_trans(trans,
671 					  inode_inum(src_dir), &src_dir_u,
672 					  inode_inum(dst_dir), &dst_dir_u,
673 					  &src_inode_u,
674 					  &dst_inode_u,
675 					  &src_dentry->d_name,
676 					  &dst_dentry->d_name,
677 					  mode));
678 	if (unlikely(ret))
679 		goto err;
680 
681 	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
682 	BUG_ON(dst_inode &&
683 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
684 
685 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
686 				      ATTR_MTIME|ATTR_CTIME);
687 
688 	if (src_dir != dst_dir)
689 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
690 					      ATTR_MTIME|ATTR_CTIME);
691 
692 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
693 				      ATTR_CTIME);
694 
695 	if (dst_inode)
696 		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
697 					      ATTR_CTIME);
698 err:
699 	bch2_trans_put(trans);
700 
701 	bch2_fs_quota_transfer(c, src_inode,
702 			       bch_qid(&src_inode->ei_inode),
703 			       1 << QTYP_PRJ,
704 			       KEY_TYPE_QUOTA_NOCHECK);
705 	if (dst_inode)
706 		bch2_fs_quota_transfer(c, dst_inode,
707 				       bch_qid(&dst_inode->ei_inode),
708 				       1 << QTYP_PRJ,
709 				       KEY_TYPE_QUOTA_NOCHECK);
710 
711 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
712 			   src_dir,
713 			   dst_dir,
714 			   src_inode,
715 			   dst_inode);
716 
717 	return bch2_err_class(ret);
718 }
719 
720 static void bch2_setattr_copy(struct mnt_idmap *idmap,
721 			      struct bch_inode_info *inode,
722 			      struct bch_inode_unpacked *bi,
723 			      struct iattr *attr)
724 {
725 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
726 	unsigned int ia_valid = attr->ia_valid;
727 
728 	if (ia_valid & ATTR_UID)
729 		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
730 	if (ia_valid & ATTR_GID)
731 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
732 
733 	if (ia_valid & ATTR_SIZE)
734 		bi->bi_size = attr->ia_size;
735 
736 	if (ia_valid & ATTR_ATIME)
737 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
738 	if (ia_valid & ATTR_MTIME)
739 		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
740 	if (ia_valid & ATTR_CTIME)
741 		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
742 
743 	if (ia_valid & ATTR_MODE) {
744 		umode_t mode = attr->ia_mode;
745 		kgid_t gid = ia_valid & ATTR_GID
746 			? attr->ia_gid
747 			: inode->v.i_gid;
748 
749 		if (!in_group_p(gid) &&
750 		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
751 			mode &= ~S_ISGID;
752 		bi->bi_mode = mode;
753 	}
754 }
755 
756 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
757 			 struct bch_inode_info *inode,
758 			 struct iattr *attr)
759 {
760 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
761 	struct bch_qid qid;
762 	struct btree_trans *trans;
763 	struct btree_iter inode_iter = { NULL };
764 	struct bch_inode_unpacked inode_u;
765 	struct posix_acl *acl = NULL;
766 	int ret;
767 
768 	mutex_lock(&inode->ei_update_lock);
769 
770 	qid = inode->ei_qid;
771 
772 	if (attr->ia_valid & ATTR_UID)
773 		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
774 
775 	if (attr->ia_valid & ATTR_GID)
776 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
777 
778 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
779 				     KEY_TYPE_QUOTA_PREALLOC);
780 	if (ret)
781 		goto err;
782 
783 	trans = bch2_trans_get(c);
784 retry:
785 	bch2_trans_begin(trans);
786 	kfree(acl);
787 	acl = NULL;
788 
789 	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
790 			      BTREE_ITER_INTENT);
791 	if (ret)
792 		goto btree_err;
793 
794 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
795 
796 	if (attr->ia_valid & ATTR_MODE) {
797 		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
798 				     inode_u.bi_mode, &acl);
799 		if (ret)
800 			goto btree_err;
801 	}
802 
803 	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
804 		bch2_trans_commit(trans, NULL, NULL,
805 				  BCH_TRANS_COMMIT_no_enospc);
806 btree_err:
807 	bch2_trans_iter_exit(trans, &inode_iter);
808 
809 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
810 		goto retry;
811 	if (unlikely(ret))
812 		goto err_trans;
813 
814 	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
815 
816 	if (acl)
817 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
818 err_trans:
819 	bch2_trans_put(trans);
820 err:
821 	mutex_unlock(&inode->ei_update_lock);
822 
823 	return bch2_err_class(ret);
824 }
825 
826 static int bch2_getattr(struct mnt_idmap *idmap,
827 			const struct path *path, struct kstat *stat,
828 			u32 request_mask, unsigned query_flags)
829 {
830 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
831 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
832 
833 	stat->dev	= inode->v.i_sb->s_dev;
834 	stat->ino	= inode->v.i_ino;
835 	stat->mode	= inode->v.i_mode;
836 	stat->nlink	= inode->v.i_nlink;
837 	stat->uid	= inode->v.i_uid;
838 	stat->gid	= inode->v.i_gid;
839 	stat->rdev	= inode->v.i_rdev;
840 	stat->size	= i_size_read(&inode->v);
841 	stat->atime	= inode_get_atime(&inode->v);
842 	stat->mtime	= inode_get_mtime(&inode->v);
843 	stat->ctime	= inode_get_ctime(&inode->v);
844 	stat->blksize	= block_bytes(c);
845 	stat->blocks	= inode->v.i_blocks;
846 
847 	stat->subvol	= inode->ei_subvol;
848 	stat->result_mask |= STATX_SUBVOL;
849 
850 	if (request_mask & STATX_BTIME) {
851 		stat->result_mask |= STATX_BTIME;
852 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
853 	}
854 
855 	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
856 		stat->attributes |= STATX_ATTR_IMMUTABLE;
857 	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
858 
859 	if (inode->ei_inode.bi_flags & BCH_INODE_append)
860 		stat->attributes |= STATX_ATTR_APPEND;
861 	stat->attributes_mask	 |= STATX_ATTR_APPEND;
862 
863 	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
864 		stat->attributes |= STATX_ATTR_NODUMP;
865 	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
866 
867 	return 0;
868 }
869 
870 static int bch2_setattr(struct mnt_idmap *idmap,
871 			struct dentry *dentry, struct iattr *iattr)
872 {
873 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
874 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
875 	int ret;
876 
877 	lockdep_assert_held(&inode->v.i_rwsem);
878 
879 	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
880 		setattr_prepare(idmap, dentry, iattr);
881 	if (ret)
882 		return ret;
883 
884 	return iattr->ia_valid & ATTR_SIZE
885 		? bchfs_truncate(idmap, inode, iattr)
886 		: bch2_setattr_nonsize(idmap, inode, iattr);
887 }
888 
889 static int bch2_tmpfile(struct mnt_idmap *idmap,
890 			struct inode *vdir, struct file *file, umode_t mode)
891 {
892 	struct bch_inode_info *inode =
893 		__bch2_create(idmap, to_bch_ei(vdir),
894 			      file->f_path.dentry, mode, 0,
895 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
896 
897 	if (IS_ERR(inode))
898 		return bch2_err_class(PTR_ERR(inode));
899 
900 	d_mark_tmpfile(file, &inode->v);
901 	d_instantiate(file->f_path.dentry, &inode->v);
902 	return finish_open_simple(file, 0);
903 }
904 
905 static int bch2_fill_extent(struct bch_fs *c,
906 			    struct fiemap_extent_info *info,
907 			    struct bkey_s_c k, unsigned flags)
908 {
909 	if (bkey_extent_is_direct_data(k.k)) {
910 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
911 		const union bch_extent_entry *entry;
912 		struct extent_ptr_decoded p;
913 		int ret;
914 
915 		if (k.k->type == KEY_TYPE_reflink_v)
916 			flags |= FIEMAP_EXTENT_SHARED;
917 
918 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
919 			int flags2 = 0;
920 			u64 offset = p.ptr.offset;
921 
922 			if (p.ptr.unwritten)
923 				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
924 
925 			if (p.crc.compression_type)
926 				flags2 |= FIEMAP_EXTENT_ENCODED;
927 			else
928 				offset += p.crc.offset;
929 
930 			if ((offset & (block_sectors(c) - 1)) ||
931 			    (k.k->size & (block_sectors(c) - 1)))
932 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
933 
934 			ret = fiemap_fill_next_extent(info,
935 						bkey_start_offset(k.k) << 9,
936 						offset << 9,
937 						k.k->size << 9, flags|flags2);
938 			if (ret)
939 				return ret;
940 		}
941 
942 		return 0;
943 	} else if (bkey_extent_is_inline_data(k.k)) {
944 		return fiemap_fill_next_extent(info,
945 					       bkey_start_offset(k.k) << 9,
946 					       0, k.k->size << 9,
947 					       flags|
948 					       FIEMAP_EXTENT_DATA_INLINE);
949 	} else if (k.k->type == KEY_TYPE_reservation) {
950 		return fiemap_fill_next_extent(info,
951 					       bkey_start_offset(k.k) << 9,
952 					       0, k.k->size << 9,
953 					       flags|
954 					       FIEMAP_EXTENT_DELALLOC|
955 					       FIEMAP_EXTENT_UNWRITTEN);
956 	} else {
957 		BUG();
958 	}
959 }
960 
961 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
962 		       u64 start, u64 len)
963 {
964 	struct bch_fs *c = vinode->i_sb->s_fs_info;
965 	struct bch_inode_info *ei = to_bch_ei(vinode);
966 	struct btree_trans *trans;
967 	struct btree_iter iter;
968 	struct bkey_s_c k;
969 	struct bkey_buf cur, prev;
970 	unsigned offset_into_extent, sectors;
971 	bool have_extent = false;
972 	u32 snapshot;
973 	int ret = 0;
974 
975 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
976 	if (ret)
977 		return ret;
978 
979 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
980 	if (start + len < start)
981 		return -EINVAL;
982 
983 	start >>= 9;
984 
985 	bch2_bkey_buf_init(&cur);
986 	bch2_bkey_buf_init(&prev);
987 	trans = bch2_trans_get(c);
988 retry:
989 	bch2_trans_begin(trans);
990 
991 	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
992 	if (ret)
993 		goto err;
994 
995 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
996 			     SPOS(ei->v.i_ino, start, snapshot), 0);
997 
998 	while (!(ret = btree_trans_too_many_iters(trans)) &&
999 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
1000 	       !(ret = bkey_err(k))) {
1001 		enum btree_id data_btree = BTREE_ID_extents;
1002 
1003 		if (!bkey_extent_is_data(k.k) &&
1004 		    k.k->type != KEY_TYPE_reservation) {
1005 			bch2_btree_iter_advance(&iter);
1006 			continue;
1007 		}
1008 
1009 		offset_into_extent	= iter.pos.offset -
1010 			bkey_start_offset(k.k);
1011 		sectors			= k.k->size - offset_into_extent;
1012 
1013 		bch2_bkey_buf_reassemble(&cur, c, k);
1014 
1015 		ret = bch2_read_indirect_extent(trans, &data_btree,
1016 					&offset_into_extent, &cur);
1017 		if (ret)
1018 			break;
1019 
1020 		k = bkey_i_to_s_c(cur.k);
1021 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1022 
1023 		sectors = min(sectors, k.k->size - offset_into_extent);
1024 
1025 		bch2_cut_front(POS(k.k->p.inode,
1026 				   bkey_start_offset(k.k) +
1027 				   offset_into_extent),
1028 			       cur.k);
1029 		bch2_key_resize(&cur.k->k, sectors);
1030 		cur.k->k.p = iter.pos;
1031 		cur.k->k.p.offset += cur.k->k.size;
1032 
1033 		if (have_extent) {
1034 			bch2_trans_unlock(trans);
1035 			ret = bch2_fill_extent(c, info,
1036 					bkey_i_to_s_c(prev.k), 0);
1037 			if (ret)
1038 				break;
1039 		}
1040 
1041 		bkey_copy(prev.k, cur.k);
1042 		have_extent = true;
1043 
1044 		bch2_btree_iter_set_pos(&iter,
1045 			POS(iter.pos.inode, iter.pos.offset + sectors));
1046 	}
1047 	start = iter.pos.offset;
1048 	bch2_trans_iter_exit(trans, &iter);
1049 err:
1050 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1051 		goto retry;
1052 
1053 	if (!ret && have_extent) {
1054 		bch2_trans_unlock(trans);
1055 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1056 				       FIEMAP_EXTENT_LAST);
1057 	}
1058 
1059 	bch2_trans_put(trans);
1060 	bch2_bkey_buf_exit(&cur, c);
1061 	bch2_bkey_buf_exit(&prev, c);
1062 	return ret < 0 ? ret : 0;
1063 }
1064 
1065 static const struct vm_operations_struct bch_vm_ops = {
1066 	.fault		= bch2_page_fault,
1067 	.map_pages	= filemap_map_pages,
1068 	.page_mkwrite   = bch2_page_mkwrite,
1069 };
1070 
1071 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1072 {
1073 	file_accessed(file);
1074 
1075 	vma->vm_ops = &bch_vm_ops;
1076 	return 0;
1077 }
1078 
1079 /* Directories: */
1080 
1081 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1082 {
1083 	return generic_file_llseek_size(file, offset, whence,
1084 					S64_MAX, S64_MAX);
1085 }
1086 
1087 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1088 {
1089 	struct bch_inode_info *inode = file_bch_inode(file);
1090 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1091 
1092 	if (!dir_emit_dots(file, ctx))
1093 		return 0;
1094 
1095 	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1096 
1097 	bch_err_fn(c, ret);
1098 	return bch2_err_class(ret);
1099 }
1100 
1101 static int bch2_open(struct inode *vinode, struct file *file)
1102 {
1103 	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1104 		struct bch_inode_info *inode = to_bch_ei(vinode);
1105 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1106 
1107 		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1108 		if (ret)
1109 			return ret;
1110 	}
1111 
1112 	return generic_file_open(vinode, file);
1113 }
1114 
1115 static const struct file_operations bch_file_operations = {
1116 	.open		= bch2_open,
1117 	.llseek		= bch2_llseek,
1118 	.read_iter	= bch2_read_iter,
1119 	.write_iter	= bch2_write_iter,
1120 	.mmap		= bch2_mmap,
1121 	.fsync		= bch2_fsync,
1122 	.splice_read	= filemap_splice_read,
1123 	.splice_write	= iter_file_splice_write,
1124 	.fallocate	= bch2_fallocate_dispatch,
1125 	.unlocked_ioctl = bch2_fs_file_ioctl,
1126 #ifdef CONFIG_COMPAT
1127 	.compat_ioctl	= bch2_compat_fs_ioctl,
1128 #endif
1129 	.remap_file_range = bch2_remap_file_range,
1130 };
1131 
1132 static const struct inode_operations bch_file_inode_operations = {
1133 	.getattr	= bch2_getattr,
1134 	.setattr	= bch2_setattr,
1135 	.fiemap		= bch2_fiemap,
1136 	.listxattr	= bch2_xattr_list,
1137 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1138 	.get_acl	= bch2_get_acl,
1139 	.set_acl	= bch2_set_acl,
1140 #endif
1141 };
1142 
1143 static const struct inode_operations bch_dir_inode_operations = {
1144 	.lookup		= bch2_lookup,
1145 	.create		= bch2_create,
1146 	.link		= bch2_link,
1147 	.unlink		= bch2_unlink,
1148 	.symlink	= bch2_symlink,
1149 	.mkdir		= bch2_mkdir,
1150 	.rmdir		= bch2_unlink,
1151 	.mknod		= bch2_mknod,
1152 	.rename		= bch2_rename2,
1153 	.getattr	= bch2_getattr,
1154 	.setattr	= bch2_setattr,
1155 	.tmpfile	= bch2_tmpfile,
1156 	.listxattr	= bch2_xattr_list,
1157 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1158 	.get_acl	= bch2_get_acl,
1159 	.set_acl	= bch2_set_acl,
1160 #endif
1161 };
1162 
1163 static const struct file_operations bch_dir_file_operations = {
1164 	.llseek		= bch2_dir_llseek,
1165 	.read		= generic_read_dir,
1166 	.iterate_shared	= bch2_vfs_readdir,
1167 	.fsync		= bch2_fsync,
1168 	.unlocked_ioctl = bch2_fs_file_ioctl,
1169 #ifdef CONFIG_COMPAT
1170 	.compat_ioctl	= bch2_compat_fs_ioctl,
1171 #endif
1172 };
1173 
1174 static const struct inode_operations bch_symlink_inode_operations = {
1175 	.get_link	= page_get_link,
1176 	.getattr	= bch2_getattr,
1177 	.setattr	= bch2_setattr,
1178 	.listxattr	= bch2_xattr_list,
1179 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1180 	.get_acl	= bch2_get_acl,
1181 	.set_acl	= bch2_set_acl,
1182 #endif
1183 };
1184 
1185 static const struct inode_operations bch_special_inode_operations = {
1186 	.getattr	= bch2_getattr,
1187 	.setattr	= bch2_setattr,
1188 	.listxattr	= bch2_xattr_list,
1189 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1190 	.get_acl	= bch2_get_acl,
1191 	.set_acl	= bch2_set_acl,
1192 #endif
1193 };
1194 
1195 static const struct address_space_operations bch_address_space_operations = {
1196 	.read_folio	= bch2_read_folio,
1197 	.writepages	= bch2_writepages,
1198 	.readahead	= bch2_readahead,
1199 	.dirty_folio	= filemap_dirty_folio,
1200 	.write_begin	= bch2_write_begin,
1201 	.write_end	= bch2_write_end,
1202 	.invalidate_folio = bch2_invalidate_folio,
1203 	.release_folio	= bch2_release_folio,
1204 	.direct_IO	= noop_direct_IO,
1205 #ifdef CONFIG_MIGRATION
1206 	.migrate_folio	= filemap_migrate_folio,
1207 #endif
1208 	.error_remove_folio = generic_error_remove_folio,
1209 };
1210 
1211 struct bcachefs_fid {
1212 	u64		inum;
1213 	u32		subvol;
1214 	u32		gen;
1215 } __packed;
1216 
1217 struct bcachefs_fid_with_parent {
1218 	struct bcachefs_fid	fid;
1219 	struct bcachefs_fid	dir;
1220 } __packed;
1221 
1222 static int bcachefs_fid_valid(int fh_len, int fh_type)
1223 {
1224 	switch (fh_type) {
1225 	case FILEID_BCACHEFS_WITHOUT_PARENT:
1226 		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1227 	case FILEID_BCACHEFS_WITH_PARENT:
1228 		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1229 	default:
1230 		return false;
1231 	}
1232 }
1233 
1234 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1235 {
1236 	return (struct bcachefs_fid) {
1237 		.inum	= inode->ei_inode.bi_inum,
1238 		.subvol	= inode->ei_subvol,
1239 		.gen	= inode->ei_inode.bi_generation,
1240 	};
1241 }
1242 
1243 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1244 			  struct inode *vdir)
1245 {
1246 	struct bch_inode_info *inode	= to_bch_ei(vinode);
1247 	struct bch_inode_info *dir	= to_bch_ei(vdir);
1248 	int min_len;
1249 
1250 	if (!S_ISDIR(inode->v.i_mode) && dir) {
1251 		struct bcachefs_fid_with_parent *fid = (void *) fh;
1252 
1253 		min_len = sizeof(*fid) / sizeof(u32);
1254 		if (*len < min_len) {
1255 			*len = min_len;
1256 			return FILEID_INVALID;
1257 		}
1258 
1259 		fid->fid = bch2_inode_to_fid(inode);
1260 		fid->dir = bch2_inode_to_fid(dir);
1261 
1262 		*len = min_len;
1263 		return FILEID_BCACHEFS_WITH_PARENT;
1264 	} else {
1265 		struct bcachefs_fid *fid = (void *) fh;
1266 
1267 		min_len = sizeof(*fid) / sizeof(u32);
1268 		if (*len < min_len) {
1269 			*len = min_len;
1270 			return FILEID_INVALID;
1271 		}
1272 		*fid = bch2_inode_to_fid(inode);
1273 
1274 		*len = min_len;
1275 		return FILEID_BCACHEFS_WITHOUT_PARENT;
1276 	}
1277 }
1278 
1279 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1280 					struct bcachefs_fid fid)
1281 {
1282 	struct bch_fs *c = sb->s_fs_info;
1283 	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1284 				    .subvol = fid.subvol,
1285 				    .inum = fid.inum,
1286 	});
1287 	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1288 		iput(vinode);
1289 		vinode = ERR_PTR(-ESTALE);
1290 	}
1291 	return vinode;
1292 }
1293 
1294 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1295 		int fh_len, int fh_type)
1296 {
1297 	struct bcachefs_fid *fid = (void *) _fid;
1298 
1299 	if (!bcachefs_fid_valid(fh_len, fh_type))
1300 		return NULL;
1301 
1302 	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1303 }
1304 
1305 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1306 		int fh_len, int fh_type)
1307 {
1308 	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1309 
1310 	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1311 	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1312 		return NULL;
1313 
1314 	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1315 }
1316 
1317 static struct dentry *bch2_get_parent(struct dentry *child)
1318 {
1319 	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1320 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1321 	subvol_inum parent_inum = {
1322 		.subvol = inode->ei_inode.bi_parent_subvol ?:
1323 			inode->ei_subvol,
1324 		.inum = inode->ei_inode.bi_dir,
1325 	};
1326 
1327 	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1328 }
1329 
1330 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1331 {
1332 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1333 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1334 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1335 	struct btree_trans *trans;
1336 	struct btree_iter iter1;
1337 	struct btree_iter iter2;
1338 	struct bkey_s_c k;
1339 	struct bkey_s_c_dirent d;
1340 	struct bch_inode_unpacked inode_u;
1341 	subvol_inum target;
1342 	u32 snapshot;
1343 	struct qstr dirent_name;
1344 	unsigned name_len = 0;
1345 	int ret;
1346 
1347 	if (!S_ISDIR(dir->v.i_mode))
1348 		return -EINVAL;
1349 
1350 	trans = bch2_trans_get(c);
1351 
1352 	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1353 			     POS(dir->ei_inode.bi_inum, 0), 0);
1354 	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1355 			     POS(dir->ei_inode.bi_inum, 0), 0);
1356 retry:
1357 	bch2_trans_begin(trans);
1358 
1359 	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1360 	if (ret)
1361 		goto err;
1362 
1363 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1364 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1365 
1366 	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1367 	if (ret)
1368 		goto err;
1369 
1370 	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1371 		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1372 
1373 		k = bch2_btree_iter_peek_slot(&iter1);
1374 		ret = bkey_err(k);
1375 		if (ret)
1376 			goto err;
1377 
1378 		if (k.k->type != KEY_TYPE_dirent) {
1379 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1380 			goto err;
1381 		}
1382 
1383 		d = bkey_s_c_to_dirent(k);
1384 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1385 		if (ret > 0)
1386 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1387 		if (ret)
1388 			goto err;
1389 
1390 		if (target.subvol	== inode->ei_subvol &&
1391 		    target.inum		== inode->ei_inode.bi_inum)
1392 			goto found;
1393 	} else {
1394 		/*
1395 		 * File with multiple hardlinks and our backref is to the wrong
1396 		 * directory - linear search:
1397 		 */
1398 		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1399 			if (k.k->p.inode > dir->ei_inode.bi_inum)
1400 				break;
1401 
1402 			if (k.k->type != KEY_TYPE_dirent)
1403 				continue;
1404 
1405 			d = bkey_s_c_to_dirent(k);
1406 			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1407 			if (ret < 0)
1408 				break;
1409 			if (ret)
1410 				continue;
1411 
1412 			if (target.subvol	== inode->ei_subvol &&
1413 			    target.inum		== inode->ei_inode.bi_inum)
1414 				goto found;
1415 		}
1416 	}
1417 
1418 	ret = -ENOENT;
1419 	goto err;
1420 found:
1421 	dirent_name = bch2_dirent_get_name(d);
1422 
1423 	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1424 	memcpy(name, dirent_name.name, name_len);
1425 	name[name_len] = '\0';
1426 err:
1427 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1428 		goto retry;
1429 
1430 	bch2_trans_iter_exit(trans, &iter1);
1431 	bch2_trans_iter_exit(trans, &iter2);
1432 	bch2_trans_put(trans);
1433 
1434 	return ret;
1435 }
1436 
1437 static const struct export_operations bch_export_ops = {
1438 	.encode_fh	= bch2_encode_fh,
1439 	.fh_to_dentry	= bch2_fh_to_dentry,
1440 	.fh_to_parent	= bch2_fh_to_parent,
1441 	.get_parent	= bch2_get_parent,
1442 	.get_name	= bch2_get_name,
1443 };
1444 
1445 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1446 				struct bch_inode_info *inode,
1447 				struct bch_inode_unpacked *bi,
1448 				struct bch_subvolume *subvol)
1449 {
1450 	bch2_iget5_set(&inode->v, &inum);
1451 	bch2_inode_update_after_write(trans, inode, bi, ~0);
1452 
1453 	if (BCH_SUBVOLUME_SNAP(subvol))
1454 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1455 	else
1456 		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1457 
1458 	inode->v.i_blocks	= bi->bi_sectors;
1459 	inode->v.i_ino		= bi->bi_inum;
1460 	inode->v.i_rdev		= bi->bi_dev;
1461 	inode->v.i_generation	= bi->bi_generation;
1462 	inode->v.i_size		= bi->bi_size;
1463 
1464 	inode->ei_flags		= 0;
1465 	inode->ei_quota_reserved = 0;
1466 	inode->ei_qid		= bch_qid(bi);
1467 	inode->ei_subvol	= inum.subvol;
1468 
1469 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1470 
1471 	switch (inode->v.i_mode & S_IFMT) {
1472 	case S_IFREG:
1473 		inode->v.i_op	= &bch_file_inode_operations;
1474 		inode->v.i_fop	= &bch_file_operations;
1475 		break;
1476 	case S_IFDIR:
1477 		inode->v.i_op	= &bch_dir_inode_operations;
1478 		inode->v.i_fop	= &bch_dir_file_operations;
1479 		break;
1480 	case S_IFLNK:
1481 		inode_nohighmem(&inode->v);
1482 		inode->v.i_op	= &bch_symlink_inode_operations;
1483 		break;
1484 	default:
1485 		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1486 		inode->v.i_op	= &bch_special_inode_operations;
1487 		break;
1488 	}
1489 
1490 	mapping_set_large_folios(inode->v.i_mapping);
1491 }
1492 
1493 static struct inode *bch2_alloc_inode(struct super_block *sb)
1494 {
1495 	struct bch_inode_info *inode;
1496 
1497 	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
1498 	if (!inode)
1499 		return NULL;
1500 
1501 	inode_init_once(&inode->v);
1502 	mutex_init(&inode->ei_update_lock);
1503 	two_state_lock_init(&inode->ei_pagecache_lock);
1504 	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
1505 	mutex_init(&inode->ei_quota_lock);
1506 
1507 	return &inode->v;
1508 }
1509 
1510 static void bch2_i_callback(struct rcu_head *head)
1511 {
1512 	struct inode *vinode = container_of(head, struct inode, i_rcu);
1513 	struct bch_inode_info *inode = to_bch_ei(vinode);
1514 
1515 	kmem_cache_free(bch2_inode_cache, inode);
1516 }
1517 
1518 static void bch2_destroy_inode(struct inode *vinode)
1519 {
1520 	call_rcu(&vinode->i_rcu, bch2_i_callback);
1521 }
1522 
1523 static int inode_update_times_fn(struct btree_trans *trans,
1524 				 struct bch_inode_info *inode,
1525 				 struct bch_inode_unpacked *bi,
1526 				 void *p)
1527 {
1528 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1529 
1530 	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1531 	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1532 	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1533 
1534 	return 0;
1535 }
1536 
1537 static int bch2_vfs_write_inode(struct inode *vinode,
1538 				struct writeback_control *wbc)
1539 {
1540 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1541 	struct bch_inode_info *inode = to_bch_ei(vinode);
1542 	int ret;
1543 
1544 	mutex_lock(&inode->ei_update_lock);
1545 	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1546 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1547 	mutex_unlock(&inode->ei_update_lock);
1548 
1549 	return bch2_err_class(ret);
1550 }
1551 
1552 static void bch2_evict_inode(struct inode *vinode)
1553 {
1554 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1555 	struct bch_inode_info *inode = to_bch_ei(vinode);
1556 
1557 	truncate_inode_pages_final(&inode->v.i_data);
1558 
1559 	clear_inode(&inode->v);
1560 
1561 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1562 
1563 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1564 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1565 				KEY_TYPE_QUOTA_WARN);
1566 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1567 				KEY_TYPE_QUOTA_WARN);
1568 		bch2_inode_rm(c, inode_inum(inode));
1569 	}
1570 
1571 	mutex_lock(&c->vfs_inodes_lock);
1572 	list_del_init(&inode->ei_vfs_inode_list);
1573 	mutex_unlock(&c->vfs_inodes_lock);
1574 }
1575 
1576 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1577 {
1578 	struct bch_inode_info *inode;
1579 	DARRAY(struct bch_inode_info *) grabbed;
1580 	bool clean_pass = false, this_pass_clean;
1581 
1582 	/*
1583 	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1584 	 * be pruned with d_mark_dontcache().
1585 	 *
1586 	 * Once we've had a clean pass where we didn't find any inodes without
1587 	 * I_DONTCACHE, we wait for them to be freed:
1588 	 */
1589 
1590 	darray_init(&grabbed);
1591 	darray_make_room(&grabbed, 1024);
1592 again:
1593 	cond_resched();
1594 	this_pass_clean = true;
1595 
1596 	mutex_lock(&c->vfs_inodes_lock);
1597 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1598 		if (!snapshot_list_has_id(s, inode->ei_subvol))
1599 			continue;
1600 
1601 		if (!(inode->v.i_state & I_DONTCACHE) &&
1602 		    !(inode->v.i_state & I_FREEING) &&
1603 		    igrab(&inode->v)) {
1604 			this_pass_clean = false;
1605 
1606 			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1607 				iput(&inode->v);
1608 				break;
1609 			}
1610 		} else if (clean_pass && this_pass_clean) {
1611 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1612 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1613 
1614 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1615 			mutex_unlock(&c->vfs_inodes_lock);
1616 
1617 			schedule();
1618 			finish_wait(wq, &wait.wq_entry);
1619 			goto again;
1620 		}
1621 	}
1622 	mutex_unlock(&c->vfs_inodes_lock);
1623 
1624 	darray_for_each(grabbed, i) {
1625 		inode = *i;
1626 		d_mark_dontcache(&inode->v);
1627 		d_prune_aliases(&inode->v);
1628 		iput(&inode->v);
1629 	}
1630 	grabbed.nr = 0;
1631 
1632 	if (!clean_pass || !this_pass_clean) {
1633 		clean_pass = this_pass_clean;
1634 		goto again;
1635 	}
1636 
1637 	darray_exit(&grabbed);
1638 }
1639 
1640 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1641 {
1642 	struct super_block *sb = dentry->d_sb;
1643 	struct bch_fs *c = sb->s_fs_info;
1644 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1645 	unsigned shift = sb->s_blocksize_bits - 9;
1646 	/*
1647 	 * this assumes inodes take up 64 bytes, which is a decent average
1648 	 * number:
1649 	 */
1650 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1651 
1652 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1653 	buf->f_bsize	= sb->s_blocksize;
1654 	buf->f_blocks	= usage.capacity >> shift;
1655 	buf->f_bfree	= usage.free >> shift;
1656 	buf->f_bavail	= avail_factor(usage.free) >> shift;
1657 
1658 	buf->f_files	= usage.nr_inodes + avail_inodes;
1659 	buf->f_ffree	= avail_inodes;
1660 
1661 	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
1662 	buf->f_namelen	= BCH_NAME_MAX;
1663 
1664 	return 0;
1665 }
1666 
1667 static int bch2_sync_fs(struct super_block *sb, int wait)
1668 {
1669 	struct bch_fs *c = sb->s_fs_info;
1670 	int ret;
1671 
1672 	if (c->opts.journal_flush_disabled)
1673 		return 0;
1674 
1675 	if (!wait) {
1676 		bch2_journal_flush_async(&c->journal, NULL);
1677 		return 0;
1678 	}
1679 
1680 	ret = bch2_journal_flush(&c->journal);
1681 	return bch2_err_class(ret);
1682 }
1683 
1684 static struct bch_fs *bch2_path_to_fs(const char *path)
1685 {
1686 	struct bch_fs *c;
1687 	dev_t dev;
1688 	int ret;
1689 
1690 	ret = lookup_bdev(path, &dev);
1691 	if (ret)
1692 		return ERR_PTR(ret);
1693 
1694 	c = bch2_dev_to_fs(dev);
1695 	if (c)
1696 		closure_put(&c->cl);
1697 	return c ?: ERR_PTR(-ENOENT);
1698 }
1699 
1700 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1701 {
1702 	struct bch_fs *c = sb->s_fs_info;
1703 	struct bch_opts opts = bch2_opts_empty();
1704 	int ret;
1705 
1706 	ret = bch2_parse_mount_opts(c, &opts, data);
1707 	if (ret)
1708 		goto err;
1709 
1710 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1711 
1712 	if (opts.read_only != c->opts.read_only) {
1713 		down_write(&c->state_lock);
1714 
1715 		if (opts.read_only) {
1716 			bch2_fs_read_only(c);
1717 
1718 			sb->s_flags |= SB_RDONLY;
1719 		} else {
1720 			ret = bch2_fs_read_write(c);
1721 			if (ret) {
1722 				bch_err(c, "error going rw: %i", ret);
1723 				up_write(&c->state_lock);
1724 				ret = -EINVAL;
1725 				goto err;
1726 			}
1727 
1728 			sb->s_flags &= ~SB_RDONLY;
1729 		}
1730 
1731 		c->opts.read_only = opts.read_only;
1732 
1733 		up_write(&c->state_lock);
1734 	}
1735 
1736 	if (opt_defined(opts, errors))
1737 		c->opts.errors = opts.errors;
1738 err:
1739 	return bch2_err_class(ret);
1740 }
1741 
1742 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1743 {
1744 	struct bch_fs *c = root->d_sb->s_fs_info;
1745 	bool first = true;
1746 
1747 	for_each_online_member(c, ca) {
1748 		if (!first)
1749 			seq_putc(seq, ':');
1750 		first = false;
1751 		seq_puts(seq, ca->disk_sb.sb_name);
1752 	}
1753 
1754 	return 0;
1755 }
1756 
1757 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1758 {
1759 	struct bch_fs *c = root->d_sb->s_fs_info;
1760 	enum bch_opt_id i;
1761 	struct printbuf buf = PRINTBUF;
1762 	int ret = 0;
1763 
1764 	for (i = 0; i < bch2_opts_nr; i++) {
1765 		const struct bch_option *opt = &bch2_opt_table[i];
1766 		u64 v = bch2_opt_get_by_id(&c->opts, i);
1767 
1768 		if (!(opt->flags & OPT_MOUNT))
1769 			continue;
1770 
1771 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1772 			continue;
1773 
1774 		printbuf_reset(&buf);
1775 		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1776 				 OPT_SHOW_MOUNT_STYLE);
1777 		seq_putc(seq, ',');
1778 		seq_puts(seq, buf.buf);
1779 	}
1780 
1781 	if (buf.allocation_failure)
1782 		ret = -ENOMEM;
1783 	printbuf_exit(&buf);
1784 	return ret;
1785 }
1786 
1787 static void bch2_put_super(struct super_block *sb)
1788 {
1789 	struct bch_fs *c = sb->s_fs_info;
1790 
1791 	__bch2_fs_stop(c);
1792 }
1793 
1794 /*
1795  * bcachefs doesn't currently integrate intwrite freeze protection but the
1796  * internal write references serve the same purpose. Therefore reuse the
1797  * read-only transition code to perform the quiesce. The caveat is that we don't
1798  * currently have the ability to block tasks that want a write reference while
1799  * the superblock is frozen. This is fine for now, but we should either add
1800  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1801  */
1802 static int bch2_freeze(struct super_block *sb)
1803 {
1804 	struct bch_fs *c = sb->s_fs_info;
1805 
1806 	down_write(&c->state_lock);
1807 	bch2_fs_read_only(c);
1808 	up_write(&c->state_lock);
1809 	return 0;
1810 }
1811 
1812 static int bch2_unfreeze(struct super_block *sb)
1813 {
1814 	struct bch_fs *c = sb->s_fs_info;
1815 	int ret;
1816 
1817 	if (test_bit(BCH_FS_emergency_ro, &c->flags))
1818 		return 0;
1819 
1820 	down_write(&c->state_lock);
1821 	ret = bch2_fs_read_write(c);
1822 	up_write(&c->state_lock);
1823 	return ret;
1824 }
1825 
1826 static const struct super_operations bch_super_operations = {
1827 	.alloc_inode	= bch2_alloc_inode,
1828 	.destroy_inode	= bch2_destroy_inode,
1829 	.write_inode	= bch2_vfs_write_inode,
1830 	.evict_inode	= bch2_evict_inode,
1831 	.sync_fs	= bch2_sync_fs,
1832 	.statfs		= bch2_statfs,
1833 	.show_devname	= bch2_show_devname,
1834 	.show_options	= bch2_show_options,
1835 	.remount_fs	= bch2_remount,
1836 	.put_super	= bch2_put_super,
1837 	.freeze_fs	= bch2_freeze,
1838 	.unfreeze_fs	= bch2_unfreeze,
1839 };
1840 
1841 static int bch2_set_super(struct super_block *s, void *data)
1842 {
1843 	s->s_fs_info = data;
1844 	return 0;
1845 }
1846 
1847 static int bch2_noset_super(struct super_block *s, void *data)
1848 {
1849 	return -EBUSY;
1850 }
1851 
1852 typedef DARRAY(struct bch_fs *) darray_fs;
1853 
1854 static int bch2_test_super(struct super_block *s, void *data)
1855 {
1856 	struct bch_fs *c = s->s_fs_info;
1857 	darray_fs *d = data;
1858 
1859 	if (!c)
1860 		return false;
1861 
1862 	darray_for_each(*d, i)
1863 		if (c != *i)
1864 			return false;
1865 	return true;
1866 }
1867 
1868 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1869 				 int flags, const char *dev_name, void *data)
1870 {
1871 	struct bch_fs *c;
1872 	struct super_block *sb;
1873 	struct inode *vinode;
1874 	struct bch_opts opts = bch2_opts_empty();
1875 	int ret;
1876 
1877 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1878 
1879 	ret = bch2_parse_mount_opts(NULL, &opts, data);
1880 	if (ret) {
1881 		ret = bch2_err_class(ret);
1882 		return ERR_PTR(ret);
1883 	}
1884 
1885 	if (!dev_name || strlen(dev_name) == 0)
1886 		return ERR_PTR(-EINVAL);
1887 
1888 	darray_str devs;
1889 	ret = bch2_split_devs(dev_name, &devs);
1890 	if (ret)
1891 		return ERR_PTR(ret);
1892 
1893 	darray_fs devs_to_fs = {};
1894 	darray_for_each(devs, i) {
1895 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1896 		if (ret) {
1897 			sb = ERR_PTR(ret);
1898 			goto got_sb;
1899 		}
1900 	}
1901 
1902 	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1903 	if (!IS_ERR(sb))
1904 		goto got_sb;
1905 
1906 	c = bch2_fs_open(devs.data, devs.nr, opts);
1907 	if (IS_ERR(c)) {
1908 		sb = ERR_CAST(c);
1909 		goto got_sb;
1910 	}
1911 
1912 	/* Some options can't be parsed until after the fs is started: */
1913 	ret = bch2_parse_mount_opts(c, &opts, data);
1914 	if (ret) {
1915 		bch2_fs_stop(c);
1916 		sb = ERR_PTR(ret);
1917 		goto got_sb;
1918 	}
1919 
1920 	bch2_opts_apply(&c->opts, opts);
1921 
1922 	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1923 	if (IS_ERR(sb))
1924 		bch2_fs_stop(c);
1925 got_sb:
1926 	darray_exit(&devs_to_fs);
1927 	bch2_darray_str_exit(&devs);
1928 
1929 	if (IS_ERR(sb)) {
1930 		ret = PTR_ERR(sb);
1931 		ret = bch2_err_class(ret);
1932 		return ERR_PTR(ret);
1933 	}
1934 
1935 	c = sb->s_fs_info;
1936 
1937 	if (sb->s_root) {
1938 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
1939 			ret = -EBUSY;
1940 			goto err_put_super;
1941 		}
1942 		goto out;
1943 	}
1944 
1945 	sb->s_blocksize		= block_bytes(c);
1946 	sb->s_blocksize_bits	= ilog2(block_bytes(c));
1947 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
1948 	sb->s_op		= &bch_super_operations;
1949 	sb->s_export_op		= &bch_export_ops;
1950 #ifdef CONFIG_BCACHEFS_QUOTA
1951 	sb->s_qcop		= &bch2_quotactl_operations;
1952 	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1953 #endif
1954 	sb->s_xattr		= bch2_xattr_handlers;
1955 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
1956 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
1957 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1958 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
1959 	sb->s_uuid		= c->sb.user_uuid;
1960 	c->vfs_sb		= sb;
1961 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1962 
1963 	ret = super_setup_bdi(sb);
1964 	if (ret)
1965 		goto err_put_super;
1966 
1967 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
1968 
1969 	for_each_online_member(c, ca) {
1970 		struct block_device *bdev = ca->disk_sb.bdev;
1971 
1972 		/* XXX: create an anonymous device for multi device filesystems */
1973 		sb->s_bdev	= bdev;
1974 		sb->s_dev	= bdev->bd_dev;
1975 		percpu_ref_put(&ca->io_ref);
1976 		break;
1977 	}
1978 
1979 	c->dev = sb->s_dev;
1980 
1981 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1982 	if (c->opts.acl)
1983 		sb->s_flags	|= SB_POSIXACL;
1984 #endif
1985 
1986 	sb->s_shrink->seeks = 0;
1987 
1988 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
1989 	ret = PTR_ERR_OR_ZERO(vinode);
1990 	bch_err_msg(c, ret, "mounting: error getting root inode");
1991 	if (ret)
1992 		goto err_put_super;
1993 
1994 	sb->s_root = d_make_root(vinode);
1995 	if (!sb->s_root) {
1996 		bch_err(c, "error mounting: error allocating root dentry");
1997 		ret = -ENOMEM;
1998 		goto err_put_super;
1999 	}
2000 
2001 	sb->s_flags |= SB_ACTIVE;
2002 out:
2003 	return dget(sb->s_root);
2004 
2005 err_put_super:
2006 	__bch2_fs_stop(c);
2007 	deactivate_locked_super(sb);
2008 	return ERR_PTR(bch2_err_class(ret));
2009 }
2010 
2011 static void bch2_kill_sb(struct super_block *sb)
2012 {
2013 	struct bch_fs *c = sb->s_fs_info;
2014 
2015 	generic_shutdown_super(sb);
2016 	bch2_fs_free(c);
2017 }
2018 
2019 static struct file_system_type bcache_fs_type = {
2020 	.owner		= THIS_MODULE,
2021 	.name		= "bcachefs",
2022 	.mount		= bch2_mount,
2023 	.kill_sb	= bch2_kill_sb,
2024 	.fs_flags	= FS_REQUIRES_DEV,
2025 };
2026 
2027 MODULE_ALIAS_FS("bcachefs");
2028 
2029 void bch2_vfs_exit(void)
2030 {
2031 	unregister_filesystem(&bcache_fs_type);
2032 	kmem_cache_destroy(bch2_inode_cache);
2033 }
2034 
2035 int __init bch2_vfs_init(void)
2036 {
2037 	int ret = -ENOMEM;
2038 
2039 	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
2040 	if (!bch2_inode_cache)
2041 		goto err;
2042 
2043 	ret = register_filesystem(&bcache_fs_type);
2044 	if (ret)
2045 		goto err;
2046 
2047 	return 0;
2048 err:
2049 	bch2_vfs_exit();
2050 	return ret;
2051 }
2052 
2053 #endif /* NO_BCACHEFS_FS */
2054