xref: /linux/fs/bcachefs/fs.c (revision 6fa4bf3dce0668a96faca0024e382f4489a9cc9b)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42 
43 static struct kmem_cache *bch2_inode_cache;
44 
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46 				struct bch_inode_info *,
47 				struct bch_inode_unpacked *,
48 				struct bch_subvolume *);
49 
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51 				   struct bch_inode_info *inode,
52 				   struct bch_inode_unpacked *bi,
53 				   unsigned fields)
54 {
55 	struct bch_fs *c = trans->c;
56 
57 	BUG_ON(bi->bi_inum != inode->v.i_ino);
58 
59 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60 			       POS(0, bi->bi_inum),
61 			       c->opts.inodes_use_key_cache);
62 
63 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64 	i_uid_write(&inode->v, bi->bi_uid);
65 	i_gid_write(&inode->v, bi->bi_gid);
66 	inode->v.i_mode	= bi->bi_mode;
67 
68 	if (fields & ATTR_ATIME)
69 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70 	if (fields & ATTR_MTIME)
71 		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72 	if (fields & ATTR_CTIME)
73 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74 
75 	inode->ei_inode		= *bi;
76 
77 	bch2_inode_flags_to_vfs(inode);
78 }
79 
80 int __must_check bch2_write_inode(struct bch_fs *c,
81 				  struct bch_inode_info *inode,
82 				  inode_set_fn set,
83 				  void *p, unsigned fields)
84 {
85 	struct btree_trans *trans = bch2_trans_get(c);
86 	struct btree_iter iter = { NULL };
87 	struct bch_inode_unpacked inode_u;
88 	int ret;
89 retry:
90 	bch2_trans_begin(trans);
91 
92 	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93 				BTREE_ITER_intent) ?:
94 		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95 		bch2_inode_write(trans, &iter, &inode_u) ?:
96 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97 
98 	/*
99 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 	 * this is important for inode updates via bchfs_write_index_update
101 	 */
102 	if (!ret)
103 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104 
105 	bch2_trans_iter_exit(trans, &iter);
106 
107 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108 		goto retry;
109 
110 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111 			     "%s: inode %u:%llu not found when updating",
112 			     bch2_err_str(ret),
113 			     inode_inum(inode).subvol,
114 			     inode_inum(inode).inum);
115 
116 	bch2_trans_put(trans);
117 	return ret < 0 ? ret : 0;
118 }
119 
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121 			   struct bch_inode_info *inode,
122 			   struct bch_qid new_qid,
123 			   unsigned qtypes,
124 			   enum quota_acct_mode mode)
125 {
126 	unsigned i;
127 	int ret;
128 
129 	qtypes &= enabled_qtypes(c);
130 
131 	for (i = 0; i < QTYP_NR; i++)
132 		if (new_qid.q[i] == inode->ei_qid.q[i])
133 			qtypes &= ~(1U << i);
134 
135 	if (!qtypes)
136 		return 0;
137 
138 	mutex_lock(&inode->ei_quota_lock);
139 
140 	ret = bch2_quota_transfer(c, qtypes, new_qid,
141 				  inode->ei_qid,
142 				  inode->v.i_blocks +
143 				  inode->ei_quota_reserved,
144 				  mode);
145 	if (!ret)
146 		for (i = 0; i < QTYP_NR; i++)
147 			if (qtypes & (1 << i))
148 				inode->ei_qid.q[i] = new_qid.q[i];
149 
150 	mutex_unlock(&inode->ei_quota_lock);
151 
152 	return ret;
153 }
154 
155 static int bch2_iget5_test(struct inode *vinode, void *p)
156 {
157 	struct bch_inode_info *inode = to_bch_ei(vinode);
158 	subvol_inum *inum = p;
159 
160 	return inode->ei_subvol == inum->subvol &&
161 		inode->ei_inode.bi_inum == inum->inum;
162 }
163 
164 static int bch2_iget5_set(struct inode *vinode, void *p)
165 {
166 	struct bch_inode_info *inode = to_bch_ei(vinode);
167 	subvol_inum *inum = p;
168 
169 	inode->v.i_ino		= inum->inum;
170 	inode->ei_subvol	= inum->subvol;
171 	inode->ei_inode.bi_inum	= inum->inum;
172 	return 0;
173 }
174 
175 static unsigned bch2_inode_hash(subvol_inum inum)
176 {
177 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
178 }
179 
180 static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
181 {
182 	subvol_inum inum = inode_inum(inode);
183 	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
184 				      bch2_inode_hash(inum),
185 				      bch2_iget5_test,
186 				      bch2_iget5_set,
187 				      &inum));
188 	BUG_ON(!old);
189 
190 	if (unlikely(old != inode)) {
191 		/*
192 		 * bcachefs doesn't use I_NEW; we have no use for it since we
193 		 * only insert fully created inodes in the inode hash table. But
194 		 * discard_new_inode() expects it to be set...
195 		 */
196 		inode->v.i_flags |= I_NEW;
197 		discard_new_inode(&inode->v);
198 		inode = old;
199 	} else {
200 		mutex_lock(&c->vfs_inodes_lock);
201 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
202 		mutex_unlock(&c->vfs_inodes_lock);
203 		/*
204 		 * Again, I_NEW makes no sense for bcachefs. This is only needed
205 		 * for clearing I_NEW, but since the inode was already fully
206 		 * created and initialized we didn't actually want
207 		 * inode_insert5() to set it for us.
208 		 */
209 		unlock_new_inode(&inode->v);
210 	}
211 
212 	return inode;
213 }
214 
215 #define memalloc_flags_do(_flags, _do)						\
216 ({										\
217 	unsigned _saved_flags = memalloc_flags_save(_flags);			\
218 	typeof(_do) _ret = _do;							\
219 	memalloc_noreclaim_restore(_saved_flags);				\
220 	_ret;									\
221 })
222 
223 static struct inode *bch2_alloc_inode(struct super_block *sb)
224 {
225 	BUG();
226 }
227 
228 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
229 {
230 	struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
231 	if (!inode)
232 		return NULL;
233 
234 	inode_init_once(&inode->v);
235 	mutex_init(&inode->ei_update_lock);
236 	two_state_lock_init(&inode->ei_pagecache_lock);
237 	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
238 	inode->ei_flags = 0;
239 	mutex_init(&inode->ei_quota_lock);
240 	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
241 	inode->v.i_state = 0;
242 
243 	if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
244 		kmem_cache_free(bch2_inode_cache, inode);
245 		return NULL;
246 	}
247 
248 	return inode;
249 }
250 
251 /*
252  * Allocate a new inode, dropping/retaking btree locks if necessary:
253  */
254 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
255 {
256 	struct bch_inode_info *inode =
257 		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
258 				  __bch2_new_inode(trans->c));
259 
260 	if (unlikely(!inode)) {
261 		int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
262 		if (ret && inode) {
263 			__destroy_inode(&inode->v);
264 			kmem_cache_free(bch2_inode_cache, inode);
265 		}
266 		if (ret)
267 			return ERR_PTR(ret);
268 	}
269 
270 	return inode;
271 }
272 
273 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
274 {
275 	struct bch_inode_info *inode =
276 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
277 					  bch2_inode_hash(inum),
278 					  bch2_iget5_test,
279 					  &inum));
280 	if (inode)
281 		return &inode->v;
282 
283 	struct btree_trans *trans = bch2_trans_get(c);
284 
285 	struct bch_inode_unpacked inode_u;
286 	struct bch_subvolume subvol;
287 	int ret = lockrestart_do(trans,
288 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
289 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
290 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
291 	if (!ret) {
292 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
293 		inode = bch2_inode_insert(c, inode);
294 	}
295 	bch2_trans_put(trans);
296 
297 	return ret ? ERR_PTR(ret) : &inode->v;
298 }
299 
300 struct bch_inode_info *
301 __bch2_create(struct mnt_idmap *idmap,
302 	      struct bch_inode_info *dir, struct dentry *dentry,
303 	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
304 	      unsigned flags)
305 {
306 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
307 	struct btree_trans *trans;
308 	struct bch_inode_unpacked dir_u;
309 	struct bch_inode_info *inode;
310 	struct bch_inode_unpacked inode_u;
311 	struct posix_acl *default_acl = NULL, *acl = NULL;
312 	subvol_inum inum;
313 	struct bch_subvolume subvol;
314 	u64 journal_seq = 0;
315 	int ret;
316 
317 	/*
318 	 * preallocate acls + vfs inode before btree transaction, so that
319 	 * nothing can fail after the transaction succeeds:
320 	 */
321 #ifdef CONFIG_BCACHEFS_POSIX_ACL
322 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
323 	if (ret)
324 		return ERR_PTR(ret);
325 #endif
326 	inode = __bch2_new_inode(c);
327 	if (unlikely(!inode)) {
328 		inode = ERR_PTR(-ENOMEM);
329 		goto err;
330 	}
331 
332 	bch2_inode_init_early(c, &inode_u);
333 
334 	if (!(flags & BCH_CREATE_TMPFILE))
335 		mutex_lock(&dir->ei_update_lock);
336 
337 	trans = bch2_trans_get(c);
338 retry:
339 	bch2_trans_begin(trans);
340 
341 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
342 		bch2_create_trans(trans,
343 				  inode_inum(dir), &dir_u, &inode_u,
344 				  !(flags & BCH_CREATE_TMPFILE)
345 				  ? &dentry->d_name : NULL,
346 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
347 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
348 				  mode, rdev,
349 				  default_acl, acl, snapshot_src, flags) ?:
350 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
351 				KEY_TYPE_QUOTA_PREALLOC);
352 	if (unlikely(ret))
353 		goto err_before_quota;
354 
355 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
356 	inum.inum = inode_u.bi_inum;
357 
358 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
359 				   BTREE_ITER_with_updates, &subvol) ?:
360 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
361 	if (unlikely(ret)) {
362 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
363 				KEY_TYPE_QUOTA_WARN);
364 err_before_quota:
365 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
366 			goto retry;
367 		goto err_trans;
368 	}
369 
370 	if (!(flags & BCH_CREATE_TMPFILE)) {
371 		bch2_inode_update_after_write(trans, dir, &dir_u,
372 					      ATTR_MTIME|ATTR_CTIME);
373 		mutex_unlock(&dir->ei_update_lock);
374 	}
375 
376 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
377 
378 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
379 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
380 
381 	/*
382 	 * we must insert the new inode into the inode cache before calling
383 	 * bch2_trans_exit() and dropping locks, else we could race with another
384 	 * thread pulling the inode in and modifying it:
385 	 */
386 	inode = bch2_inode_insert(c, inode);
387 	bch2_trans_put(trans);
388 err:
389 	posix_acl_release(default_acl);
390 	posix_acl_release(acl);
391 	return inode;
392 err_trans:
393 	if (!(flags & BCH_CREATE_TMPFILE))
394 		mutex_unlock(&dir->ei_update_lock);
395 
396 	bch2_trans_put(trans);
397 	make_bad_inode(&inode->v);
398 	iput(&inode->v);
399 	inode = ERR_PTR(ret);
400 	goto err;
401 }
402 
403 /* methods */
404 
405 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
406 			subvol_inum dir, struct bch_hash_info *dir_hash_info,
407 			const struct qstr *name)
408 {
409 	struct bch_fs *c = trans->c;
410 	struct btree_iter dirent_iter = {};
411 	subvol_inum inum = {};
412 	struct printbuf buf = PRINTBUF;
413 
414 	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
415 					     dir_hash_info, dir, name, 0);
416 	int ret = bkey_err(k);
417 	if (ret)
418 		return ERR_PTR(ret);
419 
420 	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
421 	if (ret > 0)
422 		ret = -ENOENT;
423 	if (ret)
424 		goto err;
425 
426 	struct bch_inode_info *inode =
427 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
428 					  bch2_inode_hash(inum),
429 					  bch2_iget5_test,
430 					  &inum));
431 	if (inode)
432 		goto out;
433 
434 	struct bch_subvolume subvol;
435 	struct bch_inode_unpacked inode_u;
436 	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
437 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
438 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
439 
440 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
441 				c, "dirent to missing inode:\n  %s",
442 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
443 	if (ret)
444 		goto err;
445 
446 	/* regular files may have hardlinks: */
447 	if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
448 				    !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
449 				    c,
450 				    "dirent points to inode that does not point back:\n  %s",
451 				    (bch2_bkey_val_to_text(&buf, c, k),
452 				     prt_printf(&buf, "\n  "),
453 				     bch2_inode_unpacked_to_text(&buf, &inode_u),
454 				     buf.buf))) {
455 		ret = -ENOENT;
456 		goto err;
457 	}
458 
459 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
460 	inode = bch2_inode_insert(c, inode);
461 out:
462 	bch2_trans_iter_exit(trans, &dirent_iter);
463 	printbuf_exit(&buf);
464 	return inode;
465 err:
466 	inode = ERR_PTR(ret);
467 	goto out;
468 }
469 
470 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
471 				  unsigned int flags)
472 {
473 	struct bch_fs *c = vdir->i_sb->s_fs_info;
474 	struct bch_inode_info *dir = to_bch_ei(vdir);
475 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
476 
477 	struct bch_inode_info *inode;
478 	bch2_trans_do(c, NULL, NULL, 0,
479 		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
480 							  &hash, &dentry->d_name)));
481 	if (IS_ERR(inode))
482 		inode = NULL;
483 
484 	return d_splice_alias(&inode->v, dentry);
485 }
486 
487 static int bch2_mknod(struct mnt_idmap *idmap,
488 		      struct inode *vdir, struct dentry *dentry,
489 		      umode_t mode, dev_t rdev)
490 {
491 	struct bch_inode_info *inode =
492 		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
493 			      (subvol_inum) { 0 }, 0);
494 
495 	if (IS_ERR(inode))
496 		return bch2_err_class(PTR_ERR(inode));
497 
498 	d_instantiate(dentry, &inode->v);
499 	return 0;
500 }
501 
502 static int bch2_create(struct mnt_idmap *idmap,
503 		       struct inode *vdir, struct dentry *dentry,
504 		       umode_t mode, bool excl)
505 {
506 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
507 }
508 
509 static int __bch2_link(struct bch_fs *c,
510 		       struct bch_inode_info *inode,
511 		       struct bch_inode_info *dir,
512 		       struct dentry *dentry)
513 {
514 	struct btree_trans *trans = bch2_trans_get(c);
515 	struct bch_inode_unpacked dir_u, inode_u;
516 	int ret;
517 
518 	mutex_lock(&inode->ei_update_lock);
519 
520 	ret = commit_do(trans, NULL, NULL, 0,
521 			bch2_link_trans(trans,
522 					inode_inum(dir),   &dir_u,
523 					inode_inum(inode), &inode_u,
524 					&dentry->d_name));
525 
526 	if (likely(!ret)) {
527 		bch2_inode_update_after_write(trans, dir, &dir_u,
528 					      ATTR_MTIME|ATTR_CTIME);
529 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
530 	}
531 
532 	bch2_trans_put(trans);
533 	mutex_unlock(&inode->ei_update_lock);
534 	return ret;
535 }
536 
537 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
538 		     struct dentry *dentry)
539 {
540 	struct bch_fs *c = vdir->i_sb->s_fs_info;
541 	struct bch_inode_info *dir = to_bch_ei(vdir);
542 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
543 	int ret;
544 
545 	lockdep_assert_held(&inode->v.i_rwsem);
546 
547 	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
548 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
549 		__bch2_link(c, inode, dir, dentry);
550 	if (unlikely(ret))
551 		return bch2_err_class(ret);
552 
553 	ihold(&inode->v);
554 	d_instantiate(dentry, &inode->v);
555 	return 0;
556 }
557 
558 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
559 		  bool deleting_snapshot)
560 {
561 	struct bch_fs *c = vdir->i_sb->s_fs_info;
562 	struct bch_inode_info *dir = to_bch_ei(vdir);
563 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
564 	struct bch_inode_unpacked dir_u, inode_u;
565 	struct btree_trans *trans = bch2_trans_get(c);
566 	int ret;
567 
568 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
569 
570 	ret = commit_do(trans, NULL, NULL,
571 			BCH_TRANS_COMMIT_no_enospc,
572 		bch2_unlink_trans(trans,
573 				  inode_inum(dir), &dir_u,
574 				  &inode_u, &dentry->d_name,
575 				  deleting_snapshot));
576 	if (unlikely(ret))
577 		goto err;
578 
579 	bch2_inode_update_after_write(trans, dir, &dir_u,
580 				      ATTR_MTIME|ATTR_CTIME);
581 	bch2_inode_update_after_write(trans, inode, &inode_u,
582 				      ATTR_MTIME);
583 
584 	if (inode_u.bi_subvol) {
585 		/*
586 		 * Subvolume deletion is asynchronous, but we still want to tell
587 		 * the VFS that it's been deleted here:
588 		 */
589 		set_nlink(&inode->v, 0);
590 	}
591 err:
592 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
593 	bch2_trans_put(trans);
594 
595 	return ret;
596 }
597 
598 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
599 {
600 	struct bch_inode_info *dir= to_bch_ei(vdir);
601 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
602 
603 	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
604 		__bch2_unlink(vdir, dentry, false);
605 	return bch2_err_class(ret);
606 }
607 
608 static int bch2_symlink(struct mnt_idmap *idmap,
609 			struct inode *vdir, struct dentry *dentry,
610 			const char *symname)
611 {
612 	struct bch_fs *c = vdir->i_sb->s_fs_info;
613 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
614 	int ret;
615 
616 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
617 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
618 	if (IS_ERR(inode))
619 		return bch2_err_class(PTR_ERR(inode));
620 
621 	inode_lock(&inode->v);
622 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
623 	inode_unlock(&inode->v);
624 
625 	if (unlikely(ret))
626 		goto err;
627 
628 	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
629 	if (unlikely(ret))
630 		goto err;
631 
632 	ret = __bch2_link(c, inode, dir, dentry);
633 	if (unlikely(ret))
634 		goto err;
635 
636 	d_instantiate(dentry, &inode->v);
637 	return 0;
638 err:
639 	iput(&inode->v);
640 	return bch2_err_class(ret);
641 }
642 
643 static int bch2_mkdir(struct mnt_idmap *idmap,
644 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
645 {
646 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
647 }
648 
649 static int bch2_rename2(struct mnt_idmap *idmap,
650 			struct inode *src_vdir, struct dentry *src_dentry,
651 			struct inode *dst_vdir, struct dentry *dst_dentry,
652 			unsigned flags)
653 {
654 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
655 	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
656 	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
657 	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
658 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
659 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
660 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
661 	struct btree_trans *trans;
662 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
663 		? BCH_RENAME_EXCHANGE
664 		: dst_dentry->d_inode
665 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
666 	int ret;
667 
668 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
669 		return -EINVAL;
670 
671 	if (mode == BCH_RENAME_OVERWRITE) {
672 		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
673 						   0, LLONG_MAX);
674 		if (ret)
675 			return ret;
676 	}
677 
678 	trans = bch2_trans_get(c);
679 
680 	bch2_lock_inodes(INODE_UPDATE_LOCK,
681 			 src_dir,
682 			 dst_dir,
683 			 src_inode,
684 			 dst_inode);
685 
686 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
687 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
688 	if (ret)
689 		goto err;
690 
691 	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
692 		ret = bch2_fs_quota_transfer(c, src_inode,
693 					     dst_dir->ei_qid,
694 					     1 << QTYP_PRJ,
695 					     KEY_TYPE_QUOTA_PREALLOC);
696 		if (ret)
697 			goto err;
698 	}
699 
700 	if (mode == BCH_RENAME_EXCHANGE &&
701 	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
702 		ret = bch2_fs_quota_transfer(c, dst_inode,
703 					     src_dir->ei_qid,
704 					     1 << QTYP_PRJ,
705 					     KEY_TYPE_QUOTA_PREALLOC);
706 		if (ret)
707 			goto err;
708 	}
709 
710 	ret = commit_do(trans, NULL, NULL, 0,
711 			bch2_rename_trans(trans,
712 					  inode_inum(src_dir), &src_dir_u,
713 					  inode_inum(dst_dir), &dst_dir_u,
714 					  &src_inode_u,
715 					  &dst_inode_u,
716 					  &src_dentry->d_name,
717 					  &dst_dentry->d_name,
718 					  mode));
719 	if (unlikely(ret))
720 		goto err;
721 
722 	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
723 	BUG_ON(dst_inode &&
724 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
725 
726 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
727 				      ATTR_MTIME|ATTR_CTIME);
728 
729 	if (src_dir != dst_dir)
730 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
731 					      ATTR_MTIME|ATTR_CTIME);
732 
733 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
734 				      ATTR_CTIME);
735 
736 	if (dst_inode)
737 		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
738 					      ATTR_CTIME);
739 err:
740 	bch2_trans_put(trans);
741 
742 	bch2_fs_quota_transfer(c, src_inode,
743 			       bch_qid(&src_inode->ei_inode),
744 			       1 << QTYP_PRJ,
745 			       KEY_TYPE_QUOTA_NOCHECK);
746 	if (dst_inode)
747 		bch2_fs_quota_transfer(c, dst_inode,
748 				       bch_qid(&dst_inode->ei_inode),
749 				       1 << QTYP_PRJ,
750 				       KEY_TYPE_QUOTA_NOCHECK);
751 
752 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
753 			   src_dir,
754 			   dst_dir,
755 			   src_inode,
756 			   dst_inode);
757 
758 	return bch2_err_class(ret);
759 }
760 
761 static void bch2_setattr_copy(struct mnt_idmap *idmap,
762 			      struct bch_inode_info *inode,
763 			      struct bch_inode_unpacked *bi,
764 			      struct iattr *attr)
765 {
766 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
767 	unsigned int ia_valid = attr->ia_valid;
768 
769 	if (ia_valid & ATTR_UID)
770 		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
771 	if (ia_valid & ATTR_GID)
772 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
773 
774 	if (ia_valid & ATTR_SIZE)
775 		bi->bi_size = attr->ia_size;
776 
777 	if (ia_valid & ATTR_ATIME)
778 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
779 	if (ia_valid & ATTR_MTIME)
780 		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
781 	if (ia_valid & ATTR_CTIME)
782 		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
783 
784 	if (ia_valid & ATTR_MODE) {
785 		umode_t mode = attr->ia_mode;
786 		kgid_t gid = ia_valid & ATTR_GID
787 			? attr->ia_gid
788 			: inode->v.i_gid;
789 
790 		if (!in_group_p(gid) &&
791 		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
792 			mode &= ~S_ISGID;
793 		bi->bi_mode = mode;
794 	}
795 }
796 
797 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
798 			 struct bch_inode_info *inode,
799 			 struct iattr *attr)
800 {
801 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
802 	struct bch_qid qid;
803 	struct btree_trans *trans;
804 	struct btree_iter inode_iter = { NULL };
805 	struct bch_inode_unpacked inode_u;
806 	struct posix_acl *acl = NULL;
807 	int ret;
808 
809 	mutex_lock(&inode->ei_update_lock);
810 
811 	qid = inode->ei_qid;
812 
813 	if (attr->ia_valid & ATTR_UID)
814 		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
815 
816 	if (attr->ia_valid & ATTR_GID)
817 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
818 
819 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
820 				     KEY_TYPE_QUOTA_PREALLOC);
821 	if (ret)
822 		goto err;
823 
824 	trans = bch2_trans_get(c);
825 retry:
826 	bch2_trans_begin(trans);
827 	kfree(acl);
828 	acl = NULL;
829 
830 	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
831 			      BTREE_ITER_intent);
832 	if (ret)
833 		goto btree_err;
834 
835 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
836 
837 	if (attr->ia_valid & ATTR_MODE) {
838 		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
839 				     inode_u.bi_mode, &acl);
840 		if (ret)
841 			goto btree_err;
842 	}
843 
844 	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
845 		bch2_trans_commit(trans, NULL, NULL,
846 				  BCH_TRANS_COMMIT_no_enospc);
847 btree_err:
848 	bch2_trans_iter_exit(trans, &inode_iter);
849 
850 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
851 		goto retry;
852 	if (unlikely(ret))
853 		goto err_trans;
854 
855 	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
856 
857 	if (acl)
858 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
859 err_trans:
860 	bch2_trans_put(trans);
861 err:
862 	mutex_unlock(&inode->ei_update_lock);
863 
864 	return bch2_err_class(ret);
865 }
866 
867 static int bch2_getattr(struct mnt_idmap *idmap,
868 			const struct path *path, struct kstat *stat,
869 			u32 request_mask, unsigned query_flags)
870 {
871 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
872 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
873 
874 	stat->dev	= inode->v.i_sb->s_dev;
875 	stat->ino	= inode->v.i_ino;
876 	stat->mode	= inode->v.i_mode;
877 	stat->nlink	= inode->v.i_nlink;
878 	stat->uid	= inode->v.i_uid;
879 	stat->gid	= inode->v.i_gid;
880 	stat->rdev	= inode->v.i_rdev;
881 	stat->size	= i_size_read(&inode->v);
882 	stat->atime	= inode_get_atime(&inode->v);
883 	stat->mtime	= inode_get_mtime(&inode->v);
884 	stat->ctime	= inode_get_ctime(&inode->v);
885 	stat->blksize	= block_bytes(c);
886 	stat->blocks	= inode->v.i_blocks;
887 
888 	stat->subvol	= inode->ei_subvol;
889 	stat->result_mask |= STATX_SUBVOL;
890 
891 	if (request_mask & STATX_BTIME) {
892 		stat->result_mask |= STATX_BTIME;
893 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
894 	}
895 
896 	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
897 		stat->attributes |= STATX_ATTR_IMMUTABLE;
898 	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
899 
900 	if (inode->ei_inode.bi_flags & BCH_INODE_append)
901 		stat->attributes |= STATX_ATTR_APPEND;
902 	stat->attributes_mask	 |= STATX_ATTR_APPEND;
903 
904 	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
905 		stat->attributes |= STATX_ATTR_NODUMP;
906 	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
907 
908 	return 0;
909 }
910 
911 static int bch2_setattr(struct mnt_idmap *idmap,
912 			struct dentry *dentry, struct iattr *iattr)
913 {
914 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
915 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
916 	int ret;
917 
918 	lockdep_assert_held(&inode->v.i_rwsem);
919 
920 	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
921 		setattr_prepare(idmap, dentry, iattr);
922 	if (ret)
923 		return ret;
924 
925 	return iattr->ia_valid & ATTR_SIZE
926 		? bchfs_truncate(idmap, inode, iattr)
927 		: bch2_setattr_nonsize(idmap, inode, iattr);
928 }
929 
930 static int bch2_tmpfile(struct mnt_idmap *idmap,
931 			struct inode *vdir, struct file *file, umode_t mode)
932 {
933 	struct bch_inode_info *inode =
934 		__bch2_create(idmap, to_bch_ei(vdir),
935 			      file->f_path.dentry, mode, 0,
936 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
937 
938 	if (IS_ERR(inode))
939 		return bch2_err_class(PTR_ERR(inode));
940 
941 	d_mark_tmpfile(file, &inode->v);
942 	d_instantiate(file->f_path.dentry, &inode->v);
943 	return finish_open_simple(file, 0);
944 }
945 
946 static int bch2_fill_extent(struct bch_fs *c,
947 			    struct fiemap_extent_info *info,
948 			    struct bkey_s_c k, unsigned flags)
949 {
950 	if (bkey_extent_is_direct_data(k.k)) {
951 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
952 		const union bch_extent_entry *entry;
953 		struct extent_ptr_decoded p;
954 		int ret;
955 
956 		if (k.k->type == KEY_TYPE_reflink_v)
957 			flags |= FIEMAP_EXTENT_SHARED;
958 
959 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
960 			int flags2 = 0;
961 			u64 offset = p.ptr.offset;
962 
963 			if (p.ptr.unwritten)
964 				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
965 
966 			if (p.crc.compression_type)
967 				flags2 |= FIEMAP_EXTENT_ENCODED;
968 			else
969 				offset += p.crc.offset;
970 
971 			if ((offset & (block_sectors(c) - 1)) ||
972 			    (k.k->size & (block_sectors(c) - 1)))
973 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
974 
975 			ret = fiemap_fill_next_extent(info,
976 						bkey_start_offset(k.k) << 9,
977 						offset << 9,
978 						k.k->size << 9, flags|flags2);
979 			if (ret)
980 				return ret;
981 		}
982 
983 		return 0;
984 	} else if (bkey_extent_is_inline_data(k.k)) {
985 		return fiemap_fill_next_extent(info,
986 					       bkey_start_offset(k.k) << 9,
987 					       0, k.k->size << 9,
988 					       flags|
989 					       FIEMAP_EXTENT_DATA_INLINE);
990 	} else if (k.k->type == KEY_TYPE_reservation) {
991 		return fiemap_fill_next_extent(info,
992 					       bkey_start_offset(k.k) << 9,
993 					       0, k.k->size << 9,
994 					       flags|
995 					       FIEMAP_EXTENT_DELALLOC|
996 					       FIEMAP_EXTENT_UNWRITTEN);
997 	} else {
998 		BUG();
999 	}
1000 }
1001 
1002 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1003 		       u64 start, u64 len)
1004 {
1005 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1006 	struct bch_inode_info *ei = to_bch_ei(vinode);
1007 	struct btree_trans *trans;
1008 	struct btree_iter iter;
1009 	struct bkey_s_c k;
1010 	struct bkey_buf cur, prev;
1011 	unsigned offset_into_extent, sectors;
1012 	bool have_extent = false;
1013 	u32 snapshot;
1014 	int ret = 0;
1015 
1016 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
1017 	if (ret)
1018 		return ret;
1019 
1020 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1021 	if (start + len < start)
1022 		return -EINVAL;
1023 
1024 	start >>= 9;
1025 
1026 	bch2_bkey_buf_init(&cur);
1027 	bch2_bkey_buf_init(&prev);
1028 	trans = bch2_trans_get(c);
1029 retry:
1030 	bch2_trans_begin(trans);
1031 
1032 	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
1033 	if (ret)
1034 		goto err;
1035 
1036 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1037 			     SPOS(ei->v.i_ino, start, snapshot), 0);
1038 
1039 	while (!(ret = btree_trans_too_many_iters(trans)) &&
1040 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
1041 	       !(ret = bkey_err(k))) {
1042 		enum btree_id data_btree = BTREE_ID_extents;
1043 
1044 		if (!bkey_extent_is_data(k.k) &&
1045 		    k.k->type != KEY_TYPE_reservation) {
1046 			bch2_btree_iter_advance(&iter);
1047 			continue;
1048 		}
1049 
1050 		offset_into_extent	= iter.pos.offset -
1051 			bkey_start_offset(k.k);
1052 		sectors			= k.k->size - offset_into_extent;
1053 
1054 		bch2_bkey_buf_reassemble(&cur, c, k);
1055 
1056 		ret = bch2_read_indirect_extent(trans, &data_btree,
1057 					&offset_into_extent, &cur);
1058 		if (ret)
1059 			break;
1060 
1061 		k = bkey_i_to_s_c(cur.k);
1062 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1063 
1064 		sectors = min(sectors, k.k->size - offset_into_extent);
1065 
1066 		bch2_cut_front(POS(k.k->p.inode,
1067 				   bkey_start_offset(k.k) +
1068 				   offset_into_extent),
1069 			       cur.k);
1070 		bch2_key_resize(&cur.k->k, sectors);
1071 		cur.k->k.p = iter.pos;
1072 		cur.k->k.p.offset += cur.k->k.size;
1073 
1074 		if (have_extent) {
1075 			bch2_trans_unlock(trans);
1076 			ret = bch2_fill_extent(c, info,
1077 					bkey_i_to_s_c(prev.k), 0);
1078 			if (ret)
1079 				break;
1080 		}
1081 
1082 		bkey_copy(prev.k, cur.k);
1083 		have_extent = true;
1084 
1085 		bch2_btree_iter_set_pos(&iter,
1086 			POS(iter.pos.inode, iter.pos.offset + sectors));
1087 
1088 		ret = bch2_trans_relock(trans);
1089 		if (ret)
1090 			break;
1091 	}
1092 	start = iter.pos.offset;
1093 	bch2_trans_iter_exit(trans, &iter);
1094 err:
1095 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1096 		goto retry;
1097 
1098 	if (!ret && have_extent) {
1099 		bch2_trans_unlock(trans);
1100 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1101 				       FIEMAP_EXTENT_LAST);
1102 	}
1103 
1104 	bch2_trans_put(trans);
1105 	bch2_bkey_buf_exit(&cur, c);
1106 	bch2_bkey_buf_exit(&prev, c);
1107 	return ret < 0 ? ret : 0;
1108 }
1109 
1110 static const struct vm_operations_struct bch_vm_ops = {
1111 	.fault		= bch2_page_fault,
1112 	.map_pages	= filemap_map_pages,
1113 	.page_mkwrite   = bch2_page_mkwrite,
1114 };
1115 
1116 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1117 {
1118 	file_accessed(file);
1119 
1120 	vma->vm_ops = &bch_vm_ops;
1121 	return 0;
1122 }
1123 
1124 /* Directories: */
1125 
1126 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1127 {
1128 	return generic_file_llseek_size(file, offset, whence,
1129 					S64_MAX, S64_MAX);
1130 }
1131 
1132 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1133 {
1134 	struct bch_inode_info *inode = file_bch_inode(file);
1135 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1136 
1137 	if (!dir_emit_dots(file, ctx))
1138 		return 0;
1139 
1140 	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1141 
1142 	bch_err_fn(c, ret);
1143 	return bch2_err_class(ret);
1144 }
1145 
1146 static int bch2_open(struct inode *vinode, struct file *file)
1147 {
1148 	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1149 		struct bch_inode_info *inode = to_bch_ei(vinode);
1150 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1151 
1152 		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1153 		if (ret)
1154 			return ret;
1155 	}
1156 
1157 	file->f_mode |= FMODE_CAN_ODIRECT;
1158 
1159 	return generic_file_open(vinode, file);
1160 }
1161 
1162 static const struct file_operations bch_file_operations = {
1163 	.open		= bch2_open,
1164 	.llseek		= bch2_llseek,
1165 	.read_iter	= bch2_read_iter,
1166 	.write_iter	= bch2_write_iter,
1167 	.mmap		= bch2_mmap,
1168 	.get_unmapped_area = thp_get_unmapped_area,
1169 	.fsync		= bch2_fsync,
1170 	.splice_read	= filemap_splice_read,
1171 	.splice_write	= iter_file_splice_write,
1172 	.fallocate	= bch2_fallocate_dispatch,
1173 	.unlocked_ioctl = bch2_fs_file_ioctl,
1174 #ifdef CONFIG_COMPAT
1175 	.compat_ioctl	= bch2_compat_fs_ioctl,
1176 #endif
1177 	.remap_file_range = bch2_remap_file_range,
1178 };
1179 
1180 static const struct inode_operations bch_file_inode_operations = {
1181 	.getattr	= bch2_getattr,
1182 	.setattr	= bch2_setattr,
1183 	.fiemap		= bch2_fiemap,
1184 	.listxattr	= bch2_xattr_list,
1185 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1186 	.get_acl	= bch2_get_acl,
1187 	.set_acl	= bch2_set_acl,
1188 #endif
1189 };
1190 
1191 static const struct inode_operations bch_dir_inode_operations = {
1192 	.lookup		= bch2_lookup,
1193 	.create		= bch2_create,
1194 	.link		= bch2_link,
1195 	.unlink		= bch2_unlink,
1196 	.symlink	= bch2_symlink,
1197 	.mkdir		= bch2_mkdir,
1198 	.rmdir		= bch2_unlink,
1199 	.mknod		= bch2_mknod,
1200 	.rename		= bch2_rename2,
1201 	.getattr	= bch2_getattr,
1202 	.setattr	= bch2_setattr,
1203 	.tmpfile	= bch2_tmpfile,
1204 	.listxattr	= bch2_xattr_list,
1205 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1206 	.get_acl	= bch2_get_acl,
1207 	.set_acl	= bch2_set_acl,
1208 #endif
1209 };
1210 
1211 static const struct file_operations bch_dir_file_operations = {
1212 	.llseek		= bch2_dir_llseek,
1213 	.read		= generic_read_dir,
1214 	.iterate_shared	= bch2_vfs_readdir,
1215 	.fsync		= bch2_fsync,
1216 	.unlocked_ioctl = bch2_fs_file_ioctl,
1217 #ifdef CONFIG_COMPAT
1218 	.compat_ioctl	= bch2_compat_fs_ioctl,
1219 #endif
1220 };
1221 
1222 static const struct inode_operations bch_symlink_inode_operations = {
1223 	.get_link	= page_get_link,
1224 	.getattr	= bch2_getattr,
1225 	.setattr	= bch2_setattr,
1226 	.listxattr	= bch2_xattr_list,
1227 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1228 	.get_acl	= bch2_get_acl,
1229 	.set_acl	= bch2_set_acl,
1230 #endif
1231 };
1232 
1233 static const struct inode_operations bch_special_inode_operations = {
1234 	.getattr	= bch2_getattr,
1235 	.setattr	= bch2_setattr,
1236 	.listxattr	= bch2_xattr_list,
1237 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1238 	.get_acl	= bch2_get_acl,
1239 	.set_acl	= bch2_set_acl,
1240 #endif
1241 };
1242 
1243 static const struct address_space_operations bch_address_space_operations = {
1244 	.read_folio	= bch2_read_folio,
1245 	.writepages	= bch2_writepages,
1246 	.readahead	= bch2_readahead,
1247 	.dirty_folio	= filemap_dirty_folio,
1248 	.write_begin	= bch2_write_begin,
1249 	.write_end	= bch2_write_end,
1250 	.invalidate_folio = bch2_invalidate_folio,
1251 	.release_folio	= bch2_release_folio,
1252 #ifdef CONFIG_MIGRATION
1253 	.migrate_folio	= filemap_migrate_folio,
1254 #endif
1255 	.error_remove_folio = generic_error_remove_folio,
1256 };
1257 
1258 struct bcachefs_fid {
1259 	u64		inum;
1260 	u32		subvol;
1261 	u32		gen;
1262 } __packed;
1263 
1264 struct bcachefs_fid_with_parent {
1265 	struct bcachefs_fid	fid;
1266 	struct bcachefs_fid	dir;
1267 } __packed;
1268 
1269 static int bcachefs_fid_valid(int fh_len, int fh_type)
1270 {
1271 	switch (fh_type) {
1272 	case FILEID_BCACHEFS_WITHOUT_PARENT:
1273 		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1274 	case FILEID_BCACHEFS_WITH_PARENT:
1275 		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1276 	default:
1277 		return false;
1278 	}
1279 }
1280 
1281 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1282 {
1283 	return (struct bcachefs_fid) {
1284 		.inum	= inode->ei_inode.bi_inum,
1285 		.subvol	= inode->ei_subvol,
1286 		.gen	= inode->ei_inode.bi_generation,
1287 	};
1288 }
1289 
1290 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1291 			  struct inode *vdir)
1292 {
1293 	struct bch_inode_info *inode	= to_bch_ei(vinode);
1294 	struct bch_inode_info *dir	= to_bch_ei(vdir);
1295 	int min_len;
1296 
1297 	if (!S_ISDIR(inode->v.i_mode) && dir) {
1298 		struct bcachefs_fid_with_parent *fid = (void *) fh;
1299 
1300 		min_len = sizeof(*fid) / sizeof(u32);
1301 		if (*len < min_len) {
1302 			*len = min_len;
1303 			return FILEID_INVALID;
1304 		}
1305 
1306 		fid->fid = bch2_inode_to_fid(inode);
1307 		fid->dir = bch2_inode_to_fid(dir);
1308 
1309 		*len = min_len;
1310 		return FILEID_BCACHEFS_WITH_PARENT;
1311 	} else {
1312 		struct bcachefs_fid *fid = (void *) fh;
1313 
1314 		min_len = sizeof(*fid) / sizeof(u32);
1315 		if (*len < min_len) {
1316 			*len = min_len;
1317 			return FILEID_INVALID;
1318 		}
1319 		*fid = bch2_inode_to_fid(inode);
1320 
1321 		*len = min_len;
1322 		return FILEID_BCACHEFS_WITHOUT_PARENT;
1323 	}
1324 }
1325 
1326 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1327 					struct bcachefs_fid fid)
1328 {
1329 	struct bch_fs *c = sb->s_fs_info;
1330 	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1331 				    .subvol = fid.subvol,
1332 				    .inum = fid.inum,
1333 	});
1334 	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1335 		iput(vinode);
1336 		vinode = ERR_PTR(-ESTALE);
1337 	}
1338 	return vinode;
1339 }
1340 
1341 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1342 		int fh_len, int fh_type)
1343 {
1344 	struct bcachefs_fid *fid = (void *) _fid;
1345 
1346 	if (!bcachefs_fid_valid(fh_len, fh_type))
1347 		return NULL;
1348 
1349 	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1350 }
1351 
1352 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1353 		int fh_len, int fh_type)
1354 {
1355 	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1356 
1357 	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1358 	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1359 		return NULL;
1360 
1361 	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1362 }
1363 
1364 static struct dentry *bch2_get_parent(struct dentry *child)
1365 {
1366 	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1367 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1368 	subvol_inum parent_inum = {
1369 		.subvol = inode->ei_inode.bi_parent_subvol ?:
1370 			inode->ei_subvol,
1371 		.inum = inode->ei_inode.bi_dir,
1372 	};
1373 
1374 	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1375 }
1376 
1377 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1378 {
1379 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1380 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1381 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1382 	struct btree_trans *trans;
1383 	struct btree_iter iter1;
1384 	struct btree_iter iter2;
1385 	struct bkey_s_c k;
1386 	struct bkey_s_c_dirent d;
1387 	struct bch_inode_unpacked inode_u;
1388 	subvol_inum target;
1389 	u32 snapshot;
1390 	struct qstr dirent_name;
1391 	unsigned name_len = 0;
1392 	int ret;
1393 
1394 	if (!S_ISDIR(dir->v.i_mode))
1395 		return -EINVAL;
1396 
1397 	trans = bch2_trans_get(c);
1398 
1399 	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1400 			     POS(dir->ei_inode.bi_inum, 0), 0);
1401 	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1402 			     POS(dir->ei_inode.bi_inum, 0), 0);
1403 retry:
1404 	bch2_trans_begin(trans);
1405 
1406 	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1407 	if (ret)
1408 		goto err;
1409 
1410 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1411 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1412 
1413 	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1414 	if (ret)
1415 		goto err;
1416 
1417 	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1418 		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1419 
1420 		k = bch2_btree_iter_peek_slot(&iter1);
1421 		ret = bkey_err(k);
1422 		if (ret)
1423 			goto err;
1424 
1425 		if (k.k->type != KEY_TYPE_dirent) {
1426 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1427 			goto err;
1428 		}
1429 
1430 		d = bkey_s_c_to_dirent(k);
1431 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1432 		if (ret > 0)
1433 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1434 		if (ret)
1435 			goto err;
1436 
1437 		if (target.subvol	== inode->ei_subvol &&
1438 		    target.inum		== inode->ei_inode.bi_inum)
1439 			goto found;
1440 	} else {
1441 		/*
1442 		 * File with multiple hardlinks and our backref is to the wrong
1443 		 * directory - linear search:
1444 		 */
1445 		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1446 			if (k.k->p.inode > dir->ei_inode.bi_inum)
1447 				break;
1448 
1449 			if (k.k->type != KEY_TYPE_dirent)
1450 				continue;
1451 
1452 			d = bkey_s_c_to_dirent(k);
1453 			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1454 			if (ret < 0)
1455 				break;
1456 			if (ret)
1457 				continue;
1458 
1459 			if (target.subvol	== inode->ei_subvol &&
1460 			    target.inum		== inode->ei_inode.bi_inum)
1461 				goto found;
1462 		}
1463 	}
1464 
1465 	ret = -ENOENT;
1466 	goto err;
1467 found:
1468 	dirent_name = bch2_dirent_get_name(d);
1469 
1470 	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1471 	memcpy(name, dirent_name.name, name_len);
1472 	name[name_len] = '\0';
1473 err:
1474 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1475 		goto retry;
1476 
1477 	bch2_trans_iter_exit(trans, &iter1);
1478 	bch2_trans_iter_exit(trans, &iter2);
1479 	bch2_trans_put(trans);
1480 
1481 	return ret;
1482 }
1483 
1484 static const struct export_operations bch_export_ops = {
1485 	.encode_fh	= bch2_encode_fh,
1486 	.fh_to_dentry	= bch2_fh_to_dentry,
1487 	.fh_to_parent	= bch2_fh_to_parent,
1488 	.get_parent	= bch2_get_parent,
1489 	.get_name	= bch2_get_name,
1490 };
1491 
1492 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1493 				struct bch_inode_info *inode,
1494 				struct bch_inode_unpacked *bi,
1495 				struct bch_subvolume *subvol)
1496 {
1497 	bch2_iget5_set(&inode->v, &inum);
1498 	bch2_inode_update_after_write(trans, inode, bi, ~0);
1499 
1500 	inode->v.i_blocks	= bi->bi_sectors;
1501 	inode->v.i_ino		= bi->bi_inum;
1502 	inode->v.i_rdev		= bi->bi_dev;
1503 	inode->v.i_generation	= bi->bi_generation;
1504 	inode->v.i_size		= bi->bi_size;
1505 
1506 	inode->ei_flags		= 0;
1507 	inode->ei_quota_reserved = 0;
1508 	inode->ei_qid		= bch_qid(bi);
1509 	inode->ei_subvol	= inum.subvol;
1510 
1511 	if (BCH_SUBVOLUME_SNAP(subvol))
1512 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1513 
1514 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1515 
1516 	switch (inode->v.i_mode & S_IFMT) {
1517 	case S_IFREG:
1518 		inode->v.i_op	= &bch_file_inode_operations;
1519 		inode->v.i_fop	= &bch_file_operations;
1520 		break;
1521 	case S_IFDIR:
1522 		inode->v.i_op	= &bch_dir_inode_operations;
1523 		inode->v.i_fop	= &bch_dir_file_operations;
1524 		break;
1525 	case S_IFLNK:
1526 		inode_nohighmem(&inode->v);
1527 		inode->v.i_op	= &bch_symlink_inode_operations;
1528 		break;
1529 	default:
1530 		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1531 		inode->v.i_op	= &bch_special_inode_operations;
1532 		break;
1533 	}
1534 
1535 	mapping_set_large_folios(inode->v.i_mapping);
1536 }
1537 
1538 static void bch2_free_inode(struct inode *vinode)
1539 {
1540 	kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
1541 }
1542 
1543 static int inode_update_times_fn(struct btree_trans *trans,
1544 				 struct bch_inode_info *inode,
1545 				 struct bch_inode_unpacked *bi,
1546 				 void *p)
1547 {
1548 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1549 
1550 	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1551 	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1552 	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1553 
1554 	return 0;
1555 }
1556 
1557 static int bch2_vfs_write_inode(struct inode *vinode,
1558 				struct writeback_control *wbc)
1559 {
1560 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1561 	struct bch_inode_info *inode = to_bch_ei(vinode);
1562 	int ret;
1563 
1564 	mutex_lock(&inode->ei_update_lock);
1565 	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1566 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1567 	mutex_unlock(&inode->ei_update_lock);
1568 
1569 	return bch2_err_class(ret);
1570 }
1571 
1572 static void bch2_evict_inode(struct inode *vinode)
1573 {
1574 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1575 	struct bch_inode_info *inode = to_bch_ei(vinode);
1576 
1577 	truncate_inode_pages_final(&inode->v.i_data);
1578 
1579 	clear_inode(&inode->v);
1580 
1581 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1582 
1583 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1584 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1585 				KEY_TYPE_QUOTA_WARN);
1586 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1587 				KEY_TYPE_QUOTA_WARN);
1588 		bch2_inode_rm(c, inode_inum(inode));
1589 	}
1590 
1591 	mutex_lock(&c->vfs_inodes_lock);
1592 	list_del_init(&inode->ei_vfs_inode_list);
1593 	mutex_unlock(&c->vfs_inodes_lock);
1594 }
1595 
1596 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1597 {
1598 	struct bch_inode_info *inode;
1599 	DARRAY(struct bch_inode_info *) grabbed;
1600 	bool clean_pass = false, this_pass_clean;
1601 
1602 	/*
1603 	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1604 	 * be pruned with d_mark_dontcache().
1605 	 *
1606 	 * Once we've had a clean pass where we didn't find any inodes without
1607 	 * I_DONTCACHE, we wait for them to be freed:
1608 	 */
1609 
1610 	darray_init(&grabbed);
1611 	darray_make_room(&grabbed, 1024);
1612 again:
1613 	cond_resched();
1614 	this_pass_clean = true;
1615 
1616 	mutex_lock(&c->vfs_inodes_lock);
1617 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1618 		if (!snapshot_list_has_id(s, inode->ei_subvol))
1619 			continue;
1620 
1621 		if (!(inode->v.i_state & I_DONTCACHE) &&
1622 		    !(inode->v.i_state & I_FREEING) &&
1623 		    igrab(&inode->v)) {
1624 			this_pass_clean = false;
1625 
1626 			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1627 				iput(&inode->v);
1628 				break;
1629 			}
1630 		} else if (clean_pass && this_pass_clean) {
1631 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1632 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1633 
1634 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1635 			mutex_unlock(&c->vfs_inodes_lock);
1636 
1637 			schedule();
1638 			finish_wait(wq, &wait.wq_entry);
1639 			goto again;
1640 		}
1641 	}
1642 	mutex_unlock(&c->vfs_inodes_lock);
1643 
1644 	darray_for_each(grabbed, i) {
1645 		inode = *i;
1646 		d_mark_dontcache(&inode->v);
1647 		d_prune_aliases(&inode->v);
1648 		iput(&inode->v);
1649 	}
1650 	grabbed.nr = 0;
1651 
1652 	if (!clean_pass || !this_pass_clean) {
1653 		clean_pass = this_pass_clean;
1654 		goto again;
1655 	}
1656 
1657 	darray_exit(&grabbed);
1658 }
1659 
1660 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1661 {
1662 	struct super_block *sb = dentry->d_sb;
1663 	struct bch_fs *c = sb->s_fs_info;
1664 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1665 	unsigned shift = sb->s_blocksize_bits - 9;
1666 	/*
1667 	 * this assumes inodes take up 64 bytes, which is a decent average
1668 	 * number:
1669 	 */
1670 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1671 
1672 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1673 	buf->f_bsize	= sb->s_blocksize;
1674 	buf->f_blocks	= usage.capacity >> shift;
1675 	buf->f_bfree	= usage.free >> shift;
1676 	buf->f_bavail	= avail_factor(usage.free) >> shift;
1677 
1678 	buf->f_files	= usage.nr_inodes + avail_inodes;
1679 	buf->f_ffree	= avail_inodes;
1680 
1681 	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
1682 	buf->f_namelen	= BCH_NAME_MAX;
1683 
1684 	return 0;
1685 }
1686 
1687 static int bch2_sync_fs(struct super_block *sb, int wait)
1688 {
1689 	struct bch_fs *c = sb->s_fs_info;
1690 	int ret;
1691 
1692 	if (c->opts.journal_flush_disabled)
1693 		return 0;
1694 
1695 	if (!wait) {
1696 		bch2_journal_flush_async(&c->journal, NULL);
1697 		return 0;
1698 	}
1699 
1700 	ret = bch2_journal_flush(&c->journal);
1701 	return bch2_err_class(ret);
1702 }
1703 
1704 static struct bch_fs *bch2_path_to_fs(const char *path)
1705 {
1706 	struct bch_fs *c;
1707 	dev_t dev;
1708 	int ret;
1709 
1710 	ret = lookup_bdev(path, &dev);
1711 	if (ret)
1712 		return ERR_PTR(ret);
1713 
1714 	c = bch2_dev_to_fs(dev);
1715 	if (c)
1716 		closure_put(&c->cl);
1717 	return c ?: ERR_PTR(-ENOENT);
1718 }
1719 
1720 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1721 {
1722 	struct bch_fs *c = sb->s_fs_info;
1723 	struct bch_opts opts = bch2_opts_empty();
1724 	int ret;
1725 
1726 	ret = bch2_parse_mount_opts(c, &opts, data);
1727 	if (ret)
1728 		goto err;
1729 
1730 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1731 
1732 	if (opts.read_only != c->opts.read_only) {
1733 		down_write(&c->state_lock);
1734 
1735 		if (opts.read_only) {
1736 			bch2_fs_read_only(c);
1737 
1738 			sb->s_flags |= SB_RDONLY;
1739 		} else {
1740 			ret = bch2_fs_read_write(c);
1741 			if (ret) {
1742 				bch_err(c, "error going rw: %i", ret);
1743 				up_write(&c->state_lock);
1744 				ret = -EINVAL;
1745 				goto err;
1746 			}
1747 
1748 			sb->s_flags &= ~SB_RDONLY;
1749 		}
1750 
1751 		c->opts.read_only = opts.read_only;
1752 
1753 		up_write(&c->state_lock);
1754 	}
1755 
1756 	if (opt_defined(opts, errors))
1757 		c->opts.errors = opts.errors;
1758 err:
1759 	return bch2_err_class(ret);
1760 }
1761 
1762 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1763 {
1764 	struct bch_fs *c = root->d_sb->s_fs_info;
1765 	bool first = true;
1766 
1767 	for_each_online_member(c, ca) {
1768 		if (!first)
1769 			seq_putc(seq, ':');
1770 		first = false;
1771 		seq_puts(seq, ca->disk_sb.sb_name);
1772 	}
1773 
1774 	return 0;
1775 }
1776 
1777 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1778 {
1779 	struct bch_fs *c = root->d_sb->s_fs_info;
1780 	enum bch_opt_id i;
1781 	struct printbuf buf = PRINTBUF;
1782 	int ret = 0;
1783 
1784 	for (i = 0; i < bch2_opts_nr; i++) {
1785 		const struct bch_option *opt = &bch2_opt_table[i];
1786 		u64 v = bch2_opt_get_by_id(&c->opts, i);
1787 
1788 		if (!(opt->flags & OPT_MOUNT))
1789 			continue;
1790 
1791 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1792 			continue;
1793 
1794 		printbuf_reset(&buf);
1795 		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1796 				 OPT_SHOW_MOUNT_STYLE);
1797 		seq_putc(seq, ',');
1798 		seq_puts(seq, buf.buf);
1799 	}
1800 
1801 	if (buf.allocation_failure)
1802 		ret = -ENOMEM;
1803 	printbuf_exit(&buf);
1804 	return ret;
1805 }
1806 
1807 static void bch2_put_super(struct super_block *sb)
1808 {
1809 	struct bch_fs *c = sb->s_fs_info;
1810 
1811 	__bch2_fs_stop(c);
1812 }
1813 
1814 /*
1815  * bcachefs doesn't currently integrate intwrite freeze protection but the
1816  * internal write references serve the same purpose. Therefore reuse the
1817  * read-only transition code to perform the quiesce. The caveat is that we don't
1818  * currently have the ability to block tasks that want a write reference while
1819  * the superblock is frozen. This is fine for now, but we should either add
1820  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1821  */
1822 static int bch2_freeze(struct super_block *sb)
1823 {
1824 	struct bch_fs *c = sb->s_fs_info;
1825 
1826 	down_write(&c->state_lock);
1827 	bch2_fs_read_only(c);
1828 	up_write(&c->state_lock);
1829 	return 0;
1830 }
1831 
1832 static int bch2_unfreeze(struct super_block *sb)
1833 {
1834 	struct bch_fs *c = sb->s_fs_info;
1835 	int ret;
1836 
1837 	if (test_bit(BCH_FS_emergency_ro, &c->flags))
1838 		return 0;
1839 
1840 	down_write(&c->state_lock);
1841 	ret = bch2_fs_read_write(c);
1842 	up_write(&c->state_lock);
1843 	return ret;
1844 }
1845 
1846 static const struct super_operations bch_super_operations = {
1847 	.alloc_inode	= bch2_alloc_inode,
1848 	.free_inode	= bch2_free_inode,
1849 	.write_inode	= bch2_vfs_write_inode,
1850 	.evict_inode	= bch2_evict_inode,
1851 	.sync_fs	= bch2_sync_fs,
1852 	.statfs		= bch2_statfs,
1853 	.show_devname	= bch2_show_devname,
1854 	.show_options	= bch2_show_options,
1855 	.remount_fs	= bch2_remount,
1856 	.put_super	= bch2_put_super,
1857 	.freeze_fs	= bch2_freeze,
1858 	.unfreeze_fs	= bch2_unfreeze,
1859 };
1860 
1861 static int bch2_set_super(struct super_block *s, void *data)
1862 {
1863 	s->s_fs_info = data;
1864 	return 0;
1865 }
1866 
1867 static int bch2_noset_super(struct super_block *s, void *data)
1868 {
1869 	return -EBUSY;
1870 }
1871 
1872 typedef DARRAY(struct bch_fs *) darray_fs;
1873 
1874 static int bch2_test_super(struct super_block *s, void *data)
1875 {
1876 	struct bch_fs *c = s->s_fs_info;
1877 	darray_fs *d = data;
1878 
1879 	if (!c)
1880 		return false;
1881 
1882 	darray_for_each(*d, i)
1883 		if (c != *i)
1884 			return false;
1885 	return true;
1886 }
1887 
1888 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1889 				 int flags, const char *dev_name, void *data)
1890 {
1891 	struct bch_fs *c;
1892 	struct super_block *sb;
1893 	struct inode *vinode;
1894 	struct bch_opts opts = bch2_opts_empty();
1895 	int ret;
1896 
1897 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1898 
1899 	ret = bch2_parse_mount_opts(NULL, &opts, data);
1900 	if (ret) {
1901 		ret = bch2_err_class(ret);
1902 		return ERR_PTR(ret);
1903 	}
1904 
1905 	if (!dev_name || strlen(dev_name) == 0)
1906 		return ERR_PTR(-EINVAL);
1907 
1908 	darray_str devs;
1909 	ret = bch2_split_devs(dev_name, &devs);
1910 	if (ret)
1911 		return ERR_PTR(ret);
1912 
1913 	darray_fs devs_to_fs = {};
1914 	darray_for_each(devs, i) {
1915 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1916 		if (ret) {
1917 			sb = ERR_PTR(ret);
1918 			goto got_sb;
1919 		}
1920 	}
1921 
1922 	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1923 	if (!IS_ERR(sb))
1924 		goto got_sb;
1925 
1926 	c = bch2_fs_open(devs.data, devs.nr, opts);
1927 	if (IS_ERR(c)) {
1928 		sb = ERR_CAST(c);
1929 		goto got_sb;
1930 	}
1931 
1932 	/* Some options can't be parsed until after the fs is started: */
1933 	ret = bch2_parse_mount_opts(c, &opts, data);
1934 	if (ret) {
1935 		bch2_fs_stop(c);
1936 		sb = ERR_PTR(ret);
1937 		goto got_sb;
1938 	}
1939 
1940 	bch2_opts_apply(&c->opts, opts);
1941 
1942 	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1943 	if (IS_ERR(sb))
1944 		bch2_fs_stop(c);
1945 got_sb:
1946 	darray_exit(&devs_to_fs);
1947 	bch2_darray_str_exit(&devs);
1948 
1949 	if (IS_ERR(sb)) {
1950 		ret = PTR_ERR(sb);
1951 		goto err;
1952 	}
1953 
1954 	c = sb->s_fs_info;
1955 
1956 	if (sb->s_root) {
1957 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
1958 			ret = -EBUSY;
1959 			goto err_put_super;
1960 		}
1961 		goto out;
1962 	}
1963 
1964 	sb->s_blocksize		= block_bytes(c);
1965 	sb->s_blocksize_bits	= ilog2(block_bytes(c));
1966 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
1967 	sb->s_op		= &bch_super_operations;
1968 	sb->s_export_op		= &bch_export_ops;
1969 #ifdef CONFIG_BCACHEFS_QUOTA
1970 	sb->s_qcop		= &bch2_quotactl_operations;
1971 	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1972 #endif
1973 	sb->s_xattr		= bch2_xattr_handlers;
1974 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
1975 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
1976 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1977 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
1978 	sb->s_uuid		= c->sb.user_uuid;
1979 	sb->s_shrink->seeks	= 0;
1980 	c->vfs_sb		= sb;
1981 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1982 
1983 	ret = super_setup_bdi(sb);
1984 	if (ret)
1985 		goto err_put_super;
1986 
1987 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
1988 
1989 	for_each_online_member(c, ca) {
1990 		struct block_device *bdev = ca->disk_sb.bdev;
1991 
1992 		/* XXX: create an anonymous device for multi device filesystems */
1993 		sb->s_bdev	= bdev;
1994 		sb->s_dev	= bdev->bd_dev;
1995 		percpu_ref_put(&ca->io_ref);
1996 		break;
1997 	}
1998 
1999 	c->dev = sb->s_dev;
2000 
2001 #ifdef CONFIG_BCACHEFS_POSIX_ACL
2002 	if (c->opts.acl)
2003 		sb->s_flags	|= SB_POSIXACL;
2004 #endif
2005 
2006 	sb->s_shrink->seeks = 0;
2007 
2008 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
2009 	ret = PTR_ERR_OR_ZERO(vinode);
2010 	bch_err_msg(c, ret, "mounting: error getting root inode");
2011 	if (ret)
2012 		goto err_put_super;
2013 
2014 	sb->s_root = d_make_root(vinode);
2015 	if (!sb->s_root) {
2016 		bch_err(c, "error mounting: error allocating root dentry");
2017 		ret = -ENOMEM;
2018 		goto err_put_super;
2019 	}
2020 
2021 	sb->s_flags |= SB_ACTIVE;
2022 out:
2023 	return dget(sb->s_root);
2024 
2025 err_put_super:
2026 	__bch2_fs_stop(c);
2027 	deactivate_locked_super(sb);
2028 err:
2029 	/*
2030 	 * On an inconsistency error in recovery we might see an -EROFS derived
2031 	 * errorcode (from the journal), but we don't want to return that to
2032 	 * userspace as that causes util-linux to retry the mount RO - which is
2033 	 * confusing:
2034 	 */
2035 	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
2036 		ret = -EIO;
2037 	return ERR_PTR(bch2_err_class(ret));
2038 }
2039 
2040 static void bch2_kill_sb(struct super_block *sb)
2041 {
2042 	struct bch_fs *c = sb->s_fs_info;
2043 
2044 	generic_shutdown_super(sb);
2045 	bch2_fs_free(c);
2046 }
2047 
2048 static struct file_system_type bcache_fs_type = {
2049 	.owner		= THIS_MODULE,
2050 	.name		= "bcachefs",
2051 	.mount		= bch2_mount,
2052 	.kill_sb	= bch2_kill_sb,
2053 	.fs_flags	= FS_REQUIRES_DEV,
2054 };
2055 
2056 MODULE_ALIAS_FS("bcachefs");
2057 
2058 void bch2_vfs_exit(void)
2059 {
2060 	unregister_filesystem(&bcache_fs_type);
2061 	kmem_cache_destroy(bch2_inode_cache);
2062 }
2063 
2064 int __init bch2_vfs_init(void)
2065 {
2066 	int ret = -ENOMEM;
2067 
2068 	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
2069 	if (!bch2_inode_cache)
2070 		goto err;
2071 
2072 	ret = register_filesystem(&bcache_fs_type);
2073 	if (ret)
2074 		goto err;
2075 
2076 	return 0;
2077 err:
2078 	bch2_vfs_exit();
2079 	return ret;
2080 }
2081 
2082 #endif /* NO_BCACHEFS_FS */
2083