xref: /linux/fs/bcachefs/fs.c (revision 3e7819886281e077e82006fe4804b0d6b0f5643b)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "acl.h"
6 #include "bkey_buf.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "chardev.h"
10 #include "dirent.h"
11 #include "errcode.h"
12 #include "extents.h"
13 #include "fs.h"
14 #include "fs-common.h"
15 #include "fs-io.h"
16 #include "fs-ioctl.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
20 #include "fsck.h"
21 #include "inode.h"
22 #include "io_read.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "quota.h"
26 #include "snapshot.h"
27 #include "super.h"
28 #include "xattr.h"
29 
30 #include <linux/aio.h>
31 #include <linux/backing-dev.h>
32 #include <linux/exportfs.h>
33 #include <linux/fiemap.h>
34 #include <linux/module.h>
35 #include <linux/pagemap.h>
36 #include <linux/posix_acl.h>
37 #include <linux/random.h>
38 #include <linux/seq_file.h>
39 #include <linux/statfs.h>
40 #include <linux/string.h>
41 #include <linux/xattr.h>
42 
43 static struct kmem_cache *bch2_inode_cache;
44 
45 static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
46 				struct bch_inode_info *,
47 				struct bch_inode_unpacked *,
48 				struct bch_subvolume *);
49 
50 void bch2_inode_update_after_write(struct btree_trans *trans,
51 				   struct bch_inode_info *inode,
52 				   struct bch_inode_unpacked *bi,
53 				   unsigned fields)
54 {
55 	struct bch_fs *c = trans->c;
56 
57 	BUG_ON(bi->bi_inum != inode->v.i_ino);
58 
59 	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
60 			       POS(0, bi->bi_inum),
61 			       c->opts.inodes_use_key_cache);
62 
63 	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
64 	i_uid_write(&inode->v, bi->bi_uid);
65 	i_gid_write(&inode->v, bi->bi_gid);
66 	inode->v.i_mode	= bi->bi_mode;
67 
68 	if (fields & ATTR_ATIME)
69 		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
70 	if (fields & ATTR_MTIME)
71 		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
72 	if (fields & ATTR_CTIME)
73 		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
74 
75 	inode->ei_inode		= *bi;
76 
77 	bch2_inode_flags_to_vfs(inode);
78 }
79 
80 int __must_check bch2_write_inode(struct bch_fs *c,
81 				  struct bch_inode_info *inode,
82 				  inode_set_fn set,
83 				  void *p, unsigned fields)
84 {
85 	struct btree_trans *trans = bch2_trans_get(c);
86 	struct btree_iter iter = { NULL };
87 	struct bch_inode_unpacked inode_u;
88 	int ret;
89 retry:
90 	bch2_trans_begin(trans);
91 
92 	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
93 				BTREE_ITER_intent) ?:
94 		(set ? set(trans, inode, &inode_u, p) : 0) ?:
95 		bch2_inode_write(trans, &iter, &inode_u) ?:
96 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
97 
98 	/*
99 	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 	 * this is important for inode updates via bchfs_write_index_update
101 	 */
102 	if (!ret)
103 		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
104 
105 	bch2_trans_iter_exit(trans, &iter);
106 
107 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
108 		goto retry;
109 
110 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
111 			     "%s: inode %u:%llu not found when updating",
112 			     bch2_err_str(ret),
113 			     inode_inum(inode).subvol,
114 			     inode_inum(inode).inum);
115 
116 	bch2_trans_put(trans);
117 	return ret < 0 ? ret : 0;
118 }
119 
120 int bch2_fs_quota_transfer(struct bch_fs *c,
121 			   struct bch_inode_info *inode,
122 			   struct bch_qid new_qid,
123 			   unsigned qtypes,
124 			   enum quota_acct_mode mode)
125 {
126 	unsigned i;
127 	int ret;
128 
129 	qtypes &= enabled_qtypes(c);
130 
131 	for (i = 0; i < QTYP_NR; i++)
132 		if (new_qid.q[i] == inode->ei_qid.q[i])
133 			qtypes &= ~(1U << i);
134 
135 	if (!qtypes)
136 		return 0;
137 
138 	mutex_lock(&inode->ei_quota_lock);
139 
140 	ret = bch2_quota_transfer(c, qtypes, new_qid,
141 				  inode->ei_qid,
142 				  inode->v.i_blocks +
143 				  inode->ei_quota_reserved,
144 				  mode);
145 	if (!ret)
146 		for (i = 0; i < QTYP_NR; i++)
147 			if (qtypes & (1 << i))
148 				inode->ei_qid.q[i] = new_qid.q[i];
149 
150 	mutex_unlock(&inode->ei_quota_lock);
151 
152 	return ret;
153 }
154 
155 static int bch2_iget5_test(struct inode *vinode, void *p)
156 {
157 	struct bch_inode_info *inode = to_bch_ei(vinode);
158 	subvol_inum *inum = p;
159 
160 	return inode->ei_subvol == inum->subvol &&
161 		inode->ei_inode.bi_inum == inum->inum;
162 }
163 
164 static int bch2_iget5_set(struct inode *vinode, void *p)
165 {
166 	struct bch_inode_info *inode = to_bch_ei(vinode);
167 	subvol_inum *inum = p;
168 
169 	inode->v.i_ino		= inum->inum;
170 	inode->ei_subvol	= inum->subvol;
171 	inode->ei_inode.bi_inum	= inum->inum;
172 	return 0;
173 }
174 
175 static unsigned bch2_inode_hash(subvol_inum inum)
176 {
177 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
178 }
179 
180 static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
181 {
182 	subvol_inum inum = inode_inum(inode);
183 	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
184 				      bch2_inode_hash(inum),
185 				      bch2_iget5_test,
186 				      bch2_iget5_set,
187 				      &inum));
188 	BUG_ON(!old);
189 
190 	if (unlikely(old != inode)) {
191 		/*
192 		 * bcachefs doesn't use I_NEW; we have no use for it since we
193 		 * only insert fully created inodes in the inode hash table. But
194 		 * discard_new_inode() expects it to be set...
195 		 */
196 		inode->v.i_flags |= I_NEW;
197 		/*
198 		 * We don't want bch2_evict_inode() to delete the inode on disk,
199 		 * we just raced and had another inode in cache. Normally new
200 		 * inodes don't have nlink == 0 - except tmpfiles do...
201 		 */
202 		set_nlink(&inode->v, 1);
203 		discard_new_inode(&inode->v);
204 		inode = old;
205 	} else {
206 		mutex_lock(&c->vfs_inodes_lock);
207 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
208 		mutex_unlock(&c->vfs_inodes_lock);
209 		/*
210 		 * Again, I_NEW makes no sense for bcachefs. This is only needed
211 		 * for clearing I_NEW, but since the inode was already fully
212 		 * created and initialized we didn't actually want
213 		 * inode_insert5() to set it for us.
214 		 */
215 		unlock_new_inode(&inode->v);
216 	}
217 
218 	return inode;
219 }
220 
221 #define memalloc_flags_do(_flags, _do)						\
222 ({										\
223 	unsigned _saved_flags = memalloc_flags_save(_flags);			\
224 	typeof(_do) _ret = _do;							\
225 	memalloc_noreclaim_restore(_saved_flags);				\
226 	_ret;									\
227 })
228 
229 static struct inode *bch2_alloc_inode(struct super_block *sb)
230 {
231 	BUG();
232 }
233 
234 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
235 {
236 	struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
237 	if (!inode)
238 		return NULL;
239 
240 	inode_init_once(&inode->v);
241 	mutex_init(&inode->ei_update_lock);
242 	two_state_lock_init(&inode->ei_pagecache_lock);
243 	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
244 	inode->ei_flags = 0;
245 	mutex_init(&inode->ei_quota_lock);
246 	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
247 
248 	if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
249 		kmem_cache_free(bch2_inode_cache, inode);
250 		return NULL;
251 	}
252 
253 	return inode;
254 }
255 
256 /*
257  * Allocate a new inode, dropping/retaking btree locks if necessary:
258  */
259 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
260 {
261 	struct bch_inode_info *inode =
262 		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
263 				  __bch2_new_inode(trans->c));
264 
265 	if (unlikely(!inode)) {
266 		int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
267 		if (ret && inode) {
268 			__destroy_inode(&inode->v);
269 			kmem_cache_free(bch2_inode_cache, inode);
270 		}
271 		if (ret)
272 			return ERR_PTR(ret);
273 	}
274 
275 	return inode;
276 }
277 
278 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
279 {
280 	struct bch_inode_info *inode =
281 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
282 					  bch2_inode_hash(inum),
283 					  bch2_iget5_test,
284 					  &inum));
285 	if (inode)
286 		return &inode->v;
287 
288 	struct btree_trans *trans = bch2_trans_get(c);
289 
290 	struct bch_inode_unpacked inode_u;
291 	struct bch_subvolume subvol;
292 	int ret = lockrestart_do(trans,
293 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
294 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
295 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
296 	if (!ret) {
297 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
298 		inode = bch2_inode_insert(c, inode);
299 	}
300 	bch2_trans_put(trans);
301 
302 	return ret ? ERR_PTR(ret) : &inode->v;
303 }
304 
305 struct bch_inode_info *
306 __bch2_create(struct mnt_idmap *idmap,
307 	      struct bch_inode_info *dir, struct dentry *dentry,
308 	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
309 	      unsigned flags)
310 {
311 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
312 	struct btree_trans *trans;
313 	struct bch_inode_unpacked dir_u;
314 	struct bch_inode_info *inode;
315 	struct bch_inode_unpacked inode_u;
316 	struct posix_acl *default_acl = NULL, *acl = NULL;
317 	subvol_inum inum;
318 	struct bch_subvolume subvol;
319 	u64 journal_seq = 0;
320 	int ret;
321 
322 	/*
323 	 * preallocate acls + vfs inode before btree transaction, so that
324 	 * nothing can fail after the transaction succeeds:
325 	 */
326 #ifdef CONFIG_BCACHEFS_POSIX_ACL
327 	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
328 	if (ret)
329 		return ERR_PTR(ret);
330 #endif
331 	inode = __bch2_new_inode(c);
332 	if (unlikely(!inode)) {
333 		inode = ERR_PTR(-ENOMEM);
334 		goto err;
335 	}
336 
337 	bch2_inode_init_early(c, &inode_u);
338 
339 	if (!(flags & BCH_CREATE_TMPFILE))
340 		mutex_lock(&dir->ei_update_lock);
341 
342 	trans = bch2_trans_get(c);
343 retry:
344 	bch2_trans_begin(trans);
345 
346 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
347 		bch2_create_trans(trans,
348 				  inode_inum(dir), &dir_u, &inode_u,
349 				  !(flags & BCH_CREATE_TMPFILE)
350 				  ? &dentry->d_name : NULL,
351 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
352 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
353 				  mode, rdev,
354 				  default_acl, acl, snapshot_src, flags) ?:
355 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
356 				KEY_TYPE_QUOTA_PREALLOC);
357 	if (unlikely(ret))
358 		goto err_before_quota;
359 
360 	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
361 	inum.inum = inode_u.bi_inum;
362 
363 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
364 				   BTREE_ITER_with_updates, &subvol) ?:
365 		bch2_trans_commit(trans, NULL, &journal_seq, 0);
366 	if (unlikely(ret)) {
367 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
368 				KEY_TYPE_QUOTA_WARN);
369 err_before_quota:
370 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
371 			goto retry;
372 		goto err_trans;
373 	}
374 
375 	if (!(flags & BCH_CREATE_TMPFILE)) {
376 		bch2_inode_update_after_write(trans, dir, &dir_u,
377 					      ATTR_MTIME|ATTR_CTIME);
378 		mutex_unlock(&dir->ei_update_lock);
379 	}
380 
381 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
382 
383 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
384 	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
385 
386 	/*
387 	 * we must insert the new inode into the inode cache before calling
388 	 * bch2_trans_exit() and dropping locks, else we could race with another
389 	 * thread pulling the inode in and modifying it:
390 	 */
391 	inode = bch2_inode_insert(c, inode);
392 	bch2_trans_put(trans);
393 err:
394 	posix_acl_release(default_acl);
395 	posix_acl_release(acl);
396 	return inode;
397 err_trans:
398 	if (!(flags & BCH_CREATE_TMPFILE))
399 		mutex_unlock(&dir->ei_update_lock);
400 
401 	bch2_trans_put(trans);
402 	make_bad_inode(&inode->v);
403 	iput(&inode->v);
404 	inode = ERR_PTR(ret);
405 	goto err;
406 }
407 
408 /* methods */
409 
410 static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
411 			subvol_inum dir, struct bch_hash_info *dir_hash_info,
412 			const struct qstr *name)
413 {
414 	struct bch_fs *c = trans->c;
415 	struct btree_iter dirent_iter = {};
416 	subvol_inum inum = {};
417 	struct printbuf buf = PRINTBUF;
418 
419 	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
420 					     dir_hash_info, dir, name, 0);
421 	int ret = bkey_err(k);
422 	if (ret)
423 		return ERR_PTR(ret);
424 
425 	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
426 	if (ret > 0)
427 		ret = -ENOENT;
428 	if (ret)
429 		goto err;
430 
431 	struct bch_inode_info *inode =
432 		to_bch_ei(ilookup5_nowait(c->vfs_sb,
433 					  bch2_inode_hash(inum),
434 					  bch2_iget5_test,
435 					  &inum));
436 	if (inode)
437 		goto out;
438 
439 	struct bch_subvolume subvol;
440 	struct bch_inode_unpacked inode_u;
441 	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
442 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
443 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
444 
445 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
446 				c, "dirent to missing inode:\n  %s",
447 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
448 	if (ret)
449 		goto err;
450 
451 	/* regular files may have hardlinks: */
452 	if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) &&
453 				    !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
454 				    c,
455 				    "dirent points to inode that does not point back:\n  %s",
456 				    (bch2_bkey_val_to_text(&buf, c, k),
457 				     prt_printf(&buf, "\n  "),
458 				     bch2_inode_unpacked_to_text(&buf, &inode_u),
459 				     buf.buf))) {
460 		ret = -ENOENT;
461 		goto err;
462 	}
463 
464 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
465 	inode = bch2_inode_insert(c, inode);
466 out:
467 	bch2_trans_iter_exit(trans, &dirent_iter);
468 	printbuf_exit(&buf);
469 	return inode;
470 err:
471 	inode = ERR_PTR(ret);
472 	goto out;
473 }
474 
475 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
476 				  unsigned int flags)
477 {
478 	struct bch_fs *c = vdir->i_sb->s_fs_info;
479 	struct bch_inode_info *dir = to_bch_ei(vdir);
480 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
481 
482 	struct bch_inode_info *inode;
483 	bch2_trans_do(c, NULL, NULL, 0,
484 		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
485 							  &hash, &dentry->d_name)));
486 	if (IS_ERR(inode))
487 		inode = NULL;
488 
489 	return d_splice_alias(&inode->v, dentry);
490 }
491 
492 static int bch2_mknod(struct mnt_idmap *idmap,
493 		      struct inode *vdir, struct dentry *dentry,
494 		      umode_t mode, dev_t rdev)
495 {
496 	struct bch_inode_info *inode =
497 		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
498 			      (subvol_inum) { 0 }, 0);
499 
500 	if (IS_ERR(inode))
501 		return bch2_err_class(PTR_ERR(inode));
502 
503 	d_instantiate(dentry, &inode->v);
504 	return 0;
505 }
506 
507 static int bch2_create(struct mnt_idmap *idmap,
508 		       struct inode *vdir, struct dentry *dentry,
509 		       umode_t mode, bool excl)
510 {
511 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
512 }
513 
514 static int __bch2_link(struct bch_fs *c,
515 		       struct bch_inode_info *inode,
516 		       struct bch_inode_info *dir,
517 		       struct dentry *dentry)
518 {
519 	struct btree_trans *trans = bch2_trans_get(c);
520 	struct bch_inode_unpacked dir_u, inode_u;
521 	int ret;
522 
523 	mutex_lock(&inode->ei_update_lock);
524 
525 	ret = commit_do(trans, NULL, NULL, 0,
526 			bch2_link_trans(trans,
527 					inode_inum(dir),   &dir_u,
528 					inode_inum(inode), &inode_u,
529 					&dentry->d_name));
530 
531 	if (likely(!ret)) {
532 		bch2_inode_update_after_write(trans, dir, &dir_u,
533 					      ATTR_MTIME|ATTR_CTIME);
534 		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
535 	}
536 
537 	bch2_trans_put(trans);
538 	mutex_unlock(&inode->ei_update_lock);
539 	return ret;
540 }
541 
542 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
543 		     struct dentry *dentry)
544 {
545 	struct bch_fs *c = vdir->i_sb->s_fs_info;
546 	struct bch_inode_info *dir = to_bch_ei(vdir);
547 	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
548 	int ret;
549 
550 	lockdep_assert_held(&inode->v.i_rwsem);
551 
552 	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
553 		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
554 		__bch2_link(c, inode, dir, dentry);
555 	if (unlikely(ret))
556 		return bch2_err_class(ret);
557 
558 	ihold(&inode->v);
559 	d_instantiate(dentry, &inode->v);
560 	return 0;
561 }
562 
563 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
564 		  bool deleting_snapshot)
565 {
566 	struct bch_fs *c = vdir->i_sb->s_fs_info;
567 	struct bch_inode_info *dir = to_bch_ei(vdir);
568 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
569 	struct bch_inode_unpacked dir_u, inode_u;
570 	struct btree_trans *trans = bch2_trans_get(c);
571 	int ret;
572 
573 	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
574 
575 	ret = commit_do(trans, NULL, NULL,
576 			BCH_TRANS_COMMIT_no_enospc,
577 		bch2_unlink_trans(trans,
578 				  inode_inum(dir), &dir_u,
579 				  &inode_u, &dentry->d_name,
580 				  deleting_snapshot));
581 	if (unlikely(ret))
582 		goto err;
583 
584 	bch2_inode_update_after_write(trans, dir, &dir_u,
585 				      ATTR_MTIME|ATTR_CTIME);
586 	bch2_inode_update_after_write(trans, inode, &inode_u,
587 				      ATTR_MTIME);
588 
589 	if (inode_u.bi_subvol) {
590 		/*
591 		 * Subvolume deletion is asynchronous, but we still want to tell
592 		 * the VFS that it's been deleted here:
593 		 */
594 		set_nlink(&inode->v, 0);
595 	}
596 err:
597 	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
598 	bch2_trans_put(trans);
599 
600 	return ret;
601 }
602 
603 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
604 {
605 	struct bch_inode_info *dir= to_bch_ei(vdir);
606 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
607 
608 	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
609 		__bch2_unlink(vdir, dentry, false);
610 	return bch2_err_class(ret);
611 }
612 
613 static int bch2_symlink(struct mnt_idmap *idmap,
614 			struct inode *vdir, struct dentry *dentry,
615 			const char *symname)
616 {
617 	struct bch_fs *c = vdir->i_sb->s_fs_info;
618 	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
619 	int ret;
620 
621 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
622 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
623 	if (IS_ERR(inode))
624 		return bch2_err_class(PTR_ERR(inode));
625 
626 	inode_lock(&inode->v);
627 	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
628 	inode_unlock(&inode->v);
629 
630 	if (unlikely(ret))
631 		goto err;
632 
633 	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
634 	if (unlikely(ret))
635 		goto err;
636 
637 	ret = __bch2_link(c, inode, dir, dentry);
638 	if (unlikely(ret))
639 		goto err;
640 
641 	d_instantiate(dentry, &inode->v);
642 	return 0;
643 err:
644 	iput(&inode->v);
645 	return bch2_err_class(ret);
646 }
647 
648 static int bch2_mkdir(struct mnt_idmap *idmap,
649 		      struct inode *vdir, struct dentry *dentry, umode_t mode)
650 {
651 	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
652 }
653 
654 static int bch2_rename2(struct mnt_idmap *idmap,
655 			struct inode *src_vdir, struct dentry *src_dentry,
656 			struct inode *dst_vdir, struct dentry *dst_dentry,
657 			unsigned flags)
658 {
659 	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
660 	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
661 	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
662 	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
663 	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
664 	struct bch_inode_unpacked dst_dir_u, src_dir_u;
665 	struct bch_inode_unpacked src_inode_u, dst_inode_u;
666 	struct btree_trans *trans;
667 	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
668 		? BCH_RENAME_EXCHANGE
669 		: dst_dentry->d_inode
670 		? BCH_RENAME_OVERWRITE : BCH_RENAME;
671 	int ret;
672 
673 	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
674 		return -EINVAL;
675 
676 	if (mode == BCH_RENAME_OVERWRITE) {
677 		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
678 						   0, LLONG_MAX);
679 		if (ret)
680 			return ret;
681 	}
682 
683 	trans = bch2_trans_get(c);
684 
685 	bch2_lock_inodes(INODE_UPDATE_LOCK,
686 			 src_dir,
687 			 dst_dir,
688 			 src_inode,
689 			 dst_inode);
690 
691 	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
692 		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
693 	if (ret)
694 		goto err;
695 
696 	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
697 		ret = bch2_fs_quota_transfer(c, src_inode,
698 					     dst_dir->ei_qid,
699 					     1 << QTYP_PRJ,
700 					     KEY_TYPE_QUOTA_PREALLOC);
701 		if (ret)
702 			goto err;
703 	}
704 
705 	if (mode == BCH_RENAME_EXCHANGE &&
706 	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
707 		ret = bch2_fs_quota_transfer(c, dst_inode,
708 					     src_dir->ei_qid,
709 					     1 << QTYP_PRJ,
710 					     KEY_TYPE_QUOTA_PREALLOC);
711 		if (ret)
712 			goto err;
713 	}
714 
715 	ret = commit_do(trans, NULL, NULL, 0,
716 			bch2_rename_trans(trans,
717 					  inode_inum(src_dir), &src_dir_u,
718 					  inode_inum(dst_dir), &dst_dir_u,
719 					  &src_inode_u,
720 					  &dst_inode_u,
721 					  &src_dentry->d_name,
722 					  &dst_dentry->d_name,
723 					  mode));
724 	if (unlikely(ret))
725 		goto err;
726 
727 	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
728 	BUG_ON(dst_inode &&
729 	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
730 
731 	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
732 				      ATTR_MTIME|ATTR_CTIME);
733 
734 	if (src_dir != dst_dir)
735 		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
736 					      ATTR_MTIME|ATTR_CTIME);
737 
738 	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
739 				      ATTR_CTIME);
740 
741 	if (dst_inode)
742 		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
743 					      ATTR_CTIME);
744 err:
745 	bch2_trans_put(trans);
746 
747 	bch2_fs_quota_transfer(c, src_inode,
748 			       bch_qid(&src_inode->ei_inode),
749 			       1 << QTYP_PRJ,
750 			       KEY_TYPE_QUOTA_NOCHECK);
751 	if (dst_inode)
752 		bch2_fs_quota_transfer(c, dst_inode,
753 				       bch_qid(&dst_inode->ei_inode),
754 				       1 << QTYP_PRJ,
755 				       KEY_TYPE_QUOTA_NOCHECK);
756 
757 	bch2_unlock_inodes(INODE_UPDATE_LOCK,
758 			   src_dir,
759 			   dst_dir,
760 			   src_inode,
761 			   dst_inode);
762 
763 	return bch2_err_class(ret);
764 }
765 
766 static void bch2_setattr_copy(struct mnt_idmap *idmap,
767 			      struct bch_inode_info *inode,
768 			      struct bch_inode_unpacked *bi,
769 			      struct iattr *attr)
770 {
771 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
772 	unsigned int ia_valid = attr->ia_valid;
773 
774 	if (ia_valid & ATTR_UID)
775 		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
776 	if (ia_valid & ATTR_GID)
777 		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
778 
779 	if (ia_valid & ATTR_SIZE)
780 		bi->bi_size = attr->ia_size;
781 
782 	if (ia_valid & ATTR_ATIME)
783 		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
784 	if (ia_valid & ATTR_MTIME)
785 		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
786 	if (ia_valid & ATTR_CTIME)
787 		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
788 
789 	if (ia_valid & ATTR_MODE) {
790 		umode_t mode = attr->ia_mode;
791 		kgid_t gid = ia_valid & ATTR_GID
792 			? attr->ia_gid
793 			: inode->v.i_gid;
794 
795 		if (!in_group_p(gid) &&
796 		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
797 			mode &= ~S_ISGID;
798 		bi->bi_mode = mode;
799 	}
800 }
801 
802 int bch2_setattr_nonsize(struct mnt_idmap *idmap,
803 			 struct bch_inode_info *inode,
804 			 struct iattr *attr)
805 {
806 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
807 	struct bch_qid qid;
808 	struct btree_trans *trans;
809 	struct btree_iter inode_iter = { NULL };
810 	struct bch_inode_unpacked inode_u;
811 	struct posix_acl *acl = NULL;
812 	int ret;
813 
814 	mutex_lock(&inode->ei_update_lock);
815 
816 	qid = inode->ei_qid;
817 
818 	if (attr->ia_valid & ATTR_UID)
819 		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
820 
821 	if (attr->ia_valid & ATTR_GID)
822 		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
823 
824 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
825 				     KEY_TYPE_QUOTA_PREALLOC);
826 	if (ret)
827 		goto err;
828 
829 	trans = bch2_trans_get(c);
830 retry:
831 	bch2_trans_begin(trans);
832 	kfree(acl);
833 	acl = NULL;
834 
835 	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
836 			      BTREE_ITER_intent);
837 	if (ret)
838 		goto btree_err;
839 
840 	bch2_setattr_copy(idmap, inode, &inode_u, attr);
841 
842 	if (attr->ia_valid & ATTR_MODE) {
843 		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
844 				     inode_u.bi_mode, &acl);
845 		if (ret)
846 			goto btree_err;
847 	}
848 
849 	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
850 		bch2_trans_commit(trans, NULL, NULL,
851 				  BCH_TRANS_COMMIT_no_enospc);
852 btree_err:
853 	bch2_trans_iter_exit(trans, &inode_iter);
854 
855 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
856 		goto retry;
857 	if (unlikely(ret))
858 		goto err_trans;
859 
860 	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
861 
862 	if (acl)
863 		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
864 err_trans:
865 	bch2_trans_put(trans);
866 err:
867 	mutex_unlock(&inode->ei_update_lock);
868 
869 	return bch2_err_class(ret);
870 }
871 
872 static int bch2_getattr(struct mnt_idmap *idmap,
873 			const struct path *path, struct kstat *stat,
874 			u32 request_mask, unsigned query_flags)
875 {
876 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
877 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
878 
879 	stat->dev	= inode->v.i_sb->s_dev;
880 	stat->ino	= inode->v.i_ino;
881 	stat->mode	= inode->v.i_mode;
882 	stat->nlink	= inode->v.i_nlink;
883 	stat->uid	= inode->v.i_uid;
884 	stat->gid	= inode->v.i_gid;
885 	stat->rdev	= inode->v.i_rdev;
886 	stat->size	= i_size_read(&inode->v);
887 	stat->atime	= inode_get_atime(&inode->v);
888 	stat->mtime	= inode_get_mtime(&inode->v);
889 	stat->ctime	= inode_get_ctime(&inode->v);
890 	stat->blksize	= block_bytes(c);
891 	stat->blocks	= inode->v.i_blocks;
892 
893 	stat->subvol	= inode->ei_subvol;
894 	stat->result_mask |= STATX_SUBVOL;
895 
896 	if (request_mask & STATX_BTIME) {
897 		stat->result_mask |= STATX_BTIME;
898 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
899 	}
900 
901 	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
902 		stat->attributes |= STATX_ATTR_IMMUTABLE;
903 	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
904 
905 	if (inode->ei_inode.bi_flags & BCH_INODE_append)
906 		stat->attributes |= STATX_ATTR_APPEND;
907 	stat->attributes_mask	 |= STATX_ATTR_APPEND;
908 
909 	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
910 		stat->attributes |= STATX_ATTR_NODUMP;
911 	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
912 
913 	return 0;
914 }
915 
916 static int bch2_setattr(struct mnt_idmap *idmap,
917 			struct dentry *dentry, struct iattr *iattr)
918 {
919 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
920 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
921 	int ret;
922 
923 	lockdep_assert_held(&inode->v.i_rwsem);
924 
925 	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
926 		setattr_prepare(idmap, dentry, iattr);
927 	if (ret)
928 		return ret;
929 
930 	return iattr->ia_valid & ATTR_SIZE
931 		? bchfs_truncate(idmap, inode, iattr)
932 		: bch2_setattr_nonsize(idmap, inode, iattr);
933 }
934 
935 static int bch2_tmpfile(struct mnt_idmap *idmap,
936 			struct inode *vdir, struct file *file, umode_t mode)
937 {
938 	struct bch_inode_info *inode =
939 		__bch2_create(idmap, to_bch_ei(vdir),
940 			      file->f_path.dentry, mode, 0,
941 			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
942 
943 	if (IS_ERR(inode))
944 		return bch2_err_class(PTR_ERR(inode));
945 
946 	d_mark_tmpfile(file, &inode->v);
947 	d_instantiate(file->f_path.dentry, &inode->v);
948 	return finish_open_simple(file, 0);
949 }
950 
951 static int bch2_fill_extent(struct bch_fs *c,
952 			    struct fiemap_extent_info *info,
953 			    struct bkey_s_c k, unsigned flags)
954 {
955 	if (bkey_extent_is_direct_data(k.k)) {
956 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
957 		const union bch_extent_entry *entry;
958 		struct extent_ptr_decoded p;
959 		int ret;
960 
961 		if (k.k->type == KEY_TYPE_reflink_v)
962 			flags |= FIEMAP_EXTENT_SHARED;
963 
964 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
965 			int flags2 = 0;
966 			u64 offset = p.ptr.offset;
967 
968 			if (p.ptr.unwritten)
969 				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
970 
971 			if (p.crc.compression_type)
972 				flags2 |= FIEMAP_EXTENT_ENCODED;
973 			else
974 				offset += p.crc.offset;
975 
976 			if ((offset & (block_sectors(c) - 1)) ||
977 			    (k.k->size & (block_sectors(c) - 1)))
978 				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
979 
980 			ret = fiemap_fill_next_extent(info,
981 						bkey_start_offset(k.k) << 9,
982 						offset << 9,
983 						k.k->size << 9, flags|flags2);
984 			if (ret)
985 				return ret;
986 		}
987 
988 		return 0;
989 	} else if (bkey_extent_is_inline_data(k.k)) {
990 		return fiemap_fill_next_extent(info,
991 					       bkey_start_offset(k.k) << 9,
992 					       0, k.k->size << 9,
993 					       flags|
994 					       FIEMAP_EXTENT_DATA_INLINE);
995 	} else if (k.k->type == KEY_TYPE_reservation) {
996 		return fiemap_fill_next_extent(info,
997 					       bkey_start_offset(k.k) << 9,
998 					       0, k.k->size << 9,
999 					       flags|
1000 					       FIEMAP_EXTENT_DELALLOC|
1001 					       FIEMAP_EXTENT_UNWRITTEN);
1002 	} else {
1003 		BUG();
1004 	}
1005 }
1006 
1007 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
1008 		       u64 start, u64 len)
1009 {
1010 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1011 	struct bch_inode_info *ei = to_bch_ei(vinode);
1012 	struct btree_trans *trans;
1013 	struct btree_iter iter;
1014 	struct bkey_s_c k;
1015 	struct bkey_buf cur, prev;
1016 	unsigned offset_into_extent, sectors;
1017 	bool have_extent = false;
1018 	u32 snapshot;
1019 	int ret = 0;
1020 
1021 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
1022 	if (ret)
1023 		return ret;
1024 
1025 	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
1026 	if (start + len < start)
1027 		return -EINVAL;
1028 
1029 	start >>= 9;
1030 
1031 	bch2_bkey_buf_init(&cur);
1032 	bch2_bkey_buf_init(&prev);
1033 	trans = bch2_trans_get(c);
1034 retry:
1035 	bch2_trans_begin(trans);
1036 
1037 	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
1038 	if (ret)
1039 		goto err;
1040 
1041 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1042 			     SPOS(ei->v.i_ino, start, snapshot), 0);
1043 
1044 	while (!(ret = btree_trans_too_many_iters(trans)) &&
1045 	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
1046 	       !(ret = bkey_err(k))) {
1047 		enum btree_id data_btree = BTREE_ID_extents;
1048 
1049 		if (!bkey_extent_is_data(k.k) &&
1050 		    k.k->type != KEY_TYPE_reservation) {
1051 			bch2_btree_iter_advance(&iter);
1052 			continue;
1053 		}
1054 
1055 		offset_into_extent	= iter.pos.offset -
1056 			bkey_start_offset(k.k);
1057 		sectors			= k.k->size - offset_into_extent;
1058 
1059 		bch2_bkey_buf_reassemble(&cur, c, k);
1060 
1061 		ret = bch2_read_indirect_extent(trans, &data_btree,
1062 					&offset_into_extent, &cur);
1063 		if (ret)
1064 			break;
1065 
1066 		k = bkey_i_to_s_c(cur.k);
1067 		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
1068 
1069 		sectors = min(sectors, k.k->size - offset_into_extent);
1070 
1071 		bch2_cut_front(POS(k.k->p.inode,
1072 				   bkey_start_offset(k.k) +
1073 				   offset_into_extent),
1074 			       cur.k);
1075 		bch2_key_resize(&cur.k->k, sectors);
1076 		cur.k->k.p = iter.pos;
1077 		cur.k->k.p.offset += cur.k->k.size;
1078 
1079 		if (have_extent) {
1080 			bch2_trans_unlock(trans);
1081 			ret = bch2_fill_extent(c, info,
1082 					bkey_i_to_s_c(prev.k), 0);
1083 			if (ret)
1084 				break;
1085 		}
1086 
1087 		bkey_copy(prev.k, cur.k);
1088 		have_extent = true;
1089 
1090 		bch2_btree_iter_set_pos(&iter,
1091 			POS(iter.pos.inode, iter.pos.offset + sectors));
1092 
1093 		ret = bch2_trans_relock(trans);
1094 		if (ret)
1095 			break;
1096 	}
1097 	start = iter.pos.offset;
1098 	bch2_trans_iter_exit(trans, &iter);
1099 err:
1100 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1101 		goto retry;
1102 
1103 	if (!ret && have_extent) {
1104 		bch2_trans_unlock(trans);
1105 		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
1106 				       FIEMAP_EXTENT_LAST);
1107 	}
1108 
1109 	bch2_trans_put(trans);
1110 	bch2_bkey_buf_exit(&cur, c);
1111 	bch2_bkey_buf_exit(&prev, c);
1112 	return ret < 0 ? ret : 0;
1113 }
1114 
1115 static const struct vm_operations_struct bch_vm_ops = {
1116 	.fault		= bch2_page_fault,
1117 	.map_pages	= filemap_map_pages,
1118 	.page_mkwrite   = bch2_page_mkwrite,
1119 };
1120 
1121 static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
1122 {
1123 	file_accessed(file);
1124 
1125 	vma->vm_ops = &bch_vm_ops;
1126 	return 0;
1127 }
1128 
1129 /* Directories: */
1130 
1131 static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
1132 {
1133 	return generic_file_llseek_size(file, offset, whence,
1134 					S64_MAX, S64_MAX);
1135 }
1136 
1137 static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
1138 {
1139 	struct bch_inode_info *inode = file_bch_inode(file);
1140 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1141 
1142 	if (!dir_emit_dots(file, ctx))
1143 		return 0;
1144 
1145 	int ret = bch2_readdir(c, inode_inum(inode), ctx);
1146 
1147 	bch_err_fn(c, ret);
1148 	return bch2_err_class(ret);
1149 }
1150 
1151 static int bch2_open(struct inode *vinode, struct file *file)
1152 {
1153 	if (file->f_flags & (O_WRONLY|O_RDWR)) {
1154 		struct bch_inode_info *inode = to_bch_ei(vinode);
1155 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
1156 
1157 		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
1158 		if (ret)
1159 			return ret;
1160 	}
1161 
1162 	file->f_mode |= FMODE_CAN_ODIRECT;
1163 
1164 	return generic_file_open(vinode, file);
1165 }
1166 
1167 static const struct file_operations bch_file_operations = {
1168 	.open		= bch2_open,
1169 	.llseek		= bch2_llseek,
1170 	.read_iter	= bch2_read_iter,
1171 	.write_iter	= bch2_write_iter,
1172 	.mmap		= bch2_mmap,
1173 	.get_unmapped_area = thp_get_unmapped_area,
1174 	.fsync		= bch2_fsync,
1175 	.splice_read	= filemap_splice_read,
1176 	.splice_write	= iter_file_splice_write,
1177 	.fallocate	= bch2_fallocate_dispatch,
1178 	.unlocked_ioctl = bch2_fs_file_ioctl,
1179 #ifdef CONFIG_COMPAT
1180 	.compat_ioctl	= bch2_compat_fs_ioctl,
1181 #endif
1182 	.remap_file_range = bch2_remap_file_range,
1183 };
1184 
1185 static const struct inode_operations bch_file_inode_operations = {
1186 	.getattr	= bch2_getattr,
1187 	.setattr	= bch2_setattr,
1188 	.fiemap		= bch2_fiemap,
1189 	.listxattr	= bch2_xattr_list,
1190 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1191 	.get_acl	= bch2_get_acl,
1192 	.set_acl	= bch2_set_acl,
1193 #endif
1194 };
1195 
1196 static const struct inode_operations bch_dir_inode_operations = {
1197 	.lookup		= bch2_lookup,
1198 	.create		= bch2_create,
1199 	.link		= bch2_link,
1200 	.unlink		= bch2_unlink,
1201 	.symlink	= bch2_symlink,
1202 	.mkdir		= bch2_mkdir,
1203 	.rmdir		= bch2_unlink,
1204 	.mknod		= bch2_mknod,
1205 	.rename		= bch2_rename2,
1206 	.getattr	= bch2_getattr,
1207 	.setattr	= bch2_setattr,
1208 	.tmpfile	= bch2_tmpfile,
1209 	.listxattr	= bch2_xattr_list,
1210 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1211 	.get_acl	= bch2_get_acl,
1212 	.set_acl	= bch2_set_acl,
1213 #endif
1214 };
1215 
1216 static const struct file_operations bch_dir_file_operations = {
1217 	.llseek		= bch2_dir_llseek,
1218 	.read		= generic_read_dir,
1219 	.iterate_shared	= bch2_vfs_readdir,
1220 	.fsync		= bch2_fsync,
1221 	.unlocked_ioctl = bch2_fs_file_ioctl,
1222 #ifdef CONFIG_COMPAT
1223 	.compat_ioctl	= bch2_compat_fs_ioctl,
1224 #endif
1225 };
1226 
1227 static const struct inode_operations bch_symlink_inode_operations = {
1228 	.get_link	= page_get_link,
1229 	.getattr	= bch2_getattr,
1230 	.setattr	= bch2_setattr,
1231 	.listxattr	= bch2_xattr_list,
1232 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1233 	.get_acl	= bch2_get_acl,
1234 	.set_acl	= bch2_set_acl,
1235 #endif
1236 };
1237 
1238 static const struct inode_operations bch_special_inode_operations = {
1239 	.getattr	= bch2_getattr,
1240 	.setattr	= bch2_setattr,
1241 	.listxattr	= bch2_xattr_list,
1242 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1243 	.get_acl	= bch2_get_acl,
1244 	.set_acl	= bch2_set_acl,
1245 #endif
1246 };
1247 
1248 static const struct address_space_operations bch_address_space_operations = {
1249 	.read_folio	= bch2_read_folio,
1250 	.writepages	= bch2_writepages,
1251 	.readahead	= bch2_readahead,
1252 	.dirty_folio	= filemap_dirty_folio,
1253 	.write_begin	= bch2_write_begin,
1254 	.write_end	= bch2_write_end,
1255 	.invalidate_folio = bch2_invalidate_folio,
1256 	.release_folio	= bch2_release_folio,
1257 #ifdef CONFIG_MIGRATION
1258 	.migrate_folio	= filemap_migrate_folio,
1259 #endif
1260 	.error_remove_folio = generic_error_remove_folio,
1261 };
1262 
1263 struct bcachefs_fid {
1264 	u64		inum;
1265 	u32		subvol;
1266 	u32		gen;
1267 } __packed;
1268 
1269 struct bcachefs_fid_with_parent {
1270 	struct bcachefs_fid	fid;
1271 	struct bcachefs_fid	dir;
1272 } __packed;
1273 
1274 static int bcachefs_fid_valid(int fh_len, int fh_type)
1275 {
1276 	switch (fh_type) {
1277 	case FILEID_BCACHEFS_WITHOUT_PARENT:
1278 		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
1279 	case FILEID_BCACHEFS_WITH_PARENT:
1280 		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
1281 	default:
1282 		return false;
1283 	}
1284 }
1285 
1286 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
1287 {
1288 	return (struct bcachefs_fid) {
1289 		.inum	= inode->ei_inode.bi_inum,
1290 		.subvol	= inode->ei_subvol,
1291 		.gen	= inode->ei_inode.bi_generation,
1292 	};
1293 }
1294 
1295 static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
1296 			  struct inode *vdir)
1297 {
1298 	struct bch_inode_info *inode	= to_bch_ei(vinode);
1299 	struct bch_inode_info *dir	= to_bch_ei(vdir);
1300 	int min_len;
1301 
1302 	if (!S_ISDIR(inode->v.i_mode) && dir) {
1303 		struct bcachefs_fid_with_parent *fid = (void *) fh;
1304 
1305 		min_len = sizeof(*fid) / sizeof(u32);
1306 		if (*len < min_len) {
1307 			*len = min_len;
1308 			return FILEID_INVALID;
1309 		}
1310 
1311 		fid->fid = bch2_inode_to_fid(inode);
1312 		fid->dir = bch2_inode_to_fid(dir);
1313 
1314 		*len = min_len;
1315 		return FILEID_BCACHEFS_WITH_PARENT;
1316 	} else {
1317 		struct bcachefs_fid *fid = (void *) fh;
1318 
1319 		min_len = sizeof(*fid) / sizeof(u32);
1320 		if (*len < min_len) {
1321 			*len = min_len;
1322 			return FILEID_INVALID;
1323 		}
1324 		*fid = bch2_inode_to_fid(inode);
1325 
1326 		*len = min_len;
1327 		return FILEID_BCACHEFS_WITHOUT_PARENT;
1328 	}
1329 }
1330 
1331 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
1332 					struct bcachefs_fid fid)
1333 {
1334 	struct bch_fs *c = sb->s_fs_info;
1335 	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
1336 				    .subvol = fid.subvol,
1337 				    .inum = fid.inum,
1338 	});
1339 	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
1340 		iput(vinode);
1341 		vinode = ERR_PTR(-ESTALE);
1342 	}
1343 	return vinode;
1344 }
1345 
1346 static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
1347 		int fh_len, int fh_type)
1348 {
1349 	struct bcachefs_fid *fid = (void *) _fid;
1350 
1351 	if (!bcachefs_fid_valid(fh_len, fh_type))
1352 		return NULL;
1353 
1354 	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
1355 }
1356 
1357 static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
1358 		int fh_len, int fh_type)
1359 {
1360 	struct bcachefs_fid_with_parent *fid = (void *) _fid;
1361 
1362 	if (!bcachefs_fid_valid(fh_len, fh_type) ||
1363 	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
1364 		return NULL;
1365 
1366 	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
1367 }
1368 
1369 static struct dentry *bch2_get_parent(struct dentry *child)
1370 {
1371 	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
1372 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1373 	subvol_inum parent_inum = {
1374 		.subvol = inode->ei_inode.bi_parent_subvol ?:
1375 			inode->ei_subvol,
1376 		.inum = inode->ei_inode.bi_dir,
1377 	};
1378 
1379 	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
1380 }
1381 
1382 static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
1383 {
1384 	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
1385 	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
1386 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1387 	struct btree_trans *trans;
1388 	struct btree_iter iter1;
1389 	struct btree_iter iter2;
1390 	struct bkey_s_c k;
1391 	struct bkey_s_c_dirent d;
1392 	struct bch_inode_unpacked inode_u;
1393 	subvol_inum target;
1394 	u32 snapshot;
1395 	struct qstr dirent_name;
1396 	unsigned name_len = 0;
1397 	int ret;
1398 
1399 	if (!S_ISDIR(dir->v.i_mode))
1400 		return -EINVAL;
1401 
1402 	trans = bch2_trans_get(c);
1403 
1404 	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
1405 			     POS(dir->ei_inode.bi_inum, 0), 0);
1406 	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
1407 			     POS(dir->ei_inode.bi_inum, 0), 0);
1408 retry:
1409 	bch2_trans_begin(trans);
1410 
1411 	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
1412 	if (ret)
1413 		goto err;
1414 
1415 	bch2_btree_iter_set_snapshot(&iter1, snapshot);
1416 	bch2_btree_iter_set_snapshot(&iter2, snapshot);
1417 
1418 	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
1419 	if (ret)
1420 		goto err;
1421 
1422 	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
1423 		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
1424 
1425 		k = bch2_btree_iter_peek_slot(&iter1);
1426 		ret = bkey_err(k);
1427 		if (ret)
1428 			goto err;
1429 
1430 		if (k.k->type != KEY_TYPE_dirent) {
1431 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1432 			goto err;
1433 		}
1434 
1435 		d = bkey_s_c_to_dirent(k);
1436 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1437 		if (ret > 0)
1438 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1439 		if (ret)
1440 			goto err;
1441 
1442 		if (target.subvol	== inode->ei_subvol &&
1443 		    target.inum		== inode->ei_inode.bi_inum)
1444 			goto found;
1445 	} else {
1446 		/*
1447 		 * File with multiple hardlinks and our backref is to the wrong
1448 		 * directory - linear search:
1449 		 */
1450 		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
1451 			if (k.k->p.inode > dir->ei_inode.bi_inum)
1452 				break;
1453 
1454 			if (k.k->type != KEY_TYPE_dirent)
1455 				continue;
1456 
1457 			d = bkey_s_c_to_dirent(k);
1458 			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
1459 			if (ret < 0)
1460 				break;
1461 			if (ret)
1462 				continue;
1463 
1464 			if (target.subvol	== inode->ei_subvol &&
1465 			    target.inum		== inode->ei_inode.bi_inum)
1466 				goto found;
1467 		}
1468 	}
1469 
1470 	ret = -ENOENT;
1471 	goto err;
1472 found:
1473 	dirent_name = bch2_dirent_get_name(d);
1474 
1475 	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
1476 	memcpy(name, dirent_name.name, name_len);
1477 	name[name_len] = '\0';
1478 err:
1479 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1480 		goto retry;
1481 
1482 	bch2_trans_iter_exit(trans, &iter1);
1483 	bch2_trans_iter_exit(trans, &iter2);
1484 	bch2_trans_put(trans);
1485 
1486 	return ret;
1487 }
1488 
1489 static const struct export_operations bch_export_ops = {
1490 	.encode_fh	= bch2_encode_fh,
1491 	.fh_to_dentry	= bch2_fh_to_dentry,
1492 	.fh_to_parent	= bch2_fh_to_parent,
1493 	.get_parent	= bch2_get_parent,
1494 	.get_name	= bch2_get_name,
1495 };
1496 
1497 static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
1498 				struct bch_inode_info *inode,
1499 				struct bch_inode_unpacked *bi,
1500 				struct bch_subvolume *subvol)
1501 {
1502 	bch2_iget5_set(&inode->v, &inum);
1503 	bch2_inode_update_after_write(trans, inode, bi, ~0);
1504 
1505 	inode->v.i_blocks	= bi->bi_sectors;
1506 	inode->v.i_ino		= bi->bi_inum;
1507 	inode->v.i_rdev		= bi->bi_dev;
1508 	inode->v.i_generation	= bi->bi_generation;
1509 	inode->v.i_size		= bi->bi_size;
1510 
1511 	inode->ei_flags		= 0;
1512 	inode->ei_quota_reserved = 0;
1513 	inode->ei_qid		= bch_qid(bi);
1514 	inode->ei_subvol	= inum.subvol;
1515 
1516 	if (BCH_SUBVOLUME_SNAP(subvol))
1517 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
1518 
1519 	inode->v.i_mapping->a_ops = &bch_address_space_operations;
1520 
1521 	switch (inode->v.i_mode & S_IFMT) {
1522 	case S_IFREG:
1523 		inode->v.i_op	= &bch_file_inode_operations;
1524 		inode->v.i_fop	= &bch_file_operations;
1525 		break;
1526 	case S_IFDIR:
1527 		inode->v.i_op	= &bch_dir_inode_operations;
1528 		inode->v.i_fop	= &bch_dir_file_operations;
1529 		break;
1530 	case S_IFLNK:
1531 		inode_nohighmem(&inode->v);
1532 		inode->v.i_op	= &bch_symlink_inode_operations;
1533 		break;
1534 	default:
1535 		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
1536 		inode->v.i_op	= &bch_special_inode_operations;
1537 		break;
1538 	}
1539 
1540 	mapping_set_large_folios(inode->v.i_mapping);
1541 }
1542 
1543 static void bch2_free_inode(struct inode *vinode)
1544 {
1545 	kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
1546 }
1547 
1548 static int inode_update_times_fn(struct btree_trans *trans,
1549 				 struct bch_inode_info *inode,
1550 				 struct bch_inode_unpacked *bi,
1551 				 void *p)
1552 {
1553 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
1554 
1555 	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
1556 	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
1557 	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
1558 
1559 	return 0;
1560 }
1561 
1562 static int bch2_vfs_write_inode(struct inode *vinode,
1563 				struct writeback_control *wbc)
1564 {
1565 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1566 	struct bch_inode_info *inode = to_bch_ei(vinode);
1567 	int ret;
1568 
1569 	mutex_lock(&inode->ei_update_lock);
1570 	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
1571 			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
1572 	mutex_unlock(&inode->ei_update_lock);
1573 
1574 	return bch2_err_class(ret);
1575 }
1576 
1577 static void bch2_evict_inode(struct inode *vinode)
1578 {
1579 	struct bch_fs *c = vinode->i_sb->s_fs_info;
1580 	struct bch_inode_info *inode = to_bch_ei(vinode);
1581 
1582 	truncate_inode_pages_final(&inode->v.i_data);
1583 
1584 	clear_inode(&inode->v);
1585 
1586 	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
1587 
1588 	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
1589 		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
1590 				KEY_TYPE_QUOTA_WARN);
1591 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
1592 				KEY_TYPE_QUOTA_WARN);
1593 		bch2_inode_rm(c, inode_inum(inode));
1594 	}
1595 
1596 	mutex_lock(&c->vfs_inodes_lock);
1597 	list_del_init(&inode->ei_vfs_inode_list);
1598 	mutex_unlock(&c->vfs_inodes_lock);
1599 }
1600 
1601 void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
1602 {
1603 	struct bch_inode_info *inode;
1604 	DARRAY(struct bch_inode_info *) grabbed;
1605 	bool clean_pass = false, this_pass_clean;
1606 
1607 	/*
1608 	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1609 	 * be pruned with d_mark_dontcache().
1610 	 *
1611 	 * Once we've had a clean pass where we didn't find any inodes without
1612 	 * I_DONTCACHE, we wait for them to be freed:
1613 	 */
1614 
1615 	darray_init(&grabbed);
1616 	darray_make_room(&grabbed, 1024);
1617 again:
1618 	cond_resched();
1619 	this_pass_clean = true;
1620 
1621 	mutex_lock(&c->vfs_inodes_lock);
1622 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
1623 		if (!snapshot_list_has_id(s, inode->ei_subvol))
1624 			continue;
1625 
1626 		if (!(inode->v.i_state & I_DONTCACHE) &&
1627 		    !(inode->v.i_state & I_FREEING) &&
1628 		    igrab(&inode->v)) {
1629 			this_pass_clean = false;
1630 
1631 			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
1632 				iput(&inode->v);
1633 				break;
1634 			}
1635 		} else if (clean_pass && this_pass_clean) {
1636 			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
1637 			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
1638 
1639 			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
1640 			mutex_unlock(&c->vfs_inodes_lock);
1641 
1642 			schedule();
1643 			finish_wait(wq, &wait.wq_entry);
1644 			goto again;
1645 		}
1646 	}
1647 	mutex_unlock(&c->vfs_inodes_lock);
1648 
1649 	darray_for_each(grabbed, i) {
1650 		inode = *i;
1651 		d_mark_dontcache(&inode->v);
1652 		d_prune_aliases(&inode->v);
1653 		iput(&inode->v);
1654 	}
1655 	grabbed.nr = 0;
1656 
1657 	if (!clean_pass || !this_pass_clean) {
1658 		clean_pass = this_pass_clean;
1659 		goto again;
1660 	}
1661 
1662 	darray_exit(&grabbed);
1663 }
1664 
1665 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
1666 {
1667 	struct super_block *sb = dentry->d_sb;
1668 	struct bch_fs *c = sb->s_fs_info;
1669 	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
1670 	unsigned shift = sb->s_blocksize_bits - 9;
1671 	/*
1672 	 * this assumes inodes take up 64 bytes, which is a decent average
1673 	 * number:
1674 	 */
1675 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
1676 
1677 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
1678 	buf->f_bsize	= sb->s_blocksize;
1679 	buf->f_blocks	= usage.capacity >> shift;
1680 	buf->f_bfree	= usage.free >> shift;
1681 	buf->f_bavail	= avail_factor(usage.free) >> shift;
1682 
1683 	buf->f_files	= usage.nr_inodes + avail_inodes;
1684 	buf->f_ffree	= avail_inodes;
1685 
1686 	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
1687 	buf->f_namelen	= BCH_NAME_MAX;
1688 
1689 	return 0;
1690 }
1691 
1692 static int bch2_sync_fs(struct super_block *sb, int wait)
1693 {
1694 	struct bch_fs *c = sb->s_fs_info;
1695 	int ret;
1696 
1697 	if (c->opts.journal_flush_disabled)
1698 		return 0;
1699 
1700 	if (!wait) {
1701 		bch2_journal_flush_async(&c->journal, NULL);
1702 		return 0;
1703 	}
1704 
1705 	ret = bch2_journal_flush(&c->journal);
1706 	return bch2_err_class(ret);
1707 }
1708 
1709 static struct bch_fs *bch2_path_to_fs(const char *path)
1710 {
1711 	struct bch_fs *c;
1712 	dev_t dev;
1713 	int ret;
1714 
1715 	ret = lookup_bdev(path, &dev);
1716 	if (ret)
1717 		return ERR_PTR(ret);
1718 
1719 	c = bch2_dev_to_fs(dev);
1720 	if (c)
1721 		closure_put(&c->cl);
1722 	return c ?: ERR_PTR(-ENOENT);
1723 }
1724 
1725 static int bch2_remount(struct super_block *sb, int *flags, char *data)
1726 {
1727 	struct bch_fs *c = sb->s_fs_info;
1728 	struct bch_opts opts = bch2_opts_empty();
1729 	int ret;
1730 
1731 	ret = bch2_parse_mount_opts(c, &opts, data);
1732 	if (ret)
1733 		goto err;
1734 
1735 	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
1736 
1737 	if (opts.read_only != c->opts.read_only) {
1738 		down_write(&c->state_lock);
1739 
1740 		if (opts.read_only) {
1741 			bch2_fs_read_only(c);
1742 
1743 			sb->s_flags |= SB_RDONLY;
1744 		} else {
1745 			ret = bch2_fs_read_write(c);
1746 			if (ret) {
1747 				bch_err(c, "error going rw: %i", ret);
1748 				up_write(&c->state_lock);
1749 				ret = -EINVAL;
1750 				goto err;
1751 			}
1752 
1753 			sb->s_flags &= ~SB_RDONLY;
1754 		}
1755 
1756 		c->opts.read_only = opts.read_only;
1757 
1758 		up_write(&c->state_lock);
1759 	}
1760 
1761 	if (opt_defined(opts, errors))
1762 		c->opts.errors = opts.errors;
1763 err:
1764 	return bch2_err_class(ret);
1765 }
1766 
1767 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
1768 {
1769 	struct bch_fs *c = root->d_sb->s_fs_info;
1770 	bool first = true;
1771 
1772 	for_each_online_member(c, ca) {
1773 		if (!first)
1774 			seq_putc(seq, ':');
1775 		first = false;
1776 		seq_puts(seq, ca->disk_sb.sb_name);
1777 	}
1778 
1779 	return 0;
1780 }
1781 
1782 static int bch2_show_options(struct seq_file *seq, struct dentry *root)
1783 {
1784 	struct bch_fs *c = root->d_sb->s_fs_info;
1785 	enum bch_opt_id i;
1786 	struct printbuf buf = PRINTBUF;
1787 	int ret = 0;
1788 
1789 	for (i = 0; i < bch2_opts_nr; i++) {
1790 		const struct bch_option *opt = &bch2_opt_table[i];
1791 		u64 v = bch2_opt_get_by_id(&c->opts, i);
1792 
1793 		if (!(opt->flags & OPT_MOUNT))
1794 			continue;
1795 
1796 		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
1797 			continue;
1798 
1799 		printbuf_reset(&buf);
1800 		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
1801 				 OPT_SHOW_MOUNT_STYLE);
1802 		seq_putc(seq, ',');
1803 		seq_puts(seq, buf.buf);
1804 	}
1805 
1806 	if (buf.allocation_failure)
1807 		ret = -ENOMEM;
1808 	printbuf_exit(&buf);
1809 	return ret;
1810 }
1811 
1812 static void bch2_put_super(struct super_block *sb)
1813 {
1814 	struct bch_fs *c = sb->s_fs_info;
1815 
1816 	__bch2_fs_stop(c);
1817 }
1818 
1819 /*
1820  * bcachefs doesn't currently integrate intwrite freeze protection but the
1821  * internal write references serve the same purpose. Therefore reuse the
1822  * read-only transition code to perform the quiesce. The caveat is that we don't
1823  * currently have the ability to block tasks that want a write reference while
1824  * the superblock is frozen. This is fine for now, but we should either add
1825  * blocking support or find a way to integrate sb_start_intwrite() and friends.
1826  */
1827 static int bch2_freeze(struct super_block *sb)
1828 {
1829 	struct bch_fs *c = sb->s_fs_info;
1830 
1831 	down_write(&c->state_lock);
1832 	bch2_fs_read_only(c);
1833 	up_write(&c->state_lock);
1834 	return 0;
1835 }
1836 
1837 static int bch2_unfreeze(struct super_block *sb)
1838 {
1839 	struct bch_fs *c = sb->s_fs_info;
1840 	int ret;
1841 
1842 	if (test_bit(BCH_FS_emergency_ro, &c->flags))
1843 		return 0;
1844 
1845 	down_write(&c->state_lock);
1846 	ret = bch2_fs_read_write(c);
1847 	up_write(&c->state_lock);
1848 	return ret;
1849 }
1850 
1851 static const struct super_operations bch_super_operations = {
1852 	.alloc_inode	= bch2_alloc_inode,
1853 	.free_inode	= bch2_free_inode,
1854 	.write_inode	= bch2_vfs_write_inode,
1855 	.evict_inode	= bch2_evict_inode,
1856 	.sync_fs	= bch2_sync_fs,
1857 	.statfs		= bch2_statfs,
1858 	.show_devname	= bch2_show_devname,
1859 	.show_options	= bch2_show_options,
1860 	.remount_fs	= bch2_remount,
1861 	.put_super	= bch2_put_super,
1862 	.freeze_fs	= bch2_freeze,
1863 	.unfreeze_fs	= bch2_unfreeze,
1864 };
1865 
1866 static int bch2_set_super(struct super_block *s, void *data)
1867 {
1868 	s->s_fs_info = data;
1869 	return 0;
1870 }
1871 
1872 static int bch2_noset_super(struct super_block *s, void *data)
1873 {
1874 	return -EBUSY;
1875 }
1876 
1877 typedef DARRAY(struct bch_fs *) darray_fs;
1878 
1879 static int bch2_test_super(struct super_block *s, void *data)
1880 {
1881 	struct bch_fs *c = s->s_fs_info;
1882 	darray_fs *d = data;
1883 
1884 	if (!c)
1885 		return false;
1886 
1887 	darray_for_each(*d, i)
1888 		if (c != *i)
1889 			return false;
1890 	return true;
1891 }
1892 
1893 static struct dentry *bch2_mount(struct file_system_type *fs_type,
1894 				 int flags, const char *dev_name, void *data)
1895 {
1896 	struct bch_fs *c;
1897 	struct super_block *sb;
1898 	struct inode *vinode;
1899 	struct bch_opts opts = bch2_opts_empty();
1900 	int ret;
1901 
1902 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
1903 
1904 	ret = bch2_parse_mount_opts(NULL, &opts, data);
1905 	if (ret) {
1906 		ret = bch2_err_class(ret);
1907 		return ERR_PTR(ret);
1908 	}
1909 
1910 	if (!dev_name || strlen(dev_name) == 0)
1911 		return ERR_PTR(-EINVAL);
1912 
1913 	darray_str devs;
1914 	ret = bch2_split_devs(dev_name, &devs);
1915 	if (ret)
1916 		return ERR_PTR(ret);
1917 
1918 	darray_fs devs_to_fs = {};
1919 	darray_for_each(devs, i) {
1920 		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
1921 		if (ret) {
1922 			sb = ERR_PTR(ret);
1923 			goto got_sb;
1924 		}
1925 	}
1926 
1927 	sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
1928 	if (!IS_ERR(sb))
1929 		goto got_sb;
1930 
1931 	c = bch2_fs_open(devs.data, devs.nr, opts);
1932 	if (IS_ERR(c)) {
1933 		sb = ERR_CAST(c);
1934 		goto got_sb;
1935 	}
1936 
1937 	/* Some options can't be parsed until after the fs is started: */
1938 	ret = bch2_parse_mount_opts(c, &opts, data);
1939 	if (ret) {
1940 		bch2_fs_stop(c);
1941 		sb = ERR_PTR(ret);
1942 		goto got_sb;
1943 	}
1944 
1945 	bch2_opts_apply(&c->opts, opts);
1946 
1947 	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
1948 	if (IS_ERR(sb))
1949 		bch2_fs_stop(c);
1950 got_sb:
1951 	darray_exit(&devs_to_fs);
1952 	bch2_darray_str_exit(&devs);
1953 
1954 	if (IS_ERR(sb)) {
1955 		ret = PTR_ERR(sb);
1956 		goto err;
1957 	}
1958 
1959 	c = sb->s_fs_info;
1960 
1961 	if (sb->s_root) {
1962 		if ((flags ^ sb->s_flags) & SB_RDONLY) {
1963 			ret = -EBUSY;
1964 			goto err_put_super;
1965 		}
1966 		goto out;
1967 	}
1968 
1969 	sb->s_blocksize		= block_bytes(c);
1970 	sb->s_blocksize_bits	= ilog2(block_bytes(c));
1971 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
1972 	sb->s_op		= &bch_super_operations;
1973 	sb->s_export_op		= &bch_export_ops;
1974 #ifdef CONFIG_BCACHEFS_QUOTA
1975 	sb->s_qcop		= &bch2_quotactl_operations;
1976 	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
1977 #endif
1978 	sb->s_xattr		= bch2_xattr_handlers;
1979 	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
1980 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
1981 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
1982 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
1983 	sb->s_uuid		= c->sb.user_uuid;
1984 	sb->s_shrink->seeks	= 0;
1985 	c->vfs_sb		= sb;
1986 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
1987 
1988 	ret = super_setup_bdi(sb);
1989 	if (ret)
1990 		goto err_put_super;
1991 
1992 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
1993 
1994 	for_each_online_member(c, ca) {
1995 		struct block_device *bdev = ca->disk_sb.bdev;
1996 
1997 		/* XXX: create an anonymous device for multi device filesystems */
1998 		sb->s_bdev	= bdev;
1999 		sb->s_dev	= bdev->bd_dev;
2000 		percpu_ref_put(&ca->io_ref);
2001 		break;
2002 	}
2003 
2004 	c->dev = sb->s_dev;
2005 
2006 #ifdef CONFIG_BCACHEFS_POSIX_ACL
2007 	if (c->opts.acl)
2008 		sb->s_flags	|= SB_POSIXACL;
2009 #endif
2010 
2011 	sb->s_shrink->seeks = 0;
2012 
2013 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
2014 	ret = PTR_ERR_OR_ZERO(vinode);
2015 	bch_err_msg(c, ret, "mounting: error getting root inode");
2016 	if (ret)
2017 		goto err_put_super;
2018 
2019 	sb->s_root = d_make_root(vinode);
2020 	if (!sb->s_root) {
2021 		bch_err(c, "error mounting: error allocating root dentry");
2022 		ret = -ENOMEM;
2023 		goto err_put_super;
2024 	}
2025 
2026 	sb->s_flags |= SB_ACTIVE;
2027 out:
2028 	return dget(sb->s_root);
2029 
2030 err_put_super:
2031 	__bch2_fs_stop(c);
2032 	deactivate_locked_super(sb);
2033 err:
2034 	if (ret)
2035 		pr_err("error: %s", bch2_err_str(ret));
2036 	/*
2037 	 * On an inconsistency error in recovery we might see an -EROFS derived
2038 	 * errorcode (from the journal), but we don't want to return that to
2039 	 * userspace as that causes util-linux to retry the mount RO - which is
2040 	 * confusing:
2041 	 */
2042 	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
2043 		ret = -EIO;
2044 	return ERR_PTR(bch2_err_class(ret));
2045 }
2046 
2047 static void bch2_kill_sb(struct super_block *sb)
2048 {
2049 	struct bch_fs *c = sb->s_fs_info;
2050 
2051 	generic_shutdown_super(sb);
2052 	bch2_fs_free(c);
2053 }
2054 
2055 static struct file_system_type bcache_fs_type = {
2056 	.owner		= THIS_MODULE,
2057 	.name		= "bcachefs",
2058 	.mount		= bch2_mount,
2059 	.kill_sb	= bch2_kill_sb,
2060 	.fs_flags	= FS_REQUIRES_DEV,
2061 };
2062 
2063 MODULE_ALIAS_FS("bcachefs");
2064 
2065 void bch2_vfs_exit(void)
2066 {
2067 	unregister_filesystem(&bcache_fs_type);
2068 	kmem_cache_destroy(bch2_inode_cache);
2069 }
2070 
2071 int __init bch2_vfs_init(void)
2072 {
2073 	int ret = -ENOMEM;
2074 
2075 	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
2076 	if (!bch2_inode_cache)
2077 		goto err;
2078 
2079 	ret = register_filesystem(&bcache_fs_type);
2080 	if (ret)
2081 		goto err;
2082 
2083 	return 0;
2084 err:
2085 	bch2_vfs_exit();
2086 	return ret;
2087 }
2088 
2089 #endif /* NO_BCACHEFS_FS */
2090