xref: /linux/fs/libfs.c (revision 2241f81c91f211b512bd2c3a26a4a74258d0e008)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *	fs/libfs.c
4  *	Library for filesystems writers.
5  */
6 
7 #include <linux/blkdev.h>
8 #include <linux/export.h>
9 #include <linux/pagemap.h>
10 #include <linux/slab.h>
11 #include <linux/cred.h>
12 #include <linux/mount.h>
13 #include <linux/vfs.h>
14 #include <linux/quotaops.h>
15 #include <linux/mutex.h>
16 #include <linux/namei.h>
17 #include <linux/exportfs.h>
18 #include <linux/iversion.h>
19 #include <linux/writeback.h>
20 #include <linux/buffer_head.h> /* sync_mapping_buffers */
21 #include <linux/fs_context.h>
22 #include <linux/pseudo_fs.h>
23 #include <linux/fsnotify.h>
24 #include <linux/unicode.h>
25 #include <linux/fscrypt.h>
26 
27 #include <linux/uaccess.h>
28 
29 #include "internal.h"
30 
31 int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
32 		   struct kstat *stat, u32 request_mask,
33 		   unsigned int query_flags)
34 {
35 	struct inode *inode = d_inode(path->dentry);
36 	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
37 	stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
38 	return 0;
39 }
40 EXPORT_SYMBOL(simple_getattr);
41 
42 int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
43 {
44 	u64 id = huge_encode_dev(dentry->d_sb->s_dev);
45 
46 	buf->f_fsid = u64_to_fsid(id);
47 	buf->f_type = dentry->d_sb->s_magic;
48 	buf->f_bsize = PAGE_SIZE;
49 	buf->f_namelen = NAME_MAX;
50 	return 0;
51 }
52 EXPORT_SYMBOL(simple_statfs);
53 
54 /*
55  * Retaining negative dentries for an in-memory filesystem just wastes
56  * memory and lookup time: arrange for them to be deleted immediately.
57  */
58 int always_delete_dentry(const struct dentry *dentry)
59 {
60 	return 1;
61 }
62 EXPORT_SYMBOL(always_delete_dentry);
63 
64 const struct dentry_operations simple_dentry_operations = {
65 	.d_delete = always_delete_dentry,
66 };
67 EXPORT_SYMBOL(simple_dentry_operations);
68 
69 /*
70  * Lookup the data. This is trivial - if the dentry didn't already
71  * exist, we know it is negative.  Set d_op to delete negative dentries.
72  */
73 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
74 {
75 	if (dentry->d_name.len > NAME_MAX)
76 		return ERR_PTR(-ENAMETOOLONG);
77 	if (!dentry->d_sb->s_d_op)
78 		d_set_d_op(dentry, &simple_dentry_operations);
79 	d_add(dentry, NULL);
80 	return NULL;
81 }
82 EXPORT_SYMBOL(simple_lookup);
83 
84 int dcache_dir_open(struct inode *inode, struct file *file)
85 {
86 	file->private_data = d_alloc_cursor(file->f_path.dentry);
87 
88 	return file->private_data ? 0 : -ENOMEM;
89 }
90 EXPORT_SYMBOL(dcache_dir_open);
91 
92 int dcache_dir_close(struct inode *inode, struct file *file)
93 {
94 	dput(file->private_data);
95 	return 0;
96 }
97 EXPORT_SYMBOL(dcache_dir_close);
98 
99 /* parent is locked at least shared */
100 /*
101  * Returns an element of siblings' list.
102  * We are looking for <count>th positive after <p>; if
103  * found, dentry is grabbed and returned to caller.
104  * If no such element exists, NULL is returned.
105  */
106 static struct dentry *scan_positives(struct dentry *cursor,
107 					struct list_head *p,
108 					loff_t count,
109 					struct dentry *last)
110 {
111 	struct dentry *dentry = cursor->d_parent, *found = NULL;
112 
113 	spin_lock(&dentry->d_lock);
114 	while ((p = p->next) != &dentry->d_subdirs) {
115 		struct dentry *d = list_entry(p, struct dentry, d_child);
116 		// we must at least skip cursors, to avoid livelocks
117 		if (d->d_flags & DCACHE_DENTRY_CURSOR)
118 			continue;
119 		if (simple_positive(d) && !--count) {
120 			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
121 			if (simple_positive(d))
122 				found = dget_dlock(d);
123 			spin_unlock(&d->d_lock);
124 			if (likely(found))
125 				break;
126 			count = 1;
127 		}
128 		if (need_resched()) {
129 			list_move(&cursor->d_child, p);
130 			p = &cursor->d_child;
131 			spin_unlock(&dentry->d_lock);
132 			cond_resched();
133 			spin_lock(&dentry->d_lock);
134 		}
135 	}
136 	spin_unlock(&dentry->d_lock);
137 	dput(last);
138 	return found;
139 }
140 
141 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
142 {
143 	struct dentry *dentry = file->f_path.dentry;
144 	switch (whence) {
145 		case 1:
146 			offset += file->f_pos;
147 			fallthrough;
148 		case 0:
149 			if (offset >= 0)
150 				break;
151 			fallthrough;
152 		default:
153 			return -EINVAL;
154 	}
155 	if (offset != file->f_pos) {
156 		struct dentry *cursor = file->private_data;
157 		struct dentry *to = NULL;
158 
159 		inode_lock_shared(dentry->d_inode);
160 
161 		if (offset > 2)
162 			to = scan_positives(cursor, &dentry->d_subdirs,
163 					    offset - 2, NULL);
164 		spin_lock(&dentry->d_lock);
165 		if (to)
166 			list_move(&cursor->d_child, &to->d_child);
167 		else
168 			list_del_init(&cursor->d_child);
169 		spin_unlock(&dentry->d_lock);
170 		dput(to);
171 
172 		file->f_pos = offset;
173 
174 		inode_unlock_shared(dentry->d_inode);
175 	}
176 	return offset;
177 }
178 EXPORT_SYMBOL(dcache_dir_lseek);
179 
180 /*
181  * Directory is locked and all positive dentries in it are safe, since
182  * for ramfs-type trees they can't go away without unlink() or rmdir(),
183  * both impossible due to the lock on directory.
184  */
185 
186 int dcache_readdir(struct file *file, struct dir_context *ctx)
187 {
188 	struct dentry *dentry = file->f_path.dentry;
189 	struct dentry *cursor = file->private_data;
190 	struct list_head *anchor = &dentry->d_subdirs;
191 	struct dentry *next = NULL;
192 	struct list_head *p;
193 
194 	if (!dir_emit_dots(file, ctx))
195 		return 0;
196 
197 	if (ctx->pos == 2)
198 		p = anchor;
199 	else if (!list_empty(&cursor->d_child))
200 		p = &cursor->d_child;
201 	else
202 		return 0;
203 
204 	while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
205 		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
206 			      d_inode(next)->i_ino,
207 			      fs_umode_to_dtype(d_inode(next)->i_mode)))
208 			break;
209 		ctx->pos++;
210 		p = &next->d_child;
211 	}
212 	spin_lock(&dentry->d_lock);
213 	if (next)
214 		list_move_tail(&cursor->d_child, &next->d_child);
215 	else
216 		list_del_init(&cursor->d_child);
217 	spin_unlock(&dentry->d_lock);
218 	dput(next);
219 
220 	return 0;
221 }
222 EXPORT_SYMBOL(dcache_readdir);
223 
224 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
225 {
226 	return -EISDIR;
227 }
228 EXPORT_SYMBOL(generic_read_dir);
229 
230 const struct file_operations simple_dir_operations = {
231 	.open		= dcache_dir_open,
232 	.release	= dcache_dir_close,
233 	.llseek		= dcache_dir_lseek,
234 	.read		= generic_read_dir,
235 	.iterate_shared	= dcache_readdir,
236 	.fsync		= noop_fsync,
237 };
238 EXPORT_SYMBOL(simple_dir_operations);
239 
240 const struct inode_operations simple_dir_inode_operations = {
241 	.lookup		= simple_lookup,
242 };
243 EXPORT_SYMBOL(simple_dir_inode_operations);
244 
245 static void offset_set(struct dentry *dentry, u32 offset)
246 {
247 	dentry->d_fsdata = (void *)((uintptr_t)(offset));
248 }
249 
250 static u32 dentry2offset(struct dentry *dentry)
251 {
252 	return (u32)((uintptr_t)(dentry->d_fsdata));
253 }
254 
255 static struct lock_class_key simple_offset_xa_lock;
256 
257 /**
258  * simple_offset_init - initialize an offset_ctx
259  * @octx: directory offset map to be initialized
260  *
261  */
262 void simple_offset_init(struct offset_ctx *octx)
263 {
264 	xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
265 	lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
266 
267 	/* 0 is '.', 1 is '..', so always start with offset 2 */
268 	octx->next_offset = 2;
269 }
270 
271 /**
272  * simple_offset_add - Add an entry to a directory's offset map
273  * @octx: directory offset ctx to be updated
274  * @dentry: new dentry being added
275  *
276  * Returns zero on success. @so_ctx and the dentry offset are updated.
277  * Otherwise, a negative errno value is returned.
278  */
279 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
280 {
281 	static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
282 	u32 offset;
283 	int ret;
284 
285 	if (dentry2offset(dentry) != 0)
286 		return -EBUSY;
287 
288 	ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
289 			      &octx->next_offset, GFP_KERNEL);
290 	if (ret < 0)
291 		return ret;
292 
293 	offset_set(dentry, offset);
294 	return 0;
295 }
296 
297 /**
298  * simple_offset_remove - Remove an entry to a directory's offset map
299  * @octx: directory offset ctx to be updated
300  * @dentry: dentry being removed
301  *
302  */
303 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
304 {
305 	u32 offset;
306 
307 	offset = dentry2offset(dentry);
308 	if (offset == 0)
309 		return;
310 
311 	xa_erase(&octx->xa, offset);
312 	offset_set(dentry, 0);
313 }
314 
315 /**
316  * simple_offset_rename_exchange - exchange rename with directory offsets
317  * @old_dir: parent of dentry being moved
318  * @old_dentry: dentry being moved
319  * @new_dir: destination parent
320  * @new_dentry: destination dentry
321  *
322  * Returns zero on success. Otherwise a negative errno is returned and the
323  * rename is rolled back.
324  */
325 int simple_offset_rename_exchange(struct inode *old_dir,
326 				  struct dentry *old_dentry,
327 				  struct inode *new_dir,
328 				  struct dentry *new_dentry)
329 {
330 	struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
331 	struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
332 	u32 old_index = dentry2offset(old_dentry);
333 	u32 new_index = dentry2offset(new_dentry);
334 	int ret;
335 
336 	simple_offset_remove(old_ctx, old_dentry);
337 	simple_offset_remove(new_ctx, new_dentry);
338 
339 	ret = simple_offset_add(new_ctx, old_dentry);
340 	if (ret)
341 		goto out_restore;
342 
343 	ret = simple_offset_add(old_ctx, new_dentry);
344 	if (ret) {
345 		simple_offset_remove(new_ctx, old_dentry);
346 		goto out_restore;
347 	}
348 
349 	ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
350 	if (ret) {
351 		simple_offset_remove(new_ctx, old_dentry);
352 		simple_offset_remove(old_ctx, new_dentry);
353 		goto out_restore;
354 	}
355 	return 0;
356 
357 out_restore:
358 	offset_set(old_dentry, old_index);
359 	xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
360 	offset_set(new_dentry, new_index);
361 	xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
362 	return ret;
363 }
364 
365 /**
366  * simple_offset_destroy - Release offset map
367  * @octx: directory offset ctx that is about to be destroyed
368  *
369  * During fs teardown (eg. umount), a directory's offset map might still
370  * contain entries. xa_destroy() cleans out anything that remains.
371  */
372 void simple_offset_destroy(struct offset_ctx *octx)
373 {
374 	xa_destroy(&octx->xa);
375 }
376 
377 /**
378  * offset_dir_llseek - Advance the read position of a directory descriptor
379  * @file: an open directory whose position is to be updated
380  * @offset: a byte offset
381  * @whence: enumerator describing the starting position for this update
382  *
383  * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
384  *
385  * Returns the updated read position if successful; otherwise a
386  * negative errno is returned and the read position remains unchanged.
387  */
388 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
389 {
390 	switch (whence) {
391 	case SEEK_CUR:
392 		offset += file->f_pos;
393 		fallthrough;
394 	case SEEK_SET:
395 		if (offset >= 0)
396 			break;
397 		fallthrough;
398 	default:
399 		return -EINVAL;
400 	}
401 
402 	return vfs_setpos(file, offset, U32_MAX);
403 }
404 
405 static struct dentry *offset_find_next(struct xa_state *xas)
406 {
407 	struct dentry *child, *found = NULL;
408 
409 	rcu_read_lock();
410 	child = xas_next_entry(xas, U32_MAX);
411 	if (!child)
412 		goto out;
413 	spin_lock(&child->d_lock);
414 	if (simple_positive(child))
415 		found = dget_dlock(child);
416 	spin_unlock(&child->d_lock);
417 out:
418 	rcu_read_unlock();
419 	return found;
420 }
421 
422 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
423 {
424 	u32 offset = dentry2offset(dentry);
425 	struct inode *inode = d_inode(dentry);
426 
427 	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
428 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
429 }
430 
431 static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
432 {
433 	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
434 	XA_STATE(xas, &so_ctx->xa, ctx->pos);
435 	struct dentry *dentry;
436 
437 	while (true) {
438 		dentry = offset_find_next(&xas);
439 		if (!dentry)
440 			break;
441 
442 		if (!offset_dir_emit(ctx, dentry)) {
443 			dput(dentry);
444 			break;
445 		}
446 
447 		dput(dentry);
448 		ctx->pos = xas.xa_index + 1;
449 	}
450 }
451 
452 /**
453  * offset_readdir - Emit entries starting at offset @ctx->pos
454  * @file: an open directory to iterate over
455  * @ctx: directory iteration context
456  *
457  * Caller must hold @file's i_rwsem to prevent insertion or removal of
458  * entries during this call.
459  *
460  * On entry, @ctx->pos contains an offset that represents the first entry
461  * to be read from the directory.
462  *
463  * The operation continues until there are no more entries to read, or
464  * until the ctx->actor indicates there is no more space in the caller's
465  * output buffer.
466  *
467  * On return, @ctx->pos contains an offset that will read the next entry
468  * in this directory when offset_readdir() is called again with @ctx.
469  *
470  * Return values:
471  *   %0 - Complete
472  */
473 static int offset_readdir(struct file *file, struct dir_context *ctx)
474 {
475 	struct dentry *dir = file->f_path.dentry;
476 
477 	lockdep_assert_held(&d_inode(dir)->i_rwsem);
478 
479 	if (!dir_emit_dots(file, ctx))
480 		return 0;
481 
482 	offset_iterate_dir(d_inode(dir), ctx);
483 	return 0;
484 }
485 
486 const struct file_operations simple_offset_dir_operations = {
487 	.llseek		= offset_dir_llseek,
488 	.iterate_shared	= offset_readdir,
489 	.read		= generic_read_dir,
490 	.fsync		= noop_fsync,
491 };
492 
493 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
494 {
495 	struct dentry *child = NULL;
496 	struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
497 
498 	spin_lock(&parent->d_lock);
499 	while ((p = p->next) != &parent->d_subdirs) {
500 		struct dentry *d = container_of(p, struct dentry, d_child);
501 		if (simple_positive(d)) {
502 			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
503 			if (simple_positive(d))
504 				child = dget_dlock(d);
505 			spin_unlock(&d->d_lock);
506 			if (likely(child))
507 				break;
508 		}
509 	}
510 	spin_unlock(&parent->d_lock);
511 	dput(prev);
512 	return child;
513 }
514 
515 void simple_recursive_removal(struct dentry *dentry,
516                               void (*callback)(struct dentry *))
517 {
518 	struct dentry *this = dget(dentry);
519 	while (true) {
520 		struct dentry *victim = NULL, *child;
521 		struct inode *inode = this->d_inode;
522 
523 		inode_lock(inode);
524 		if (d_is_dir(this))
525 			inode->i_flags |= S_DEAD;
526 		while ((child = find_next_child(this, victim)) == NULL) {
527 			// kill and ascend
528 			// update metadata while it's still locked
529 			inode_set_ctime_current(inode);
530 			clear_nlink(inode);
531 			inode_unlock(inode);
532 			victim = this;
533 			this = this->d_parent;
534 			inode = this->d_inode;
535 			inode_lock(inode);
536 			if (simple_positive(victim)) {
537 				d_invalidate(victim);	// avoid lost mounts
538 				if (d_is_dir(victim))
539 					fsnotify_rmdir(inode, victim);
540 				else
541 					fsnotify_unlink(inode, victim);
542 				if (callback)
543 					callback(victim);
544 				dput(victim);		// unpin it
545 			}
546 			if (victim == dentry) {
547 				inode_set_mtime_to_ts(inode,
548 						      inode_set_ctime_current(inode));
549 				if (d_is_dir(dentry))
550 					drop_nlink(inode);
551 				inode_unlock(inode);
552 				dput(dentry);
553 				return;
554 			}
555 		}
556 		inode_unlock(inode);
557 		this = child;
558 	}
559 }
560 EXPORT_SYMBOL(simple_recursive_removal);
561 
562 static const struct super_operations simple_super_operations = {
563 	.statfs		= simple_statfs,
564 };
565 
566 static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
567 {
568 	struct pseudo_fs_context *ctx = fc->fs_private;
569 	struct inode *root;
570 
571 	s->s_maxbytes = MAX_LFS_FILESIZE;
572 	s->s_blocksize = PAGE_SIZE;
573 	s->s_blocksize_bits = PAGE_SHIFT;
574 	s->s_magic = ctx->magic;
575 	s->s_op = ctx->ops ?: &simple_super_operations;
576 	s->s_xattr = ctx->xattr;
577 	s->s_time_gran = 1;
578 	root = new_inode(s);
579 	if (!root)
580 		return -ENOMEM;
581 
582 	/*
583 	 * since this is the first inode, make it number 1. New inodes created
584 	 * after this must take care not to collide with it (by passing
585 	 * max_reserved of 1 to iunique).
586 	 */
587 	root->i_ino = 1;
588 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
589 	simple_inode_init_ts(root);
590 	s->s_root = d_make_root(root);
591 	if (!s->s_root)
592 		return -ENOMEM;
593 	s->s_d_op = ctx->dops;
594 	return 0;
595 }
596 
597 static int pseudo_fs_get_tree(struct fs_context *fc)
598 {
599 	return get_tree_nodev(fc, pseudo_fs_fill_super);
600 }
601 
602 static void pseudo_fs_free(struct fs_context *fc)
603 {
604 	kfree(fc->fs_private);
605 }
606 
607 static const struct fs_context_operations pseudo_fs_context_ops = {
608 	.free		= pseudo_fs_free,
609 	.get_tree	= pseudo_fs_get_tree,
610 };
611 
612 /*
613  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
614  * will never be mountable)
615  */
616 struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
617 					unsigned long magic)
618 {
619 	struct pseudo_fs_context *ctx;
620 
621 	ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
622 	if (likely(ctx)) {
623 		ctx->magic = magic;
624 		fc->fs_private = ctx;
625 		fc->ops = &pseudo_fs_context_ops;
626 		fc->sb_flags |= SB_NOUSER;
627 		fc->global = true;
628 	}
629 	return ctx;
630 }
631 EXPORT_SYMBOL(init_pseudo);
632 
633 int simple_open(struct inode *inode, struct file *file)
634 {
635 	if (inode->i_private)
636 		file->private_data = inode->i_private;
637 	return 0;
638 }
639 EXPORT_SYMBOL(simple_open);
640 
641 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
642 {
643 	struct inode *inode = d_inode(old_dentry);
644 
645 	inode_set_mtime_to_ts(dir,
646 			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
647 	inc_nlink(inode);
648 	ihold(inode);
649 	dget(dentry);
650 	d_instantiate(dentry, inode);
651 	return 0;
652 }
653 EXPORT_SYMBOL(simple_link);
654 
655 int simple_empty(struct dentry *dentry)
656 {
657 	struct dentry *child;
658 	int ret = 0;
659 
660 	spin_lock(&dentry->d_lock);
661 	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
662 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
663 		if (simple_positive(child)) {
664 			spin_unlock(&child->d_lock);
665 			goto out;
666 		}
667 		spin_unlock(&child->d_lock);
668 	}
669 	ret = 1;
670 out:
671 	spin_unlock(&dentry->d_lock);
672 	return ret;
673 }
674 EXPORT_SYMBOL(simple_empty);
675 
676 int simple_unlink(struct inode *dir, struct dentry *dentry)
677 {
678 	struct inode *inode = d_inode(dentry);
679 
680 	inode_set_mtime_to_ts(dir,
681 			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
682 	drop_nlink(inode);
683 	dput(dentry);
684 	return 0;
685 }
686 EXPORT_SYMBOL(simple_unlink);
687 
688 int simple_rmdir(struct inode *dir, struct dentry *dentry)
689 {
690 	if (!simple_empty(dentry))
691 		return -ENOTEMPTY;
692 
693 	drop_nlink(d_inode(dentry));
694 	simple_unlink(dir, dentry);
695 	drop_nlink(dir);
696 	return 0;
697 }
698 EXPORT_SYMBOL(simple_rmdir);
699 
700 /**
701  * simple_rename_timestamp - update the various inode timestamps for rename
702  * @old_dir: old parent directory
703  * @old_dentry: dentry that is being renamed
704  * @new_dir: new parent directory
705  * @new_dentry: target for rename
706  *
707  * POSIX mandates that the old and new parent directories have their ctime and
708  * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
709  * their ctime updated.
710  */
711 void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
712 			     struct inode *new_dir, struct dentry *new_dentry)
713 {
714 	struct inode *newino = d_inode(new_dentry);
715 
716 	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
717 	if (new_dir != old_dir)
718 		inode_set_mtime_to_ts(new_dir,
719 				      inode_set_ctime_current(new_dir));
720 	inode_set_ctime_current(d_inode(old_dentry));
721 	if (newino)
722 		inode_set_ctime_current(newino);
723 }
724 EXPORT_SYMBOL_GPL(simple_rename_timestamp);
725 
726 int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
727 			   struct inode *new_dir, struct dentry *new_dentry)
728 {
729 	bool old_is_dir = d_is_dir(old_dentry);
730 	bool new_is_dir = d_is_dir(new_dentry);
731 
732 	if (old_dir != new_dir && old_is_dir != new_is_dir) {
733 		if (old_is_dir) {
734 			drop_nlink(old_dir);
735 			inc_nlink(new_dir);
736 		} else {
737 			drop_nlink(new_dir);
738 			inc_nlink(old_dir);
739 		}
740 	}
741 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
742 	return 0;
743 }
744 EXPORT_SYMBOL_GPL(simple_rename_exchange);
745 
746 int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
747 		  struct dentry *old_dentry, struct inode *new_dir,
748 		  struct dentry *new_dentry, unsigned int flags)
749 {
750 	int they_are_dirs = d_is_dir(old_dentry);
751 
752 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
753 		return -EINVAL;
754 
755 	if (flags & RENAME_EXCHANGE)
756 		return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
757 
758 	if (!simple_empty(new_dentry))
759 		return -ENOTEMPTY;
760 
761 	if (d_really_is_positive(new_dentry)) {
762 		simple_unlink(new_dir, new_dentry);
763 		if (they_are_dirs) {
764 			drop_nlink(d_inode(new_dentry));
765 			drop_nlink(old_dir);
766 		}
767 	} else if (they_are_dirs) {
768 		drop_nlink(old_dir);
769 		inc_nlink(new_dir);
770 	}
771 
772 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
773 	return 0;
774 }
775 EXPORT_SYMBOL(simple_rename);
776 
777 /**
778  * simple_setattr - setattr for simple filesystem
779  * @idmap: idmap of the target mount
780  * @dentry: dentry
781  * @iattr: iattr structure
782  *
783  * Returns 0 on success, -error on failure.
784  *
785  * simple_setattr is a simple ->setattr implementation without a proper
786  * implementation of size changes.
787  *
788  * It can either be used for in-memory filesystems or special files
789  * on simple regular filesystems.  Anything that needs to change on-disk
790  * or wire state on size changes needs its own setattr method.
791  */
792 int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
793 		   struct iattr *iattr)
794 {
795 	struct inode *inode = d_inode(dentry);
796 	int error;
797 
798 	error = setattr_prepare(idmap, dentry, iattr);
799 	if (error)
800 		return error;
801 
802 	if (iattr->ia_valid & ATTR_SIZE)
803 		truncate_setsize(inode, iattr->ia_size);
804 	setattr_copy(idmap, inode, iattr);
805 	mark_inode_dirty(inode);
806 	return 0;
807 }
808 EXPORT_SYMBOL(simple_setattr);
809 
810 static int simple_read_folio(struct file *file, struct folio *folio)
811 {
812 	folio_zero_range(folio, 0, folio_size(folio));
813 	flush_dcache_folio(folio);
814 	folio_mark_uptodate(folio);
815 	folio_unlock(folio);
816 	return 0;
817 }
818 
819 int simple_write_begin(struct file *file, struct address_space *mapping,
820 			loff_t pos, unsigned len,
821 			struct page **pagep, void **fsdata)
822 {
823 	struct folio *folio;
824 
825 	folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
826 			mapping_gfp_mask(mapping));
827 	if (IS_ERR(folio))
828 		return PTR_ERR(folio);
829 
830 	*pagep = &folio->page;
831 
832 	if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
833 		size_t from = offset_in_folio(folio, pos);
834 
835 		folio_zero_segments(folio, 0, from,
836 				from + len, folio_size(folio));
837 	}
838 	return 0;
839 }
840 EXPORT_SYMBOL(simple_write_begin);
841 
842 /**
843  * simple_write_end - .write_end helper for non-block-device FSes
844  * @file: See .write_end of address_space_operations
845  * @mapping: 		"
846  * @pos: 		"
847  * @len: 		"
848  * @copied: 		"
849  * @page: 		"
850  * @fsdata: 		"
851  *
852  * simple_write_end does the minimum needed for updating a page after writing is
853  * done. It has the same API signature as the .write_end of
854  * address_space_operations vector. So it can just be set onto .write_end for
855  * FSes that don't need any other processing. i_mutex is assumed to be held.
856  * Block based filesystems should use generic_write_end().
857  * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
858  * is not called, so a filesystem that actually does store data in .write_inode
859  * should extend on what's done here with a call to mark_inode_dirty() in the
860  * case that i_size has changed.
861  *
862  * Use *ONLY* with simple_read_folio()
863  */
864 static int simple_write_end(struct file *file, struct address_space *mapping,
865 			loff_t pos, unsigned len, unsigned copied,
866 			struct page *page, void *fsdata)
867 {
868 	struct folio *folio = page_folio(page);
869 	struct inode *inode = folio->mapping->host;
870 	loff_t last_pos = pos + copied;
871 
872 	/* zero the stale part of the folio if we did a short copy */
873 	if (!folio_test_uptodate(folio)) {
874 		if (copied < len) {
875 			size_t from = offset_in_folio(folio, pos);
876 
877 			folio_zero_range(folio, from + copied, len - copied);
878 		}
879 		folio_mark_uptodate(folio);
880 	}
881 	/*
882 	 * No need to use i_size_read() here, the i_size
883 	 * cannot change under us because we hold the i_mutex.
884 	 */
885 	if (last_pos > inode->i_size)
886 		i_size_write(inode, last_pos);
887 
888 	folio_mark_dirty(folio);
889 	folio_unlock(folio);
890 	folio_put(folio);
891 
892 	return copied;
893 }
894 
895 /*
896  * Provides ramfs-style behavior: data in the pagecache, but no writeback.
897  */
898 const struct address_space_operations ram_aops = {
899 	.read_folio	= simple_read_folio,
900 	.write_begin	= simple_write_begin,
901 	.write_end	= simple_write_end,
902 	.dirty_folio	= noop_dirty_folio,
903 };
904 EXPORT_SYMBOL(ram_aops);
905 
906 /*
907  * the inodes created here are not hashed. If you use iunique to generate
908  * unique inode values later for this filesystem, then you must take care
909  * to pass it an appropriate max_reserved value to avoid collisions.
910  */
911 int simple_fill_super(struct super_block *s, unsigned long magic,
912 		      const struct tree_descr *files)
913 {
914 	struct inode *inode;
915 	struct dentry *root;
916 	struct dentry *dentry;
917 	int i;
918 
919 	s->s_blocksize = PAGE_SIZE;
920 	s->s_blocksize_bits = PAGE_SHIFT;
921 	s->s_magic = magic;
922 	s->s_op = &simple_super_operations;
923 	s->s_time_gran = 1;
924 
925 	inode = new_inode(s);
926 	if (!inode)
927 		return -ENOMEM;
928 	/*
929 	 * because the root inode is 1, the files array must not contain an
930 	 * entry at index 1
931 	 */
932 	inode->i_ino = 1;
933 	inode->i_mode = S_IFDIR | 0755;
934 	simple_inode_init_ts(inode);
935 	inode->i_op = &simple_dir_inode_operations;
936 	inode->i_fop = &simple_dir_operations;
937 	set_nlink(inode, 2);
938 	root = d_make_root(inode);
939 	if (!root)
940 		return -ENOMEM;
941 	for (i = 0; !files->name || files->name[0]; i++, files++) {
942 		if (!files->name)
943 			continue;
944 
945 		/* warn if it tries to conflict with the root inode */
946 		if (unlikely(i == 1))
947 			printk(KERN_WARNING "%s: %s passed in a files array"
948 				"with an index of 1!\n", __func__,
949 				s->s_type->name);
950 
951 		dentry = d_alloc_name(root, files->name);
952 		if (!dentry)
953 			goto out;
954 		inode = new_inode(s);
955 		if (!inode) {
956 			dput(dentry);
957 			goto out;
958 		}
959 		inode->i_mode = S_IFREG | files->mode;
960 		simple_inode_init_ts(inode);
961 		inode->i_fop = files->ops;
962 		inode->i_ino = i;
963 		d_add(dentry, inode);
964 	}
965 	s->s_root = root;
966 	return 0;
967 out:
968 	d_genocide(root);
969 	shrink_dcache_parent(root);
970 	dput(root);
971 	return -ENOMEM;
972 }
973 EXPORT_SYMBOL(simple_fill_super);
974 
975 static DEFINE_SPINLOCK(pin_fs_lock);
976 
977 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
978 {
979 	struct vfsmount *mnt = NULL;
980 	spin_lock(&pin_fs_lock);
981 	if (unlikely(!*mount)) {
982 		spin_unlock(&pin_fs_lock);
983 		mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
984 		if (IS_ERR(mnt))
985 			return PTR_ERR(mnt);
986 		spin_lock(&pin_fs_lock);
987 		if (!*mount)
988 			*mount = mnt;
989 	}
990 	mntget(*mount);
991 	++*count;
992 	spin_unlock(&pin_fs_lock);
993 	mntput(mnt);
994 	return 0;
995 }
996 EXPORT_SYMBOL(simple_pin_fs);
997 
998 void simple_release_fs(struct vfsmount **mount, int *count)
999 {
1000 	struct vfsmount *mnt;
1001 	spin_lock(&pin_fs_lock);
1002 	mnt = *mount;
1003 	if (!--*count)
1004 		*mount = NULL;
1005 	spin_unlock(&pin_fs_lock);
1006 	mntput(mnt);
1007 }
1008 EXPORT_SYMBOL(simple_release_fs);
1009 
1010 /**
1011  * simple_read_from_buffer - copy data from the buffer to user space
1012  * @to: the user space buffer to read to
1013  * @count: the maximum number of bytes to read
1014  * @ppos: the current position in the buffer
1015  * @from: the buffer to read from
1016  * @available: the size of the buffer
1017  *
1018  * The simple_read_from_buffer() function reads up to @count bytes from the
1019  * buffer @from at offset @ppos into the user space address starting at @to.
1020  *
1021  * On success, the number of bytes read is returned and the offset @ppos is
1022  * advanced by this number, or negative value is returned on error.
1023  **/
1024 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
1025 				const void *from, size_t available)
1026 {
1027 	loff_t pos = *ppos;
1028 	size_t ret;
1029 
1030 	if (pos < 0)
1031 		return -EINVAL;
1032 	if (pos >= available || !count)
1033 		return 0;
1034 	if (count > available - pos)
1035 		count = available - pos;
1036 	ret = copy_to_user(to, from + pos, count);
1037 	if (ret == count)
1038 		return -EFAULT;
1039 	count -= ret;
1040 	*ppos = pos + count;
1041 	return count;
1042 }
1043 EXPORT_SYMBOL(simple_read_from_buffer);
1044 
1045 /**
1046  * simple_write_to_buffer - copy data from user space to the buffer
1047  * @to: the buffer to write to
1048  * @available: the size of the buffer
1049  * @ppos: the current position in the buffer
1050  * @from: the user space buffer to read from
1051  * @count: the maximum number of bytes to read
1052  *
1053  * The simple_write_to_buffer() function reads up to @count bytes from the user
1054  * space address starting at @from into the buffer @to at offset @ppos.
1055  *
1056  * On success, the number of bytes written is returned and the offset @ppos is
1057  * advanced by this number, or negative value is returned on error.
1058  **/
1059 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
1060 		const void __user *from, size_t count)
1061 {
1062 	loff_t pos = *ppos;
1063 	size_t res;
1064 
1065 	if (pos < 0)
1066 		return -EINVAL;
1067 	if (pos >= available || !count)
1068 		return 0;
1069 	if (count > available - pos)
1070 		count = available - pos;
1071 	res = copy_from_user(to + pos, from, count);
1072 	if (res == count)
1073 		return -EFAULT;
1074 	count -= res;
1075 	*ppos = pos + count;
1076 	return count;
1077 }
1078 EXPORT_SYMBOL(simple_write_to_buffer);
1079 
1080 /**
1081  * memory_read_from_buffer - copy data from the buffer
1082  * @to: the kernel space buffer to read to
1083  * @count: the maximum number of bytes to read
1084  * @ppos: the current position in the buffer
1085  * @from: the buffer to read from
1086  * @available: the size of the buffer
1087  *
1088  * The memory_read_from_buffer() function reads up to @count bytes from the
1089  * buffer @from at offset @ppos into the kernel space address starting at @to.
1090  *
1091  * On success, the number of bytes read is returned and the offset @ppos is
1092  * advanced by this number, or negative value is returned on error.
1093  **/
1094 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
1095 				const void *from, size_t available)
1096 {
1097 	loff_t pos = *ppos;
1098 
1099 	if (pos < 0)
1100 		return -EINVAL;
1101 	if (pos >= available)
1102 		return 0;
1103 	if (count > available - pos)
1104 		count = available - pos;
1105 	memcpy(to, from + pos, count);
1106 	*ppos = pos + count;
1107 
1108 	return count;
1109 }
1110 EXPORT_SYMBOL(memory_read_from_buffer);
1111 
1112 /*
1113  * Transaction based IO.
1114  * The file expects a single write which triggers the transaction, and then
1115  * possibly a read which collects the result - which is stored in a
1116  * file-local buffer.
1117  */
1118 
1119 void simple_transaction_set(struct file *file, size_t n)
1120 {
1121 	struct simple_transaction_argresp *ar = file->private_data;
1122 
1123 	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
1124 
1125 	/*
1126 	 * The barrier ensures that ar->size will really remain zero until
1127 	 * ar->data is ready for reading.
1128 	 */
1129 	smp_mb();
1130 	ar->size = n;
1131 }
1132 EXPORT_SYMBOL(simple_transaction_set);
1133 
1134 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
1135 {
1136 	struct simple_transaction_argresp *ar;
1137 	static DEFINE_SPINLOCK(simple_transaction_lock);
1138 
1139 	if (size > SIMPLE_TRANSACTION_LIMIT - 1)
1140 		return ERR_PTR(-EFBIG);
1141 
1142 	ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
1143 	if (!ar)
1144 		return ERR_PTR(-ENOMEM);
1145 
1146 	spin_lock(&simple_transaction_lock);
1147 
1148 	/* only one write allowed per open */
1149 	if (file->private_data) {
1150 		spin_unlock(&simple_transaction_lock);
1151 		free_page((unsigned long)ar);
1152 		return ERR_PTR(-EBUSY);
1153 	}
1154 
1155 	file->private_data = ar;
1156 
1157 	spin_unlock(&simple_transaction_lock);
1158 
1159 	if (copy_from_user(ar->data, buf, size))
1160 		return ERR_PTR(-EFAULT);
1161 
1162 	return ar->data;
1163 }
1164 EXPORT_SYMBOL(simple_transaction_get);
1165 
1166 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
1167 {
1168 	struct simple_transaction_argresp *ar = file->private_data;
1169 
1170 	if (!ar)
1171 		return 0;
1172 	return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
1173 }
1174 EXPORT_SYMBOL(simple_transaction_read);
1175 
1176 int simple_transaction_release(struct inode *inode, struct file *file)
1177 {
1178 	free_page((unsigned long)file->private_data);
1179 	return 0;
1180 }
1181 EXPORT_SYMBOL(simple_transaction_release);
1182 
1183 /* Simple attribute files */
1184 
1185 struct simple_attr {
1186 	int (*get)(void *, u64 *);
1187 	int (*set)(void *, u64);
1188 	char get_buf[24];	/* enough to store a u64 and "\n\0" */
1189 	char set_buf[24];
1190 	void *data;
1191 	const char *fmt;	/* format for read operation */
1192 	struct mutex mutex;	/* protects access to these buffers */
1193 };
1194 
1195 /* simple_attr_open is called by an actual attribute open file operation
1196  * to set the attribute specific access operations. */
1197 int simple_attr_open(struct inode *inode, struct file *file,
1198 		     int (*get)(void *, u64 *), int (*set)(void *, u64),
1199 		     const char *fmt)
1200 {
1201 	struct simple_attr *attr;
1202 
1203 	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
1204 	if (!attr)
1205 		return -ENOMEM;
1206 
1207 	attr->get = get;
1208 	attr->set = set;
1209 	attr->data = inode->i_private;
1210 	attr->fmt = fmt;
1211 	mutex_init(&attr->mutex);
1212 
1213 	file->private_data = attr;
1214 
1215 	return nonseekable_open(inode, file);
1216 }
1217 EXPORT_SYMBOL_GPL(simple_attr_open);
1218 
1219 int simple_attr_release(struct inode *inode, struct file *file)
1220 {
1221 	kfree(file->private_data);
1222 	return 0;
1223 }
1224 EXPORT_SYMBOL_GPL(simple_attr_release);	/* GPL-only?  This?  Really? */
1225 
1226 /* read from the buffer that is filled with the get function */
1227 ssize_t simple_attr_read(struct file *file, char __user *buf,
1228 			 size_t len, loff_t *ppos)
1229 {
1230 	struct simple_attr *attr;
1231 	size_t size;
1232 	ssize_t ret;
1233 
1234 	attr = file->private_data;
1235 
1236 	if (!attr->get)
1237 		return -EACCES;
1238 
1239 	ret = mutex_lock_interruptible(&attr->mutex);
1240 	if (ret)
1241 		return ret;
1242 
1243 	if (*ppos && attr->get_buf[0]) {
1244 		/* continued read */
1245 		size = strlen(attr->get_buf);
1246 	} else {
1247 		/* first read */
1248 		u64 val;
1249 		ret = attr->get(attr->data, &val);
1250 		if (ret)
1251 			goto out;
1252 
1253 		size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
1254 				 attr->fmt, (unsigned long long)val);
1255 	}
1256 
1257 	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
1258 out:
1259 	mutex_unlock(&attr->mutex);
1260 	return ret;
1261 }
1262 EXPORT_SYMBOL_GPL(simple_attr_read);
1263 
1264 /* interpret the buffer as a number to call the set function with */
1265 static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
1266 			  size_t len, loff_t *ppos, bool is_signed)
1267 {
1268 	struct simple_attr *attr;
1269 	unsigned long long val;
1270 	size_t size;
1271 	ssize_t ret;
1272 
1273 	attr = file->private_data;
1274 	if (!attr->set)
1275 		return -EACCES;
1276 
1277 	ret = mutex_lock_interruptible(&attr->mutex);
1278 	if (ret)
1279 		return ret;
1280 
1281 	ret = -EFAULT;
1282 	size = min(sizeof(attr->set_buf) - 1, len);
1283 	if (copy_from_user(attr->set_buf, buf, size))
1284 		goto out;
1285 
1286 	attr->set_buf[size] = '\0';
1287 	if (is_signed)
1288 		ret = kstrtoll(attr->set_buf, 0, &val);
1289 	else
1290 		ret = kstrtoull(attr->set_buf, 0, &val);
1291 	if (ret)
1292 		goto out;
1293 	ret = attr->set(attr->data, val);
1294 	if (ret == 0)
1295 		ret = len; /* on success, claim we got the whole input */
1296 out:
1297 	mutex_unlock(&attr->mutex);
1298 	return ret;
1299 }
1300 
1301 ssize_t simple_attr_write(struct file *file, const char __user *buf,
1302 			  size_t len, loff_t *ppos)
1303 {
1304 	return simple_attr_write_xsigned(file, buf, len, ppos, false);
1305 }
1306 EXPORT_SYMBOL_GPL(simple_attr_write);
1307 
1308 ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
1309 			  size_t len, loff_t *ppos)
1310 {
1311 	return simple_attr_write_xsigned(file, buf, len, ppos, true);
1312 }
1313 EXPORT_SYMBOL_GPL(simple_attr_write_signed);
1314 
1315 /**
1316  * generic_encode_ino32_fh - generic export_operations->encode_fh function
1317  * @inode:   the object to encode
1318  * @fh:      where to store the file handle fragment
1319  * @max_len: maximum length to store there (in 4 byte units)
1320  * @parent:  parent directory inode, if wanted
1321  *
1322  * This generic encode_fh function assumes that the 32 inode number
1323  * is suitable for locating an inode, and that the generation number
1324  * can be used to check that it is still valid.  It places them in the
1325  * filehandle fragment where export_decode_fh expects to find them.
1326  */
1327 int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
1328 			    struct inode *parent)
1329 {
1330 	struct fid *fid = (void *)fh;
1331 	int len = *max_len;
1332 	int type = FILEID_INO32_GEN;
1333 
1334 	if (parent && (len < 4)) {
1335 		*max_len = 4;
1336 		return FILEID_INVALID;
1337 	} else if (len < 2) {
1338 		*max_len = 2;
1339 		return FILEID_INVALID;
1340 	}
1341 
1342 	len = 2;
1343 	fid->i32.ino = inode->i_ino;
1344 	fid->i32.gen = inode->i_generation;
1345 	if (parent) {
1346 		fid->i32.parent_ino = parent->i_ino;
1347 		fid->i32.parent_gen = parent->i_generation;
1348 		len = 4;
1349 		type = FILEID_INO32_GEN_PARENT;
1350 	}
1351 	*max_len = len;
1352 	return type;
1353 }
1354 EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
1355 
1356 /**
1357  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
1358  * @sb:		filesystem to do the file handle conversion on
1359  * @fid:	file handle to convert
1360  * @fh_len:	length of the file handle in bytes
1361  * @fh_type:	type of file handle
1362  * @get_inode:	filesystem callback to retrieve inode
1363  *
1364  * This function decodes @fid as long as it has one of the well-known
1365  * Linux filehandle types and calls @get_inode on it to retrieve the
1366  * inode for the object specified in the file handle.
1367  */
1368 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
1369 		int fh_len, int fh_type, struct inode *(*get_inode)
1370 			(struct super_block *sb, u64 ino, u32 gen))
1371 {
1372 	struct inode *inode = NULL;
1373 
1374 	if (fh_len < 2)
1375 		return NULL;
1376 
1377 	switch (fh_type) {
1378 	case FILEID_INO32_GEN:
1379 	case FILEID_INO32_GEN_PARENT:
1380 		inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
1381 		break;
1382 	}
1383 
1384 	return d_obtain_alias(inode);
1385 }
1386 EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
1387 
1388 /**
1389  * generic_fh_to_parent - generic helper for the fh_to_parent export operation
1390  * @sb:		filesystem to do the file handle conversion on
1391  * @fid:	file handle to convert
1392  * @fh_len:	length of the file handle in bytes
1393  * @fh_type:	type of file handle
1394  * @get_inode:	filesystem callback to retrieve inode
1395  *
1396  * This function decodes @fid as long as it has one of the well-known
1397  * Linux filehandle types and calls @get_inode on it to retrieve the
1398  * inode for the _parent_ object specified in the file handle if it
1399  * is specified in the file handle, or NULL otherwise.
1400  */
1401 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
1402 		int fh_len, int fh_type, struct inode *(*get_inode)
1403 			(struct super_block *sb, u64 ino, u32 gen))
1404 {
1405 	struct inode *inode = NULL;
1406 
1407 	if (fh_len <= 2)
1408 		return NULL;
1409 
1410 	switch (fh_type) {
1411 	case FILEID_INO32_GEN_PARENT:
1412 		inode = get_inode(sb, fid->i32.parent_ino,
1413 				  (fh_len > 3 ? fid->i32.parent_gen : 0));
1414 		break;
1415 	}
1416 
1417 	return d_obtain_alias(inode);
1418 }
1419 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
1420 
1421 /**
1422  * __generic_file_fsync - generic fsync implementation for simple filesystems
1423  *
1424  * @file:	file to synchronize
1425  * @start:	start offset in bytes
1426  * @end:	end offset in bytes (inclusive)
1427  * @datasync:	only synchronize essential metadata if true
1428  *
1429  * This is a generic implementation of the fsync method for simple
1430  * filesystems which track all non-inode metadata in the buffers list
1431  * hanging off the address_space structure.
1432  */
1433 int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
1434 				 int datasync)
1435 {
1436 	struct inode *inode = file->f_mapping->host;
1437 	int err;
1438 	int ret;
1439 
1440 	err = file_write_and_wait_range(file, start, end);
1441 	if (err)
1442 		return err;
1443 
1444 	inode_lock(inode);
1445 	ret = sync_mapping_buffers(inode->i_mapping);
1446 	if (!(inode->i_state & I_DIRTY_ALL))
1447 		goto out;
1448 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
1449 		goto out;
1450 
1451 	err = sync_inode_metadata(inode, 1);
1452 	if (ret == 0)
1453 		ret = err;
1454 
1455 out:
1456 	inode_unlock(inode);
1457 	/* check and advance again to catch errors after syncing out buffers */
1458 	err = file_check_and_advance_wb_err(file);
1459 	if (ret == 0)
1460 		ret = err;
1461 	return ret;
1462 }
1463 EXPORT_SYMBOL(__generic_file_fsync);
1464 
1465 /**
1466  * generic_file_fsync - generic fsync implementation for simple filesystems
1467  *			with flush
1468  * @file:	file to synchronize
1469  * @start:	start offset in bytes
1470  * @end:	end offset in bytes (inclusive)
1471  * @datasync:	only synchronize essential metadata if true
1472  *
1473  */
1474 
1475 int generic_file_fsync(struct file *file, loff_t start, loff_t end,
1476 		       int datasync)
1477 {
1478 	struct inode *inode = file->f_mapping->host;
1479 	int err;
1480 
1481 	err = __generic_file_fsync(file, start, end, datasync);
1482 	if (err)
1483 		return err;
1484 	return blkdev_issue_flush(inode->i_sb->s_bdev);
1485 }
1486 EXPORT_SYMBOL(generic_file_fsync);
1487 
1488 /**
1489  * generic_check_addressable - Check addressability of file system
1490  * @blocksize_bits:	log of file system block size
1491  * @num_blocks:		number of blocks in file system
1492  *
1493  * Determine whether a file system with @num_blocks blocks (and a
1494  * block size of 2**@blocksize_bits) is addressable by the sector_t
1495  * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
1496  */
1497 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
1498 {
1499 	u64 last_fs_block = num_blocks - 1;
1500 	u64 last_fs_page =
1501 		last_fs_block >> (PAGE_SHIFT - blocksize_bits);
1502 
1503 	if (unlikely(num_blocks == 0))
1504 		return 0;
1505 
1506 	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
1507 		return -EINVAL;
1508 
1509 	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
1510 	    (last_fs_page > (pgoff_t)(~0ULL))) {
1511 		return -EFBIG;
1512 	}
1513 	return 0;
1514 }
1515 EXPORT_SYMBOL(generic_check_addressable);
1516 
1517 /*
1518  * No-op implementation of ->fsync for in-memory filesystems.
1519  */
1520 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
1521 {
1522 	return 0;
1523 }
1524 EXPORT_SYMBOL(noop_fsync);
1525 
1526 ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
1527 {
1528 	/*
1529 	 * iomap based filesystems support direct I/O without need for
1530 	 * this callback. However, it still needs to be set in
1531 	 * inode->a_ops so that open/fcntl know that direct I/O is
1532 	 * generally supported.
1533 	 */
1534 	return -EINVAL;
1535 }
1536 EXPORT_SYMBOL_GPL(noop_direct_IO);
1537 
1538 /* Because kfree isn't assignment-compatible with void(void*) ;-/ */
1539 void kfree_link(void *p)
1540 {
1541 	kfree(p);
1542 }
1543 EXPORT_SYMBOL(kfree_link);
1544 
1545 struct inode *alloc_anon_inode(struct super_block *s)
1546 {
1547 	static const struct address_space_operations anon_aops = {
1548 		.dirty_folio	= noop_dirty_folio,
1549 	};
1550 	struct inode *inode = new_inode_pseudo(s);
1551 
1552 	if (!inode)
1553 		return ERR_PTR(-ENOMEM);
1554 
1555 	inode->i_ino = get_next_ino();
1556 	inode->i_mapping->a_ops = &anon_aops;
1557 
1558 	/*
1559 	 * Mark the inode dirty from the very beginning,
1560 	 * that way it will never be moved to the dirty
1561 	 * list because mark_inode_dirty() will think
1562 	 * that it already _is_ on the dirty list.
1563 	 */
1564 	inode->i_state = I_DIRTY;
1565 	inode->i_mode = S_IRUSR | S_IWUSR;
1566 	inode->i_uid = current_fsuid();
1567 	inode->i_gid = current_fsgid();
1568 	inode->i_flags |= S_PRIVATE;
1569 	simple_inode_init_ts(inode);
1570 	return inode;
1571 }
1572 EXPORT_SYMBOL(alloc_anon_inode);
1573 
1574 /**
1575  * simple_nosetlease - generic helper for prohibiting leases
1576  * @filp: file pointer
1577  * @arg: type of lease to obtain
1578  * @flp: new lease supplied for insertion
1579  * @priv: private data for lm_setup operation
1580  *
1581  * Generic helper for filesystems that do not wish to allow leases to be set.
1582  * All arguments are ignored and it just returns -EINVAL.
1583  */
1584 int
1585 simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
1586 		  void **priv)
1587 {
1588 	return -EINVAL;
1589 }
1590 EXPORT_SYMBOL(simple_nosetlease);
1591 
1592 /**
1593  * simple_get_link - generic helper to get the target of "fast" symlinks
1594  * @dentry: not used here
1595  * @inode: the symlink inode
1596  * @done: not used here
1597  *
1598  * Generic helper for filesystems to use for symlink inodes where a pointer to
1599  * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
1600  * since as an optimization the path lookup code uses any non-NULL ->i_link
1601  * directly, without calling ->get_link().  But ->get_link() still must be set,
1602  * to mark the inode_operations as being for a symlink.
1603  *
1604  * Return: the symlink target
1605  */
1606 const char *simple_get_link(struct dentry *dentry, struct inode *inode,
1607 			    struct delayed_call *done)
1608 {
1609 	return inode->i_link;
1610 }
1611 EXPORT_SYMBOL(simple_get_link);
1612 
1613 const struct inode_operations simple_symlink_inode_operations = {
1614 	.get_link = simple_get_link,
1615 };
1616 EXPORT_SYMBOL(simple_symlink_inode_operations);
1617 
1618 /*
1619  * Operations for a permanently empty directory.
1620  */
1621 static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
1622 {
1623 	return ERR_PTR(-ENOENT);
1624 }
1625 
1626 static int empty_dir_getattr(struct mnt_idmap *idmap,
1627 			     const struct path *path, struct kstat *stat,
1628 			     u32 request_mask, unsigned int query_flags)
1629 {
1630 	struct inode *inode = d_inode(path->dentry);
1631 	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
1632 	return 0;
1633 }
1634 
1635 static int empty_dir_setattr(struct mnt_idmap *idmap,
1636 			     struct dentry *dentry, struct iattr *attr)
1637 {
1638 	return -EPERM;
1639 }
1640 
1641 static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
1642 {
1643 	return -EOPNOTSUPP;
1644 }
1645 
1646 static const struct inode_operations empty_dir_inode_operations = {
1647 	.lookup		= empty_dir_lookup,
1648 	.permission	= generic_permission,
1649 	.setattr	= empty_dir_setattr,
1650 	.getattr	= empty_dir_getattr,
1651 	.listxattr	= empty_dir_listxattr,
1652 };
1653 
1654 static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
1655 {
1656 	/* An empty directory has two entries . and .. at offsets 0 and 1 */
1657 	return generic_file_llseek_size(file, offset, whence, 2, 2);
1658 }
1659 
1660 static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
1661 {
1662 	dir_emit_dots(file, ctx);
1663 	return 0;
1664 }
1665 
1666 static const struct file_operations empty_dir_operations = {
1667 	.llseek		= empty_dir_llseek,
1668 	.read		= generic_read_dir,
1669 	.iterate_shared	= empty_dir_readdir,
1670 	.fsync		= noop_fsync,
1671 };
1672 
1673 
1674 void make_empty_dir_inode(struct inode *inode)
1675 {
1676 	set_nlink(inode, 2);
1677 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
1678 	inode->i_uid = GLOBAL_ROOT_UID;
1679 	inode->i_gid = GLOBAL_ROOT_GID;
1680 	inode->i_rdev = 0;
1681 	inode->i_size = 0;
1682 	inode->i_blkbits = PAGE_SHIFT;
1683 	inode->i_blocks = 0;
1684 
1685 	inode->i_op = &empty_dir_inode_operations;
1686 	inode->i_opflags &= ~IOP_XATTR;
1687 	inode->i_fop = &empty_dir_operations;
1688 }
1689 
1690 bool is_empty_dir_inode(struct inode *inode)
1691 {
1692 	return (inode->i_fop == &empty_dir_operations) &&
1693 		(inode->i_op == &empty_dir_inode_operations);
1694 }
1695 
1696 #if IS_ENABLED(CONFIG_UNICODE)
1697 /**
1698  * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
1699  * @dentry:	dentry whose name we are checking against
1700  * @len:	len of name of dentry
1701  * @str:	str pointer to name of dentry
1702  * @name:	Name to compare against
1703  *
1704  * Return: 0 if names match, 1 if mismatch, or -ERRNO
1705  */
1706 static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
1707 				const char *str, const struct qstr *name)
1708 {
1709 	const struct dentry *parent = READ_ONCE(dentry->d_parent);
1710 	const struct inode *dir = READ_ONCE(parent->d_inode);
1711 	const struct super_block *sb = dentry->d_sb;
1712 	const struct unicode_map *um = sb->s_encoding;
1713 	struct qstr qstr = QSTR_INIT(str, len);
1714 	char strbuf[DNAME_INLINE_LEN];
1715 	int ret;
1716 
1717 	if (!dir || !IS_CASEFOLDED(dir))
1718 		goto fallback;
1719 	/*
1720 	 * If the dentry name is stored in-line, then it may be concurrently
1721 	 * modified by a rename.  If this happens, the VFS will eventually retry
1722 	 * the lookup, so it doesn't matter what ->d_compare() returns.
1723 	 * However, it's unsafe to call utf8_strncasecmp() with an unstable
1724 	 * string.  Therefore, we have to copy the name into a temporary buffer.
1725 	 */
1726 	if (len <= DNAME_INLINE_LEN - 1) {
1727 		memcpy(strbuf, str, len);
1728 		strbuf[len] = 0;
1729 		qstr.name = strbuf;
1730 		/* prevent compiler from optimizing out the temporary buffer */
1731 		barrier();
1732 	}
1733 	ret = utf8_strncasecmp(um, name, &qstr);
1734 	if (ret >= 0)
1735 		return ret;
1736 
1737 	if (sb_has_strict_encoding(sb))
1738 		return -EINVAL;
1739 fallback:
1740 	if (len != name->len)
1741 		return 1;
1742 	return !!memcmp(str, name->name, len);
1743 }
1744 
1745 /**
1746  * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
1747  * @dentry:	dentry of the parent directory
1748  * @str:	qstr of name whose hash we should fill in
1749  *
1750  * Return: 0 if hash was successful or unchanged, and -EINVAL on error
1751  */
1752 static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
1753 {
1754 	const struct inode *dir = READ_ONCE(dentry->d_inode);
1755 	struct super_block *sb = dentry->d_sb;
1756 	const struct unicode_map *um = sb->s_encoding;
1757 	int ret = 0;
1758 
1759 	if (!dir || !IS_CASEFOLDED(dir))
1760 		return 0;
1761 
1762 	ret = utf8_casefold_hash(um, dentry, str);
1763 	if (ret < 0 && sb_has_strict_encoding(sb))
1764 		return -EINVAL;
1765 	return 0;
1766 }
1767 
1768 static const struct dentry_operations generic_ci_dentry_ops = {
1769 	.d_hash = generic_ci_d_hash,
1770 	.d_compare = generic_ci_d_compare,
1771 };
1772 #endif
1773 
1774 #ifdef CONFIG_FS_ENCRYPTION
1775 static const struct dentry_operations generic_encrypted_dentry_ops = {
1776 	.d_revalidate = fscrypt_d_revalidate,
1777 };
1778 #endif
1779 
1780 #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
1781 static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
1782 	.d_hash = generic_ci_d_hash,
1783 	.d_compare = generic_ci_d_compare,
1784 	.d_revalidate = fscrypt_d_revalidate,
1785 };
1786 #endif
1787 
1788 /**
1789  * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
1790  * @dentry:	dentry to set ops on
1791  *
1792  * Casefolded directories need d_hash and d_compare set, so that the dentries
1793  * contained in them are handled case-insensitively.  Note that these operations
1794  * are needed on the parent directory rather than on the dentries in it, and
1795  * while the casefolding flag can be toggled on and off on an empty directory,
1796  * dentry_operations can't be changed later.  As a result, if the filesystem has
1797  * casefolding support enabled at all, we have to give all dentries the
1798  * casefolding operations even if their inode doesn't have the casefolding flag
1799  * currently (and thus the casefolding ops would be no-ops for now).
1800  *
1801  * Encryption works differently in that the only dentry operation it needs is
1802  * d_revalidate, which it only needs on dentries that have the no-key name flag.
1803  * The no-key flag can't be set "later", so we don't have to worry about that.
1804  *
1805  * Finally, to maximize compatibility with overlayfs (which isn't compatible
1806  * with certain dentry operations) and to avoid taking an unnecessary
1807  * performance hit, we use custom dentry_operations for each possible
1808  * combination rather than always installing all operations.
1809  */
1810 void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
1811 {
1812 #ifdef CONFIG_FS_ENCRYPTION
1813 	bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
1814 #endif
1815 #if IS_ENABLED(CONFIG_UNICODE)
1816 	bool needs_ci_ops = dentry->d_sb->s_encoding;
1817 #endif
1818 #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
1819 	if (needs_encrypt_ops && needs_ci_ops) {
1820 		d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
1821 		return;
1822 	}
1823 #endif
1824 #ifdef CONFIG_FS_ENCRYPTION
1825 	if (needs_encrypt_ops) {
1826 		d_set_d_op(dentry, &generic_encrypted_dentry_ops);
1827 		return;
1828 	}
1829 #endif
1830 #if IS_ENABLED(CONFIG_UNICODE)
1831 	if (needs_ci_ops) {
1832 		d_set_d_op(dentry, &generic_ci_dentry_ops);
1833 		return;
1834 	}
1835 #endif
1836 }
1837 EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
1838 
1839 /**
1840  * inode_maybe_inc_iversion - increments i_version
1841  * @inode: inode with the i_version that should be updated
1842  * @force: increment the counter even if it's not necessary?
1843  *
1844  * Every time the inode is modified, the i_version field must be seen to have
1845  * changed by any observer.
1846  *
1847  * If "force" is set or the QUERIED flag is set, then ensure that we increment
1848  * the value, and clear the queried flag.
1849  *
1850  * In the common case where neither is set, then we can return "false" without
1851  * updating i_version.
1852  *
1853  * If this function returns false, and no other metadata has changed, then we
1854  * can avoid logging the metadata.
1855  */
1856 bool inode_maybe_inc_iversion(struct inode *inode, bool force)
1857 {
1858 	u64 cur, new;
1859 
1860 	/*
1861 	 * The i_version field is not strictly ordered with any other inode
1862 	 * information, but the legacy inode_inc_iversion code used a spinlock
1863 	 * to serialize increments.
1864 	 *
1865 	 * Here, we add full memory barriers to ensure that any de-facto
1866 	 * ordering with other info is preserved.
1867 	 *
1868 	 * This barrier pairs with the barrier in inode_query_iversion()
1869 	 */
1870 	smp_mb();
1871 	cur = inode_peek_iversion_raw(inode);
1872 	do {
1873 		/* If flag is clear then we needn't do anything */
1874 		if (!force && !(cur & I_VERSION_QUERIED))
1875 			return false;
1876 
1877 		/* Since lowest bit is flag, add 2 to avoid it */
1878 		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
1879 	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
1880 	return true;
1881 }
1882 EXPORT_SYMBOL(inode_maybe_inc_iversion);
1883 
1884 /**
1885  * inode_query_iversion - read i_version for later use
1886  * @inode: inode from which i_version should be read
1887  *
1888  * Read the inode i_version counter. This should be used by callers that wish
1889  * to store the returned i_version for later comparison. This will guarantee
1890  * that a later query of the i_version will result in a different value if
1891  * anything has changed.
1892  *
1893  * In this implementation, we fetch the current value, set the QUERIED flag and
1894  * then try to swap it into place with a cmpxchg, if it wasn't already set. If
1895  * that fails, we try again with the newly fetched value from the cmpxchg.
1896  */
1897 u64 inode_query_iversion(struct inode *inode)
1898 {
1899 	u64 cur, new;
1900 
1901 	cur = inode_peek_iversion_raw(inode);
1902 	do {
1903 		/* If flag is already set, then no need to swap */
1904 		if (cur & I_VERSION_QUERIED) {
1905 			/*
1906 			 * This barrier (and the implicit barrier in the
1907 			 * cmpxchg below) pairs with the barrier in
1908 			 * inode_maybe_inc_iversion().
1909 			 */
1910 			smp_mb();
1911 			break;
1912 		}
1913 
1914 		new = cur | I_VERSION_QUERIED;
1915 	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
1916 	return cur >> I_VERSION_QUERIED_SHIFT;
1917 }
1918 EXPORT_SYMBOL(inode_query_iversion);
1919 
1920 ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
1921 		ssize_t direct_written, ssize_t buffered_written)
1922 {
1923 	struct address_space *mapping = iocb->ki_filp->f_mapping;
1924 	loff_t pos = iocb->ki_pos - buffered_written;
1925 	loff_t end = iocb->ki_pos - 1;
1926 	int err;
1927 
1928 	/*
1929 	 * If the buffered write fallback returned an error, we want to return
1930 	 * the number of bytes which were written by direct I/O, or the error
1931 	 * code if that was zero.
1932 	 *
1933 	 * Note that this differs from normal direct-io semantics, which will
1934 	 * return -EFOO even if some bytes were written.
1935 	 */
1936 	if (unlikely(buffered_written < 0)) {
1937 		if (direct_written)
1938 			return direct_written;
1939 		return buffered_written;
1940 	}
1941 
1942 	/*
1943 	 * We need to ensure that the page cache pages are written to disk and
1944 	 * invalidated to preserve the expected O_DIRECT semantics.
1945 	 */
1946 	err = filemap_write_and_wait_range(mapping, pos, end);
1947 	if (err < 0) {
1948 		/*
1949 		 * We don't know how much we wrote, so just return the number of
1950 		 * bytes which were direct-written
1951 		 */
1952 		iocb->ki_pos -= buffered_written;
1953 		if (direct_written)
1954 			return direct_written;
1955 		return err;
1956 	}
1957 	invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
1958 	return direct_written + buffered_written;
1959 }
1960 EXPORT_SYMBOL_GPL(direct_write_fallback);
1961 
1962 /**
1963  * simple_inode_init_ts - initialize the timestamps for a new inode
1964  * @inode: inode to be initialized
1965  *
1966  * When a new inode is created, most filesystems set the timestamps to the
1967  * current time. Add a helper to do this.
1968  */
1969 struct timespec64 simple_inode_init_ts(struct inode *inode)
1970 {
1971 	struct timespec64 ts = inode_set_ctime_current(inode);
1972 
1973 	inode_set_atime_to_ts(inode, ts);
1974 	inode_set_mtime_to_ts(inode, ts);
1975 	return ts;
1976 }
1977 EXPORT_SYMBOL(simple_inode_init_ts);
1978