xref: /linux/fs/overlayfs/super.c (revision d3d90cc2891c9cf4ecba7b85c0af716ab755c7e5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *
4  * Copyright (C) 2011 Novell Inc.
5  */
6 
7 #include <uapi/linux/magic.h>
8 #include <linux/fs.h>
9 #include <linux/namei.h>
10 #include <linux/xattr.h>
11 #include <linux/mount.h>
12 #include <linux/parser.h>
13 #include <linux/module.h>
14 #include <linux/statfs.h>
15 #include <linux/seq_file.h>
16 #include <linux/posix_acl_xattr.h>
17 #include <linux/exportfs.h>
18 #include <linux/file.h>
19 #include <linux/fs_context.h>
20 #include <linux/fs_parser.h>
21 #include "overlayfs.h"
22 #include "params.h"
23 
24 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
25 MODULE_DESCRIPTION("Overlay filesystem");
26 MODULE_LICENSE("GPL");
27 
28 
29 struct ovl_dir_cache;
30 
ovl_d_real(struct dentry * dentry,enum d_real_type type)31 static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
32 {
33 	struct dentry *upper, *lower;
34 	int err;
35 
36 	switch (type) {
37 	case D_REAL_DATA:
38 	case D_REAL_METADATA:
39 		break;
40 	default:
41 		goto bug;
42 	}
43 
44 	if (!d_is_reg(dentry)) {
45 		/* d_real_inode() is only relevant for regular files */
46 		return dentry;
47 	}
48 
49 	upper = ovl_dentry_upper(dentry);
50 	if (upper && (type == D_REAL_METADATA ||
51 		      ovl_has_upperdata(d_inode(dentry))))
52 		return upper;
53 
54 	if (type == D_REAL_METADATA) {
55 		lower = ovl_dentry_lower(dentry);
56 		goto real_lower;
57 	}
58 
59 	/*
60 	 * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
61 	 * the real lowerdata dentry.  The only current caller of d_real() with
62 	 * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
63 	 * likely going to be followed reading from the file, before placing
64 	 * uprobes on offset within the file, so lowerdata should be available
65 	 * when setting the uprobe.
66 	 */
67 	err = ovl_verify_lowerdata(dentry);
68 	if (err)
69 		goto bug;
70 	lower = ovl_dentry_lowerdata(dentry);
71 	if (!lower)
72 		goto bug;
73 
74 real_lower:
75 	/* Handle recursion into stacked lower fs */
76 	return d_real(lower, type);
77 
78 bug:
79 	WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
80 	return dentry;
81 }
82 
ovl_revalidate_real(struct dentry * d,unsigned int flags,bool weak)83 static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
84 {
85 	int ret = 1;
86 
87 	if (!d)
88 		return 1;
89 
90 	if (weak) {
91 		if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
92 			ret =  d->d_op->d_weak_revalidate(d, flags);
93 	} else if (d->d_flags & DCACHE_OP_REVALIDATE) {
94 		struct dentry *parent;
95 		struct inode *dir;
96 		struct name_snapshot n;
97 
98 		if (flags & LOOKUP_RCU) {
99 			parent = READ_ONCE(d->d_parent);
100 			dir = d_inode_rcu(parent);
101 			if (!dir)
102 				return -ECHILD;
103 		} else {
104 			parent = dget_parent(d);
105 			dir = d_inode(parent);
106 		}
107 		take_dentry_name_snapshot(&n, d);
108 		ret = d->d_op->d_revalidate(dir, &n.name, d, flags);
109 		release_dentry_name_snapshot(&n);
110 		if (!(flags & LOOKUP_RCU))
111 			dput(parent);
112 		if (!ret) {
113 			if (!(flags & LOOKUP_RCU))
114 				d_invalidate(d);
115 			ret = -ESTALE;
116 		}
117 	}
118 	return ret;
119 }
120 
ovl_dentry_revalidate_common(struct dentry * dentry,unsigned int flags,bool weak)121 static int ovl_dentry_revalidate_common(struct dentry *dentry,
122 					unsigned int flags, bool weak)
123 {
124 	struct ovl_entry *oe;
125 	struct ovl_path *lowerstack;
126 	struct inode *inode = d_inode_rcu(dentry);
127 	struct dentry *upper;
128 	unsigned int i;
129 	int ret = 1;
130 
131 	/* Careful in RCU mode */
132 	if (!inode)
133 		return -ECHILD;
134 
135 	oe = OVL_I_E(inode);
136 	lowerstack = ovl_lowerstack(oe);
137 	upper = ovl_i_dentry_upper(inode);
138 	if (upper)
139 		ret = ovl_revalidate_real(upper, flags, weak);
140 
141 	for (i = 0; ret > 0 && i < ovl_numlower(oe); i++)
142 		ret = ovl_revalidate_real(lowerstack[i].dentry, flags, weak);
143 
144 	return ret;
145 }
146 
ovl_dentry_revalidate(struct inode * dir,const struct qstr * name,struct dentry * dentry,unsigned int flags)147 static int ovl_dentry_revalidate(struct inode *dir, const struct qstr *name,
148 				 struct dentry *dentry, unsigned int flags)
149 {
150 	return ovl_dentry_revalidate_common(dentry, flags, false);
151 }
152 
ovl_dentry_weak_revalidate(struct dentry * dentry,unsigned int flags)153 static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
154 {
155 	return ovl_dentry_revalidate_common(dentry, flags, true);
156 }
157 
158 static const struct dentry_operations ovl_dentry_operations = {
159 	.d_real = ovl_d_real,
160 	.d_revalidate = ovl_dentry_revalidate,
161 	.d_weak_revalidate = ovl_dentry_weak_revalidate,
162 };
163 
164 static struct kmem_cache *ovl_inode_cachep;
165 
ovl_alloc_inode(struct super_block * sb)166 static struct inode *ovl_alloc_inode(struct super_block *sb)
167 {
168 	struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL);
169 
170 	if (!oi)
171 		return NULL;
172 
173 	oi->cache = NULL;
174 	oi->redirect = NULL;
175 	oi->version = 0;
176 	oi->flags = 0;
177 	oi->__upperdentry = NULL;
178 	oi->lowerdata_redirect = NULL;
179 	oi->oe = NULL;
180 	mutex_init(&oi->lock);
181 
182 	return &oi->vfs_inode;
183 }
184 
ovl_free_inode(struct inode * inode)185 static void ovl_free_inode(struct inode *inode)
186 {
187 	struct ovl_inode *oi = OVL_I(inode);
188 
189 	kfree(oi->redirect);
190 	kfree(oi->oe);
191 	mutex_destroy(&oi->lock);
192 	kmem_cache_free(ovl_inode_cachep, oi);
193 }
194 
ovl_destroy_inode(struct inode * inode)195 static void ovl_destroy_inode(struct inode *inode)
196 {
197 	struct ovl_inode *oi = OVL_I(inode);
198 
199 	dput(oi->__upperdentry);
200 	ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe));
201 	if (S_ISDIR(inode->i_mode))
202 		ovl_dir_cache_free(inode);
203 	else
204 		kfree(oi->lowerdata_redirect);
205 }
206 
ovl_put_super(struct super_block * sb)207 static void ovl_put_super(struct super_block *sb)
208 {
209 	struct ovl_fs *ofs = OVL_FS(sb);
210 
211 	if (ofs)
212 		ovl_free_fs(ofs);
213 }
214 
215 /* Sync real dirty inodes in upper filesystem (if it exists) */
ovl_sync_fs(struct super_block * sb,int wait)216 static int ovl_sync_fs(struct super_block *sb, int wait)
217 {
218 	struct ovl_fs *ofs = OVL_FS(sb);
219 	struct super_block *upper_sb;
220 	int ret;
221 
222 	ret = ovl_sync_status(ofs);
223 
224 	if (ret < 0)
225 		return -EIO;
226 
227 	if (!ret)
228 		return ret;
229 
230 	/*
231 	 * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
232 	 * All the super blocks will be iterated, including upper_sb.
233 	 *
234 	 * If this is a syncfs(2) call, then we do need to call
235 	 * sync_filesystem() on upper_sb, but enough if we do it when being
236 	 * called with wait == 1.
237 	 */
238 	if (!wait)
239 		return 0;
240 
241 	upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
242 
243 	down_read(&upper_sb->s_umount);
244 	ret = sync_filesystem(upper_sb);
245 	up_read(&upper_sb->s_umount);
246 
247 	return ret;
248 }
249 
250 /**
251  * ovl_statfs
252  * @dentry: The dentry to query
253  * @buf: The struct kstatfs to fill in with stats
254  *
255  * Get the filesystem statistics.  As writes always target the upper layer
256  * filesystem pass the statfs to the upper filesystem (if it exists)
257  */
ovl_statfs(struct dentry * dentry,struct kstatfs * buf)258 static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
259 {
260 	struct super_block *sb = dentry->d_sb;
261 	struct ovl_fs *ofs = OVL_FS(sb);
262 	struct dentry *root_dentry = sb->s_root;
263 	struct path path;
264 	int err;
265 
266 	ovl_path_real(root_dentry, &path);
267 
268 	err = vfs_statfs(&path, buf);
269 	if (!err) {
270 		buf->f_namelen = ofs->namelen;
271 		buf->f_type = OVERLAYFS_SUPER_MAGIC;
272 		if (ovl_has_fsid(ofs))
273 			buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
274 	}
275 
276 	return err;
277 }
278 
279 static const struct super_operations ovl_super_operations = {
280 	.alloc_inode	= ovl_alloc_inode,
281 	.free_inode	= ovl_free_inode,
282 	.destroy_inode	= ovl_destroy_inode,
283 	.drop_inode	= generic_delete_inode,
284 	.put_super	= ovl_put_super,
285 	.sync_fs	= ovl_sync_fs,
286 	.statfs		= ovl_statfs,
287 	.show_options	= ovl_show_options,
288 };
289 
290 #define OVL_WORKDIR_NAME "work"
291 #define OVL_INDEXDIR_NAME "index"
292 
ovl_workdir_create(struct ovl_fs * ofs,const char * name,bool persist)293 static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
294 					 const char *name, bool persist)
295 {
296 	struct inode *dir =  ofs->workbasedir->d_inode;
297 	struct vfsmount *mnt = ovl_upper_mnt(ofs);
298 	struct dentry *work;
299 	int err;
300 	bool retried = false;
301 
302 	inode_lock_nested(dir, I_MUTEX_PARENT);
303 retry:
304 	work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
305 
306 	if (!IS_ERR(work)) {
307 		struct iattr attr = {
308 			.ia_valid = ATTR_MODE,
309 			.ia_mode = S_IFDIR | 0,
310 		};
311 
312 		if (work->d_inode) {
313 			err = -EEXIST;
314 			if (retried)
315 				goto out_dput;
316 
317 			if (persist)
318 				goto out_unlock;
319 
320 			retried = true;
321 			err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0);
322 			dput(work);
323 			if (err == -EINVAL) {
324 				work = ERR_PTR(err);
325 				goto out_unlock;
326 			}
327 			goto retry;
328 		}
329 
330 		err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
331 		if (err)
332 			goto out_dput;
333 
334 		/* Weird filesystem returning with hashed negative (kernfs)? */
335 		err = -EINVAL;
336 		if (d_really_is_negative(work))
337 			goto out_dput;
338 
339 		/*
340 		 * Try to remove POSIX ACL xattrs from workdir.  We are good if:
341 		 *
342 		 * a) success (there was a POSIX ACL xattr and was removed)
343 		 * b) -ENODATA (there was no POSIX ACL xattr)
344 		 * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported)
345 		 *
346 		 * There are various other error values that could effectively
347 		 * mean that the xattr doesn't exist (e.g. -ERANGE is returned
348 		 * if the xattr name is too long), but the set of filesystems
349 		 * allowed as upper are limited to "normal" ones, where checking
350 		 * for the above two errors is sufficient.
351 		 */
352 		err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT);
353 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
354 			goto out_dput;
355 
356 		err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS);
357 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
358 			goto out_dput;
359 
360 		/* Clear any inherited mode bits */
361 		inode_lock(work->d_inode);
362 		err = ovl_do_notify_change(ofs, work, &attr);
363 		inode_unlock(work->d_inode);
364 		if (err)
365 			goto out_dput;
366 	} else {
367 		err = PTR_ERR(work);
368 		goto out_err;
369 	}
370 out_unlock:
371 	inode_unlock(dir);
372 	return work;
373 
374 out_dput:
375 	dput(work);
376 out_err:
377 	pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n",
378 		ofs->config.workdir, name, -err);
379 	work = NULL;
380 	goto out_unlock;
381 }
382 
ovl_check_namelen(const struct path * path,struct ovl_fs * ofs,const char * name)383 static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
384 			     const char *name)
385 {
386 	struct kstatfs statfs;
387 	int err = vfs_statfs(path, &statfs);
388 
389 	if (err)
390 		pr_err("statfs failed on '%s'\n", name);
391 	else
392 		ofs->namelen = max(ofs->namelen, statfs.f_namelen);
393 
394 	return err;
395 }
396 
ovl_lower_dir(const char * name,struct path * path,struct ovl_fs * ofs,int * stack_depth)397 static int ovl_lower_dir(const char *name, struct path *path,
398 			 struct ovl_fs *ofs, int *stack_depth)
399 {
400 	int fh_type;
401 	int err;
402 
403 	err = ovl_check_namelen(path, ofs, name);
404 	if (err)
405 		return err;
406 
407 	*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
408 
409 	/*
410 	 * The inodes index feature and NFS export need to encode and decode
411 	 * file handles, so they require that all layers support them.
412 	 */
413 	fh_type = ovl_can_decode_fh(path->dentry->d_sb);
414 	if ((ofs->config.nfs_export ||
415 	     (ofs->config.index && ofs->config.upperdir)) && !fh_type) {
416 		ofs->config.index = false;
417 		ofs->config.nfs_export = false;
418 		pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
419 			name);
420 	}
421 	ofs->nofh |= !fh_type;
422 	/*
423 	 * Decoding origin file handle is required for persistent st_ino.
424 	 * Without persistent st_ino, xino=auto falls back to xino=off.
425 	 */
426 	if (ofs->config.xino == OVL_XINO_AUTO &&
427 	    ofs->config.upperdir && !fh_type) {
428 		ofs->config.xino = OVL_XINO_OFF;
429 		pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n",
430 			name);
431 	}
432 
433 	/* Check if lower fs has 32bit inode numbers */
434 	if (fh_type != FILEID_INO32_GEN)
435 		ofs->xino_mode = -1;
436 
437 	return 0;
438 }
439 
440 /* Workdir should not be subdir of upperdir and vice versa */
ovl_workdir_ok(struct dentry * workdir,struct dentry * upperdir)441 static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
442 {
443 	bool ok = false;
444 
445 	if (workdir != upperdir) {
446 		struct dentry *trap = lock_rename(workdir, upperdir);
447 		if (!IS_ERR(trap))
448 			unlock_rename(workdir, upperdir);
449 		ok = (trap == NULL);
450 	}
451 	return ok;
452 }
453 
ovl_setup_trap(struct super_block * sb,struct dentry * dir,struct inode ** ptrap,const char * name)454 static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
455 			  struct inode **ptrap, const char *name)
456 {
457 	struct inode *trap;
458 	int err;
459 
460 	trap = ovl_get_trap_inode(sb, dir);
461 	err = PTR_ERR_OR_ZERO(trap);
462 	if (err) {
463 		if (err == -ELOOP)
464 			pr_err("conflicting %s path\n", name);
465 		return err;
466 	}
467 
468 	*ptrap = trap;
469 	return 0;
470 }
471 
472 /*
473  * Determine how we treat concurrent use of upperdir/workdir based on the
474  * index feature. This is papering over mount leaks of container runtimes,
475  * for example, an old overlay mount is leaked and now its upperdir is
476  * attempted to be used as a lower layer in a new overlay mount.
477  */
ovl_report_in_use(struct ovl_fs * ofs,const char * name)478 static int ovl_report_in_use(struct ovl_fs *ofs, const char *name)
479 {
480 	if (ofs->config.index) {
481 		pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n",
482 		       name);
483 		return -EBUSY;
484 	} else {
485 		pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n",
486 			name);
487 		return 0;
488 	}
489 }
490 
ovl_get_upper(struct super_block * sb,struct ovl_fs * ofs,struct ovl_layer * upper_layer,const struct path * upperpath)491 static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
492 			 struct ovl_layer *upper_layer,
493 			 const struct path *upperpath)
494 {
495 	struct vfsmount *upper_mnt;
496 	int err;
497 
498 	/* Upperdir path should not be r/o */
499 	if (__mnt_is_readonly(upperpath->mnt)) {
500 		pr_err("upper fs is r/o, try multi-lower layers mount\n");
501 		err = -EINVAL;
502 		goto out;
503 	}
504 
505 	err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir);
506 	if (err)
507 		goto out;
508 
509 	err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap,
510 			     "upperdir");
511 	if (err)
512 		goto out;
513 
514 	upper_mnt = clone_private_mount(upperpath);
515 	err = PTR_ERR(upper_mnt);
516 	if (IS_ERR(upper_mnt)) {
517 		pr_err("failed to clone upperpath\n");
518 		goto out;
519 	}
520 
521 	/* Don't inherit atime flags */
522 	upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
523 	upper_layer->mnt = upper_mnt;
524 	upper_layer->idx = 0;
525 	upper_layer->fsid = 0;
526 
527 	/*
528 	 * Inherit SB_NOSEC flag from upperdir.
529 	 *
530 	 * This optimization changes behavior when a security related attribute
531 	 * (suid/sgid/security.*) is changed on an underlying layer.  This is
532 	 * okay because we don't yet have guarantees in that case, but it will
533 	 * need careful treatment once we want to honour changes to underlying
534 	 * filesystems.
535 	 */
536 	if (upper_mnt->mnt_sb->s_flags & SB_NOSEC)
537 		sb->s_flags |= SB_NOSEC;
538 
539 	if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) {
540 		ofs->upperdir_locked = true;
541 	} else {
542 		err = ovl_report_in_use(ofs, "upperdir");
543 		if (err)
544 			goto out;
545 	}
546 
547 	err = 0;
548 out:
549 	return err;
550 }
551 
552 /*
553  * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
554  * negative values if error is encountered.
555  */
ovl_check_rename_whiteout(struct ovl_fs * ofs)556 static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
557 {
558 	struct dentry *workdir = ofs->workdir;
559 	struct inode *dir = d_inode(workdir);
560 	struct dentry *temp;
561 	struct dentry *dest;
562 	struct dentry *whiteout;
563 	struct name_snapshot name;
564 	int err;
565 
566 	inode_lock_nested(dir, I_MUTEX_PARENT);
567 
568 	temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
569 	err = PTR_ERR(temp);
570 	if (IS_ERR(temp))
571 		goto out_unlock;
572 
573 	dest = ovl_lookup_temp(ofs, workdir);
574 	err = PTR_ERR(dest);
575 	if (IS_ERR(dest)) {
576 		dput(temp);
577 		goto out_unlock;
578 	}
579 
580 	/* Name is inline and stable - using snapshot as a copy helper */
581 	take_dentry_name_snapshot(&name, temp);
582 	err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
583 	if (err) {
584 		if (err == -EINVAL)
585 			err = 0;
586 		goto cleanup_temp;
587 	}
588 
589 	whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len);
590 	err = PTR_ERR(whiteout);
591 	if (IS_ERR(whiteout))
592 		goto cleanup_temp;
593 
594 	err = ovl_upper_is_whiteout(ofs, whiteout);
595 
596 	/* Best effort cleanup of whiteout and temp file */
597 	if (err)
598 		ovl_cleanup(ofs, dir, whiteout);
599 	dput(whiteout);
600 
601 cleanup_temp:
602 	ovl_cleanup(ofs, dir, temp);
603 	release_dentry_name_snapshot(&name);
604 	dput(temp);
605 	dput(dest);
606 
607 out_unlock:
608 	inode_unlock(dir);
609 
610 	return err;
611 }
612 
ovl_lookup_or_create(struct ovl_fs * ofs,struct dentry * parent,const char * name,umode_t mode)613 static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
614 					   struct dentry *parent,
615 					   const char *name, umode_t mode)
616 {
617 	size_t len = strlen(name);
618 	struct dentry *child;
619 
620 	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
621 	child = ovl_lookup_upper(ofs, name, parent, len);
622 	if (!IS_ERR(child) && !child->d_inode)
623 		child = ovl_create_real(ofs, parent->d_inode, child,
624 					OVL_CATTR(mode));
625 	inode_unlock(parent->d_inode);
626 	dput(parent);
627 
628 	return child;
629 }
630 
631 /*
632  * Creates $workdir/work/incompat/volatile/dirty file if it is not already
633  * present.
634  */
ovl_create_volatile_dirty(struct ovl_fs * ofs)635 static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
636 {
637 	unsigned int ctr;
638 	struct dentry *d = dget(ofs->workbasedir);
639 	static const char *const volatile_path[] = {
640 		OVL_WORKDIR_NAME, "incompat", "volatile", "dirty"
641 	};
642 	const char *const *name = volatile_path;
643 
644 	for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) {
645 		d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
646 		if (IS_ERR(d))
647 			return PTR_ERR(d);
648 	}
649 	dput(d);
650 	return 0;
651 }
652 
ovl_make_workdir(struct super_block * sb,struct ovl_fs * ofs,const struct path * workpath)653 static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
654 			    const struct path *workpath)
655 {
656 	struct vfsmount *mnt = ovl_upper_mnt(ofs);
657 	struct dentry *workdir;
658 	struct file *tmpfile;
659 	bool rename_whiteout;
660 	bool d_type;
661 	int fh_type;
662 	int err;
663 
664 	err = mnt_want_write(mnt);
665 	if (err)
666 		return err;
667 
668 	workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
669 	err = PTR_ERR(workdir);
670 	if (IS_ERR_OR_NULL(workdir))
671 		goto out;
672 
673 	ofs->workdir = workdir;
674 
675 	err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir");
676 	if (err)
677 		goto out;
678 
679 	/*
680 	 * Upper should support d_type, else whiteouts are visible.  Given
681 	 * workdir and upper are on same fs, we can do iterate_dir() on
682 	 * workdir. This check requires successful creation of workdir in
683 	 * previous step.
684 	 */
685 	err = ovl_check_d_type_supported(workpath);
686 	if (err < 0)
687 		goto out;
688 
689 	d_type = err;
690 	if (!d_type)
691 		pr_warn("upper fs needs to support d_type.\n");
692 
693 	/* Check if upper/work fs supports O_TMPFILE */
694 	tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
695 	ofs->tmpfile = !IS_ERR(tmpfile);
696 	if (ofs->tmpfile)
697 		fput(tmpfile);
698 	else
699 		pr_warn("upper fs does not support tmpfile.\n");
700 
701 
702 	/* Check if upper/work fs supports RENAME_WHITEOUT */
703 	err = ovl_check_rename_whiteout(ofs);
704 	if (err < 0)
705 		goto out;
706 
707 	rename_whiteout = err;
708 	if (!rename_whiteout)
709 		pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
710 
711 	/*
712 	 * Check if upper/work fs supports (trusted|user).overlay.* xattr
713 	 */
714 	err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
715 	if (err) {
716 		pr_warn("failed to set xattr on upper\n");
717 		ofs->noxattr = true;
718 		if (ovl_redirect_follow(ofs)) {
719 			ofs->config.redirect_mode = OVL_REDIRECT_NOFOLLOW;
720 			pr_warn("...falling back to redirect_dir=nofollow.\n");
721 		}
722 		if (ofs->config.metacopy) {
723 			ofs->config.metacopy = false;
724 			pr_warn("...falling back to metacopy=off.\n");
725 		}
726 		if (ofs->config.index) {
727 			ofs->config.index = false;
728 			pr_warn("...falling back to index=off.\n");
729 		}
730 		if (ovl_has_fsid(ofs)) {
731 			ofs->config.uuid = OVL_UUID_NULL;
732 			pr_warn("...falling back to uuid=null.\n");
733 		}
734 		/*
735 		 * xattr support is required for persistent st_ino.
736 		 * Without persistent st_ino, xino=auto falls back to xino=off.
737 		 */
738 		if (ofs->config.xino == OVL_XINO_AUTO) {
739 			ofs->config.xino = OVL_XINO_OFF;
740 			pr_warn("...falling back to xino=off.\n");
741 		}
742 		if (err == -EPERM && !ofs->config.userxattr)
743 			pr_info("try mounting with 'userxattr' option\n");
744 		err = 0;
745 	} else {
746 		ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
747 	}
748 
749 	/*
750 	 * We allowed sub-optimal upper fs configuration and don't want to break
751 	 * users over kernel upgrade, but we never allowed remote upper fs, so
752 	 * we can enforce strict requirements for remote upper fs.
753 	 */
754 	if (ovl_dentry_remote(ofs->workdir) &&
755 	    (!d_type || !rename_whiteout || ofs->noxattr)) {
756 		pr_err("upper fs missing required features.\n");
757 		err = -EINVAL;
758 		goto out;
759 	}
760 
761 	/*
762 	 * For volatile mount, create a incompat/volatile/dirty file to keep
763 	 * track of it.
764 	 */
765 	if (ofs->config.ovl_volatile) {
766 		err = ovl_create_volatile_dirty(ofs);
767 		if (err < 0) {
768 			pr_err("Failed to create volatile/dirty file.\n");
769 			goto out;
770 		}
771 	}
772 
773 	/* Check if upper/work fs supports file handles */
774 	fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
775 	if (ofs->config.index && !fh_type) {
776 		ofs->config.index = false;
777 		pr_warn("upper fs does not support file handles, falling back to index=off.\n");
778 	}
779 	ofs->nofh |= !fh_type;
780 
781 	/* Check if upper fs has 32bit inode numbers */
782 	if (fh_type != FILEID_INO32_GEN)
783 		ofs->xino_mode = -1;
784 
785 	/* NFS export of r/w mount depends on index */
786 	if (ofs->config.nfs_export && !ofs->config.index) {
787 		pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n");
788 		ofs->config.nfs_export = false;
789 	}
790 out:
791 	mnt_drop_write(mnt);
792 	return err;
793 }
794 
ovl_get_workdir(struct super_block * sb,struct ovl_fs * ofs,const struct path * upperpath,const struct path * workpath)795 static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
796 			   const struct path *upperpath,
797 			   const struct path *workpath)
798 {
799 	int err;
800 
801 	err = -EINVAL;
802 	if (upperpath->mnt != workpath->mnt) {
803 		pr_err("workdir and upperdir must reside under the same mount\n");
804 		return err;
805 	}
806 	if (!ovl_workdir_ok(workpath->dentry, upperpath->dentry)) {
807 		pr_err("workdir and upperdir must be separate subtrees\n");
808 		return err;
809 	}
810 
811 	ofs->workbasedir = dget(workpath->dentry);
812 
813 	if (ovl_inuse_trylock(ofs->workbasedir)) {
814 		ofs->workdir_locked = true;
815 	} else {
816 		err = ovl_report_in_use(ofs, "workdir");
817 		if (err)
818 			return err;
819 	}
820 
821 	err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap,
822 			     "workdir");
823 	if (err)
824 		return err;
825 
826 	return ovl_make_workdir(sb, ofs, workpath);
827 }
828 
ovl_get_indexdir(struct super_block * sb,struct ovl_fs * ofs,struct ovl_entry * oe,const struct path * upperpath)829 static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
830 			    struct ovl_entry *oe, const struct path *upperpath)
831 {
832 	struct vfsmount *mnt = ovl_upper_mnt(ofs);
833 	struct dentry *indexdir;
834 	struct dentry *origin = ovl_lowerstack(oe)->dentry;
835 	const struct ovl_fh *fh;
836 	int err;
837 
838 	fh = ovl_get_origin_fh(ofs, origin);
839 	if (IS_ERR(fh))
840 		return PTR_ERR(fh);
841 
842 	err = mnt_want_write(mnt);
843 	if (err)
844 		goto out_free_fh;
845 
846 	/* Verify lower root is upper root origin */
847 	err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true);
848 	if (err) {
849 		pr_err("failed to verify upper root origin\n");
850 		goto out;
851 	}
852 
853 	/* index dir will act also as workdir */
854 	iput(ofs->workdir_trap);
855 	ofs->workdir_trap = NULL;
856 	dput(ofs->workdir);
857 	ofs->workdir = NULL;
858 	indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
859 	if (IS_ERR(indexdir)) {
860 		err = PTR_ERR(indexdir);
861 	} else if (indexdir) {
862 		ofs->workdir = indexdir;
863 		err = ovl_setup_trap(sb, indexdir, &ofs->workdir_trap,
864 				     "indexdir");
865 		if (err)
866 			goto out;
867 
868 		/*
869 		 * Verify upper root is exclusively associated with index dir.
870 		 * Older kernels stored upper fh in ".overlay.origin"
871 		 * xattr. If that xattr exists, verify that it is a match to
872 		 * upper dir file handle. In any case, verify or set xattr
873 		 * ".overlay.upper" to indicate that index may have
874 		 * directory entries.
875 		 */
876 		if (ovl_check_origin_xattr(ofs, indexdir)) {
877 			err = ovl_verify_origin_xattr(ofs, indexdir,
878 						      OVL_XATTR_ORIGIN,
879 						      upperpath->dentry, true,
880 						      false);
881 			if (err)
882 				pr_err("failed to verify index dir 'origin' xattr\n");
883 		}
884 		err = ovl_verify_upper(ofs, indexdir, upperpath->dentry, true);
885 		if (err)
886 			pr_err("failed to verify index dir 'upper' xattr\n");
887 
888 		/* Cleanup bad/stale/orphan index entries */
889 		if (!err)
890 			err = ovl_indexdir_cleanup(ofs);
891 	}
892 	if (err || !indexdir)
893 		pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
894 
895 out:
896 	mnt_drop_write(mnt);
897 out_free_fh:
898 	kfree(fh);
899 	return err;
900 }
901 
ovl_lower_uuid_ok(struct ovl_fs * ofs,const uuid_t * uuid)902 static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
903 {
904 	unsigned int i;
905 
906 	if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs))
907 		return true;
908 
909 	/*
910 	 * We allow using single lower with null uuid for index and nfs_export
911 	 * for example to support those features with single lower squashfs.
912 	 * To avoid regressions in setups of overlay with re-formatted lower
913 	 * squashfs, do not allow decoding origin with lower null uuid unless
914 	 * user opted-in to one of the new features that require following the
915 	 * lower inode of non-dir upper.
916 	 */
917 	if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid))
918 		return false;
919 
920 	for (i = 0; i < ofs->numfs; i++) {
921 		/*
922 		 * We use uuid to associate an overlay lower file handle with a
923 		 * lower layer, so we can accept lower fs with null uuid as long
924 		 * as all lower layers with null uuid are on the same fs.
925 		 * if we detect multiple lower fs with the same uuid, we
926 		 * disable lower file handle decoding on all of them.
927 		 */
928 		if (ofs->fs[i].is_lower &&
929 		    uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) {
930 			ofs->fs[i].bad_uuid = true;
931 			return false;
932 		}
933 	}
934 	return true;
935 }
936 
937 /* Get a unique fsid for the layer */
ovl_get_fsid(struct ovl_fs * ofs,const struct path * path)938 static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
939 {
940 	struct super_block *sb = path->mnt->mnt_sb;
941 	unsigned int i;
942 	dev_t dev;
943 	int err;
944 	bool bad_uuid = false;
945 	bool warn = false;
946 
947 	for (i = 0; i < ofs->numfs; i++) {
948 		if (ofs->fs[i].sb == sb)
949 			return i;
950 	}
951 
952 	if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
953 		bad_uuid = true;
954 		if (ofs->config.xino == OVL_XINO_AUTO) {
955 			ofs->config.xino = OVL_XINO_OFF;
956 			warn = true;
957 		}
958 		if (ofs->config.index || ofs->config.nfs_export) {
959 			ofs->config.index = false;
960 			ofs->config.nfs_export = false;
961 			warn = true;
962 		}
963 		if (warn) {
964 			pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n",
965 				uuid_is_null(&sb->s_uuid) ? "null" :
966 							    "conflicting",
967 				path->dentry, ovl_xino_mode(&ofs->config));
968 		}
969 	}
970 
971 	err = get_anon_bdev(&dev);
972 	if (err) {
973 		pr_err("failed to get anonymous bdev for lowerpath\n");
974 		return err;
975 	}
976 
977 	ofs->fs[ofs->numfs].sb = sb;
978 	ofs->fs[ofs->numfs].pseudo_dev = dev;
979 	ofs->fs[ofs->numfs].bad_uuid = bad_uuid;
980 
981 	return ofs->numfs++;
982 }
983 
984 /*
985  * The fsid after the last lower fsid is used for the data layers.
986  * It is a "null fs" with a null sb, null uuid, and no pseudo dev.
987  */
ovl_get_data_fsid(struct ovl_fs * ofs)988 static int ovl_get_data_fsid(struct ovl_fs *ofs)
989 {
990 	return ofs->numfs;
991 }
992 
993 
ovl_get_layers(struct super_block * sb,struct ovl_fs * ofs,struct ovl_fs_context * ctx,struct ovl_layer * layers)994 static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
995 			  struct ovl_fs_context *ctx, struct ovl_layer *layers)
996 {
997 	int err;
998 	unsigned int i;
999 	size_t nr_merged_lower;
1000 
1001 	ofs->fs = kcalloc(ctx->nr + 2, sizeof(struct ovl_sb), GFP_KERNEL);
1002 	if (ofs->fs == NULL)
1003 		return -ENOMEM;
1004 
1005 	/*
1006 	 * idx/fsid 0 are reserved for upper fs even with lower only overlay
1007 	 * and the last fsid is reserved for "null fs" of the data layers.
1008 	 */
1009 	ofs->numfs++;
1010 
1011 	/*
1012 	 * All lower layers that share the same fs as upper layer, use the same
1013 	 * pseudo_dev as upper layer.  Allocate fs[0].pseudo_dev even for lower
1014 	 * only overlay to simplify ovl_fs_free().
1015 	 * is_lower will be set if upper fs is shared with a lower layer.
1016 	 */
1017 	err = get_anon_bdev(&ofs->fs[0].pseudo_dev);
1018 	if (err) {
1019 		pr_err("failed to get anonymous bdev for upper fs\n");
1020 		return err;
1021 	}
1022 
1023 	if (ovl_upper_mnt(ofs)) {
1024 		ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb;
1025 		ofs->fs[0].is_lower = false;
1026 	}
1027 
1028 	nr_merged_lower = ctx->nr - ctx->nr_data;
1029 	for (i = 0; i < ctx->nr; i++) {
1030 		struct ovl_fs_context_layer *l = &ctx->lower[i];
1031 		struct vfsmount *mnt;
1032 		struct inode *trap;
1033 		int fsid;
1034 
1035 		if (i < nr_merged_lower)
1036 			fsid = ovl_get_fsid(ofs, &l->path);
1037 		else
1038 			fsid = ovl_get_data_fsid(ofs);
1039 		if (fsid < 0)
1040 			return fsid;
1041 
1042 		/*
1043 		 * Check if lower root conflicts with this overlay layers before
1044 		 * checking if it is in-use as upperdir/workdir of "another"
1045 		 * mount, because we do not bother to check in ovl_is_inuse() if
1046 		 * the upperdir/workdir is in fact in-use by our
1047 		 * upperdir/workdir.
1048 		 */
1049 		err = ovl_setup_trap(sb, l->path.dentry, &trap, "lowerdir");
1050 		if (err)
1051 			return err;
1052 
1053 		if (ovl_is_inuse(l->path.dentry)) {
1054 			err = ovl_report_in_use(ofs, "lowerdir");
1055 			if (err) {
1056 				iput(trap);
1057 				return err;
1058 			}
1059 		}
1060 
1061 		mnt = clone_private_mount(&l->path);
1062 		err = PTR_ERR(mnt);
1063 		if (IS_ERR(mnt)) {
1064 			pr_err("failed to clone lowerpath\n");
1065 			iput(trap);
1066 			return err;
1067 		}
1068 
1069 		/*
1070 		 * Make lower layers R/O.  That way fchmod/fchown on lower file
1071 		 * will fail instead of modifying lower fs.
1072 		 */
1073 		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
1074 
1075 		layers[ofs->numlayer].trap = trap;
1076 		layers[ofs->numlayer].mnt = mnt;
1077 		layers[ofs->numlayer].idx = ofs->numlayer;
1078 		layers[ofs->numlayer].fsid = fsid;
1079 		layers[ofs->numlayer].fs = &ofs->fs[fsid];
1080 		/* Store for printing lowerdir=... in ovl_show_options() */
1081 		ofs->config.lowerdirs[ofs->numlayer] = l->name;
1082 		l->name = NULL;
1083 		ofs->numlayer++;
1084 		ofs->fs[fsid].is_lower = true;
1085 	}
1086 
1087 	/*
1088 	 * When all layers on same fs, overlay can use real inode numbers.
1089 	 * With mount option "xino=<on|auto>", mounter declares that there are
1090 	 * enough free high bits in underlying fs to hold the unique fsid.
1091 	 * If overlayfs does encounter underlying inodes using the high xino
1092 	 * bits reserved for fsid, it emits a warning and uses the original
1093 	 * inode number or a non persistent inode number allocated from a
1094 	 * dedicated range.
1095 	 */
1096 	if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) {
1097 		if (ofs->config.xino == OVL_XINO_ON)
1098 			pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
1099 		ofs->xino_mode = 0;
1100 	} else if (ofs->config.xino == OVL_XINO_OFF) {
1101 		ofs->xino_mode = -1;
1102 	} else if (ofs->xino_mode < 0) {
1103 		/*
1104 		 * This is a roundup of number of bits needed for encoding
1105 		 * fsid, where fsid 0 is reserved for upper fs (even with
1106 		 * lower only overlay) +1 extra bit is reserved for the non
1107 		 * persistent inode number range that is used for resolving
1108 		 * xino lower bits overflow.
1109 		 */
1110 		BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
1111 		ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
1112 	}
1113 
1114 	if (ofs->xino_mode > 0) {
1115 		pr_info("\"xino\" feature enabled using %d upper inode bits.\n",
1116 			ofs->xino_mode);
1117 	}
1118 
1119 	return 0;
1120 }
1121 
ovl_get_lowerstack(struct super_block * sb,struct ovl_fs_context * ctx,struct ovl_fs * ofs,struct ovl_layer * layers)1122 static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
1123 					    struct ovl_fs_context *ctx,
1124 					    struct ovl_fs *ofs,
1125 					    struct ovl_layer *layers)
1126 {
1127 	int err;
1128 	unsigned int i;
1129 	size_t nr_merged_lower;
1130 	struct ovl_entry *oe;
1131 	struct ovl_path *lowerstack;
1132 
1133 	struct ovl_fs_context_layer *l;
1134 
1135 	if (!ofs->config.upperdir && ctx->nr == 1) {
1136 		pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n");
1137 		return ERR_PTR(-EINVAL);
1138 	}
1139 
1140 	err = -EINVAL;
1141 	for (i = 0; i < ctx->nr; i++) {
1142 		l = &ctx->lower[i];
1143 
1144 		err = ovl_lower_dir(l->name, &l->path, ofs, &sb->s_stack_depth);
1145 		if (err)
1146 			return ERR_PTR(err);
1147 	}
1148 
1149 	err = -EINVAL;
1150 	sb->s_stack_depth++;
1151 	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
1152 		pr_err("maximum fs stacking depth exceeded\n");
1153 		return ERR_PTR(err);
1154 	}
1155 
1156 	err = ovl_get_layers(sb, ofs, ctx, layers);
1157 	if (err)
1158 		return ERR_PTR(err);
1159 
1160 	err = -ENOMEM;
1161 	/* Data-only layers are not merged in root directory */
1162 	nr_merged_lower = ctx->nr - ctx->nr_data;
1163 	oe = ovl_alloc_entry(nr_merged_lower);
1164 	if (!oe)
1165 		return ERR_PTR(err);
1166 
1167 	lowerstack = ovl_lowerstack(oe);
1168 	for (i = 0; i < nr_merged_lower; i++) {
1169 		l = &ctx->lower[i];
1170 		lowerstack[i].dentry = dget(l->path.dentry);
1171 		lowerstack[i].layer = &ofs->layers[i + 1];
1172 	}
1173 	ofs->numdatalayer = ctx->nr_data;
1174 
1175 	return oe;
1176 }
1177 
1178 /*
1179  * Check if this layer root is a descendant of:
1180  * - another layer of this overlayfs instance
1181  * - upper/work dir of any overlayfs instance
1182  */
ovl_check_layer(struct super_block * sb,struct ovl_fs * ofs,struct dentry * dentry,const char * name,bool is_lower)1183 static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
1184 			   struct dentry *dentry, const char *name,
1185 			   bool is_lower)
1186 {
1187 	struct dentry *next = dentry, *parent;
1188 	int err = 0;
1189 
1190 	if (!dentry)
1191 		return 0;
1192 
1193 	parent = dget_parent(next);
1194 
1195 	/* Walk back ancestors to root (inclusive) looking for traps */
1196 	while (!err && parent != next) {
1197 		if (is_lower && ovl_lookup_trap_inode(sb, parent)) {
1198 			err = -ELOOP;
1199 			pr_err("overlapping %s path\n", name);
1200 		} else if (ovl_is_inuse(parent)) {
1201 			err = ovl_report_in_use(ofs, name);
1202 		}
1203 		next = parent;
1204 		parent = dget_parent(next);
1205 		dput(next);
1206 	}
1207 
1208 	dput(parent);
1209 
1210 	return err;
1211 }
1212 
1213 /*
1214  * Check if any of the layers or work dirs overlap.
1215  */
ovl_check_overlapping_layers(struct super_block * sb,struct ovl_fs * ofs)1216 static int ovl_check_overlapping_layers(struct super_block *sb,
1217 					struct ovl_fs *ofs)
1218 {
1219 	int i, err;
1220 
1221 	if (ovl_upper_mnt(ofs)) {
1222 		err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root,
1223 				      "upperdir", false);
1224 		if (err)
1225 			return err;
1226 
1227 		/*
1228 		 * Checking workbasedir avoids hitting ovl_is_inuse(parent) of
1229 		 * this instance and covers overlapping work and index dirs,
1230 		 * unless work or index dir have been moved since created inside
1231 		 * workbasedir.  In that case, we already have their traps in
1232 		 * inode cache and we will catch that case on lookup.
1233 		 */
1234 		err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir",
1235 				      false);
1236 		if (err)
1237 			return err;
1238 	}
1239 
1240 	for (i = 1; i < ofs->numlayer; i++) {
1241 		err = ovl_check_layer(sb, ofs,
1242 				      ofs->layers[i].mnt->mnt_root,
1243 				      "lowerdir", true);
1244 		if (err)
1245 			return err;
1246 	}
1247 
1248 	return 0;
1249 }
1250 
ovl_get_root(struct super_block * sb,struct dentry * upperdentry,struct ovl_entry * oe)1251 static struct dentry *ovl_get_root(struct super_block *sb,
1252 				   struct dentry *upperdentry,
1253 				   struct ovl_entry *oe)
1254 {
1255 	struct dentry *root;
1256 	struct ovl_fs *ofs = OVL_FS(sb);
1257 	struct ovl_path *lowerpath = ovl_lowerstack(oe);
1258 	unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
1259 	int fsid = lowerpath->layer->fsid;
1260 	struct ovl_inode_params oip = {
1261 		.upperdentry = upperdentry,
1262 		.oe = oe,
1263 	};
1264 
1265 	root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
1266 	if (!root)
1267 		return NULL;
1268 
1269 	if (upperdentry) {
1270 		/* Root inode uses upper st_ino/i_ino */
1271 		ino = d_inode(upperdentry)->i_ino;
1272 		fsid = 0;
1273 		ovl_dentry_set_upper_alias(root);
1274 		if (ovl_is_impuredir(sb, upperdentry))
1275 			ovl_set_flag(OVL_IMPURE, d_inode(root));
1276 	}
1277 
1278 	/* Look for xwhiteouts marker except in the lowermost layer */
1279 	for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
1280 		struct path path = {
1281 			.mnt = lowerpath->layer->mnt,
1282 			.dentry = lowerpath->dentry,
1283 		};
1284 
1285 		/* overlay.opaque=x means xwhiteouts directory */
1286 		if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
1287 			ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
1288 			ovl_dentry_set_xwhiteouts(root);
1289 		}
1290 	}
1291 
1292 	/* Root is always merge -> can have whiteouts */
1293 	ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
1294 	ovl_dentry_set_flag(OVL_E_CONNECTED, root);
1295 	ovl_set_upperdata(d_inode(root));
1296 	ovl_inode_init(d_inode(root), &oip, ino, fsid);
1297 	ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE);
1298 	/* root keeps a reference of upperdentry */
1299 	dget(upperdentry);
1300 
1301 	return root;
1302 }
1303 
ovl_fill_super(struct super_block * sb,struct fs_context * fc)1304 int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
1305 {
1306 	struct ovl_fs *ofs = sb->s_fs_info;
1307 	struct ovl_fs_context *ctx = fc->fs_private;
1308 	struct dentry *root_dentry;
1309 	struct ovl_entry *oe;
1310 	struct ovl_layer *layers;
1311 	struct cred *cred;
1312 	int err;
1313 
1314 	err = -EIO;
1315 	if (WARN_ON(fc->user_ns != current_user_ns()))
1316 		goto out_err;
1317 
1318 	sb->s_d_op = &ovl_dentry_operations;
1319 
1320 	err = -ENOMEM;
1321 	ofs->creator_cred = cred = prepare_creds();
1322 	if (!cred)
1323 		goto out_err;
1324 
1325 	err = ovl_fs_params_verify(ctx, &ofs->config);
1326 	if (err)
1327 		goto out_err;
1328 
1329 	err = -EINVAL;
1330 	if (ctx->nr == 0) {
1331 		if (!(fc->sb_flags & SB_SILENT))
1332 			pr_err("missing 'lowerdir'\n");
1333 		goto out_err;
1334 	}
1335 
1336 	err = -ENOMEM;
1337 	layers = kcalloc(ctx->nr + 1, sizeof(struct ovl_layer), GFP_KERNEL);
1338 	if (!layers)
1339 		goto out_err;
1340 
1341 	ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL);
1342 	if (!ofs->config.lowerdirs) {
1343 		kfree(layers);
1344 		goto out_err;
1345 	}
1346 	ofs->layers = layers;
1347 	/*
1348 	 * Layer 0 is reserved for upper even if there's no upper.
1349 	 * config.lowerdirs[0] is used for storing the user provided colon
1350 	 * separated lowerdir string.
1351 	 */
1352 	ofs->config.lowerdirs[0] = ctx->lowerdir_all;
1353 	ctx->lowerdir_all = NULL;
1354 	ofs->numlayer = 1;
1355 
1356 	sb->s_stack_depth = 0;
1357 	sb->s_maxbytes = MAX_LFS_FILESIZE;
1358 	atomic_long_set(&ofs->last_ino, 1);
1359 	/* Assume underlying fs uses 32bit inodes unless proven otherwise */
1360 	if (ofs->config.xino != OVL_XINO_OFF) {
1361 		ofs->xino_mode = BITS_PER_LONG - 32;
1362 		if (!ofs->xino_mode) {
1363 			pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
1364 			ofs->config.xino = OVL_XINO_OFF;
1365 		}
1366 	}
1367 
1368 	/* alloc/destroy_inode needed for setting up traps in inode cache */
1369 	sb->s_op = &ovl_super_operations;
1370 
1371 	if (ofs->config.upperdir) {
1372 		struct super_block *upper_sb;
1373 
1374 		err = -EINVAL;
1375 		if (!ofs->config.workdir) {
1376 			pr_err("missing 'workdir'\n");
1377 			goto out_err;
1378 		}
1379 
1380 		err = ovl_get_upper(sb, ofs, &layers[0], &ctx->upper);
1381 		if (err)
1382 			goto out_err;
1383 
1384 		upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
1385 		if (!ovl_should_sync(ofs)) {
1386 			ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
1387 			if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
1388 				err = -EIO;
1389 				pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
1390 				goto out_err;
1391 			}
1392 		}
1393 
1394 		err = ovl_get_workdir(sb, ofs, &ctx->upper, &ctx->work);
1395 		if (err)
1396 			goto out_err;
1397 
1398 		if (!ofs->workdir)
1399 			sb->s_flags |= SB_RDONLY;
1400 
1401 		sb->s_stack_depth = upper_sb->s_stack_depth;
1402 		sb->s_time_gran = upper_sb->s_time_gran;
1403 	}
1404 	oe = ovl_get_lowerstack(sb, ctx, ofs, layers);
1405 	err = PTR_ERR(oe);
1406 	if (IS_ERR(oe))
1407 		goto out_err;
1408 
1409 	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
1410 	if (!ovl_upper_mnt(ofs))
1411 		sb->s_flags |= SB_RDONLY;
1412 
1413 	if (!ovl_origin_uuid(ofs) && ofs->numfs > 1) {
1414 		pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=null.\n");
1415 		ofs->config.uuid = OVL_UUID_NULL;
1416 	} else if (ovl_has_fsid(ofs) && ovl_upper_mnt(ofs)) {
1417 		/* Use per instance persistent uuid/fsid */
1418 		ovl_init_uuid_xattr(sb, ofs, &ctx->upper);
1419 	}
1420 
1421 	if (!ovl_force_readonly(ofs) && ofs->config.index) {
1422 		err = ovl_get_indexdir(sb, ofs, oe, &ctx->upper);
1423 		if (err)
1424 			goto out_free_oe;
1425 
1426 		/* Force r/o mount with no index dir */
1427 		if (!ofs->workdir)
1428 			sb->s_flags |= SB_RDONLY;
1429 	}
1430 
1431 	err = ovl_check_overlapping_layers(sb, ofs);
1432 	if (err)
1433 		goto out_free_oe;
1434 
1435 	/* Show index=off in /proc/mounts for forced r/o mount */
1436 	if (!ofs->workdir) {
1437 		ofs->config.index = false;
1438 		if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
1439 			pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
1440 			ofs->config.nfs_export = false;
1441 		}
1442 	}
1443 
1444 	if (ofs->config.metacopy && ofs->config.nfs_export) {
1445 		pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
1446 		ofs->config.nfs_export = false;
1447 	}
1448 
1449 	/*
1450 	 * Support encoding decodable file handles with nfs_export=on
1451 	 * and encoding non-decodable file handles with nfs_export=off
1452 	 * if all layers support file handles.
1453 	 */
1454 	if (ofs->config.nfs_export)
1455 		sb->s_export_op = &ovl_export_operations;
1456 	else if (!ofs->nofh)
1457 		sb->s_export_op = &ovl_export_fid_operations;
1458 
1459 	/* Never override disk quota limits or use reserved space */
1460 	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
1461 
1462 	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
1463 	sb->s_xattr = ovl_xattr_handlers(ofs);
1464 	sb->s_fs_info = ofs;
1465 #ifdef CONFIG_FS_POSIX_ACL
1466 	sb->s_flags |= SB_POSIXACL;
1467 #endif
1468 	sb->s_iflags |= SB_I_SKIP_SYNC;
1469 	/*
1470 	 * Ensure that umask handling is done by the filesystems used
1471 	 * for the the upper layer instead of overlayfs as that would
1472 	 * lead to unexpected results.
1473 	 */
1474 	sb->s_iflags |= SB_I_NOUMASK;
1475 	sb->s_iflags |= SB_I_EVM_HMAC_UNSUPPORTED;
1476 
1477 	err = -ENOMEM;
1478 	root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
1479 	if (!root_dentry)
1480 		goto out_free_oe;
1481 
1482 	sb->s_root = root_dentry;
1483 
1484 	return 0;
1485 
1486 out_free_oe:
1487 	ovl_free_entry(oe);
1488 out_err:
1489 	ovl_free_fs(ofs);
1490 	sb->s_fs_info = NULL;
1491 	return err;
1492 }
1493 
1494 struct file_system_type ovl_fs_type = {
1495 	.owner			= THIS_MODULE,
1496 	.name			= "overlay",
1497 	.init_fs_context	= ovl_init_fs_context,
1498 	.parameters		= ovl_parameter_spec,
1499 	.fs_flags		= FS_USERNS_MOUNT,
1500 	.kill_sb		= kill_anon_super,
1501 };
1502 MODULE_ALIAS_FS("overlay");
1503 
ovl_inode_init_once(void * foo)1504 static void ovl_inode_init_once(void *foo)
1505 {
1506 	struct ovl_inode *oi = foo;
1507 
1508 	inode_init_once(&oi->vfs_inode);
1509 }
1510 
ovl_init(void)1511 static int __init ovl_init(void)
1512 {
1513 	int err;
1514 
1515 	ovl_inode_cachep = kmem_cache_create("ovl_inode",
1516 					     sizeof(struct ovl_inode), 0,
1517 					     (SLAB_RECLAIM_ACCOUNT|
1518 					      SLAB_ACCOUNT),
1519 					     ovl_inode_init_once);
1520 	if (ovl_inode_cachep == NULL)
1521 		return -ENOMEM;
1522 
1523 	err = register_filesystem(&ovl_fs_type);
1524 	if (!err)
1525 		return 0;
1526 
1527 	kmem_cache_destroy(ovl_inode_cachep);
1528 
1529 	return err;
1530 }
1531 
ovl_exit(void)1532 static void __exit ovl_exit(void)
1533 {
1534 	unregister_filesystem(&ovl_fs_type);
1535 
1536 	/*
1537 	 * Make sure all delayed rcu free inodes are flushed before we
1538 	 * destroy cache.
1539 	 */
1540 	rcu_barrier();
1541 	kmem_cache_destroy(ovl_inode_cachep);
1542 }
1543 
1544 module_init(ovl_init);
1545 module_exit(ovl_exit);
1546