xref: /linux/fs/overlayfs/readdir.c (revision f82811e22b480a203a438d8e1f29af9c93ccbb0c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *
4  * Copyright (C) 2011 Novell Inc.
5  */
6 
7 #include <linux/fs.h>
8 #include <linux/slab.h>
9 #include <linux/namei.h>
10 #include <linux/file.h>
11 #include <linux/xattr.h>
12 #include <linux/rbtree.h>
13 #include <linux/security.h>
14 #include <linux/cred.h>
15 #include <linux/ratelimit.h>
16 #include "overlayfs.h"
17 
18 struct ovl_cache_entry {
19 	unsigned int len;
20 	unsigned int type;
21 	u64 real_ino;
22 	u64 ino;
23 	struct list_head l_node;
24 	struct rb_node node;
25 	struct ovl_cache_entry *next_maybe_whiteout;
26 	bool is_upper;
27 	bool is_whiteout;
28 	bool check_xwhiteout;
29 	char name[];
30 };
31 
32 struct ovl_dir_cache {
33 	long refcount;
34 	u64 version;
35 	struct list_head entries;
36 	struct rb_root root;
37 };
38 
39 struct ovl_readdir_data {
40 	struct dir_context ctx;
41 	struct dentry *dentry;
42 	bool is_lowest;
43 	struct rb_root *root;
44 	struct list_head *list;
45 	struct list_head middle;
46 	struct ovl_cache_entry *first_maybe_whiteout;
47 	int count;
48 	int err;
49 	bool is_upper;
50 	bool d_type_supported;
51 	bool in_xwhiteouts_dir;
52 };
53 
54 struct ovl_dir_file {
55 	bool is_real;
56 	bool is_upper;
57 	struct ovl_dir_cache *cache;
58 	struct list_head *cursor;
59 	struct file *realfile;
60 	struct file *upperfile;
61 };
62 
63 static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
64 {
65 	return rb_entry(n, struct ovl_cache_entry, node);
66 }
67 
68 static bool ovl_cache_entry_find_link(const char *name, int len,
69 				      struct rb_node ***link,
70 				      struct rb_node **parent)
71 {
72 	bool found = false;
73 	struct rb_node **newp = *link;
74 
75 	while (!found && *newp) {
76 		int cmp;
77 		struct ovl_cache_entry *tmp;
78 
79 		*parent = *newp;
80 		tmp = ovl_cache_entry_from_node(*newp);
81 		cmp = strncmp(name, tmp->name, len);
82 		if (cmp > 0)
83 			newp = &tmp->node.rb_right;
84 		else if (cmp < 0 || len < tmp->len)
85 			newp = &tmp->node.rb_left;
86 		else
87 			found = true;
88 	}
89 	*link = newp;
90 
91 	return found;
92 }
93 
94 static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
95 						    const char *name, int len)
96 {
97 	struct rb_node *node = root->rb_node;
98 	int cmp;
99 
100 	while (node) {
101 		struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
102 
103 		cmp = strncmp(name, p->name, len);
104 		if (cmp > 0)
105 			node = p->node.rb_right;
106 		else if (cmp < 0 || len < p->len)
107 			node = p->node.rb_left;
108 		else
109 			return p;
110 	}
111 
112 	return NULL;
113 }
114 
115 static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
116 			   struct ovl_cache_entry *p)
117 {
118 	/* Don't care if not doing ovl_iter() */
119 	if (!rdd->dentry)
120 		return false;
121 
122 	/* Always recalc d_ino when remapping lower inode numbers */
123 	if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb)))
124 		return true;
125 
126 	/* Always recalc d_ino for parent */
127 	if (strcmp(p->name, "..") == 0)
128 		return true;
129 
130 	/* If this is lower, then native d_ino will do */
131 	if (!rdd->is_upper)
132 		return false;
133 
134 	/*
135 	 * Recalc d_ino for '.' and for all entries if dir is impure (contains
136 	 * copied up entries)
137 	 */
138 	if ((p->name[0] == '.' && p->len == 1) ||
139 	    ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry)))
140 		return true;
141 
142 	return false;
143 }
144 
145 static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
146 						   const char *name, int len,
147 						   u64 ino, unsigned int d_type)
148 {
149 	struct ovl_cache_entry *p;
150 	size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
151 
152 	p = kmalloc(size, GFP_KERNEL);
153 	if (!p)
154 		return NULL;
155 
156 	memcpy(p->name, name, len);
157 	p->name[len] = '\0';
158 	p->len = len;
159 	p->type = d_type;
160 	p->real_ino = ino;
161 	p->ino = ino;
162 	/* Defer setting d_ino for upper entry to ovl_iterate() */
163 	if (ovl_calc_d_ino(rdd, p))
164 		p->ino = 0;
165 	p->is_upper = rdd->is_upper;
166 	p->is_whiteout = false;
167 	/* Defer check for overlay.whiteout to ovl_iterate() */
168 	p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
169 
170 	if (d_type == DT_CHR) {
171 		p->next_maybe_whiteout = rdd->first_maybe_whiteout;
172 		rdd->first_maybe_whiteout = p;
173 	}
174 	return p;
175 }
176 
177 static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
178 				  const char *name, int len, u64 ino,
179 				  unsigned int d_type)
180 {
181 	struct rb_node **newp = &rdd->root->rb_node;
182 	struct rb_node *parent = NULL;
183 	struct ovl_cache_entry *p;
184 
185 	if (ovl_cache_entry_find_link(name, len, &newp, &parent))
186 		return true;
187 
188 	p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
189 	if (p == NULL) {
190 		rdd->err = -ENOMEM;
191 		return false;
192 	}
193 
194 	list_add_tail(&p->l_node, rdd->list);
195 	rb_link_node(&p->node, parent, newp);
196 	rb_insert_color(&p->node, rdd->root);
197 
198 	return true;
199 }
200 
201 static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
202 			   const char *name, int namelen,
203 			   loff_t offset, u64 ino, unsigned int d_type)
204 {
205 	struct ovl_cache_entry *p;
206 
207 	p = ovl_cache_entry_find(rdd->root, name, namelen);
208 	if (p) {
209 		list_move_tail(&p->l_node, &rdd->middle);
210 	} else {
211 		p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
212 		if (p == NULL)
213 			rdd->err = -ENOMEM;
214 		else
215 			list_add_tail(&p->l_node, &rdd->middle);
216 	}
217 
218 	return rdd->err == 0;
219 }
220 
221 void ovl_cache_free(struct list_head *list)
222 {
223 	struct ovl_cache_entry *p;
224 	struct ovl_cache_entry *n;
225 
226 	list_for_each_entry_safe(p, n, list, l_node)
227 		kfree(p);
228 
229 	INIT_LIST_HEAD(list);
230 }
231 
232 void ovl_dir_cache_free(struct inode *inode)
233 {
234 	struct ovl_dir_cache *cache = ovl_dir_cache(inode);
235 
236 	if (cache) {
237 		ovl_cache_free(&cache->entries);
238 		kfree(cache);
239 	}
240 }
241 
242 static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
243 {
244 	struct ovl_dir_cache *cache = od->cache;
245 
246 	WARN_ON(cache->refcount <= 0);
247 	cache->refcount--;
248 	if (!cache->refcount) {
249 		if (ovl_dir_cache(inode) == cache)
250 			ovl_set_dir_cache(inode, NULL);
251 
252 		ovl_cache_free(&cache->entries);
253 		kfree(cache);
254 	}
255 }
256 
257 static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
258 			  int namelen, loff_t offset, u64 ino,
259 			  unsigned int d_type)
260 {
261 	struct ovl_readdir_data *rdd =
262 		container_of(ctx, struct ovl_readdir_data, ctx);
263 
264 	rdd->count++;
265 	if (!rdd->is_lowest)
266 		return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
267 	else
268 		return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
269 }
270 
271 static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
272 {
273 	int err;
274 	struct ovl_cache_entry *p;
275 	struct dentry *dentry, *dir = path->dentry;
276 	const struct cred *old_cred;
277 
278 	old_cred = ovl_override_creds(rdd->dentry->d_sb);
279 
280 	err = down_write_killable(&dir->d_inode->i_rwsem);
281 	if (!err) {
282 		while (rdd->first_maybe_whiteout) {
283 			p = rdd->first_maybe_whiteout;
284 			rdd->first_maybe_whiteout = p->next_maybe_whiteout;
285 			dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
286 			if (!IS_ERR(dentry)) {
287 				p->is_whiteout = ovl_is_whiteout(dentry);
288 				dput(dentry);
289 			}
290 		}
291 		inode_unlock(dir->d_inode);
292 	}
293 	revert_creds(old_cred);
294 
295 	return err;
296 }
297 
298 static inline int ovl_dir_read(const struct path *realpath,
299 			       struct ovl_readdir_data *rdd)
300 {
301 	struct file *realfile;
302 	int err;
303 
304 	realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE);
305 	if (IS_ERR(realfile))
306 		return PTR_ERR(realfile);
307 
308 	rdd->first_maybe_whiteout = NULL;
309 	rdd->ctx.pos = 0;
310 	do {
311 		rdd->count = 0;
312 		rdd->err = 0;
313 		err = iterate_dir(realfile, &rdd->ctx);
314 		if (err >= 0)
315 			err = rdd->err;
316 	} while (!err && rdd->count);
317 
318 	if (!err && rdd->first_maybe_whiteout && rdd->dentry)
319 		err = ovl_check_whiteouts(realpath, rdd);
320 
321 	fput(realfile);
322 
323 	return err;
324 }
325 
326 static void ovl_dir_reset(struct file *file)
327 {
328 	struct ovl_dir_file *od = file->private_data;
329 	struct ovl_dir_cache *cache = od->cache;
330 	struct inode *inode = file_inode(file);
331 	bool is_real;
332 
333 	if (cache && ovl_inode_version_get(inode) != cache->version) {
334 		ovl_cache_put(od, inode);
335 		od->cache = NULL;
336 		od->cursor = NULL;
337 	}
338 	is_real = ovl_dir_is_real(inode);
339 	if (od->is_real != is_real) {
340 		/* is_real can only become false when dir is copied up */
341 		if (WARN_ON(is_real))
342 			return;
343 		od->is_real = false;
344 	}
345 }
346 
347 static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
348 	struct rb_root *root)
349 {
350 	int err;
351 	struct path realpath;
352 	struct ovl_readdir_data rdd = {
353 		.ctx.actor = ovl_fill_merge,
354 		.dentry = dentry,
355 		.list = list,
356 		.root = root,
357 		.is_lowest = false,
358 	};
359 	int idx, next;
360 	const struct ovl_layer *layer;
361 
362 	for (idx = 0; idx != -1; idx = next) {
363 		next = ovl_path_next(idx, dentry, &realpath, &layer);
364 		rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
365 		rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
366 					ovl_dentry_has_xwhiteouts(dentry);
367 
368 		if (next != -1) {
369 			err = ovl_dir_read(&realpath, &rdd);
370 			if (err)
371 				break;
372 		} else {
373 			/*
374 			 * Insert lowest layer entries before upper ones, this
375 			 * allows offsets to be reasonably constant
376 			 */
377 			list_add(&rdd.middle, rdd.list);
378 			rdd.is_lowest = true;
379 			err = ovl_dir_read(&realpath, &rdd);
380 			list_del(&rdd.middle);
381 		}
382 	}
383 	return err;
384 }
385 
386 static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
387 {
388 	struct list_head *p;
389 	loff_t off = 0;
390 
391 	list_for_each(p, &od->cache->entries) {
392 		if (off >= pos)
393 			break;
394 		off++;
395 	}
396 	/* Cursor is safe since the cache is stable */
397 	od->cursor = p;
398 }
399 
400 static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
401 {
402 	int res;
403 	struct ovl_dir_cache *cache;
404 	struct inode *inode = d_inode(dentry);
405 
406 	cache = ovl_dir_cache(inode);
407 	if (cache && ovl_inode_version_get(inode) == cache->version) {
408 		WARN_ON(!cache->refcount);
409 		cache->refcount++;
410 		return cache;
411 	}
412 	ovl_set_dir_cache(d_inode(dentry), NULL);
413 
414 	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
415 	if (!cache)
416 		return ERR_PTR(-ENOMEM);
417 
418 	cache->refcount = 1;
419 	INIT_LIST_HEAD(&cache->entries);
420 	cache->root = RB_ROOT;
421 
422 	res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root);
423 	if (res) {
424 		ovl_cache_free(&cache->entries);
425 		kfree(cache);
426 		return ERR_PTR(res);
427 	}
428 
429 	cache->version = ovl_inode_version_get(inode);
430 	ovl_set_dir_cache(inode, cache);
431 
432 	return cache;
433 }
434 
435 /* Map inode number to lower fs unique range */
436 static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
437 			       const char *name, int namelen, bool warn)
438 {
439 	unsigned int xinoshift = 64 - xinobits;
440 
441 	if (unlikely(ino >> xinoshift)) {
442 		if (warn) {
443 			pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
444 					    namelen, name, ino, xinobits);
445 		}
446 		return ino;
447 	}
448 
449 	/*
450 	 * The lowest xinobit is reserved for mapping the non-peresistent inode
451 	 * numbers range, but this range is only exposed via st_ino, not here.
452 	 */
453 	return ino | ((u64)fsid) << (xinoshift + 1);
454 }
455 
456 /*
457  * Set d_ino for upper entries if needed. Non-upper entries should always report
458  * the uppermost real inode ino and should not call this function.
459  *
460  * When not all layer are on same fs, report real ino also for upper.
461  *
462  * When all layers are on the same fs, and upper has a reference to
463  * copy up origin, call vfs_getattr() on the overlay entry to make
464  * sure that d_ino will be consistent with st_ino from stat(2).
465  *
466  * Also checks the overlay.whiteout xattr by doing a full lookup which will return
467  * negative in this case.
468  */
469 static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino)
470 
471 {
472 	struct dentry *dir = path->dentry;
473 	struct ovl_fs *ofs = OVL_FS(dir->d_sb);
474 	struct dentry *this = NULL;
475 	enum ovl_path_type type;
476 	u64 ino = p->real_ino;
477 	int xinobits = ovl_xino_bits(ofs);
478 	int err = 0;
479 
480 	if (!ovl_same_dev(ofs) && !p->check_xwhiteout)
481 		goto out;
482 
483 	if (p->name[0] == '.') {
484 		if (p->len == 1) {
485 			this = dget(dir);
486 			goto get;
487 		}
488 		if (p->len == 2 && p->name[1] == '.') {
489 			/* we shall not be moved */
490 			this = dget(dir->d_parent);
491 			goto get;
492 		}
493 	}
494 	/* This checks also for xwhiteouts */
495 	this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
496 	if (IS_ERR_OR_NULL(this) || !this->d_inode) {
497 		/* Mark a stale entry */
498 		p->is_whiteout = true;
499 		if (IS_ERR(this)) {
500 			err = PTR_ERR(this);
501 			this = NULL;
502 			goto fail;
503 		}
504 		goto out;
505 	}
506 
507 get:
508 	if (!ovl_same_dev(ofs) || !update_ino)
509 		goto out;
510 
511 	type = ovl_path_type(this);
512 	if (OVL_TYPE_ORIGIN(type)) {
513 		struct kstat stat;
514 		struct path statpath = *path;
515 
516 		statpath.dentry = this;
517 		err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
518 		if (err)
519 			goto fail;
520 
521 		/*
522 		 * Directory inode is always on overlay st_dev.
523 		 * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
524 		 * of xino bits overflow.
525 		 */
526 		WARN_ON_ONCE(S_ISDIR(stat.mode) &&
527 			     dir->d_sb->s_dev != stat.dev);
528 		ino = stat.ino;
529 	} else if (xinobits && !OVL_TYPE_UPPER(type)) {
530 		ino = ovl_remap_lower_ino(ino, xinobits,
531 					  ovl_layer_lower(this)->fsid,
532 					  p->name, p->len,
533 					  ovl_xino_warn(ofs));
534 	}
535 
536 out:
537 	p->ino = ino;
538 	dput(this);
539 	return err;
540 
541 fail:
542 	pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n",
543 			    p->name, err);
544 	goto out;
545 }
546 
547 static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
548 			  int namelen, loff_t offset, u64 ino,
549 			  unsigned int d_type)
550 {
551 	struct ovl_cache_entry *p;
552 	struct ovl_readdir_data *rdd =
553 		container_of(ctx, struct ovl_readdir_data, ctx);
554 
555 	rdd->count++;
556 	p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
557 	if (p == NULL) {
558 		rdd->err = -ENOMEM;
559 		return false;
560 	}
561 	list_add_tail(&p->l_node, rdd->list);
562 
563 	return true;
564 }
565 
566 static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
567 			       struct rb_root *root)
568 {
569 	int err;
570 	struct path realpath;
571 	struct ovl_cache_entry *p, *n;
572 	struct ovl_readdir_data rdd = {
573 		.ctx.actor = ovl_fill_plain,
574 		.list = list,
575 		.root = root,
576 	};
577 
578 	INIT_LIST_HEAD(list);
579 	*root = RB_ROOT;
580 	ovl_path_upper(path->dentry, &realpath);
581 
582 	err = ovl_dir_read(&realpath, &rdd);
583 	if (err)
584 		return err;
585 
586 	list_for_each_entry_safe(p, n, list, l_node) {
587 		if (strcmp(p->name, ".") != 0 &&
588 		    strcmp(p->name, "..") != 0) {
589 			err = ovl_cache_update(path, p, true);
590 			if (err)
591 				return err;
592 		}
593 		if (p->ino == p->real_ino) {
594 			list_del(&p->l_node);
595 			kfree(p);
596 		} else {
597 			struct rb_node **newp = &root->rb_node;
598 			struct rb_node *parent = NULL;
599 
600 			if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
601 							      &newp, &parent)))
602 				return -EIO;
603 
604 			rb_link_node(&p->node, parent, newp);
605 			rb_insert_color(&p->node, root);
606 		}
607 	}
608 	return 0;
609 }
610 
611 static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
612 {
613 	int res;
614 	struct dentry *dentry = path->dentry;
615 	struct inode *inode = d_inode(dentry);
616 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
617 	struct ovl_dir_cache *cache;
618 
619 	cache = ovl_dir_cache(inode);
620 	if (cache && ovl_inode_version_get(inode) == cache->version)
621 		return cache;
622 
623 	/* Impure cache is not refcounted, free it here */
624 	ovl_dir_cache_free(inode);
625 	ovl_set_dir_cache(inode, NULL);
626 
627 	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
628 	if (!cache)
629 		return ERR_PTR(-ENOMEM);
630 
631 	res = ovl_dir_read_impure(path, &cache->entries, &cache->root);
632 	if (res) {
633 		ovl_cache_free(&cache->entries);
634 		kfree(cache);
635 		return ERR_PTR(res);
636 	}
637 	if (list_empty(&cache->entries)) {
638 		/*
639 		 * A good opportunity to get rid of an unneeded "impure" flag.
640 		 * Removing the "impure" xattr is best effort.
641 		 */
642 		if (!ovl_want_write(dentry)) {
643 			ovl_removexattr(ofs, ovl_dentry_upper(dentry),
644 					OVL_XATTR_IMPURE);
645 			ovl_drop_write(dentry);
646 		}
647 		ovl_clear_flag(OVL_IMPURE, inode);
648 		kfree(cache);
649 		return NULL;
650 	}
651 
652 	cache->version = ovl_inode_version_get(inode);
653 	ovl_set_dir_cache(inode, cache);
654 
655 	return cache;
656 }
657 
658 struct ovl_readdir_translate {
659 	struct dir_context *orig_ctx;
660 	struct ovl_dir_cache *cache;
661 	struct dir_context ctx;
662 	u64 parent_ino;
663 	int fsid;
664 	int xinobits;
665 	bool xinowarn;
666 };
667 
668 static bool ovl_fill_real(struct dir_context *ctx, const char *name,
669 			   int namelen, loff_t offset, u64 ino,
670 			   unsigned int d_type)
671 {
672 	struct ovl_readdir_translate *rdt =
673 		container_of(ctx, struct ovl_readdir_translate, ctx);
674 	struct dir_context *orig_ctx = rdt->orig_ctx;
675 
676 	if (rdt->parent_ino && strcmp(name, "..") == 0) {
677 		ino = rdt->parent_ino;
678 	} else if (rdt->cache) {
679 		struct ovl_cache_entry *p;
680 
681 		p = ovl_cache_entry_find(&rdt->cache->root, name, namelen);
682 		if (p)
683 			ino = p->ino;
684 	} else if (rdt->xinobits) {
685 		ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
686 					  name, namelen, rdt->xinowarn);
687 	}
688 
689 	return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
690 }
691 
692 static bool ovl_is_impure_dir(struct file *file)
693 {
694 	struct ovl_dir_file *od = file->private_data;
695 	struct inode *dir = file_inode(file);
696 
697 	/*
698 	 * Only upper dir can be impure, but if we are in the middle of
699 	 * iterating a lower real dir, dir could be copied up and marked
700 	 * impure. We only want the impure cache if we started iterating
701 	 * a real upper dir to begin with.
702 	 */
703 	return od->is_upper && ovl_test_flag(OVL_IMPURE, dir);
704 
705 }
706 
707 static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
708 {
709 	int err;
710 	struct ovl_dir_file *od = file->private_data;
711 	struct dentry *dir = file->f_path.dentry;
712 	struct ovl_fs *ofs = OVL_FS(dir->d_sb);
713 	const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
714 	struct ovl_readdir_translate rdt = {
715 		.ctx.actor = ovl_fill_real,
716 		.orig_ctx = ctx,
717 		.xinobits = ovl_xino_bits(ofs),
718 		.xinowarn = ovl_xino_warn(ofs),
719 	};
720 
721 	if (rdt.xinobits && lower_layer)
722 		rdt.fsid = lower_layer->fsid;
723 
724 	if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) {
725 		struct kstat stat;
726 		struct path statpath = file->f_path;
727 
728 		statpath.dentry = dir->d_parent;
729 		err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
730 		if (err)
731 			return err;
732 
733 		WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
734 		rdt.parent_ino = stat.ino;
735 	}
736 
737 	if (ovl_is_impure_dir(file)) {
738 		rdt.cache = ovl_cache_get_impure(&file->f_path);
739 		if (IS_ERR(rdt.cache))
740 			return PTR_ERR(rdt.cache);
741 	}
742 
743 	err = iterate_dir(od->realfile, &rdt.ctx);
744 	ctx->pos = rdt.ctx.pos;
745 
746 	return err;
747 }
748 
749 
750 static int ovl_iterate(struct file *file, struct dir_context *ctx)
751 {
752 	struct ovl_dir_file *od = file->private_data;
753 	struct dentry *dentry = file->f_path.dentry;
754 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
755 	struct ovl_cache_entry *p;
756 	const struct cred *old_cred;
757 	int err;
758 
759 	old_cred = ovl_override_creds(dentry->d_sb);
760 	if (!ctx->pos)
761 		ovl_dir_reset(file);
762 
763 	if (od->is_real) {
764 		/*
765 		 * If parent is merge, then need to adjust d_ino for '..', if
766 		 * dir is impure then need to adjust d_ino for copied up
767 		 * entries.
768 		 */
769 		if (ovl_xino_bits(ofs) ||
770 		    (ovl_same_fs(ofs) &&
771 		     (ovl_is_impure_dir(file) ||
772 		      OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
773 			err = ovl_iterate_real(file, ctx);
774 		} else {
775 			err = iterate_dir(od->realfile, ctx);
776 		}
777 		goto out;
778 	}
779 
780 	if (!od->cache) {
781 		struct ovl_dir_cache *cache;
782 
783 		cache = ovl_cache_get(dentry);
784 		err = PTR_ERR(cache);
785 		if (IS_ERR(cache))
786 			goto out;
787 
788 		od->cache = cache;
789 		ovl_seek_cursor(od, ctx->pos);
790 	}
791 
792 	while (od->cursor != &od->cache->entries) {
793 		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
794 		if (!p->is_whiteout) {
795 			if (!p->ino || p->check_xwhiteout) {
796 				err = ovl_cache_update(&file->f_path, p, !p->ino);
797 				if (err)
798 					goto out;
799 			}
800 		}
801 		/* ovl_cache_update() sets is_whiteout on stale entry */
802 		if (!p->is_whiteout) {
803 			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
804 				break;
805 		}
806 		od->cursor = p->l_node.next;
807 		ctx->pos++;
808 	}
809 	err = 0;
810 out:
811 	revert_creds(old_cred);
812 	return err;
813 }
814 
815 static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
816 {
817 	loff_t res;
818 	struct ovl_dir_file *od = file->private_data;
819 
820 	inode_lock(file_inode(file));
821 	if (!file->f_pos)
822 		ovl_dir_reset(file);
823 
824 	if (od->is_real) {
825 		res = vfs_llseek(od->realfile, offset, origin);
826 		file->f_pos = od->realfile->f_pos;
827 	} else {
828 		res = -EINVAL;
829 
830 		switch (origin) {
831 		case SEEK_CUR:
832 			offset += file->f_pos;
833 			break;
834 		case SEEK_SET:
835 			break;
836 		default:
837 			goto out_unlock;
838 		}
839 		if (offset < 0)
840 			goto out_unlock;
841 
842 		if (offset != file->f_pos) {
843 			file->f_pos = offset;
844 			if (od->cache)
845 				ovl_seek_cursor(od, offset);
846 		}
847 		res = offset;
848 	}
849 out_unlock:
850 	inode_unlock(file_inode(file));
851 
852 	return res;
853 }
854 
855 static struct file *ovl_dir_open_realfile(const struct file *file,
856 					  const struct path *realpath)
857 {
858 	struct file *res;
859 	const struct cred *old_cred;
860 
861 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
862 	res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
863 	revert_creds(old_cred);
864 
865 	return res;
866 }
867 
868 /*
869  * Like ovl_real_fdget(), returns upperfile if dir was copied up since open.
870  * Unlike ovl_real_fdget(), this caches upperfile in file->private_data.
871  *
872  * TODO: use same abstract type for file->private_data of dir and file so
873  * upperfile could also be cached for files as well.
874  */
875 struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
876 {
877 
878 	struct ovl_dir_file *od = file->private_data;
879 	struct dentry *dentry = file->f_path.dentry;
880 	struct file *old, *realfile = od->realfile;
881 
882 	if (!OVL_TYPE_UPPER(ovl_path_type(dentry)))
883 		return want_upper ? NULL : realfile;
884 
885 	/*
886 	 * Need to check if we started out being a lower dir, but got copied up
887 	 */
888 	if (!od->is_upper) {
889 		realfile = READ_ONCE(od->upperfile);
890 		if (!realfile) {
891 			struct path upperpath;
892 
893 			ovl_path_upper(dentry, &upperpath);
894 			realfile = ovl_dir_open_realfile(file, &upperpath);
895 			if (IS_ERR(realfile))
896 				return realfile;
897 
898 			old = cmpxchg_release(&od->upperfile, NULL, realfile);
899 			if (old) {
900 				fput(realfile);
901 				realfile = old;
902 			}
903 		}
904 	}
905 
906 	return realfile;
907 }
908 
909 static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
910 			 int datasync)
911 {
912 	struct file *realfile;
913 	int err;
914 
915 	err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
916 	if (err <= 0)
917 		return err;
918 
919 	realfile = ovl_dir_real_file(file, true);
920 	err = PTR_ERR_OR_ZERO(realfile);
921 
922 	/* Nothing to sync for lower */
923 	if (!realfile || err)
924 		return err;
925 
926 	return vfs_fsync_range(realfile, start, end, datasync);
927 }
928 
929 static int ovl_dir_release(struct inode *inode, struct file *file)
930 {
931 	struct ovl_dir_file *od = file->private_data;
932 
933 	if (od->cache) {
934 		inode_lock(inode);
935 		ovl_cache_put(od, inode);
936 		inode_unlock(inode);
937 	}
938 	fput(od->realfile);
939 	if (od->upperfile)
940 		fput(od->upperfile);
941 	kfree(od);
942 
943 	return 0;
944 }
945 
946 static int ovl_dir_open(struct inode *inode, struct file *file)
947 {
948 	struct path realpath;
949 	struct file *realfile;
950 	struct ovl_dir_file *od;
951 	enum ovl_path_type type;
952 
953 	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
954 	if (!od)
955 		return -ENOMEM;
956 
957 	type = ovl_path_real(file->f_path.dentry, &realpath);
958 	realfile = ovl_dir_open_realfile(file, &realpath);
959 	if (IS_ERR(realfile)) {
960 		kfree(od);
961 		return PTR_ERR(realfile);
962 	}
963 	od->realfile = realfile;
964 	od->is_real = ovl_dir_is_real(inode);
965 	od->is_upper = OVL_TYPE_UPPER(type);
966 	file->private_data = od;
967 
968 	return 0;
969 }
970 
971 WRAP_DIR_ITER(ovl_iterate) // FIXME!
972 const struct file_operations ovl_dir_operations = {
973 	.read		= generic_read_dir,
974 	.open		= ovl_dir_open,
975 	.iterate_shared	= shared_ovl_iterate,
976 	.llseek		= ovl_dir_llseek,
977 	.fsync		= ovl_dir_fsync,
978 	.release	= ovl_dir_release,
979 };
980 
981 int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
982 {
983 	int err;
984 	struct ovl_cache_entry *p, *n;
985 	struct rb_root root = RB_ROOT;
986 	const struct cred *old_cred;
987 
988 	old_cred = ovl_override_creds(dentry->d_sb);
989 	err = ovl_dir_read_merged(dentry, list, &root);
990 	revert_creds(old_cred);
991 	if (err)
992 		return err;
993 
994 	err = 0;
995 
996 	list_for_each_entry_safe(p, n, list, l_node) {
997 		/*
998 		 * Select whiteouts in upperdir, they should
999 		 * be cleared when deleting this directory.
1000 		 */
1001 		if (p->is_whiteout) {
1002 			if (p->is_upper)
1003 				continue;
1004 			goto del_entry;
1005 		}
1006 
1007 		if (p->name[0] == '.') {
1008 			if (p->len == 1)
1009 				goto del_entry;
1010 			if (p->len == 2 && p->name[1] == '.')
1011 				goto del_entry;
1012 		}
1013 		err = -ENOTEMPTY;
1014 		break;
1015 
1016 del_entry:
1017 		list_del(&p->l_node);
1018 		kfree(p);
1019 	}
1020 
1021 	return err;
1022 }
1023 
1024 void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
1025 			   struct list_head *list)
1026 {
1027 	struct ovl_cache_entry *p;
1028 
1029 	inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
1030 	list_for_each_entry(p, list, l_node) {
1031 		struct dentry *dentry;
1032 
1033 		if (WARN_ON(!p->is_whiteout || !p->is_upper))
1034 			continue;
1035 
1036 		dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
1037 		if (IS_ERR(dentry)) {
1038 			pr_err("lookup '%s/%.*s' failed (%i)\n",
1039 			       upper->d_name.name, p->len, p->name,
1040 			       (int) PTR_ERR(dentry));
1041 			continue;
1042 		}
1043 		if (dentry->d_inode)
1044 			ovl_cleanup(ofs, upper->d_inode, dentry);
1045 		dput(dentry);
1046 	}
1047 	inode_unlock(upper->d_inode);
1048 }
1049 
1050 static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
1051 			  int namelen, loff_t offset, u64 ino,
1052 			  unsigned int d_type)
1053 {
1054 	struct ovl_readdir_data *rdd =
1055 		container_of(ctx, struct ovl_readdir_data, ctx);
1056 
1057 	/* Even if d_type is not supported, DT_DIR is returned for . and .. */
1058 	if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
1059 		return true;
1060 
1061 	if (d_type != DT_UNKNOWN)
1062 		rdd->d_type_supported = true;
1063 
1064 	return true;
1065 }
1066 
1067 /*
1068  * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
1069  * if error is encountered.
1070  */
1071 int ovl_check_d_type_supported(const struct path *realpath)
1072 {
1073 	int err;
1074 	struct ovl_readdir_data rdd = {
1075 		.ctx.actor = ovl_check_d_type,
1076 		.d_type_supported = false,
1077 	};
1078 
1079 	err = ovl_dir_read(realpath, &rdd);
1080 	if (err)
1081 		return err;
1082 
1083 	return rdd.d_type_supported;
1084 }
1085 
1086 #define OVL_INCOMPATDIR_NAME "incompat"
1087 
1088 static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
1089 				       int level)
1090 {
1091 	int err;
1092 	struct inode *dir = path->dentry->d_inode;
1093 	LIST_HEAD(list);
1094 	struct ovl_cache_entry *p;
1095 	struct ovl_readdir_data rdd = {
1096 		.ctx.actor = ovl_fill_plain,
1097 		.list = &list,
1098 	};
1099 	bool incompat = false;
1100 
1101 	/*
1102 	 * The "work/incompat" directory is treated specially - if it is not
1103 	 * empty, instead of printing a generic error and mounting read-only,
1104 	 * we will error about incompat features and fail the mount.
1105 	 *
1106 	 * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name
1107 	 * starts with '#'.
1108 	 */
1109 	if (level == 2 &&
1110 	    !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME))
1111 		incompat = true;
1112 
1113 	err = ovl_dir_read(path, &rdd);
1114 	if (err)
1115 		goto out;
1116 
1117 	inode_lock_nested(dir, I_MUTEX_PARENT);
1118 	list_for_each_entry(p, &list, l_node) {
1119 		struct dentry *dentry;
1120 
1121 		if (p->name[0] == '.') {
1122 			if (p->len == 1)
1123 				continue;
1124 			if (p->len == 2 && p->name[1] == '.')
1125 				continue;
1126 		} else if (incompat) {
1127 			pr_err("overlay with incompat feature '%s' cannot be mounted\n",
1128 				p->name);
1129 			err = -EINVAL;
1130 			break;
1131 		}
1132 		dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
1133 		if (IS_ERR(dentry))
1134 			continue;
1135 		if (dentry->d_inode)
1136 			err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
1137 		dput(dentry);
1138 		if (err)
1139 			break;
1140 	}
1141 	inode_unlock(dir);
1142 out:
1143 	ovl_cache_free(&list);
1144 	return err;
1145 }
1146 
1147 int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
1148 			struct vfsmount *mnt, struct dentry *dentry, int level)
1149 {
1150 	int err;
1151 
1152 	if (!d_is_dir(dentry) || level > 1) {
1153 		return ovl_cleanup(ofs, dir, dentry);
1154 	}
1155 
1156 	err = ovl_do_rmdir(ofs, dir, dentry);
1157 	if (err) {
1158 		struct path path = { .mnt = mnt, .dentry = dentry };
1159 
1160 		inode_unlock(dir);
1161 		err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
1162 		inode_lock_nested(dir, I_MUTEX_PARENT);
1163 		if (!err)
1164 			err = ovl_cleanup(ofs, dir, dentry);
1165 	}
1166 
1167 	return err;
1168 }
1169 
1170 int ovl_indexdir_cleanup(struct ovl_fs *ofs)
1171 {
1172 	int err;
1173 	struct dentry *indexdir = ofs->workdir;
1174 	struct dentry *index = NULL;
1175 	struct inode *dir = indexdir->d_inode;
1176 	struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
1177 	LIST_HEAD(list);
1178 	struct ovl_cache_entry *p;
1179 	struct ovl_readdir_data rdd = {
1180 		.ctx.actor = ovl_fill_plain,
1181 		.list = &list,
1182 	};
1183 
1184 	err = ovl_dir_read(&path, &rdd);
1185 	if (err)
1186 		goto out;
1187 
1188 	inode_lock_nested(dir, I_MUTEX_PARENT);
1189 	list_for_each_entry(p, &list, l_node) {
1190 		if (p->name[0] == '.') {
1191 			if (p->len == 1)
1192 				continue;
1193 			if (p->len == 2 && p->name[1] == '.')
1194 				continue;
1195 		}
1196 		index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
1197 		if (IS_ERR(index)) {
1198 			err = PTR_ERR(index);
1199 			index = NULL;
1200 			break;
1201 		}
1202 		/* Cleanup leftover from index create/cleanup attempt */
1203 		if (index->d_name.name[0] == '#') {
1204 			err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
1205 			if (err)
1206 				break;
1207 			goto next;
1208 		}
1209 		err = ovl_verify_index(ofs, index);
1210 		if (!err) {
1211 			goto next;
1212 		} else if (err == -ESTALE) {
1213 			/* Cleanup stale index entries */
1214 			err = ovl_cleanup(ofs, dir, index);
1215 		} else if (err != -ENOENT) {
1216 			/*
1217 			 * Abort mount to avoid corrupting the index if
1218 			 * an incompatible index entry was found or on out
1219 			 * of memory.
1220 			 */
1221 			break;
1222 		} else if (ofs->config.nfs_export) {
1223 			/*
1224 			 * Whiteout orphan index to block future open by
1225 			 * handle after overlay nlink dropped to zero.
1226 			 */
1227 			err = ovl_cleanup_and_whiteout(ofs, dir, index);
1228 		} else {
1229 			/* Cleanup orphan index entries */
1230 			err = ovl_cleanup(ofs, dir, index);
1231 		}
1232 
1233 		if (err)
1234 			break;
1235 
1236 next:
1237 		dput(index);
1238 		index = NULL;
1239 	}
1240 	dput(index);
1241 	inode_unlock(dir);
1242 out:
1243 	ovl_cache_free(&list);
1244 	if (err)
1245 		pr_err("failed index dir cleanup (%i)\n", err);
1246 	return err;
1247 }
1248