xref: /linux/fs/overlayfs/readdir.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *
4  * Copyright (C) 2011 Novell Inc.
5  */
6 
7 #include <linux/fs.h>
8 #include <linux/slab.h>
9 #include <linux/namei.h>
10 #include <linux/file.h>
11 #include <linux/xattr.h>
12 #include <linux/rbtree.h>
13 #include <linux/security.h>
14 #include <linux/cred.h>
15 #include <linux/ratelimit.h>
16 #include <linux/overflow.h>
17 #include "overlayfs.h"
18 
19 struct ovl_cache_entry {
20 	unsigned int len;
21 	unsigned int type;
22 	u64 real_ino;
23 	u64 ino;
24 	struct list_head l_node;
25 	struct rb_node node;
26 	struct ovl_cache_entry *next_maybe_whiteout;
27 	bool is_upper;
28 	bool is_whiteout;
29 	bool check_xwhiteout;
30 	const char *c_name;
31 	int c_len;
32 	char name[];
33 };
34 
35 struct ovl_dir_cache {
36 	long refcount;
37 	u64 version;
38 	struct list_head entries;
39 	struct rb_root root;
40 };
41 
42 struct ovl_readdir_data {
43 	struct dir_context ctx;
44 	struct dentry *dentry;
45 	bool is_lowest;
46 	struct rb_root *root;
47 	struct list_head *list;
48 	struct list_head middle;
49 	struct ovl_cache_entry *first_maybe_whiteout;
50 	struct unicode_map *map;
51 	int count;
52 	int err;
53 	bool is_upper;
54 	bool d_type_supported;
55 	bool in_xwhiteouts_dir;
56 };
57 
58 struct ovl_dir_file {
59 	bool is_real;
60 	bool is_upper;
61 	struct ovl_dir_cache *cache;
62 	struct list_head *cursor;
63 	struct file *realfile;
64 	struct file *upperfile;
65 };
66 
67 static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
68 {
69 	return rb_entry(n, struct ovl_cache_entry, node);
70 }
71 
72 static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len,
73 			char **dst)
74 {
75 	const struct qstr qstr = { .name = str, .len = len };
76 	char *cf_name;
77 	int cf_len;
78 
79 	if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len))
80 		return 0;
81 
82 	cf_name = kmalloc(NAME_MAX, GFP_KERNEL);
83 	if (!cf_name) {
84 		rdd->err = -ENOMEM;
85 		return -ENOMEM;
86 	}
87 
88 	cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX);
89 	if (cf_len > 0)
90 		*dst = cf_name;
91 	else
92 		kfree(cf_name);
93 
94 	return cf_len;
95 }
96 
97 static bool ovl_cache_entry_find_link(const char *name, int len,
98 				      struct rb_node ***link,
99 				      struct rb_node **parent)
100 {
101 	bool found = false;
102 	struct rb_node **newp = *link;
103 
104 	while (!found && *newp) {
105 		int cmp;
106 		struct ovl_cache_entry *tmp;
107 
108 		*parent = *newp;
109 		tmp = ovl_cache_entry_from_node(*newp);
110 		cmp = strncmp(name, tmp->c_name, len);
111 		if (cmp > 0)
112 			newp = &tmp->node.rb_right;
113 		else if (cmp < 0 || len < tmp->c_len)
114 			newp = &tmp->node.rb_left;
115 		else
116 			found = true;
117 	}
118 	*link = newp;
119 
120 	return found;
121 }
122 
123 static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
124 						    const char *name, int len)
125 {
126 	struct rb_node *node = root->rb_node;
127 	int cmp;
128 
129 	while (node) {
130 		struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
131 
132 		cmp = strncmp(name, p->c_name, len);
133 		if (cmp > 0)
134 			node = p->node.rb_right;
135 		else if (cmp < 0 || len < p->c_len)
136 			node = p->node.rb_left;
137 		else
138 			return p;
139 	}
140 
141 	return NULL;
142 }
143 
144 static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
145 			   struct ovl_cache_entry *p)
146 {
147 	/* Don't care if not doing ovl_iter() */
148 	if (!rdd->dentry)
149 		return false;
150 
151 	/* Always recalc d_ino when remapping lower inode numbers */
152 	if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb)))
153 		return true;
154 
155 	/* Always recalc d_ino for parent */
156 	if (strcmp(p->name, "..") == 0)
157 		return true;
158 
159 	/* If this is lower, then native d_ino will do */
160 	if (!rdd->is_upper)
161 		return false;
162 
163 	/*
164 	 * Recalc d_ino for '.' and for all entries if dir is impure (contains
165 	 * copied up entries)
166 	 */
167 	if ((p->name[0] == '.' && p->len == 1) ||
168 	    ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry)))
169 		return true;
170 
171 	return false;
172 }
173 
174 static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
175 						   const char *name, int len,
176 						   const char *c_name, int c_len,
177 						   u64 ino, unsigned int d_type)
178 {
179 	struct ovl_cache_entry *p;
180 
181 	p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL);
182 	if (!p)
183 		return NULL;
184 
185 	memcpy(p->name, name, len);
186 	p->name[len] = '\0';
187 	p->len = len;
188 	p->type = d_type;
189 	p->real_ino = ino;
190 	p->ino = ino;
191 	/* Defer setting d_ino for upper entry to ovl_iterate() */
192 	if (ovl_calc_d_ino(rdd, p))
193 		p->ino = 0;
194 	p->is_upper = rdd->is_upper;
195 	p->is_whiteout = false;
196 	/* Defer check for overlay.whiteout to ovl_iterate() */
197 	p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
198 
199 	if (c_name && c_name != name) {
200 		p->c_name = c_name;
201 		p->c_len = c_len;
202 	} else {
203 		p->c_name = p->name;
204 		p->c_len = len;
205 	}
206 
207 	if (d_type == DT_CHR) {
208 		p->next_maybe_whiteout = rdd->first_maybe_whiteout;
209 		rdd->first_maybe_whiteout = p;
210 	}
211 	return p;
212 }
213 
214 /* Return 0 for found, 1 for added, <0 for error */
215 static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
216 				  const char *name, int len,
217 				  const char *c_name, int c_len,
218 				  u64 ino,
219 				  unsigned int d_type)
220 {
221 	struct rb_node **newp = &rdd->root->rb_node;
222 	struct rb_node *parent = NULL;
223 	struct ovl_cache_entry *p;
224 
225 	if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent))
226 		return 0;
227 
228 	p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type);
229 	if (p == NULL) {
230 		rdd->err = -ENOMEM;
231 		return -ENOMEM;
232 	}
233 
234 	list_add_tail(&p->l_node, rdd->list);
235 	rb_link_node(&p->node, parent, newp);
236 	rb_insert_color(&p->node, rdd->root);
237 
238 	return 1;
239 }
240 
241 /* Return 0 for found, 1 for added, <0 for error */
242 static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
243 			   const char *name, int namelen,
244 			   const char *c_name, int c_len,
245 			   loff_t offset, u64 ino, unsigned int d_type)
246 {
247 	struct ovl_cache_entry *p;
248 
249 	p = ovl_cache_entry_find(rdd->root, c_name, c_len);
250 	if (p) {
251 		list_move_tail(&p->l_node, &rdd->middle);
252 		return 0;
253 	} else {
254 		p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len,
255 					ino, d_type);
256 		if (p == NULL)
257 			rdd->err = -ENOMEM;
258 		else
259 			list_add_tail(&p->l_node, &rdd->middle);
260 	}
261 
262 	return rdd->err ?: 1;
263 }
264 
265 static void ovl_cache_entry_free(struct ovl_cache_entry *p)
266 {
267 	if (p->c_name != p->name)
268 		kfree(p->c_name);
269 	kfree(p);
270 }
271 
272 void ovl_cache_free(struct list_head *list)
273 {
274 	struct ovl_cache_entry *p;
275 	struct ovl_cache_entry *n;
276 
277 	list_for_each_entry_safe(p, n, list, l_node)
278 		ovl_cache_entry_free(p);
279 
280 	INIT_LIST_HEAD(list);
281 }
282 
283 void ovl_dir_cache_free(struct inode *inode)
284 {
285 	struct ovl_dir_cache *cache = ovl_dir_cache(inode);
286 
287 	if (cache) {
288 		ovl_cache_free(&cache->entries);
289 		kfree(cache);
290 	}
291 }
292 
293 static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
294 {
295 	struct ovl_dir_cache *cache = od->cache;
296 
297 	WARN_ON(cache->refcount <= 0);
298 	cache->refcount--;
299 	if (!cache->refcount) {
300 		if (ovl_dir_cache(inode) == cache)
301 			ovl_set_dir_cache(inode, NULL);
302 
303 		ovl_cache_free(&cache->entries);
304 		kfree(cache);
305 	}
306 }
307 
308 static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
309 			  int namelen, loff_t offset, u64 ino,
310 			  unsigned int d_type)
311 {
312 	struct ovl_readdir_data *rdd =
313 		container_of(ctx, struct ovl_readdir_data, ctx);
314 	struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb);
315 	const char *c_name = NULL;
316 	char *cf_name = NULL;
317 	int c_len = 0, ret;
318 
319 	if (ofs->casefold)
320 		c_len = ovl_casefold(rdd, name, namelen, &cf_name);
321 
322 	if (rdd->err)
323 		return false;
324 
325 	if (c_len <= 0) {
326 		c_name = name;
327 		c_len = namelen;
328 	} else {
329 		c_name = cf_name;
330 	}
331 
332 	rdd->count++;
333 	if (!rdd->is_lowest)
334 		ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type);
335 	else
336 		ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type);
337 
338 	/*
339 	 * If ret == 1, that means that c_name is being used as part of struct
340 	 * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise,
341 	 * c_name was found in the rb-tree so we can free it here.
342 	 */
343 	if (ret != 1 && c_name != name)
344 		kfree(c_name);
345 
346 	return ret >= 0;
347 }
348 
349 static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
350 {
351 	struct dentry *dentry, *dir = path->dentry;
352 
353 	while (rdd->first_maybe_whiteout) {
354 		struct ovl_cache_entry *p =
355 			rdd->first_maybe_whiteout;
356 		rdd->first_maybe_whiteout = p->next_maybe_whiteout;
357 		dentry = lookup_one_positive_killable(mnt_idmap(path->mnt),
358 						      &QSTR_LEN(p->name, p->len),
359 						      dir);
360 		if (!IS_ERR(dentry)) {
361 			p->is_whiteout = ovl_is_whiteout(dentry);
362 			dput(dentry);
363 		} else if (PTR_ERR(dentry) == -EINTR) {
364 			return -EINTR;
365 		}
366 	}
367 
368 	return 0;
369 }
370 
371 static inline int ovl_dir_read(const struct path *realpath,
372 			       struct ovl_readdir_data *rdd)
373 {
374 	struct file *realfile;
375 	int err;
376 
377 	realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE);
378 	if (IS_ERR(realfile))
379 		return PTR_ERR(realfile);
380 
381 	rdd->first_maybe_whiteout = NULL;
382 	rdd->ctx.pos = 0;
383 	do {
384 		rdd->count = 0;
385 		rdd->err = 0;
386 		err = iterate_dir(realfile, &rdd->ctx);
387 		if (err >= 0)
388 			err = rdd->err;
389 	} while (!err && rdd->count);
390 
391 	if (!err && rdd->first_maybe_whiteout && rdd->dentry)
392 		err = ovl_check_whiteouts(realpath, rdd);
393 
394 	fput(realfile);
395 
396 	return err;
397 }
398 
399 static void ovl_dir_reset(struct file *file)
400 {
401 	struct ovl_dir_file *od = file->private_data;
402 	struct ovl_dir_cache *cache = od->cache;
403 	struct inode *inode = file_inode(file);
404 	bool is_real;
405 
406 	if (cache && ovl_inode_version_get(inode) != cache->version) {
407 		ovl_cache_put(od, inode);
408 		od->cache = NULL;
409 		od->cursor = NULL;
410 	}
411 	is_real = ovl_dir_is_real(inode);
412 	if (od->is_real != is_real) {
413 		/* is_real can only become false when dir is copied up */
414 		if (WARN_ON(is_real))
415 			return;
416 		od->is_real = false;
417 	}
418 }
419 
420 static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
421 	struct rb_root *root)
422 {
423 	int err;
424 	struct path realpath;
425 	struct ovl_readdir_data rdd = {
426 		.ctx.actor = ovl_fill_merge,
427 		.ctx.count = INT_MAX,
428 		.dentry = dentry,
429 		.list = list,
430 		.root = root,
431 		.is_lowest = false,
432 		.map = NULL,
433 	};
434 	int idx, next;
435 	const struct ovl_layer *layer;
436 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
437 
438 	for (idx = 0; idx != -1; idx = next) {
439 		next = ovl_path_next(idx, dentry, &realpath, &layer);
440 
441 		if (ofs->casefold)
442 			rdd.map = sb_encoding(realpath.dentry->d_sb);
443 
444 		rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
445 		rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
446 					ovl_dentry_has_xwhiteouts(dentry);
447 
448 		if (next != -1) {
449 			err = ovl_dir_read(&realpath, &rdd);
450 			if (err)
451 				break;
452 		} else {
453 			/*
454 			 * Insert lowest layer entries before upper ones, this
455 			 * allows offsets to be reasonably constant
456 			 */
457 			list_add(&rdd.middle, rdd.list);
458 			rdd.is_lowest = true;
459 			err = ovl_dir_read(&realpath, &rdd);
460 			list_del(&rdd.middle);
461 		}
462 	}
463 	return err;
464 }
465 
466 static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
467 {
468 	struct list_head *p;
469 	loff_t off = 0;
470 
471 	list_for_each(p, &od->cache->entries) {
472 		if (off >= pos)
473 			break;
474 		off++;
475 	}
476 	/* Cursor is safe since the cache is stable */
477 	od->cursor = p;
478 }
479 
480 static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
481 {
482 	int res;
483 	struct ovl_dir_cache *cache;
484 	struct inode *inode = d_inode(dentry);
485 
486 	cache = ovl_dir_cache(inode);
487 	if (cache && ovl_inode_version_get(inode) == cache->version) {
488 		WARN_ON(!cache->refcount);
489 		cache->refcount++;
490 		return cache;
491 	}
492 	ovl_set_dir_cache(d_inode(dentry), NULL);
493 
494 	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
495 	if (!cache)
496 		return ERR_PTR(-ENOMEM);
497 
498 	cache->refcount = 1;
499 	INIT_LIST_HEAD(&cache->entries);
500 	cache->root = RB_ROOT;
501 
502 	res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root);
503 	if (res) {
504 		ovl_cache_free(&cache->entries);
505 		kfree(cache);
506 		return ERR_PTR(res);
507 	}
508 
509 	cache->version = ovl_inode_version_get(inode);
510 	ovl_set_dir_cache(inode, cache);
511 
512 	return cache;
513 }
514 
515 /* Map inode number to lower fs unique range */
516 static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
517 			       const char *name, int namelen, bool warn)
518 {
519 	unsigned int xinoshift = 64 - xinobits;
520 
521 	if (unlikely(ino >> xinoshift)) {
522 		if (warn) {
523 			pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
524 					    namelen, name, ino, xinobits);
525 		}
526 		return ino;
527 	}
528 
529 	/*
530 	 * The lowest xinobit is reserved for mapping the non-peresistent inode
531 	 * numbers range, but this range is only exposed via st_ino, not here.
532 	 */
533 	return ino | ((u64)fsid) << (xinoshift + 1);
534 }
535 
536 /*
537  * Set d_ino for upper entries if needed. Non-upper entries should always report
538  * the uppermost real inode ino and should not call this function.
539  *
540  * When not all layer are on same fs, report real ino also for upper.
541  *
542  * When all layers are on the same fs, and upper has a reference to
543  * copy up origin, call vfs_getattr() on the overlay entry to make
544  * sure that d_ino will be consistent with st_ino from stat(2).
545  *
546  * Also checks the overlay.whiteout xattr by doing a full lookup which will return
547  * negative in this case.
548  */
549 static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino)
550 
551 {
552 	struct dentry *dir = path->dentry;
553 	struct ovl_fs *ofs = OVL_FS(dir->d_sb);
554 	struct dentry *this = NULL;
555 	enum ovl_path_type type;
556 	u64 ino = p->real_ino;
557 	int xinobits = ovl_xino_bits(ofs);
558 	int err = 0;
559 
560 	if (!ovl_same_dev(ofs) && !p->check_xwhiteout)
561 		goto out;
562 
563 	if (p->name[0] == '.') {
564 		if (p->len == 1) {
565 			this = dget(dir);
566 			goto get;
567 		}
568 		if (p->len == 2 && p->name[1] == '.') {
569 			/* we shall not be moved */
570 			this = dget(dir->d_parent);
571 			goto get;
572 		}
573 	}
574 	/* This checks also for xwhiteouts */
575 	this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);
576 	if (IS_ERR_OR_NULL(this) || !this->d_inode) {
577 		/* Mark a stale entry */
578 		p->is_whiteout = true;
579 		if (IS_ERR(this)) {
580 			err = PTR_ERR(this);
581 			this = NULL;
582 			goto fail;
583 		}
584 		goto out;
585 	}
586 
587 get:
588 	if (!ovl_same_dev(ofs) || !update_ino)
589 		goto out;
590 
591 	type = ovl_path_type(this);
592 	if (OVL_TYPE_ORIGIN(type)) {
593 		struct kstat stat;
594 		struct path statpath = *path;
595 
596 		statpath.dentry = this;
597 		err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
598 		if (err)
599 			goto fail;
600 
601 		/*
602 		 * Directory inode is always on overlay st_dev.
603 		 * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
604 		 * of xino bits overflow.
605 		 */
606 		WARN_ON_ONCE(S_ISDIR(stat.mode) &&
607 			     dir->d_sb->s_dev != stat.dev);
608 		ino = stat.ino;
609 	} else if (xinobits && !OVL_TYPE_UPPER(type)) {
610 		ino = ovl_remap_lower_ino(ino, xinobits,
611 					  ovl_layer_lower(this)->fsid,
612 					  p->name, p->len,
613 					  ovl_xino_warn(ofs));
614 	}
615 
616 out:
617 	p->ino = ino;
618 	dput(this);
619 	return err;
620 
621 fail:
622 	pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n",
623 			    p->name, err);
624 	goto out;
625 }
626 
627 static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
628 			  int namelen, loff_t offset, u64 ino,
629 			  unsigned int d_type)
630 {
631 	struct ovl_cache_entry *p;
632 	struct ovl_readdir_data *rdd =
633 		container_of(ctx, struct ovl_readdir_data, ctx);
634 
635 	rdd->count++;
636 	p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type);
637 	if (p == NULL) {
638 		rdd->err = -ENOMEM;
639 		return false;
640 	}
641 	list_add_tail(&p->l_node, rdd->list);
642 
643 	return true;
644 }
645 
646 static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
647 			       struct rb_root *root)
648 {
649 	int err;
650 	struct path realpath;
651 	struct ovl_cache_entry *p, *n;
652 	struct ovl_readdir_data rdd = {
653 		.ctx.actor = ovl_fill_plain,
654 		.ctx.count = INT_MAX,
655 		.list = list,
656 		.root = root,
657 	};
658 
659 	INIT_LIST_HEAD(list);
660 	*root = RB_ROOT;
661 	ovl_path_upper(path->dentry, &realpath);
662 
663 	err = ovl_dir_read(&realpath, &rdd);
664 	if (err)
665 		return err;
666 
667 	list_for_each_entry_safe(p, n, list, l_node) {
668 		if (strcmp(p->name, ".") != 0 &&
669 		    strcmp(p->name, "..") != 0) {
670 			err = ovl_cache_update(path, p, true);
671 			if (err)
672 				return err;
673 		}
674 		if (p->ino == p->real_ino) {
675 			list_del(&p->l_node);
676 			ovl_cache_entry_free(p);
677 		} else {
678 			struct rb_node **newp = &root->rb_node;
679 			struct rb_node *parent = NULL;
680 
681 			if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
682 							      &newp, &parent)))
683 				return -EIO;
684 
685 			rb_link_node(&p->node, parent, newp);
686 			rb_insert_color(&p->node, root);
687 		}
688 	}
689 	return 0;
690 }
691 
692 static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
693 {
694 	int res;
695 	struct dentry *dentry = path->dentry;
696 	struct inode *inode = d_inode(dentry);
697 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
698 	struct ovl_dir_cache *cache;
699 
700 	cache = ovl_dir_cache(inode);
701 	if (cache && ovl_inode_version_get(inode) == cache->version)
702 		return cache;
703 
704 	/* Impure cache is not refcounted, free it here */
705 	ovl_dir_cache_free(inode);
706 	ovl_set_dir_cache(inode, NULL);
707 
708 	cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
709 	if (!cache)
710 		return ERR_PTR(-ENOMEM);
711 
712 	res = ovl_dir_read_impure(path, &cache->entries, &cache->root);
713 	if (res) {
714 		ovl_cache_free(&cache->entries);
715 		kfree(cache);
716 		return ERR_PTR(res);
717 	}
718 	if (list_empty(&cache->entries)) {
719 		/*
720 		 * A good opportunity to get rid of an unneeded "impure" flag.
721 		 * Removing the "impure" xattr is best effort.
722 		 */
723 		if (!ovl_want_write(dentry)) {
724 			ovl_removexattr(ofs, ovl_dentry_upper(dentry),
725 					OVL_XATTR_IMPURE);
726 			ovl_drop_write(dentry);
727 		}
728 		ovl_clear_flag(OVL_IMPURE, inode);
729 		kfree(cache);
730 		return NULL;
731 	}
732 
733 	cache->version = ovl_inode_version_get(inode);
734 	ovl_set_dir_cache(inode, cache);
735 
736 	return cache;
737 }
738 
739 struct ovl_readdir_translate {
740 	struct dir_context *orig_ctx;
741 	struct ovl_dir_cache *cache;
742 	struct dir_context ctx;
743 	u64 parent_ino;
744 	int fsid;
745 	int xinobits;
746 	bool xinowarn;
747 };
748 
749 static bool ovl_fill_real(struct dir_context *ctx, const char *name,
750 			   int namelen, loff_t offset, u64 ino,
751 			   unsigned int d_type)
752 {
753 	struct ovl_readdir_translate *rdt =
754 		container_of(ctx, struct ovl_readdir_translate, ctx);
755 	struct dir_context *orig_ctx = rdt->orig_ctx;
756 	bool res;
757 
758 	if (rdt->parent_ino && strcmp(name, "..") == 0) {
759 		ino = rdt->parent_ino;
760 	} else if (rdt->cache) {
761 		struct ovl_cache_entry *p;
762 
763 		p = ovl_cache_entry_find(&rdt->cache->root, name, namelen);
764 		if (p)
765 			ino = p->ino;
766 	} else if (rdt->xinobits) {
767 		ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
768 					  name, namelen, rdt->xinowarn);
769 	}
770 
771 	res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
772 	ctx->count = orig_ctx->count;
773 
774 	return res;
775 }
776 
777 static bool ovl_is_impure_dir(struct file *file)
778 {
779 	struct ovl_dir_file *od = file->private_data;
780 	struct inode *dir = file_inode(file);
781 
782 	/*
783 	 * Only upper dir can be impure, but if we are in the middle of
784 	 * iterating a lower real dir, dir could be copied up and marked
785 	 * impure. We only want the impure cache if we started iterating
786 	 * a real upper dir to begin with.
787 	 */
788 	return od->is_upper && ovl_test_flag(OVL_IMPURE, dir);
789 
790 }
791 
792 static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
793 {
794 	int err;
795 	struct ovl_dir_file *od = file->private_data;
796 	struct dentry *dir = file->f_path.dentry;
797 	struct ovl_fs *ofs = OVL_FS(dir->d_sb);
798 	const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
799 	struct ovl_readdir_translate rdt = {
800 		.ctx.actor = ovl_fill_real,
801 		.ctx.count = ctx->count,
802 		.orig_ctx = ctx,
803 		.xinobits = ovl_xino_bits(ofs),
804 		.xinowarn = ovl_xino_warn(ofs),
805 	};
806 
807 	if (rdt.xinobits && lower_layer)
808 		rdt.fsid = lower_layer->fsid;
809 
810 	if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) {
811 		struct kstat stat;
812 		struct path statpath = file->f_path;
813 
814 		statpath.dentry = dir->d_parent;
815 		err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
816 		if (err)
817 			return err;
818 
819 		WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
820 		rdt.parent_ino = stat.ino;
821 	}
822 
823 	if (ovl_is_impure_dir(file)) {
824 		rdt.cache = ovl_cache_get_impure(&file->f_path);
825 		if (IS_ERR(rdt.cache))
826 			return PTR_ERR(rdt.cache);
827 	}
828 
829 	err = iterate_dir(od->realfile, &rdt.ctx);
830 	ctx->pos = rdt.ctx.pos;
831 
832 	return err;
833 }
834 
835 static int ovl_iterate_merged(struct file *file, struct dir_context *ctx)
836 {
837 	struct ovl_dir_file *od = file->private_data;
838 	struct dentry *dentry = file->f_path.dentry;
839 	struct ovl_cache_entry *p;
840 	int err = 0;
841 
842 	if (!od->cache) {
843 		struct ovl_dir_cache *cache;
844 
845 		cache = ovl_cache_get(dentry);
846 		err = PTR_ERR(cache);
847 		if (IS_ERR(cache))
848 			return err;
849 
850 		od->cache = cache;
851 		ovl_seek_cursor(od, ctx->pos);
852 	}
853 
854 	while (od->cursor != &od->cache->entries) {
855 		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
856 		if (!p->is_whiteout) {
857 			if (!p->ino || p->check_xwhiteout) {
858 				err = ovl_cache_update(&file->f_path, p, !p->ino);
859 				if (err)
860 					return err;
861 			}
862 		}
863 		/* ovl_cache_update() sets is_whiteout on stale entry */
864 		if (!p->is_whiteout) {
865 			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
866 				break;
867 		}
868 		od->cursor = p->l_node.next;
869 		ctx->pos++;
870 	}
871 	return err;
872 }
873 
874 static bool ovl_need_adjust_d_ino(struct file *file)
875 {
876 	struct dentry *dentry = file->f_path.dentry;
877 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
878 
879 	/* If parent is merge, then need to adjust d_ino for '..' */
880 	if (ovl_xino_bits(ofs))
881 		return true;
882 
883 	/* Can't do consistent inode numbering */
884 	if (!ovl_same_fs(ofs))
885 		return false;
886 
887 	/* If dir is impure then need to adjust d_ino for copied up entries */
888 	if (ovl_is_impure_dir(file) ||
889 	    OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))
890 		return true;
891 
892 	/* Pure: no need to adjust d_ino */
893 	return false;
894 }
895 
896 
897 static int ovl_iterate(struct file *file, struct dir_context *ctx)
898 {
899 	struct ovl_dir_file *od = file->private_data;
900 
901 	if (!ctx->pos)
902 		ovl_dir_reset(file);
903 
904 	with_ovl_creds(file_dentry(file)->d_sb) {
905 		if (!od->is_real)
906 			return ovl_iterate_merged(file, ctx);
907 
908 		if (ovl_need_adjust_d_ino(file))
909 			return ovl_iterate_real(file, ctx);
910 
911 		return iterate_dir(od->realfile, ctx);
912 	}
913 }
914 
915 static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
916 {
917 	loff_t res;
918 	struct ovl_dir_file *od = file->private_data;
919 
920 	inode_lock(file_inode(file));
921 	if (!file->f_pos)
922 		ovl_dir_reset(file);
923 
924 	if (od->is_real) {
925 		res = vfs_llseek(od->realfile, offset, origin);
926 		file->f_pos = od->realfile->f_pos;
927 	} else {
928 		res = -EINVAL;
929 
930 		switch (origin) {
931 		case SEEK_CUR:
932 			offset += file->f_pos;
933 			break;
934 		case SEEK_SET:
935 			break;
936 		default:
937 			goto out_unlock;
938 		}
939 		if (offset < 0)
940 			goto out_unlock;
941 
942 		if (offset != file->f_pos) {
943 			file->f_pos = offset;
944 			if (od->cache)
945 				ovl_seek_cursor(od, offset);
946 		}
947 		res = offset;
948 	}
949 out_unlock:
950 	inode_unlock(file_inode(file));
951 
952 	return res;
953 }
954 
955 static struct file *ovl_dir_open_realfile(const struct file *file,
956 					  const struct path *realpath)
957 {
958 	with_ovl_creds(file_inode(file)->i_sb)
959 		return ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
960 }
961 
962 /*
963  * Like ovl_real_fdget(), returns upperfile if dir was copied up since open.
964  * Unlike ovl_real_fdget(), this caches upperfile in file->private_data.
965  *
966  * TODO: use same abstract type for file->private_data of dir and file so
967  * upperfile could also be cached for files as well.
968  */
969 struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
970 {
971 
972 	struct ovl_dir_file *od = file->private_data;
973 	struct dentry *dentry = file->f_path.dentry;
974 	struct file *old, *realfile = od->realfile;
975 
976 	if (!OVL_TYPE_UPPER(ovl_path_type(dentry)))
977 		return want_upper ? NULL : realfile;
978 
979 	/*
980 	 * Need to check if we started out being a lower dir, but got copied up
981 	 */
982 	if (!od->is_upper) {
983 		realfile = READ_ONCE(od->upperfile);
984 		if (!realfile) {
985 			struct path upperpath;
986 
987 			ovl_path_upper(dentry, &upperpath);
988 			realfile = ovl_dir_open_realfile(file, &upperpath);
989 			if (IS_ERR(realfile))
990 				return realfile;
991 
992 			old = cmpxchg_release(&od->upperfile, NULL, realfile);
993 			if (old) {
994 				fput(realfile);
995 				realfile = old;
996 			}
997 		}
998 	}
999 
1000 	return realfile;
1001 }
1002 
1003 static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
1004 			 int datasync)
1005 {
1006 	struct file *realfile;
1007 	int err;
1008 
1009 	err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
1010 	if (err <= 0)
1011 		return err;
1012 
1013 	realfile = ovl_dir_real_file(file, true);
1014 	err = PTR_ERR_OR_ZERO(realfile);
1015 
1016 	/* Nothing to sync for lower */
1017 	if (!realfile || err)
1018 		return err;
1019 
1020 	return vfs_fsync_range(realfile, start, end, datasync);
1021 }
1022 
1023 static int ovl_dir_release(struct inode *inode, struct file *file)
1024 {
1025 	struct ovl_dir_file *od = file->private_data;
1026 
1027 	if (od->cache) {
1028 		inode_lock(inode);
1029 		ovl_cache_put(od, inode);
1030 		inode_unlock(inode);
1031 	}
1032 	fput(od->realfile);
1033 	if (od->upperfile)
1034 		fput(od->upperfile);
1035 	kfree(od);
1036 
1037 	return 0;
1038 }
1039 
1040 static int ovl_dir_open(struct inode *inode, struct file *file)
1041 {
1042 	struct path realpath;
1043 	struct file *realfile;
1044 	struct ovl_dir_file *od;
1045 	enum ovl_path_type type;
1046 
1047 	od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
1048 	if (!od)
1049 		return -ENOMEM;
1050 
1051 	type = ovl_path_real(file->f_path.dentry, &realpath);
1052 	realfile = ovl_dir_open_realfile(file, &realpath);
1053 	if (IS_ERR(realfile)) {
1054 		kfree(od);
1055 		return PTR_ERR(realfile);
1056 	}
1057 	od->realfile = realfile;
1058 	od->is_real = ovl_dir_is_real(inode);
1059 	od->is_upper = OVL_TYPE_UPPER(type);
1060 	file->private_data = od;
1061 
1062 	return 0;
1063 }
1064 
1065 WRAP_DIR_ITER(ovl_iterate) // FIXME!
1066 const struct file_operations ovl_dir_operations = {
1067 	.read		= generic_read_dir,
1068 	.open		= ovl_dir_open,
1069 	.iterate_shared	= shared_ovl_iterate,
1070 	.llseek		= ovl_dir_llseek,
1071 	.fsync		= ovl_dir_fsync,
1072 	.release	= ovl_dir_release,
1073 };
1074 
1075 int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
1076 {
1077 	int err;
1078 	struct ovl_cache_entry *p, *n;
1079 	struct rb_root root = RB_ROOT;
1080 
1081 	with_ovl_creds(dentry->d_sb)
1082 		err = ovl_dir_read_merged(dentry, list, &root);
1083 	if (err)
1084 		return err;
1085 
1086 	err = 0;
1087 
1088 	list_for_each_entry_safe(p, n, list, l_node) {
1089 		/*
1090 		 * Select whiteouts in upperdir, they should
1091 		 * be cleared when deleting this directory.
1092 		 */
1093 		if (p->is_whiteout) {
1094 			if (p->is_upper)
1095 				continue;
1096 			goto del_entry;
1097 		}
1098 
1099 		if (p->name[0] == '.') {
1100 			if (p->len == 1)
1101 				goto del_entry;
1102 			if (p->len == 2 && p->name[1] == '.')
1103 				goto del_entry;
1104 		}
1105 		err = -ENOTEMPTY;
1106 		break;
1107 
1108 del_entry:
1109 		list_del(&p->l_node);
1110 		ovl_cache_entry_free(p);
1111 	}
1112 
1113 	return err;
1114 }
1115 
1116 void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
1117 			   struct list_head *list)
1118 {
1119 	struct ovl_cache_entry *p;
1120 
1121 	list_for_each_entry(p, list, l_node) {
1122 		struct dentry *dentry;
1123 
1124 		if (WARN_ON(!p->is_whiteout || !p->is_upper))
1125 			continue;
1126 
1127 		dentry = ovl_lookup_upper_unlocked(ofs, p->name, upper, p->len);
1128 		if (IS_ERR(dentry)) {
1129 			pr_err("lookup '%s/%.*s' failed (%i)\n",
1130 			       upper->d_name.name, p->len, p->name,
1131 			       (int) PTR_ERR(dentry));
1132 			continue;
1133 		}
1134 		if (dentry->d_inode)
1135 			ovl_cleanup(ofs, upper, dentry);
1136 		dput(dentry);
1137 	}
1138 }
1139 
1140 static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
1141 			  int namelen, loff_t offset, u64 ino,
1142 			  unsigned int d_type)
1143 {
1144 	struct ovl_readdir_data *rdd =
1145 		container_of(ctx, struct ovl_readdir_data, ctx);
1146 
1147 	/* Even if d_type is not supported, DT_DIR is returned for . and .. */
1148 	if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
1149 		return true;
1150 
1151 	if (d_type != DT_UNKNOWN)
1152 		rdd->d_type_supported = true;
1153 
1154 	return true;
1155 }
1156 
1157 /*
1158  * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
1159  * if error is encountered.
1160  */
1161 int ovl_check_d_type_supported(const struct path *realpath)
1162 {
1163 	int err;
1164 	struct ovl_readdir_data rdd = {
1165 		.ctx.actor = ovl_check_d_type,
1166 		.ctx.count = INT_MAX,
1167 		.d_type_supported = false,
1168 	};
1169 
1170 	err = ovl_dir_read(realpath, &rdd);
1171 	if (err)
1172 		return err;
1173 
1174 	return rdd.d_type_supported;
1175 }
1176 
1177 #define OVL_INCOMPATDIR_NAME "incompat"
1178 
1179 static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
1180 				       int level)
1181 {
1182 	int err;
1183 	LIST_HEAD(list);
1184 	struct ovl_cache_entry *p;
1185 	struct ovl_readdir_data rdd = {
1186 		.ctx.actor = ovl_fill_plain,
1187 		.ctx.count = INT_MAX,
1188 		.list = &list,
1189 	};
1190 	bool incompat = false;
1191 
1192 	/*
1193 	 * The "work/incompat" directory is treated specially - if it is not
1194 	 * empty, instead of printing a generic error and mounting read-only,
1195 	 * we will error about incompat features and fail the mount.
1196 	 *
1197 	 * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name
1198 	 * starts with '#'.
1199 	 */
1200 	if (level == 2 &&
1201 	    !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME))
1202 		incompat = true;
1203 
1204 	err = ovl_dir_read(path, &rdd);
1205 	if (err)
1206 		goto out;
1207 
1208 	list_for_each_entry(p, &list, l_node) {
1209 		struct dentry *dentry;
1210 
1211 		if (p->name[0] == '.') {
1212 			if (p->len == 1)
1213 				continue;
1214 			if (p->len == 2 && p->name[1] == '.')
1215 				continue;
1216 		} else if (incompat) {
1217 			pr_err("overlay with incompat feature '%s' cannot be mounted\n",
1218 				p->name);
1219 			err = -EINVAL;
1220 			break;
1221 		}
1222 		dentry = ovl_lookup_upper_unlocked(ofs, p->name, path->dentry, p->len);
1223 		if (IS_ERR(dentry))
1224 			continue;
1225 		if (dentry->d_inode)
1226 			err = ovl_workdir_cleanup(ofs, path->dentry, path->mnt,
1227 						  dentry, level);
1228 		dput(dentry);
1229 		if (err)
1230 			break;
1231 	}
1232 out:
1233 	ovl_cache_free(&list);
1234 	return err;
1235 }
1236 
1237 int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
1238 			struct vfsmount *mnt, struct dentry *dentry, int level)
1239 {
1240 	int err;
1241 
1242 	if (!d_is_dir(dentry) || level > 1)
1243 		return ovl_cleanup(ofs, parent, dentry);
1244 
1245 	dentry = start_removing_dentry(parent, dentry);
1246 	if (IS_ERR(dentry))
1247 		return PTR_ERR(dentry);
1248 	err = ovl_do_rmdir(ofs, parent->d_inode, dentry);
1249 	end_removing(dentry);
1250 	if (err) {
1251 		struct path path = { .mnt = mnt, .dentry = dentry };
1252 
1253 		err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
1254 		if (!err)
1255 			err = ovl_cleanup(ofs, parent, dentry);
1256 	}
1257 
1258 	return err;
1259 }
1260 
1261 int ovl_indexdir_cleanup(struct ovl_fs *ofs)
1262 {
1263 	int err;
1264 	struct dentry *indexdir = ofs->workdir;
1265 	struct dentry *index = NULL;
1266 	struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
1267 	LIST_HEAD(list);
1268 	struct ovl_cache_entry *p;
1269 	struct ovl_readdir_data rdd = {
1270 		.ctx.actor = ovl_fill_plain,
1271 		.ctx.count = INT_MAX,
1272 		.list = &list,
1273 	};
1274 
1275 	err = ovl_dir_read(&path, &rdd);
1276 	if (err)
1277 		goto out;
1278 
1279 	list_for_each_entry(p, &list, l_node) {
1280 		if (p->name[0] == '.') {
1281 			if (p->len == 1)
1282 				continue;
1283 			if (p->len == 2 && p->name[1] == '.')
1284 				continue;
1285 		}
1286 		index = ovl_lookup_upper_unlocked(ofs, p->name, indexdir, p->len);
1287 		if (IS_ERR(index)) {
1288 			err = PTR_ERR(index);
1289 			index = NULL;
1290 			break;
1291 		}
1292 		/* Cleanup leftover from index create/cleanup attempt */
1293 		if (index->d_name.name[0] == '#') {
1294 			err = ovl_workdir_cleanup(ofs, indexdir, path.mnt, index, 1);
1295 			if (err)
1296 				break;
1297 			goto next;
1298 		}
1299 		err = ovl_verify_index(ofs, index);
1300 		if (!err) {
1301 			goto next;
1302 		} else if (err == -ESTALE) {
1303 			/* Cleanup stale index entries */
1304 			err = ovl_cleanup(ofs, indexdir, index);
1305 		} else if (err != -ENOENT) {
1306 			/*
1307 			 * Abort mount to avoid corrupting the index if
1308 			 * an incompatible index entry was found or on out
1309 			 * of memory.
1310 			 */
1311 			break;
1312 		} else if (ofs->config.nfs_export) {
1313 			/*
1314 			 * Whiteout orphan index to block future open by
1315 			 * handle after overlay nlink dropped to zero.
1316 			 */
1317 			err = ovl_cleanup_and_whiteout(ofs, indexdir, index);
1318 		} else {
1319 			/* Cleanup orphan index entries */
1320 			err = ovl_cleanup(ofs, indexdir, index);
1321 		}
1322 
1323 		if (err)
1324 			break;
1325 
1326 next:
1327 		dput(index);
1328 		index = NULL;
1329 	}
1330 	dput(index);
1331 out:
1332 	ovl_cache_free(&list);
1333 	if (err)
1334 		pr_err("failed index dir cleanup (%i)\n", err);
1335 	return err;
1336 }
1337