xref: /linux/fs/ntfs/dir.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NTFS kernel directory operations.
4  *
5  * Copyright (c) 2001-2007 Anton Altaparmakov
6  * Copyright (c) 2002 Richard Russon
7  * Copyright (c) 2025 LG Electronics Co., Ltd.
8  */
9 
10 #include <linux/blkdev.h>
11 
12 #include "dir.h"
13 #include "mft.h"
14 #include "ntfs.h"
15 #include "index.h"
16 #include "reparse.h"
17 
18 #include <linux/filelock.h>
19 
20 /*
21  * The little endian Unicode string $I30 as a global constant.
22  */
23 __le16 I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
24 		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
25 
26 /*
27  * ntfs_lookup_inode_by_name - find an inode in a directory given its name
28  * @dir_ni:	ntfs inode of the directory in which to search for the name
29  * @uname:	Unicode name for which to search in the directory
30  * @uname_len:	length of the name @uname in Unicode characters
31  * @res:	return the found file name if necessary (see below)
32  *
33  * Look for an inode with name @uname in the directory with inode @dir_ni.
34  * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
35  * the Unicode name. If the name is found in the directory, the corresponding
36  * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
37  * is a 64-bit number containing the sequence number.
38  *
39  * On error, a negative value is returned corresponding to the error code. In
40  * particular if the inode is not found -ENOENT is returned. Note that you
41  * can't just check the return value for being negative, you have to check the
42  * inode number for being negative which you can extract using MREC(return
43  * value).
44  *
45  * Note, @uname_len does not include the (optional) terminating NULL character.
46  *
47  * Note, we look for a case sensitive match first but we also look for a case
48  * insensitive match at the same time. If we find a case insensitive match, we
49  * save that for the case that we don't find an exact match, where we return
50  * the case insensitive match and setup @res (which we allocate!) with the mft
51  * reference, the file name type, length and with a copy of the little endian
52  * Unicode file name itself. If we match a file name which is in the DOS name
53  * space, we only return the mft reference and file name type in @res.
54  * ntfs_lookup() then uses this to find the long file name in the inode itself.
55  * This is to avoid polluting the dcache with short file names. We want them to
56  * work but we don't care for how quickly one can access them. This also fixes
57  * the dcache aliasing issues.
58  *
59  * Locking:  - Caller must hold i_mutex on the directory.
60  *	     - Each page cache page in the index allocation mapping must be
61  *	       locked whilst being accessed otherwise we may find a corrupt
62  *	       page due to it being under ->writepage at the moment which
63  *	       applies the mst protection fixups before writing out and then
64  *	       removes them again after the write is complete after which it
65  *	       unlocks the page.
66  */
67 u64 ntfs_lookup_inode_by_name(struct ntfs_inode *dir_ni, const __le16 *uname,
68 		const int uname_len, struct ntfs_name **res)
69 {
70 	struct ntfs_volume *vol = dir_ni->vol;
71 	struct super_block *sb = vol->sb;
72 	struct inode *ia_vi = NULL;
73 	struct mft_record *m;
74 	struct index_root *ir;
75 	struct index_entry *ie;
76 	struct index_block *ia;
77 	u8 *index_end;
78 	u64 mref;
79 	struct ntfs_attr_search_ctx *ctx;
80 	int err, rc;
81 	s64 vcn, old_vcn;
82 	struct address_space *ia_mapping;
83 	struct folio *folio;
84 	u8 *kaddr = NULL;
85 	struct ntfs_name *name = NULL;
86 
87 	/* Get hold of the mft record for the directory. */
88 	m = map_mft_record(dir_ni);
89 	if (IS_ERR(m)) {
90 		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
91 				-PTR_ERR(m));
92 		return ERR_MREF(PTR_ERR(m));
93 	}
94 	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
95 	if (unlikely(!ctx)) {
96 		err = -ENOMEM;
97 		goto err_out;
98 	}
99 	/* Find the index root attribute in the mft record. */
100 	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
101 			0, ctx);
102 	if (unlikely(err)) {
103 		if (err == -ENOENT) {
104 			ntfs_error(sb,
105 				"Index root attribute missing in directory inode 0x%llx.",
106 				dir_ni->mft_no);
107 			err = -EIO;
108 		}
109 		goto err_out;
110 	}
111 	/* Get to the index root value (it's been verified in read_inode). */
112 	ir = (struct index_root *)((u8 *)ctx->attr +
113 			le16_to_cpu(ctx->attr->data.resident.value_offset));
114 	index_end = (u8 *)&ir->index + le32_to_cpu(ir->index.index_length);
115 	/* The first index entry. */
116 	ie = (struct index_entry *)((u8 *)&ir->index +
117 			le32_to_cpu(ir->index.entries_offset));
118 	/*
119 	 * Loop until we exceed valid memory (corruption case) or until we
120 	 * reach the last entry.
121 	 */
122 	for (;; ie = (struct index_entry *)((u8 *)ie + le16_to_cpu(ie->length))) {
123 		/* Bounds checks. */
124 		if ((u8 *)ie < (u8 *)ctx->mrec ||
125 		    (u8 *)ie + sizeof(struct index_entry_header) > index_end ||
126 		    (u8 *)ie + sizeof(struct index_entry_header) + le16_to_cpu(ie->key_length) >
127 				index_end || (u8 *)ie + le16_to_cpu(ie->length) > index_end)
128 			goto dir_err_out;
129 		/*
130 		 * The last entry cannot contain a name. It can however contain
131 		 * a pointer to a child node in the B+tree so we just break out.
132 		 */
133 		if (ie->flags & INDEX_ENTRY_END)
134 			break;
135 		/* Key length should not be zero if it is not last entry. */
136 		if (!ie->key_length)
137 			goto dir_err_out;
138 		/*
139 		 * We perform a case sensitive comparison and if that matches
140 		 * we are done and return the mft reference of the inode (i.e.
141 		 * the inode number together with the sequence number for
142 		 * consistency checking). We convert it to cpu format before
143 		 * returning.
144 		 */
145 		if (ntfs_are_names_equal(uname, uname_len,
146 				(__le16 *)&ie->key.file_name.file_name,
147 				ie->key.file_name.file_name_length,
148 				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
149 found_it:
150 			/*
151 			 * We have a perfect match, so we don't need to care
152 			 * about having matched imperfectly before, so we can
153 			 * free name and set *res to NULL.
154 			 * However, if the perfect match is a short file name,
155 			 * we need to signal this through *res, so that
156 			 * ntfs_lookup() can fix dcache aliasing issues.
157 			 * As an optimization we just reuse an existing
158 			 * allocation of *res.
159 			 */
160 			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
161 				if (!name) {
162 					name = kmalloc(sizeof(struct ntfs_name),
163 							GFP_NOFS);
164 					if (!name) {
165 						err = -ENOMEM;
166 						goto err_out;
167 					}
168 				}
169 				name->mref = le64_to_cpu(
170 						ie->data.dir.indexed_file);
171 				name->type = FILE_NAME_DOS;
172 				name->len = 0;
173 				*res = name;
174 			} else {
175 				kfree(name);
176 				*res = NULL;
177 			}
178 			mref = le64_to_cpu(ie->data.dir.indexed_file);
179 			ntfs_attr_put_search_ctx(ctx);
180 			unmap_mft_record(dir_ni);
181 			return mref;
182 		}
183 		/*
184 		 * For a case insensitive mount, we also perform a case
185 		 * insensitive comparison (provided the file name is not in the
186 		 * POSIX namespace). If the comparison matches, and the name is
187 		 * in the WIN32 namespace, we cache the filename in *res so
188 		 * that the caller, ntfs_lookup(), can work on it. If the
189 		 * comparison matches, and the name is in the DOS namespace, we
190 		 * only cache the mft reference and the file name type (we set
191 		 * the name length to zero for simplicity).
192 		 */
193 		if ((!NVolCaseSensitive(vol) ||
194 		     ie->key.file_name.file_name_type == FILE_NAME_DOS) &&
195 		    ntfs_are_names_equal(uname, uname_len,
196 					 (__le16 *)&ie->key.file_name.file_name,
197 					 ie->key.file_name.file_name_length,
198 					 IGNORE_CASE, vol->upcase,
199 					 vol->upcase_len)) {
200 			int name_size = sizeof(struct ntfs_name);
201 			u8 type = ie->key.file_name.file_name_type;
202 			u8 len = ie->key.file_name.file_name_length;
203 
204 			/* Only one case insensitive matching name allowed. */
205 			if (name) {
206 				ntfs_error(sb,
207 					"Found already allocated name in phase 1. Please run chkdsk");
208 				goto dir_err_out;
209 			}
210 
211 			if (type != FILE_NAME_DOS)
212 				name_size += len * sizeof(__le16);
213 			name = kmalloc(name_size, GFP_NOFS);
214 			if (!name) {
215 				err = -ENOMEM;
216 				goto err_out;
217 			}
218 			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
219 			name->type = type;
220 			if (type != FILE_NAME_DOS) {
221 				name->len = len;
222 				memcpy(name->name, ie->key.file_name.file_name,
223 						len * sizeof(__le16));
224 			} else
225 				name->len = 0;
226 			*res = name;
227 		}
228 		/*
229 		 * Not a perfect match, need to do full blown collation so we
230 		 * know which way in the B+tree we have to go.
231 		 */
232 		rc = ntfs_collate_names(uname, uname_len,
233 				(__le16 *)&ie->key.file_name.file_name,
234 				ie->key.file_name.file_name_length, 1,
235 				IGNORE_CASE, vol->upcase, vol->upcase_len);
236 		/*
237 		 * If uname collates before the name of the current entry, there
238 		 * is definitely no such name in this index but we might need to
239 		 * descend into the B+tree so we just break out of the loop.
240 		 */
241 		if (rc == -1)
242 			break;
243 		/* The names are not equal, continue the search. */
244 		if (rc)
245 			continue;
246 		/*
247 		 * Names match with case insensitive comparison, now try the
248 		 * case sensitive comparison, which is required for proper
249 		 * collation.
250 		 */
251 		rc = ntfs_collate_names(uname, uname_len,
252 				(__le16 *)&ie->key.file_name.file_name,
253 				ie->key.file_name.file_name_length, 1,
254 				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
255 		if (rc == -1)
256 			break;
257 		if (rc)
258 			continue;
259 		/*
260 		 * Perfect match, this will never happen as the
261 		 * ntfs_are_names_equal() call will have gotten a match but we
262 		 * still treat it correctly.
263 		 */
264 		goto found_it;
265 	}
266 	/*
267 	 * We have finished with this index without success. Check for the
268 	 * presence of a child node and if not present return -ENOENT, unless
269 	 * we have got a matching name cached in name in which case return the
270 	 * mft reference associated with it.
271 	 */
272 	if (!(ie->flags & INDEX_ENTRY_NODE)) {
273 		if (name) {
274 			ntfs_attr_put_search_ctx(ctx);
275 			unmap_mft_record(dir_ni);
276 			return name->mref;
277 		}
278 		ntfs_debug("Entry not found.");
279 		err = -ENOENT;
280 		goto err_out;
281 	} /* Child node present, descend into it. */
282 
283 	/* Get the starting vcn of the index_block holding the child node. */
284 	vcn = le64_to_cpup((__le64 *)((u8 *)ie + le16_to_cpu(ie->length) - 8));
285 
286 	/*
287 	 * We are done with the index root and the mft record. Release them,
288 	 * otherwise we deadlock with read_mapping_folio().
289 	 */
290 	ntfs_attr_put_search_ctx(ctx);
291 	unmap_mft_record(dir_ni);
292 	m = NULL;
293 	ctx = NULL;
294 
295 	ia_vi = ntfs_index_iget(VFS_I(dir_ni), I30, 4);
296 	if (IS_ERR(ia_vi)) {
297 		err = PTR_ERR(ia_vi);
298 		goto err_out;
299 	}
300 
301 	ia_mapping = ia_vi->i_mapping;
302 descend_into_child_node:
303 	/*
304 	 * Convert vcn to index into the index allocation attribute in units
305 	 * of PAGE_SIZE and map the page cache page, reading it from
306 	 * disk if necessary.
307 	 */
308 	folio = read_mapping_folio(ia_mapping, vcn <<
309 			dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT, NULL);
310 	if (IS_ERR(folio)) {
311 		ntfs_error(sb, "Failed to map directory index page, error %ld.",
312 				-PTR_ERR(folio));
313 		err = PTR_ERR(folio);
314 		goto err_out;
315 	}
316 
317 	folio_lock(folio);
318 	kaddr = kmalloc(PAGE_SIZE, GFP_NOFS);
319 	if (!kaddr) {
320 		err = -ENOMEM;
321 		folio_unlock(folio);
322 		folio_put(folio);
323 		goto unm_err_out;
324 	}
325 
326 	memcpy_from_folio(kaddr, folio, 0, PAGE_SIZE);
327 	post_read_mst_fixup((struct ntfs_record *)kaddr, PAGE_SIZE);
328 	folio_unlock(folio);
329 	folio_put(folio);
330 fast_descend_into_child_node:
331 	/* Get to the index allocation block. */
332 	ia = (struct index_block *)(kaddr + ((vcn <<
333 			dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
334 	/* Bounds checks. */
335 	if ((u8 *)ia < kaddr || (u8 *)ia > kaddr + PAGE_SIZE) {
336 		ntfs_error(sb,
337 			"Out of bounds check failed. Corrupt directory inode 0x%llx or driver bug.",
338 			dir_ni->mft_no);
339 		goto unm_err_out;
340 	}
341 	index_end = (u8 *)ia + dir_ni->itype.index.block_size;
342 	if (index_end > kaddr + PAGE_SIZE) {
343 		ntfs_error(sb,
344 			   "Index buffer (VCN 0x%llx) of directory inode 0x%llx crosses page boundary. Impossible! Cannot access! This is probably a bug in the driver.",
345 			   vcn, dir_ni->mft_no);
346 		goto unm_err_out;
347 	}
348 	err = ntfs_index_block_inconsistent(vol, ia,
349 					    dir_ni->itype.index.block_size,
350 					    vcn, COLLATION_FILE_NAME,
351 					    dir_ni->mft_no);
352 	if (err)
353 		goto unm_err_out;
354 	index_end = (u8 *)&ia->index + le32_to_cpu(ia->index.index_length);
355 	/* The first index entry. */
356 	ie = (struct index_entry *)((u8 *)&ia->index +
357 			le32_to_cpu(ia->index.entries_offset));
358 	/*
359 	 * Iterate similar to above big loop but applied to index buffer, thus
360 	 * loop until we exceed valid memory (corruption case) or until we
361 	 * reach the last entry.
362 	 */
363 	for (;; ie = (struct index_entry *)((u8 *)ie + le16_to_cpu(ie->length))) {
364 		/*
365 		 * The last entry cannot contain a name. It can however contain
366 		 * a pointer to a child node in the B+tree so we just break out.
367 		 */
368 		if (ie->flags & INDEX_ENTRY_END)
369 			break;
370 		/* Key length should not be zero if it is not last entry. */
371 		if (!ie->key_length)
372 			goto unm_err_out;
373 		/*
374 		 * We perform a case sensitive comparison and if that matches
375 		 * we are done and return the mft reference of the inode (i.e.
376 		 * the inode number together with the sequence number for
377 		 * consistency checking). We convert it to cpu format before
378 		 * returning.
379 		 */
380 		if (ntfs_are_names_equal(uname, uname_len,
381 				(__le16 *)&ie->key.file_name.file_name,
382 				ie->key.file_name.file_name_length,
383 				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
384 found_it2:
385 			/*
386 			 * We have a perfect match, so we don't need to care
387 			 * about having matched imperfectly before, so we can
388 			 * free name and set *res to NULL.
389 			 * However, if the perfect match is a short file name,
390 			 * we need to signal this through *res, so that
391 			 * ntfs_lookup() can fix dcache aliasing issues.
392 			 * As an optimization we just reuse an existing
393 			 * allocation of *res.
394 			 */
395 			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
396 				if (!name) {
397 					name = kmalloc(sizeof(struct ntfs_name),
398 							GFP_NOFS);
399 					if (!name) {
400 						err = -ENOMEM;
401 						goto unm_err_out;
402 					}
403 				}
404 				name->mref = le64_to_cpu(
405 						ie->data.dir.indexed_file);
406 				name->type = FILE_NAME_DOS;
407 				name->len = 0;
408 				*res = name;
409 			} else {
410 				kfree(name);
411 				*res = NULL;
412 			}
413 			mref = le64_to_cpu(ie->data.dir.indexed_file);
414 			kfree(kaddr);
415 			iput(ia_vi);
416 			return mref;
417 		}
418 		/*
419 		 * For a case insensitive mount, we also perform a case
420 		 * insensitive comparison (provided the file name is not in the
421 		 * POSIX namespace). If the comparison matches, and the name is
422 		 * in the WIN32 namespace, we cache the filename in *res so
423 		 * that the caller, ntfs_lookup(), can work on it. If the
424 		 * comparison matches, and the name is in the DOS namespace, we
425 		 * only cache the mft reference and the file name type (we set
426 		 * the name length to zero for simplicity).
427 		 */
428 		if ((!NVolCaseSensitive(vol) ||
429 		     ie->key.file_name.file_name_type == FILE_NAME_DOS) &&
430 		    ntfs_are_names_equal(uname, uname_len,
431 					 (__le16 *)&ie->key.file_name.file_name,
432 					 ie->key.file_name.file_name_length,
433 					 IGNORE_CASE, vol->upcase,
434 					 vol->upcase_len)) {
435 			int name_size = sizeof(struct ntfs_name);
436 			u8 type = ie->key.file_name.file_name_type;
437 			u8 len = ie->key.file_name.file_name_length;
438 
439 			/* Only one case insensitive matching name allowed. */
440 			if (name) {
441 				ntfs_error(sb,
442 					"Found already allocated name in phase 2. Please run chkdsk");
443 				kfree(kaddr);
444 				goto dir_err_out;
445 			}
446 
447 			if (type != FILE_NAME_DOS)
448 				name_size += len * sizeof(__le16);
449 			name = kmalloc(name_size, GFP_NOFS);
450 			if (!name) {
451 				err = -ENOMEM;
452 				goto unm_err_out;
453 			}
454 			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
455 			name->type = type;
456 			if (type != FILE_NAME_DOS) {
457 				name->len = len;
458 				memcpy(name->name, ie->key.file_name.file_name,
459 						len * sizeof(__le16));
460 			} else
461 				name->len = 0;
462 			*res = name;
463 		}
464 		/*
465 		 * Not a perfect match, need to do full blown collation so we
466 		 * know which way in the B+tree we have to go.
467 		 */
468 		rc = ntfs_collate_names(uname, uname_len,
469 				(__le16 *)&ie->key.file_name.file_name,
470 				ie->key.file_name.file_name_length, 1,
471 				IGNORE_CASE, vol->upcase, vol->upcase_len);
472 		/*
473 		 * If uname collates before the name of the current entry, there
474 		 * is definitely no such name in this index but we might need to
475 		 * descend into the B+tree so we just break out of the loop.
476 		 */
477 		if (rc == -1)
478 			break;
479 		/* The names are not equal, continue the search. */
480 		if (rc)
481 			continue;
482 		/*
483 		 * Names match with case insensitive comparison, now try the
484 		 * case sensitive comparison, which is required for proper
485 		 * collation.
486 		 */
487 		rc = ntfs_collate_names(uname, uname_len,
488 				(__le16 *)&ie->key.file_name.file_name,
489 				ie->key.file_name.file_name_length, 1,
490 				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
491 		if (rc == -1)
492 			break;
493 		if (rc)
494 			continue;
495 		/*
496 		 * Perfect match, this will never happen as the
497 		 * ntfs_are_names_equal() call will have gotten a match but we
498 		 * still treat it correctly.
499 		 */
500 		goto found_it2;
501 	}
502 	/*
503 	 * We have finished with this index buffer without success. Check for
504 	 * the presence of a child node.
505 	 */
506 	if (ie->flags & INDEX_ENTRY_NODE) {
507 		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
508 			ntfs_error(sb,
509 				"Index entry with child node found in a leaf node in directory inode 0x%llx.",
510 				dir_ni->mft_no);
511 			goto unm_err_out;
512 		}
513 		/* Child node present, descend into it. */
514 		old_vcn = vcn;
515 		vcn = le64_to_cpup((__le64 *)((u8 *)ie +
516 				le16_to_cpu(ie->length) - 8));
517 		if (vcn >= 0) {
518 			/*
519 			 * If vcn is in the same page cache page as old_vcn we
520 			 * recycle the mapped page.
521 			 */
522 			if (ntfs_cluster_to_pidx(vol, old_vcn) ==
523 			    ntfs_cluster_to_pidx(vol, vcn))
524 				goto fast_descend_into_child_node;
525 			kfree(kaddr);
526 			kaddr = NULL;
527 			goto descend_into_child_node;
528 		}
529 		ntfs_error(sb, "Negative child node vcn in directory inode 0x%llx.",
530 				dir_ni->mft_no);
531 		goto unm_err_out;
532 	}
533 	/*
534 	 * No child node present, return -ENOENT, unless we have got a matching
535 	 * name cached in name in which case return the mft reference
536 	 * associated with it.
537 	 */
538 	if (name) {
539 		kfree(kaddr);
540 		iput(ia_vi);
541 		return name->mref;
542 	}
543 	ntfs_debug("Entry not found.");
544 	err = -ENOENT;
545 unm_err_out:
546 	kfree(kaddr);
547 err_out:
548 	if (!err)
549 		err = -EIO;
550 	if (ctx)
551 		ntfs_attr_put_search_ctx(ctx);
552 	if (m)
553 		unmap_mft_record(dir_ni);
554 	kfree(name);
555 	*res = NULL;
556 	if (!IS_ERR_OR_NULL(ia_vi))
557 		iput(ia_vi);
558 	return ERR_MREF(err);
559 dir_err_out:
560 	ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
561 	goto err_out;
562 }
563 
564 /*
565  * ntfs_filldir - ntfs specific filldir method
566  * @vol:	current ntfs volume
567  * @ndir:	ntfs inode of current directory
568  * @ia_page:	page in which the index allocation buffer @ie is in resides
569  * @ie:		current index entry
570  * @name:	buffer to use for the converted name
571  * @actor:	what to feed the entries to
572  *
573  * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
574  * callback.
575  *
576  * If @ia_page is not NULL it is the locked page containing the index
577  * allocation block containing the index entry @ie.
578  *
579  * Note, we drop (and then reacquire) the page lock on @ia_page across the
580  * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
581  * since ntfs_lookup() will lock the same page.  As an optimization, we do not
582  * retake the lock if we are returning a non-zero value as ntfs_readdir()
583  * would need to drop the lock immediately anyway.
584  */
585 static inline int ntfs_filldir(struct ntfs_volume *vol,
586 		struct ntfs_inode *ndir, struct page *ia_page, struct index_entry *ie,
587 		u8 *name, struct dir_context *actor)
588 {
589 	unsigned long mref;
590 	int name_len;
591 	unsigned int dt_type;
592 	u8 name_type;
593 
594 	name_type = ie->key.file_name.file_name_type;
595 	if (name_type == FILE_NAME_DOS) {
596 		ntfs_debug("Skipping DOS name space entry.");
597 		return 0;
598 	}
599 	if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
600 		ntfs_debug("Skipping root directory self reference entry.");
601 		return 0;
602 	}
603 	if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
604 			!NVolShowSystemFiles(vol)) {
605 		ntfs_debug("Skipping system file.");
606 		return 0;
607 	}
608 	if (!NVolShowHiddenFiles(vol) &&
609 	    (ie->key.file_name.file_attributes & FILE_ATTR_HIDDEN)) {
610 		ntfs_debug("Skipping hidden file.");
611 		return 0;
612 	}
613 
614 	name_len = ntfs_ucstonls(vol, (__le16 *)&ie->key.file_name.file_name,
615 			ie->key.file_name.file_name_length, &name,
616 			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
617 	if (name_len <= 0) {
618 		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
619 				(long long)MREF_LE(ie->data.dir.indexed_file));
620 		return 0;
621 	}
622 
623 	mref = MREF_LE(ie->data.dir.indexed_file);
624 	if (ie->key.file_name.file_attributes &
625 			FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
626 		dt_type = DT_DIR;
627 	else if (ie->key.file_name.file_attributes & FILE_ATTR_REPARSE_POINT)
628 		dt_type = ntfs_reparse_tag_dt_types(vol, mref);
629 	else
630 		dt_type = DT_REG;
631 
632 	/*
633 	 * Drop the page lock otherwise we deadlock with NFS when it calls
634 	 * ->lookup since ntfs_lookup() will lock the same page.
635 	 */
636 	if (ia_page)
637 		unlock_page(ia_page);
638 	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode 0x%lx, DT_%s.",
639 		name, name_len, actor->pos, mref, dt_type == DT_DIR ? "DIR" : "REG");
640 	if (!dir_emit(actor, name, name_len, mref, dt_type))
641 		return 1;
642 	/* Relock the page but not if we are aborting ->readdir. */
643 	if (ia_page)
644 		lock_page(ia_page);
645 	return 0;
646 }
647 
648 struct ntfs_file_private {
649 	void *key;
650 	__le16 key_length;
651 	bool end_in_iterate;
652 	loff_t curr_pos;
653 };
654 
655 struct ntfs_index_ra {
656 	unsigned long start_index;
657 	unsigned int count;
658 	struct rb_node rb_node;
659 };
660 
661 static void ntfs_insert_rb(struct ntfs_index_ra *nir, struct rb_root *root)
662 {
663 	struct rb_node **new = &root->rb_node, *parent = NULL;
664 	struct ntfs_index_ra *cnir;
665 
666 	while (*new) {
667 		parent = *new;
668 		cnir = rb_entry(parent, struct ntfs_index_ra, rb_node);
669 		if (nir->start_index < cnir->start_index)
670 			new = &parent->rb_left;
671 		else if (nir->start_index >= cnir->start_index + cnir->count)
672 			new = &parent->rb_right;
673 		else {
674 			pr_err("nir start index : %ld, count : %d, cnir start_index : %ld, count : %d\n",
675 				nir->start_index, nir->count, cnir->start_index, cnir->count);
676 			return;
677 		}
678 	}
679 
680 	rb_link_node(&nir->rb_node, parent, new);
681 	rb_insert_color(&nir->rb_node, root);
682 }
683 
684 static int ntfs_ia_blocks_readahead(struct ntfs_inode *ia_ni, loff_t pos)
685 {
686 	unsigned long dir_start_index, dir_end_index;
687 	struct inode *ia_vi = VFS_I(ia_ni);
688 	struct file_ra_state *dir_ra;
689 
690 	dir_end_index = (i_size_read(ia_vi) + PAGE_SIZE - 1) >> PAGE_SHIFT;
691 	dir_start_index = (pos + PAGE_SIZE - 1) >> PAGE_SHIFT;
692 
693 	if (dir_start_index >= dir_end_index)
694 		return 0;
695 
696 	dir_ra = kzalloc(sizeof(*dir_ra), GFP_NOFS);
697 	if (!dir_ra)
698 		return -ENOMEM;
699 
700 	file_ra_state_init(dir_ra, ia_vi->i_mapping);
701 	dir_end_index = (i_size_read(ia_vi) + PAGE_SIZE - 1) >> PAGE_SHIFT;
702 	dir_start_index = (pos + PAGE_SIZE - 1) >> PAGE_SHIFT;
703 	dir_ra->ra_pages = dir_end_index - dir_start_index;
704 	page_cache_sync_readahead(ia_vi->i_mapping, dir_ra, NULL,
705 			dir_start_index, dir_end_index - dir_start_index);
706 	kfree(dir_ra);
707 
708 	return 0;
709 }
710 
711 static int ntfs_readdir(struct file *file, struct dir_context *actor)
712 {
713 	struct inode *vdir = file_inode(file);
714 	struct super_block *sb = vdir->i_sb;
715 	struct ntfs_inode *ndir = NTFS_I(vdir);
716 	struct ntfs_volume *vol = NTFS_SB(sb);
717 	struct ntfs_attr_search_ctx *ctx = NULL;
718 	struct ntfs_index_context *ictx = NULL;
719 	u8 *name;
720 	struct index_root *ir;
721 	struct index_entry *next = NULL;
722 	struct ntfs_file_private *private = NULL;
723 	int err = 0;
724 	loff_t ie_pos = 2; /* initialize it with dot and dotdot size */
725 	struct ntfs_index_ra *nir = NULL;
726 	unsigned long index;
727 	struct rb_root ra_root = RB_ROOT;
728 	struct file_ra_state *ra;
729 
730 	ntfs_debug("Entering for inode 0x%llx, fpos 0x%llx.",
731 			ndir->mft_no, actor->pos);
732 
733 	if (file->private_data) {
734 		private = file->private_data;
735 
736 		if (actor->pos != private->curr_pos) {
737 			/*
738 			 * If actor->pos is different from the previous passed
739 			 * one, Discard the private->key and fill dirent buffer
740 			 * with linear lookup.
741 			 */
742 			kfree(private->key);
743 			private->key = NULL;
744 			private->end_in_iterate = false;
745 		} else if (private->end_in_iterate) {
746 			kfree(private->key);
747 			kfree(file->private_data);
748 			file->private_data = NULL;
749 			return 0;
750 		}
751 	}
752 
753 	/* Emulate . and .. for all directories. */
754 	if (!dir_emit_dots(file, actor))
755 		return 0;
756 
757 	/*
758 	 * Allocate a buffer to store the current name being processed
759 	 * converted to format determined by current NLS.
760 	 */
761 	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
762 	if (unlikely(!name))
763 		return -ENOMEM;
764 
765 	mutex_lock_nested(&ndir->mrec_lock, NTFS_INODE_MUTEX_PARENT);
766 	ictx = ntfs_index_ctx_get(ndir, I30, 4);
767 	if (!ictx) {
768 		kfree(name);
769 		mutex_unlock(&ndir->mrec_lock);
770 		return -ENOMEM;
771 	}
772 
773 	ra = kzalloc(sizeof(struct file_ra_state), GFP_NOFS);
774 	if (!ra) {
775 		kfree(name);
776 		ntfs_index_ctx_put(ictx);
777 		mutex_unlock(&ndir->mrec_lock);
778 		return -ENOMEM;
779 	}
780 	file_ra_state_init(ra, vol->mft_ino->i_mapping);
781 
782 	if (private && private->key) {
783 		/*
784 		 * Find index witk private->key using ntfs_index_lookup()
785 		 * instead of linear index lookup.
786 		 */
787 		err = ntfs_index_lookup(private->key,
788 					le16_to_cpu(private->key_length),
789 					ictx);
790 		if (!err) {
791 			next = ictx->entry;
792 			/*
793 			 * Update ie_pos with private->curr_pos
794 			 * to make next d_off of dirent correct.
795 			 */
796 			ie_pos = private->curr_pos;
797 
798 			if (actor->pos > vol->mft_record_size && ictx->ia_ni) {
799 				err = ntfs_ia_blocks_readahead(ictx->ia_ni, actor->pos);
800 				if (err)
801 					goto out;
802 			}
803 
804 			goto nextdir;
805 		} else {
806 			goto out;
807 		}
808 	} else if (!private) {
809 		private = kzalloc(sizeof(struct ntfs_file_private), GFP_KERNEL);
810 		if (!private) {
811 			err = -ENOMEM;
812 			goto out;
813 		}
814 		file->private_data = private;
815 	}
816 
817 	ctx = ntfs_attr_get_search_ctx(ndir, NULL);
818 	if (!ctx) {
819 		err = -ENOMEM;
820 		goto out;
821 	}
822 
823 	/* Find the index root attribute in the mft record. */
824 	if (ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 0,
825 				ctx)) {
826 		ntfs_error(sb, "Index root attribute missing in directory inode %llu",
827 				ndir->mft_no);
828 		ntfs_attr_put_search_ctx(ctx);
829 		err = -ENOMEM;
830 		goto out;
831 	}
832 
833 	/* Get to the index root value. */
834 	ir = (struct index_root *)((u8 *)ctx->attr +
835 			le16_to_cpu(ctx->attr->data.resident.value_offset));
836 
837 	ictx->ir = ir;
838 	ictx->actx = ctx;
839 	ictx->parent_vcn[ictx->pindex] = VCN_INDEX_ROOT_PARENT;
840 	ictx->is_in_root = true;
841 	ictx->parent_pos[ictx->pindex] = 0;
842 
843 	ictx->block_size = le32_to_cpu(ir->index_block_size);
844 	if (ictx->block_size < NTFS_BLOCK_SIZE) {
845 		ntfs_error(sb, "Index block size (%d) is smaller than the sector size (%d)",
846 				ictx->block_size, NTFS_BLOCK_SIZE);
847 		err = -EIO;
848 		goto out;
849 	}
850 
851 	if (vol->cluster_size <= ictx->block_size)
852 		ictx->vcn_size_bits = vol->cluster_size_bits;
853 	else
854 		ictx->vcn_size_bits = NTFS_BLOCK_SIZE_BITS;
855 	ictx->cr = ir->collation_rule;
856 
857 	/* The first index entry. */
858 	next = (struct index_entry *)((u8 *)&ir->index +
859 			le32_to_cpu(ir->index.entries_offset));
860 
861 	if (next->flags & INDEX_ENTRY_NODE) {
862 		ictx->ia_ni = ntfs_ia_open(ictx, ictx->idx_ni);
863 		if (!ictx->ia_ni) {
864 			err = -EINVAL;
865 			goto out;
866 		}
867 
868 		err = ntfs_ia_blocks_readahead(ictx->ia_ni, actor->pos);
869 		if (err)
870 			goto out;
871 	}
872 
873 	if (next->flags & INDEX_ENTRY_NODE) {
874 		next = ntfs_index_walk_down(next, ictx);
875 		if (IS_ERR(next)) {
876 			err = PTR_ERR(next);
877 			goto out;
878 		}
879 	}
880 
881 	if (next && !(next->flags & INDEX_ENTRY_END))
882 		goto nextdir;
883 
884 	while (1) {
885 		next = ntfs_index_next(next, ictx);
886 		if (IS_ERR(next)) {
887 			err = PTR_ERR(next);
888 			goto out;
889 		}
890 		if (!next)
891 			break;
892 nextdir:
893 		if (ie_pos < actor->pos) {
894 			ie_pos += le16_to_cpu(next->length);
895 			continue;
896 		}
897 
898 		actor->pos = ie_pos;
899 
900 		index = ntfs_mft_no_to_pidx(vol,
901 				MREF_LE(next->data.dir.indexed_file));
902 		if (nir) {
903 			struct ntfs_index_ra *cnir;
904 			struct rb_node *node = ra_root.rb_node;
905 
906 			if (nir->start_index <= index &&
907 			    index < nir->start_index + nir->count) {
908 				/* No behavior */
909 				goto filldir;
910 			}
911 
912 			while (node) {
913 				cnir = rb_entry(node, struct ntfs_index_ra, rb_node);
914 				if (cnir->start_index <= index &&
915 				    index < cnir->start_index + cnir->count) {
916 					goto filldir;
917 				} else if (cnir->start_index + cnir->count == index) {
918 					cnir->count++;
919 					goto filldir;
920 				} else if (!cnir->start_index && cnir->start_index - 1 == index) {
921 					cnir->start_index = index;
922 					goto filldir;
923 				}
924 
925 				if (index < cnir->start_index)
926 					node = node->rb_left;
927 				else if (index >= cnir->start_index + cnir->count)
928 					node = node->rb_right;
929 			}
930 
931 			if (nir->start_index + nir->count == index) {
932 				nir->count++;
933 			} else if (!nir->start_index && nir->start_index - 1 == index) {
934 				nir->start_index = index;
935 			} else if (nir->count > 2) {
936 				ntfs_insert_rb(nir, &ra_root);
937 				nir = NULL;
938 			} else {
939 				nir->start_index = index;
940 				nir->count = 1;
941 			}
942 		}
943 
944 		if (!nir) {
945 			nir = kzalloc(sizeof(struct ntfs_index_ra), GFP_KERNEL);
946 			if (nir) {
947 				nir->start_index = index;
948 				nir->count = 1;
949 			}
950 		}
951 
952 filldir:
953 		/* Submit the name to the filldir callback. */
954 		err = ntfs_filldir(vol, ndir, NULL, next, name, actor);
955 		if (err) {
956 			/*
957 			 * Store index key value to file private_data to start
958 			 * from current index offset on next round.
959 			 */
960 			private = file->private_data;
961 			kfree(private->key);
962 			private->key = kmalloc(le16_to_cpu(next->key_length), GFP_KERNEL);
963 			if (!private->key) {
964 				err = -ENOMEM;
965 				goto out;
966 			}
967 
968 			memcpy(private->key, &next->key.file_name, le16_to_cpu(next->key_length));
969 			private->key_length = next->key_length;
970 			break;
971 		}
972 		ie_pos += le16_to_cpu(next->length);
973 	}
974 
975 	if (!err)
976 		private->end_in_iterate = true;
977 	else
978 		err = 0;
979 
980 	private->curr_pos = actor->pos = ie_pos;
981 out:
982 	while (!RB_EMPTY_ROOT(&ra_root)) {
983 		struct ntfs_index_ra *cnir;
984 		struct rb_node *node;
985 
986 		node = rb_first(&ra_root);
987 		cnir = rb_entry(node, struct ntfs_index_ra, rb_node);
988 		ra->ra_pages = cnir->count;
989 		page_cache_sync_readahead(vol->mft_ino->i_mapping, ra, NULL,
990 				cnir->start_index, cnir->count);
991 		rb_erase(node, &ra_root);
992 		kfree(cnir);
993 	}
994 
995 	if (err) {
996 		if (private) {
997 			private->curr_pos = actor->pos;
998 			private->end_in_iterate = true;
999 		}
1000 		err = 0;
1001 	}
1002 	ntfs_index_ctx_put(ictx);
1003 	kfree(name);
1004 	kfree(nir);
1005 	kfree(ra);
1006 	mutex_unlock(&ndir->mrec_lock);
1007 	return err;
1008 }
1009 
1010 int ntfs_check_empty_dir(struct ntfs_inode *ni, struct mft_record *ni_mrec)
1011 {
1012 	struct ntfs_attr_search_ctx *ctx;
1013 	int ret = 0;
1014 
1015 	if (!(ni_mrec->flags & MFT_RECORD_IS_DIRECTORY))
1016 		return 0;
1017 
1018 	ctx = ntfs_attr_get_search_ctx(ni, NULL);
1019 	if (!ctx) {
1020 		ntfs_error(ni->vol->sb, "Failed to get search context");
1021 		return -ENOMEM;
1022 	}
1023 
1024 	/* Find the index root attribute in the mft record. */
1025 	ret = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
1026 				0, ctx);
1027 	if (ret) {
1028 		ntfs_error(ni->vol->sb, "Index root attribute missing in directory inode %llu",
1029 				ni->mft_no);
1030 		ntfs_attr_put_search_ctx(ctx);
1031 		return ret;
1032 	}
1033 
1034 	/* Non-empty directory? */
1035 	if (le32_to_cpu(ctx->attr->data.resident.value_length) !=
1036 	    sizeof(struct index_root) + sizeof(struct index_entry_header)) {
1037 		/* Both ENOTEMPTY and EEXIST are ok. We use the more common. */
1038 		ret = -ENOTEMPTY;
1039 		ntfs_debug("Directory is not empty\n");
1040 	}
1041 
1042 	ntfs_attr_put_search_ctx(ctx);
1043 
1044 	return ret;
1045 }
1046 
1047 /*
1048  * ntfs_dir_open - called when an inode is about to be opened
1049  * @vi:		inode to be opened
1050  * @filp:	file structure describing the inode
1051  *
1052  * Limit directory size to the page cache limit on architectures where unsigned
1053  * long is 32-bits. This is the most we can do for now without overflowing the
1054  * page cache page index. Doing it this way means we don't run into problems
1055  * because of existing too large directories. It would be better to allow the
1056  * user to read the accessible part of the directory but I doubt very much
1057  * anyone is going to hit this check on a 32-bit architecture, so there is no
1058  * point in adding the extra complexity required to support this.
1059  *
1060  * On 64-bit architectures, the check is hopefully optimized away by the
1061  * compiler.
1062  */
1063 static int ntfs_dir_open(struct inode *vi, struct file *filp)
1064 {
1065 	if (sizeof(unsigned long) < 8) {
1066 		if (i_size_read(vi) > MAX_LFS_FILESIZE)
1067 			return -EFBIG;
1068 	}
1069 	return 0;
1070 }
1071 
1072 static int ntfs_dir_release(struct inode *vi, struct file *filp)
1073 {
1074 	if (filp->private_data) {
1075 		kfree(((struct ntfs_file_private *)filp->private_data)->key);
1076 		kfree(filp->private_data);
1077 		filp->private_data = NULL;
1078 	}
1079 	return 0;
1080 }
1081 
1082 /*
1083  * ntfs_dir_fsync - sync a directory to disk
1084  * @filp:	file describing the directory to be synced
1085  * @start:	start offset to be synced
1086  * @end:	end offset to be synced
1087  * @datasync:	if non-zero only flush user data and not metadata
1088  *
1089  * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
1090  * msync system calls.  This function is based on file.c::ntfs_file_fsync().
1091  *
1092  * Write the mft record and all associated extent mft records as well as the
1093  * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
1094  *
1095  * If @datasync is true, we do not wait on the inode(s) to be written out
1096  * but we always wait on the page cache pages to be written out.
1097  *
1098  * Note: In the past @filp could be NULL so we ignore it as we don't need it
1099  * anyway.
1100  *
1101  * Locking: Caller must hold i_mutex on the inode.
1102  */
1103 static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
1104 			  int datasync)
1105 {
1106 	struct inode *bmp_vi, *vi = filp->f_mapping->host;
1107 	struct ntfs_volume *vol = NTFS_I(vi)->vol;
1108 	struct ntfs_inode *ni = NTFS_I(vi);
1109 	struct ntfs_attr_search_ctx *ctx;
1110 	struct inode *parent_vi, *ia_vi;
1111 	int err, ret;
1112 	struct ntfs_attr na;
1113 
1114 	ntfs_debug("Entering for inode 0x%llx.", ni->mft_no);
1115 
1116 	if (NVolShutdown(vol))
1117 		return -EIO;
1118 
1119 	ctx = ntfs_attr_get_search_ctx(ni, NULL);
1120 	if (!ctx)
1121 		return -ENOMEM;
1122 
1123 	mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL_CHILD);
1124 	while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, ctx))) {
1125 		struct file_name_attr *fn = (struct file_name_attr *)((u8 *)ctx->attr +
1126 				le16_to_cpu(ctx->attr->data.resident.value_offset));
1127 
1128 		if (MREF_LE(fn->parent_directory) == ni->mft_no)
1129 			continue;
1130 
1131 		parent_vi = ntfs_iget(vi->i_sb, MREF_LE(fn->parent_directory));
1132 		if (IS_ERR(parent_vi))
1133 			continue;
1134 		mutex_lock_nested(&NTFS_I(parent_vi)->mrec_lock, NTFS_INODE_MUTEX_NORMAL);
1135 		ia_vi = ntfs_index_iget(parent_vi, I30, 4);
1136 		mutex_unlock(&NTFS_I(parent_vi)->mrec_lock);
1137 		if (IS_ERR(ia_vi)) {
1138 			iput(parent_vi);
1139 			continue;
1140 		}
1141 		write_inode_now(ia_vi, 1);
1142 		iput(ia_vi);
1143 		write_inode_now(parent_vi, 1);
1144 		iput(parent_vi);
1145 	}
1146 	mutex_unlock(&ni->mrec_lock);
1147 	ntfs_attr_put_search_ctx(ctx);
1148 
1149 	err = file_write_and_wait_range(filp, start, end);
1150 	if (err)
1151 		return err;
1152 	inode_lock(vi);
1153 
1154 	/* If the bitmap attribute inode is in memory sync it, too. */
1155 	na.mft_no = vi->i_ino;
1156 	na.type = AT_BITMAP;
1157 	na.name = I30;
1158 	na.name_len = 4;
1159 	bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
1160 	if (bmp_vi) {
1161 		write_inode_now(bmp_vi, !datasync);
1162 		iput(bmp_vi);
1163 	}
1164 	ret = __ntfs_write_inode(vi, 1);
1165 
1166 	write_inode_now(vi, !datasync);
1167 
1168 	write_inode_now(vol->mftbmp_ino, 1);
1169 	down_write(&vol->lcnbmp_lock);
1170 	write_inode_now(vol->lcnbmp_ino, 1);
1171 	up_write(&vol->lcnbmp_lock);
1172 	write_inode_now(vol->mft_ino, 1);
1173 
1174 	err = sync_blockdev(vi->i_sb->s_bdev);
1175 	if (unlikely(err && !ret))
1176 		ret = err;
1177 	if (likely(!ret))
1178 		ntfs_debug("Done.");
1179 	else
1180 		ntfs_warning(vi->i_sb,
1181 			"Failed to f%ssync inode 0x%llx.  Error %u.",
1182 			datasync ? "data" : "", ni->mft_no, -ret);
1183 	inode_unlock(vi);
1184 	return ret;
1185 }
1186 
1187 const struct file_operations ntfs_dir_ops = {
1188 	.llseek		= generic_file_llseek,	/* Seek inside directory. */
1189 	.read		= generic_read_dir,	/* Return -EISDIR. */
1190 	.iterate_shared	= ntfs_readdir,		/* Read directory contents. */
1191 	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
1192 	.open		= ntfs_dir_open,	/* Open directory. */
1193 	.release	= ntfs_dir_release,
1194 	.unlocked_ioctl	= ntfs_ioctl,
1195 #ifdef CONFIG_COMPAT
1196 	.compat_ioctl	= ntfs_compat_ioctl,
1197 #endif
1198 	.setlease	= generic_setlease,
1199 };
1200