xref: /linux/fs/ntfs/mft.c (revision 679ee5afd5b4764911656b4d4b83b9abee2b5572)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NTFS kernel mft record operations.
4  * Part of this file is based on code from the NTFS-3G.
5  *
6  * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
7  * Copyright (c) 2002 Richard Russon
8  * Copyright (c) 2025 LG Electronics Co., Ltd.
9  */
10 
11 #include <linux/writeback.h>
12 #include <linux/bio.h>
13 #include <linux/iomap.h>
14 
15 #include "bitmap.h"
16 #include "lcnalloc.h"
17 #include "mft.h"
18 #include "ntfs.h"
19 
20 /*
21  * ntfs_mft_record_check - Check the consistency of an MFT record
22  *
23  * Make sure its general fields are safe, then examine all its
24  * attributes and apply generic checks to them.
25  *
26  * Returns 0 if the checks are successful. If not, return -EIO.
27  */
28 int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m,
29 		u64 mft_no)
30 {
31 	struct attr_record *a;
32 	struct super_block *sb = vol->sb;
33 	u16 attrs_offset;
34 	u32 bytes_in_use;
35 
36 	if (!ntfs_is_file_record(m->magic)) {
37 		ntfs_error(sb, "Record %llu has no FILE magic (0x%x)\n",
38 				mft_no, le32_to_cpu(*(__le32 *)m));
39 		goto err_out;
40 	}
41 
42 	if (le16_to_cpu(m->usa_ofs) & 0x1 ||
43 	    (vol->mft_record_size >> NTFS_BLOCK_SIZE_BITS) + 1 != le16_to_cpu(m->usa_count) ||
44 	    le16_to_cpu(m->usa_ofs) + le16_to_cpu(m->usa_count) * 2 > vol->mft_record_size) {
45 		ntfs_error(sb, "Record %llu has corrupt fix-up values fields\n",
46 				mft_no);
47 		goto err_out;
48 	}
49 
50 	if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
51 		ntfs_error(sb, "Record %llu has corrupt allocation size (%u <> %u)\n",
52 				mft_no, vol->mft_record_size,
53 				le32_to_cpu(m->bytes_allocated));
54 		goto err_out;
55 	}
56 
57 	if (le32_to_cpu(m->bytes_in_use) > vol->mft_record_size) {
58 		ntfs_error(sb, "Record %llu has corrupt in-use size (%u > %u)\n",
59 				mft_no, le32_to_cpu(m->bytes_in_use),
60 				vol->mft_record_size);
61 		goto err_out;
62 	}
63 
64 	if (le16_to_cpu(m->attrs_offset) & 7) {
65 		ntfs_error(sb, "Attributes badly aligned in record %llu\n",
66 				mft_no);
67 		goto err_out;
68 	}
69 
70 	attrs_offset = le16_to_cpu(m->attrs_offset);
71 	bytes_in_use = le32_to_cpu(m->bytes_in_use);
72 
73 	if (attrs_offset > bytes_in_use ||
74 	    bytes_in_use - attrs_offset < sizeof_field(struct attr_record, type)) {
75 		ntfs_error(sb, "Record %llu has corrupt attribute offset\n", mft_no);
76 		goto err_out;
77 	}
78 
79 	a = (struct attr_record *)((char *)m + attrs_offset);
80 	if ((char *)a < (char *)m || (char *)a > (char *)m + vol->mft_record_size) {
81 		ntfs_error(sb, "Record %llu is corrupt\n", mft_no);
82 		goto err_out;
83 	}
84 
85 	return 0;
86 
87 err_out:
88 	return -EIO;
89 }
90 
91 /*
92  * map_mft_record_folio - map the folio in which a specific mft record resides
93  * @ni:		ntfs inode whose mft record page to map
94  *
95  * This maps the folio in which the mft record of the ntfs inode @ni is
96  * situated.
97  *
98  * This allocates a new buffer (@ni->mrec), copies the MFT record data from
99  * the mapped folio into this buffer, and applies the MST (Multi Sector
100  * Transfer) fixups on the copy.
101  *
102  * The folio is pinned (referenced) in @ni->folio to ensure the data remains
103  * valid in the page cache, but the returned pointer is the allocated copy.
104  *
105  * Return: A pointer to the allocated and fixed-up mft record (@ni->mrec).
106  * The return value needs to be checked with IS_ERR(). If it is true,
107  * PTR_ERR() contains the negative error code.
108  */
109 static inline struct mft_record *map_mft_record_folio(struct ntfs_inode *ni)
110 {
111 	loff_t i_size;
112 	struct ntfs_volume *vol = ni->vol;
113 	struct inode *mft_vi = vol->mft_ino;
114 	struct folio *folio;
115 	unsigned long index, end_index;
116 	unsigned int ofs;
117 
118 	WARN_ON(ni->folio);
119 	/*
120 	 * The index into the page cache and the offset within the page cache
121 	 * page of the wanted mft record.
122 	 */
123 	index = NTFS_MFT_NR_TO_PIDX(vol, ni->mft_no);
124 	ofs = NTFS_MFT_NR_TO_POFS(vol, ni->mft_no);
125 
126 	i_size = i_size_read(mft_vi);
127 	/* The maximum valid index into the page cache for $MFT's data. */
128 	end_index = i_size >> PAGE_SHIFT;
129 
130 	/* If the wanted index is out of bounds the mft record doesn't exist. */
131 	if (unlikely(index >= end_index)) {
132 		if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
133 				vol->mft_record_size) {
134 			folio = ERR_PTR(-ENOENT);
135 			ntfs_error(vol->sb,
136 				"Attempt to read mft record 0x%llx, which is beyond the end of the mft. This is probably a bug in the ntfs driver.",
137 				ni->mft_no);
138 			goto err_out;
139 		}
140 	}
141 
142 	/* Read, map, and pin the folio. */
143 	folio = read_mapping_folio(mft_vi->i_mapping, index, NULL);
144 	if (!IS_ERR(folio)) {
145 		u8 *addr;
146 
147 		ni->mrec = kmalloc(vol->mft_record_size, GFP_NOFS);
148 		if (!ni->mrec) {
149 			folio_put(folio);
150 			folio = ERR_PTR(-ENOMEM);
151 			goto err_out;
152 		}
153 
154 		addr = kmap_local_folio(folio, 0);
155 		memcpy(ni->mrec, addr + ofs, vol->mft_record_size);
156 		post_read_mst_fixup((struct ntfs_record *)ni->mrec, vol->mft_record_size);
157 
158 		/* Catch multi sector transfer fixup errors. */
159 		if (!ntfs_mft_record_check(vol, (struct mft_record *)ni->mrec, ni->mft_no)) {
160 			kunmap_local(addr);
161 			ni->folio = folio;
162 			ni->folio_ofs = ofs;
163 			return ni->mrec;
164 		}
165 		kunmap_local(addr);
166 		folio_put(folio);
167 		kfree(ni->mrec);
168 		ni->mrec = NULL;
169 		folio = ERR_PTR(-EIO);
170 		NVolSetErrors(vol);
171 	}
172 err_out:
173 	ni->folio = NULL;
174 	ni->folio_ofs = 0;
175 	return (struct mft_record *)folio;
176 }
177 
178 /*
179  * map_mft_record - map and pin an mft record
180  * @ni:		ntfs inode whose MFT record to map
181  *
182  * This function ensures the MFT record for the given inode is mapped and
183  * accessible.
184  *
185  * It increments the reference count of the ntfs inode. If the record is
186  * already mapped (@ni->folio is set), it returns the cached record
187  * immediately.
188  *
189  * Otherwise, it calls map_mft_record_folio() to read the folio from disk
190  * (if necessary via read_mapping_folio), allocate a buffer, and copy the
191  * record data.
192  *
193  * Return: A pointer to the mft record. You need to check the returned
194  * pointer with IS_ERR().
195  */
196 struct mft_record *map_mft_record(struct ntfs_inode *ni)
197 {
198 	struct mft_record *m;
199 
200 	if (!ni)
201 		return ERR_PTR(-EINVAL);
202 
203 	ntfs_debug("Entering for mft_no 0x%llx.", ni->mft_no);
204 
205 	/* Make sure the ntfs inode doesn't go away. */
206 	atomic_inc(&ni->count);
207 
208 	if (ni->folio)
209 		return (struct mft_record *)ni->mrec;
210 
211 	m = map_mft_record_folio(ni);
212 	if (!IS_ERR(m))
213 		return m;
214 
215 	atomic_dec(&ni->count);
216 	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
217 	return m;
218 }
219 
220 /*
221  * unmap_mft_record - release a reference to a mapped mft record
222  * @ni:		ntfs inode whose MFT record to unmap
223  *
224  * This decrements the reference count of the ntfs inode.
225  *
226  * It releases the caller's hold on the inode. If the reference count indicates
227  * that there are still other users (count > 1), the function returns
228  * immediately, keeping the resources (folio and mrec buffer) pinned for
229  * those users.
230  *
231  * NOTE: If caller has modified the mft record, it is imperative to set the mft
232  * record dirty BEFORE calling unmap_mft_record().
233  */
234 void unmap_mft_record(struct ntfs_inode *ni)
235 {
236 	struct folio *folio;
237 
238 	if (!ni)
239 		return;
240 
241 	ntfs_debug("Entering for mft_no 0x%llx.", ni->mft_no);
242 
243 	folio = ni->folio;
244 	if (atomic_dec_return(&ni->count) > 1)
245 		return;
246 	WARN_ON(!folio);
247 }
248 
249 /*
250  * map_extent_mft_record - load an extent inode and attach it to its base
251  * @base_ni:	base ntfs inode
252  * @mref:	mft reference of the extent inode to load
253  * @ntfs_ino:	on successful return, pointer to the struct ntfs_inode structure
254  *
255  * Load the extent mft record @mref and attach it to its base inode @base_ni.
256  * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
257  * PTR_ERR(result) gives the negative error code.
258  *
259  * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
260  * structure of the mapped extent inode.
261  */
262 struct mft_record *map_extent_mft_record(struct ntfs_inode *base_ni, u64 mref,
263 		struct ntfs_inode **ntfs_ino)
264 {
265 	struct mft_record *m;
266 	struct ntfs_inode *ni = NULL;
267 	struct ntfs_inode **extent_nis = NULL;
268 	int i;
269 	u64 mft_no = MREF(mref);
270 	u16 seq_no = MSEQNO(mref);
271 	bool destroy_ni = false;
272 
273 	ntfs_debug("Mapping extent mft record 0x%llx (base mft record 0x%llx).",
274 			mft_no, base_ni->mft_no);
275 	/* Make sure the base ntfs inode doesn't go away. */
276 	atomic_inc(&base_ni->count);
277 	/*
278 	 * Check if this extent inode has already been added to the base inode,
279 	 * in which case just return it. If not found, add it to the base
280 	 * inode before returning it.
281 	 */
282 retry:
283 	mutex_lock(&base_ni->extent_lock);
284 	if (base_ni->nr_extents > 0) {
285 		extent_nis = base_ni->ext.extent_ntfs_inos;
286 		for (i = 0; i < base_ni->nr_extents; i++) {
287 			if (mft_no != extent_nis[i]->mft_no)
288 				continue;
289 			ni = extent_nis[i];
290 			/* Make sure the ntfs inode doesn't go away. */
291 			atomic_inc(&ni->count);
292 			break;
293 		}
294 	}
295 	if (likely(ni != NULL)) {
296 		mutex_unlock(&base_ni->extent_lock);
297 		atomic_dec(&base_ni->count);
298 		/* We found the record; just have to map and return it. */
299 		m = map_mft_record(ni);
300 		/* map_mft_record() has incremented this on success. */
301 		atomic_dec(&ni->count);
302 		if (!IS_ERR(m)) {
303 			/* Verify the sequence number. */
304 			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
305 				ntfs_debug("Done 1.");
306 				*ntfs_ino = ni;
307 				return m;
308 			}
309 			unmap_mft_record(ni);
310 			ntfs_error(base_ni->vol->sb,
311 					"Found stale extent mft reference! Corrupt filesystem. Run chkdsk.");
312 			return ERR_PTR(-EIO);
313 		}
314 map_err_out:
315 		ntfs_error(base_ni->vol->sb,
316 				"Failed to map extent mft record, error code %ld.",
317 				-PTR_ERR(m));
318 		return m;
319 	}
320 	mutex_unlock(&base_ni->extent_lock);
321 
322 	/* Record wasn't there. Get a new ntfs inode and initialize it. */
323 	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
324 	if (unlikely(!ni)) {
325 		atomic_dec(&base_ni->count);
326 		return ERR_PTR(-ENOMEM);
327 	}
328 	ni->vol = base_ni->vol;
329 	ni->seq_no = seq_no;
330 	ni->nr_extents = -1;
331 	ni->ext.base_ntfs_ino = base_ni;
332 	/* Now map the record. */
333 	m = map_mft_record(ni);
334 	if (IS_ERR(m)) {
335 		atomic_dec(&base_ni->count);
336 		ntfs_clear_extent_inode(ni);
337 		goto map_err_out;
338 	}
339 	/* Verify the sequence number if it is present. */
340 	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
341 		ntfs_error(base_ni->vol->sb,
342 				"Found stale extent mft reference! Corrupt filesystem. Run chkdsk.");
343 		destroy_ni = true;
344 		m = ERR_PTR(-EIO);
345 		goto unm_nolock_err_out;
346 	}
347 
348 	mutex_lock(&base_ni->extent_lock);
349 	for (i = 0; i < base_ni->nr_extents; i++) {
350 		if (mft_no == extent_nis[i]->mft_no) {
351 			mutex_unlock(&base_ni->extent_lock);
352 			ntfs_clear_extent_inode(ni);
353 			goto retry;
354 		}
355 	}
356 	/* Attach extent inode to base inode, reallocating memory if needed. */
357 	if (!(base_ni->nr_extents & 3)) {
358 		struct ntfs_inode **tmp;
359 		int new_size = (base_ni->nr_extents + 4) * sizeof(struct ntfs_inode *);
360 
361 		tmp = kvzalloc(new_size, GFP_NOFS);
362 		if (unlikely(!tmp)) {
363 			ntfs_error(base_ni->vol->sb, "Failed to allocate internal buffer.");
364 			destroy_ni = true;
365 			m = ERR_PTR(-ENOMEM);
366 			goto unm_err_out;
367 		}
368 		if (base_ni->nr_extents) {
369 			WARN_ON(!base_ni->ext.extent_ntfs_inos);
370 			memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
371 					4 * sizeof(struct ntfs_inode *));
372 			kvfree(base_ni->ext.extent_ntfs_inos);
373 		}
374 		base_ni->ext.extent_ntfs_inos = tmp;
375 	}
376 	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
377 	mutex_unlock(&base_ni->extent_lock);
378 	atomic_dec(&base_ni->count);
379 	ntfs_debug("Done 2.");
380 	*ntfs_ino = ni;
381 	return m;
382 unm_err_out:
383 	mutex_unlock(&base_ni->extent_lock);
384 unm_nolock_err_out:
385 	unmap_mft_record(ni);
386 	atomic_dec(&base_ni->count);
387 	/*
388 	 * If the extent inode was not attached to the base inode we need to
389 	 * release it or we will leak memory.
390 	 */
391 	if (destroy_ni)
392 		ntfs_clear_extent_inode(ni);
393 	return m;
394 }
395 
396 /*
397  * __mark_mft_record_dirty - mark the base vfs inode dirty
398  * @ni:		ntfs inode describing the mapped mft record
399  *
400  * Internal function.  Users should call mark_mft_record_dirty() instead.
401  *
402  * This function determines the base ntfs inode (in case @ni is an extent
403  * inode) and marks the corresponding VFS inode dirty.
404  *
405  * NOTE:  We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
406  * on the base vfs inode, because even though file data may have been modified,
407  * it is dirty in the inode meta data rather than the data page cache of the
408  * inode, and thus there are no data pages that need writing out.  Therefore, a
409  * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
410  * other hand, is not sufficient, because ->write_inode needs to be called even
411  * in case of fdatasync. This needs to happen or the file data would not
412  * necessarily hit the device synchronously, even though the vfs inode has the
413  * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
414  * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
415  * which is not what I_DIRTY_SYNC on its own would suggest.
416  */
417 void __mark_mft_record_dirty(struct ntfs_inode *ni)
418 {
419 	struct ntfs_inode *base_ni;
420 
421 	ntfs_debug("Entering for inode 0x%llx.", ni->mft_no);
422 	WARN_ON(NInoAttr(ni));
423 	/* Determine the base vfs inode and mark it dirty, too. */
424 	if (likely(ni->nr_extents >= 0))
425 		base_ni = ni;
426 	else
427 		base_ni = ni->ext.base_ntfs_ino;
428 	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
429 }
430 
431 /*
432  * ntfs_bio_end_io - bio completion callback for MFT record writes
433  *
434  * Decrements the folio reference count that was incremented before
435  * submit_bio(). This prevents a race condition where umount could
436  * evict the inode and release the folio while I/O is still in flight,
437  * potentially causing data corruption or use-after-free.
438  */
439 static void ntfs_bio_end_io(struct bio *bio)
440 {
441 	if (bio->bi_private)
442 		folio_put((struct folio *)bio->bi_private);
443 	bio_put(bio);
444 }
445 
446 /*
447  * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
448  * @vol:	ntfs volume on which the mft record to synchronize resides
449  * @mft_no:	mft record number of mft record to synchronize
450  * @m:		mapped, mst protected (extent) mft record to synchronize
451  *
452  * Write the mapped, mst protected (extent) mft record @m with mft record
453  * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
454  *
455  * On success return 0.  On error return -errno and set the volume errors flag
456  * in the ntfs volume @vol.
457  *
458  * NOTE:  We always perform synchronous i/o.
459  */
460 int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
461 		struct mft_record *m)
462 {
463 	u8 *kmirr;
464 	struct folio *folio;
465 	unsigned int folio_ofs, lcn_folio_off = 0;
466 	int err = 0;
467 	struct bio *bio;
468 
469 	ntfs_debug("Entering for inode 0x%llx.", mft_no);
470 
471 	if (unlikely(!vol->mftmirr_ino)) {
472 		/* This could happen during umount... */
473 		err = -EIO;
474 		goto err_out;
475 	}
476 	/* Get the page containing the mirror copy of the mft record @m. */
477 	folio = read_mapping_folio(vol->mftmirr_ino->i_mapping,
478 			NTFS_MFT_NR_TO_PIDX(vol, mft_no), NULL);
479 	if (IS_ERR(folio)) {
480 		ntfs_error(vol->sb, "Failed to map mft mirror page.");
481 		err = PTR_ERR(folio);
482 		goto err_out;
483 	}
484 
485 	folio_lock(folio);
486 	folio_clear_uptodate(folio);
487 	/* Offset of the mft mirror record inside the page. */
488 	folio_ofs = NTFS_MFT_NR_TO_POFS(vol, mft_no);
489 	/* The address in the page of the mirror copy of the mft record @m. */
490 	kmirr = kmap_local_folio(folio, 0) + folio_ofs;
491 	/* Copy the mst protected mft record to the mirror. */
492 	memcpy(kmirr, m, vol->mft_record_size);
493 	kunmap_local(kmirr);
494 
495 	if (vol->cluster_size_bits > PAGE_SHIFT) {
496 		lcn_folio_off = folio->index << PAGE_SHIFT;
497 		lcn_folio_off &= vol->cluster_size_mask;
498 	}
499 
500 	bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOIO);
501 	bio->bi_iter.bi_sector =
502 		NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) +
503 				 lcn_folio_off + folio_ofs);
504 
505 	if (bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs))
506 		err = submit_bio_wait(bio);
507 	else
508 		err = -EIO;
509 	bio_put(bio);
510 
511 	/*
512 	 * The in-memory mirror is now valid because we just memcpy()'d the
513 	 * mst-protected mft record into it.  Mark the folio uptodate even on
514 	 * write error so a subsequent read_mapping_folio() does not refetch
515 	 * the stale on-disk mirror and overwrite this copy.  The error is
516 	 * propagated to the caller via @err.
517 	 */
518 	folio_mark_uptodate(folio);
519 
520 	folio_unlock(folio);
521 	folio_put(folio);
522 	if (likely(!err)) {
523 		ntfs_debug("Done.");
524 	} else {
525 		ntfs_error(vol->sb, "I/O error while writing mft mirror record 0x%llx!", mft_no);
526 err_out:
527 		ntfs_error(vol->sb,
528 			"Failed to synchronize $MFTMirr (error code %i).  Volume will be left marked dirty on umount.  Run chkdsk on the partition after umounting to correct this.",
529 			err);
530 		NVolSetErrors(vol);
531 	}
532 	return err;
533 }
534 
535 /*
536  * write_mft_record_nolock - write out a mapped (extent) mft record
537  * @ni:		ntfs inode describing the mapped (extent) mft record
538  * @m:		mapped (extent) mft record to write
539  * @sync:	if true, wait for i/o completion
540  *
541  * Write the mapped (extent) mft record @m described by the (regular or extent)
542  * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
543  * the mft mirror, that is also updated.
544  *
545  * We only write the mft record if the ntfs inode @ni is dirty.
546  *
547  * On success, clean the mft record and return 0.
548  * On error (specifically ENOMEM), we redirty the record so it can be retried.
549  * For other errors, we mark the volume with errors.
550  */
551 int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int sync)
552 {
553 	struct ntfs_volume *vol = ni->vol;
554 	struct folio *folio = ni->folio;
555 	int err = 0, i = 0;
556 	u8 *kaddr;
557 	struct mft_record *fixup_m;
558 	struct bio *bio;
559 	unsigned int offset = 0, folio_size;
560 
561 	ntfs_debug("Entering for inode 0x%llx.", ni->mft_no);
562 
563 	WARN_ON(NInoAttr(ni));
564 	WARN_ON(!folio_test_locked(folio));
565 
566 	/*
567 	 * If the struct ntfs_inode is clean no need to do anything.  If it is dirty,
568 	 * mark it as clean now so that it can be redirtied later on if needed.
569 	 * There is no danger of races since the caller is holding the locks
570 	 * for the mft record @m and the page it is in.
571 	 */
572 	if (!NInoTestClearDirty(ni))
573 		goto done;
574 
575 	kaddr = kmap_local_folio(folio, 0);
576 	fixup_m = (struct mft_record *)(kaddr + ni->folio_ofs);
577 	memcpy(fixup_m, m, vol->mft_record_size);
578 
579 	/* Apply the mst protection fixups. */
580 	err = pre_write_mst_fixup((struct ntfs_record *)fixup_m, vol->mft_record_size);
581 	if (err) {
582 		ntfs_error(vol->sb, "Failed to apply mst fixups!");
583 		goto err_out;
584 	}
585 
586 	folio_size = vol->mft_record_size / ni->mft_lcn_count;
587 	while (i < ni->mft_lcn_count) {
588 		unsigned int clu_off;
589 
590 		clu_off = (unsigned int)((s64)ni->mft_no * vol->mft_record_size + offset) &
591 			vol->cluster_size_mask;
592 
593 		bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOIO);
594 		bio->bi_iter.bi_sector =
595 			NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, ni->mft_lcn[i]) +
596 					 clu_off);
597 
598 		if (!bio_add_folio(bio, folio, folio_size,
599 				   ni->folio_ofs + offset)) {
600 			err = -EIO;
601 			goto put_bio_out;
602 		}
603 
604 		/* Synchronize the mft mirror now if not @sync. */
605 		if (!sync && ni->mft_no < vol->mftmirr_size) {
606 			int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no,
607 							   fixup_m);
608 			if (unlikely(sub_err) && !err)
609 				err = sub_err;
610 		}
611 
612 		if (sync) {
613 			int sub_err = submit_bio_wait(bio);
614 
615 			bio_put(bio);
616 			if (unlikely(sub_err) && !err)
617 				err = sub_err;
618 		} else {
619 			folio_get(folio);
620 			bio->bi_private = folio;
621 			bio->bi_end_io = ntfs_bio_end_io;
622 			submit_bio(bio);
623 		}
624 		offset += vol->cluster_size;
625 		i++;
626 	}
627 
628 	/* If @sync, now synchronize the mft mirror. */
629 	if (sync && ni->mft_no < vol->mftmirr_size) {
630 		int sub_err = ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
631 
632 		if (unlikely(sub_err) && !err)
633 			err = sub_err;
634 	}
635 	kunmap_local(kaddr);
636 	if (unlikely(err)) {
637 		/* I/O error during writing.  This is really bad! */
638 		ntfs_error(vol->sb,
639 			"I/O error while writing mft record 0x%llx!  Marking base inode as bad.  You should unmount the volume and run chkdsk.",
640 			ni->mft_no);
641 		goto err_out;
642 	}
643 done:
644 	ntfs_debug("Done.");
645 	return 0;
646 put_bio_out:
647 	bio_put(bio);
648 err_out:
649 	/*
650 	 * The caller should mark the base inode as bad so no more I/O
651 	 * happens. ->drop_inode() will still be invoked so all extent inodes
652 	 * and other allocated memory will be freed. ENOMEM is retried by
653 	 * redirtying the mft record below.
654 	 */
655 	if (err == -ENOMEM) {
656 		ntfs_error(vol->sb,
657 			"Not enough memory to write mft record. Redirtying so the write is retried later.");
658 		mark_mft_record_dirty(ni);
659 		err = 0;
660 	} else
661 		NVolSetErrors(vol);
662 	return err;
663 }
664 
665 static int ntfs_test_inode_wb(struct inode *vi, u64 ino, void *data)
666 {
667 	struct ntfs_attr *na = data;
668 
669 	if (!ntfs_test_inode(vi, na))
670 		return 0;
671 
672 	/*
673 	 * Without this, ntfs_write_mst_block() could call iput_final()
674 	 * , and ntfs_evict_big_inode() could try to unlink this inode
675 	 * and the contex could be blocked infinitly in map_mft_record().
676 	 */
677 	if (NInoBeingDeleted(NTFS_I(vi))) {
678 		na->state = NI_BeingDeleted;
679 		return -1;
680 	}
681 
682 	/*
683 	 * This condition can prevent ntfs_write_mst_block()
684 	 * from applying/undo fixups while ntfs_create() being
685 	 * called
686 	 */
687 	spin_lock(&vi->i_lock);
688 	if (inode_state_read_once(vi) & I_CREATING) {
689 		spin_unlock(&vi->i_lock);
690 		na->state = NI_BeingCreated;
691 		return -1;
692 	}
693 	spin_unlock(&vi->i_lock);
694 
695 	return igrab(vi) ? 1 : -1;
696 }
697 
698 /*
699  * ntfs_may_write_mft_record - check if an mft record may be written out
700  * @vol:	[IN]  ntfs volume on which the mft record to check resides
701  * @mft_no:	[IN]  mft record number of the mft record to check
702  * @m:		[IN]  mapped mft record to check
703  * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
704  * @ref_vi:	[OUT] caller has to drop this vfs inode if one is returned
705  *
706  * Check if the mapped (base or extent) mft record @m with mft record number
707  * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
708  * and possible the ntfs inode of the mft record is locked and the base vfs
709  * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
710  * caller is responsible for unlocking the ntfs inode and unpinning the base
711  * vfs inode.
712  *
713  * To avoid deadlock when the caller holds a folio lock, if the function
714  * returns @ref_vi it defers dropping the vfs inode reference by returning
715  * it in @ref_vi instead of calling iput() directly.  The caller must call
716  * iput() on @ref_vi after releasing the folio lock.
717  *
718  * Return 'true' if the mft record may be written out and 'false' if not.
719  *
720  * The caller has locked the page and cleared the uptodate flag on it which
721  * means that we can safely write out any dirty mft records that do not have
722  * their inodes in icache as determined by find_inode_nowait().
723  *
724  * Here is a description of the tests we perform:
725  *
726  * If the inode is found in icache we know the mft record must be a base mft
727  * record.  If it is dirty, we do not write it and return 'false' as the vfs
728  * inode write paths will result in the access times being updated which would
729  * cause the base mft record to be redirtied and written out again.
730  *
731  * If the inode is in icache and not dirty, we attempt to lock the mft record
732  * and if we find the lock was already taken, it is not safe to write the mft
733  * record and we return 'false'.
734  *
735  * If we manage to obtain the lock we have exclusive access to the mft record,
736  * which also allows us safe writeout of the mft record.  We then set
737  * @locked_ni to the locked ntfs inode and return 'true'.
738  *
739  * Note we cannot just lock the mft record and sleep while waiting for the lock
740  * because this would deadlock due to lock reversal.
741  *
742  * If the inode is not in icache we need to perform further checks.
743  *
744  * If the mft record is not a FILE record or it is a base mft record, we can
745  * safely write it and return 'true'.
746  *
747  * We now know the mft record is an extent mft record.  We check if the inode
748  * corresponding to its base mft record is in icache. If it is not, we cannot
749  * safely determine the state of the extent inode, so we return 'false'.
750  *
751  * We now have the base inode for the extent mft record.  We check if it has an
752  * ntfs inode for the extent mft record attached. If not, it is safe to write
753  * the extent mft record and we return 'true'.
754  *
755  * If the extent inode is attached, we check if it is dirty. If so, we return
756  * 'false' (letting the standard write_inode path handle it).
757  *
758  * If it is not dirty, we attempt to lock the extent mft record. If the lock
759  * was already taken, it is not safe to write and we return 'false'.
760  *
761  * If we manage to obtain the lock we have exclusive access to the extent mft
762  * record. We set @locked_ni to the now locked ntfs inode and return 'true'.
763  */
764 static bool ntfs_may_write_mft_record(struct ntfs_volume *vol, const u64 mft_no,
765 		const struct mft_record *m, struct ntfs_inode **locked_ni,
766 		struct inode **ref_vi)
767 {
768 	struct super_block *sb = vol->sb;
769 	struct inode *mft_vi = vol->mft_ino;
770 	struct inode *vi;
771 	struct ntfs_inode *ni, *eni, **extent_nis;
772 	int i;
773 	struct ntfs_attr na = {0};
774 
775 	ntfs_debug("Entering for inode 0x%llx.", mft_no);
776 	/*
777 	 * Normally we do not return a locked inode so set @locked_ni to NULL.
778 	 */
779 	*locked_ni = NULL;
780 	*ref_vi = NULL;
781 
782 	/*
783 	 * Check if the inode corresponding to this mft record is in the VFS
784 	 * inode cache and obtain a reference to it if it is.
785 	 */
786 	ntfs_debug("Looking for inode 0x%llx in icache.", mft_no);
787 	na.mft_no = mft_no;
788 	na.type = AT_UNUSED;
789 	/*
790 	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
791 	 * we get here for it rather often.
792 	 */
793 	if (!mft_no) {
794 		/* Balance the below iput(). */
795 		vi = igrab(mft_vi);
796 		WARN_ON(vi != mft_vi);
797 	} else {
798 		/*
799 		 * Have to use find_inode_nowait() since ilookup5_nowait()
800 		 * waits for inode with I_FREEING, which causes ntfs to deadlock
801 		 * when inodes are unlinked concurrently
802 		 */
803 		vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na);
804 		if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated)
805 			return false;
806 	}
807 	if (vi) {
808 		ntfs_debug("Base inode 0x%llx is in icache.", mft_no);
809 		/* The inode is in icache. */
810 		ni = NTFS_I(vi);
811 		/* Take a reference to the ntfs inode. */
812 		atomic_inc(&ni->count);
813 		/* If the inode is dirty, do not write this record. */
814 		if (NInoDirty(ni)) {
815 			ntfs_debug("Inode 0x%llx is dirty, do not write it.",
816 					mft_no);
817 			atomic_dec(&ni->count);
818 			*ref_vi = vi;
819 			return false;
820 		}
821 		ntfs_debug("Inode 0x%llx is not dirty.", mft_no);
822 		/* The inode is not dirty, try to take the mft record lock. */
823 		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
824 			ntfs_debug("Mft record 0x%llx is already locked, do not write it.", mft_no);
825 			atomic_dec(&ni->count);
826 			*ref_vi = vi;
827 			return false;
828 		}
829 		ntfs_debug("Managed to lock mft record 0x%llx, write it.",
830 				mft_no);
831 		/*
832 		 * The write has to occur while we hold the mft record lock so
833 		 * return the locked ntfs inode.
834 		 */
835 		*locked_ni = ni;
836 		return true;
837 	}
838 	ntfs_debug("Inode 0x%llx is not in icache.", mft_no);
839 	/* The inode is not in icache. */
840 	/* Write the record if it is not a mft record (type "FILE"). */
841 	if (!ntfs_is_mft_record(m->magic)) {
842 		ntfs_debug("Mft record 0x%llx is not a FILE record, write it.",
843 				mft_no);
844 		return true;
845 	}
846 	/* Write the mft record if it is a base inode. */
847 	if (!m->base_mft_record) {
848 		ntfs_debug("Mft record 0x%llx is a base record, write it.",
849 				mft_no);
850 		return true;
851 	}
852 	/*
853 	 * This is an extent mft record.  Check if the inode corresponding to
854 	 * its base mft record is in icache and obtain a reference to it if it
855 	 * is.
856 	 */
857 	na.mft_no = MREF_LE(m->base_mft_record);
858 	na.state = 0;
859 	ntfs_debug("Mft record 0x%llx is an extent record.  Looking for base inode 0x%llx in icache.",
860 			mft_no, na.mft_no);
861 	if (!na.mft_no) {
862 		/* Balance the below iput(). */
863 		vi = igrab(mft_vi);
864 		WARN_ON(vi != mft_vi);
865 	} else {
866 		vi = find_inode_nowait(sb, na.mft_no, ntfs_test_inode_wb, &na);
867 		if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated)
868 			return false;
869 	}
870 
871 	if (!vi)
872 		return false;
873 	ntfs_debug("Base inode 0x%llx is in icache.", na.mft_no);
874 	/*
875 	 * The base inode is in icache.  Check if it has the extent inode
876 	 * corresponding to this extent mft record attached.
877 	 */
878 	ni = NTFS_I(vi);
879 	mutex_lock(&ni->extent_lock);
880 	if (ni->nr_extents <= 0) {
881 		/*
882 		 * The base inode has no attached extent inodes, write this
883 		 * extent mft record.
884 		 */
885 		mutex_unlock(&ni->extent_lock);
886 		*ref_vi = vi;
887 		ntfs_debug("Base inode 0x%llx has no attached extent inodes, write the extent record.",
888 				na.mft_no);
889 		return true;
890 	}
891 	/* Iterate over the attached extent inodes. */
892 	extent_nis = ni->ext.extent_ntfs_inos;
893 	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
894 		if (mft_no == extent_nis[i]->mft_no) {
895 			/*
896 			 * Found the extent inode corresponding to this extent
897 			 * mft record.
898 			 */
899 			eni = extent_nis[i];
900 			break;
901 		}
902 	}
903 	/*
904 	 * If the extent inode was not attached to the base inode, write this
905 	 * extent mft record.
906 	 */
907 	if (!eni) {
908 		mutex_unlock(&ni->extent_lock);
909 		*ref_vi = vi;
910 		ntfs_debug("Extent inode 0x%llx is not attached to its base inode 0x%llx, write the extent record.",
911 				mft_no, na.mft_no);
912 		return true;
913 	}
914 	ntfs_debug("Extent inode 0x%llx is attached to its base inode 0x%llx.",
915 			mft_no, na.mft_no);
916 	/* Take a reference to the extent ntfs inode. */
917 	atomic_inc(&eni->count);
918 	mutex_unlock(&ni->extent_lock);
919 
920 	/* if extent inode is dirty, write_inode will write it */
921 	if (NInoDirty(eni)) {
922 		atomic_dec(&eni->count);
923 		*ref_vi = vi;
924 		return false;
925 	}
926 
927 	/*
928 	 * Found the extent inode coresponding to this extent mft record.
929 	 * Try to take the mft record lock.
930 	 */
931 	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
932 		atomic_dec(&eni->count);
933 		*ref_vi = vi;
934 		ntfs_debug("Extent mft record 0x%llx is already locked, do not write it.",
935 				mft_no);
936 		return false;
937 	}
938 	ntfs_debug("Managed to lock extent mft record 0x%llx, write it.",
939 			mft_no);
940 	/*
941 	 * The write has to occur while we hold the mft record lock so return
942 	 * the locked extent ntfs inode.
943 	 */
944 	*locked_ni = eni;
945 	return true;
946 }
947 
948 static const char *es = "  Leaving inconsistent metadata.  Unmount and run chkdsk.";
949 
950 #define RESERVED_MFT_RECORDS	64
951 
952 /*
953  * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
954  * @vol:	volume on which to search for a free mft record
955  * @base_ni:	open base inode if allocating an extent mft record or NULL
956  *
957  * Search for a free mft record in the mft bitmap attribute on the ntfs volume
958  * @vol.
959  *
960  * If @base_ni is NULL start the search at the default allocator position.
961  *
962  * If @base_ni is not NULL start the search at the mft record after the base
963  * mft record @base_ni.
964  *
965  * Return the free mft record on success and -errno on error.  An error code of
966  * -ENOSPC means that there are no free mft records in the currently
967  * initialized mft bitmap.
968  *
969  * Locking: Caller must hold vol->mftbmp_lock for writing.
970  */
971 static s64 ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(struct ntfs_volume *vol,
972 		struct ntfs_inode *base_ni)
973 {
974 	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
975 	unsigned long flags;
976 	struct address_space *mftbmp_mapping;
977 	u8 *buf = NULL, *byte;
978 	struct folio *folio;
979 	unsigned int folio_ofs, size;
980 	u8 pass, b;
981 
982 	ntfs_debug("Searching for free mft record in the currently initialized mft bitmap.");
983 	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
984 	/*
985 	 * Set the end of the pass making sure we do not overflow the mft
986 	 * bitmap.
987 	 */
988 	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
989 	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
990 			vol->mft_record_size_bits;
991 	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
992 	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
993 	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
994 	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
995 	if (pass_end > ll)
996 		pass_end = ll;
997 	pass = 1;
998 	if (!base_ni)
999 		data_pos = vol->mft_data_pos;
1000 	else
1001 		data_pos = base_ni->mft_no + 1;
1002 	if (data_pos < RESERVED_MFT_RECORDS)
1003 		data_pos = RESERVED_MFT_RECORDS;
1004 	if (data_pos >= pass_end) {
1005 		data_pos = RESERVED_MFT_RECORDS;
1006 		pass = 2;
1007 		/* This happens on a freshly formatted volume. */
1008 		if (data_pos >= pass_end)
1009 			return -ENOSPC;
1010 	}
1011 
1012 	if (base_ni && base_ni->mft_no == FILE_MFT) {
1013 		data_pos = 0;
1014 		pass = 2;
1015 	}
1016 
1017 	pass_start = data_pos;
1018 	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, pass_end 0x%llx, data_pos 0x%llx.",
1019 			pass, pass_start, pass_end, data_pos);
1020 	/* Loop until a free mft record is found. */
1021 	for (; pass <= 2;) {
1022 		/* Cap size to pass_end. */
1023 		ofs = data_pos >> 3;
1024 		folio_ofs = ofs & ~PAGE_MASK;
1025 		size = PAGE_SIZE - folio_ofs;
1026 		ll = ((pass_end + 7) >> 3) - ofs;
1027 		if (size > ll)
1028 			size = ll;
1029 		size <<= 3;
1030 		/*
1031 		 * If we are still within the active pass, search the next page
1032 		 * for a zero bit.
1033 		 */
1034 		if (size) {
1035 			folio = read_mapping_folio(mftbmp_mapping,
1036 					ofs >> PAGE_SHIFT, NULL);
1037 			if (IS_ERR(folio)) {
1038 				ntfs_error(vol->sb, "Failed to read mft bitmap, aborting.");
1039 				return PTR_ERR(folio);
1040 			}
1041 			folio_lock(folio);
1042 			buf = (u8 *)kmap_local_folio(folio, 0) + folio_ofs;
1043 			bit = data_pos & 7;
1044 			data_pos &= ~7ull;
1045 			ntfs_debug("Before inner for loop: size 0x%x, data_pos 0x%llx, bit 0x%llx",
1046 					size, data_pos, bit);
1047 			for (; bit < size && data_pos + bit < pass_end;
1048 					bit &= ~7ull, bit += 8) {
1049 				/*
1050 				 * If we're extending $MFT and running out of the first
1051 				 * mft record (base record) then give up searching since
1052 				 * no guarantee that the found record will be accessible.
1053 				 */
1054 				if (base_ni && base_ni->mft_no == FILE_MFT && bit > 400) {
1055 					folio_unlock(folio);
1056 					kunmap_local(buf);
1057 					folio_put(folio);
1058 					return -ENOSPC;
1059 				}
1060 
1061 				byte = buf + (bit >> 3);
1062 				if (*byte == 0xff)
1063 					continue;
1064 				b = ffz((unsigned long)*byte);
1065 				if (b < 8 && b >= (bit & 7)) {
1066 					ll = data_pos + (bit & ~7ull) + b;
1067 					if (unlikely(ll >= (1ll << 32))) {
1068 						folio_unlock(folio);
1069 						kunmap_local(buf);
1070 						folio_put(folio);
1071 						return -ENOSPC;
1072 					}
1073 					*byte |= 1 << b;
1074 					folio_mark_dirty(folio);
1075 					folio_unlock(folio);
1076 					kunmap_local(buf);
1077 					folio_put(folio);
1078 					ntfs_debug("Done.  (Found and allocated mft record 0x%llx.)",
1079 							ll);
1080 					return ll;
1081 				}
1082 			}
1083 			ntfs_debug("After inner for loop: size 0x%x, data_pos 0x%llx, bit 0x%llx",
1084 					size, data_pos, bit);
1085 			data_pos += size;
1086 			folio_unlock(folio);
1087 			kunmap_local(buf);
1088 			folio_put(folio);
1089 			/*
1090 			 * If the end of the pass has not been reached yet,
1091 			 * continue searching the mft bitmap for a zero bit.
1092 			 */
1093 			if (data_pos < pass_end)
1094 				continue;
1095 		}
1096 		/* Do the next pass. */
1097 		if (++pass == 2) {
1098 			/*
1099 			 * Starting the second pass, in which we scan the first
1100 			 * part of the zone which we omitted earlier.
1101 			 */
1102 			pass_end = pass_start;
1103 			data_pos = pass_start = RESERVED_MFT_RECORDS;
1104 			ntfs_debug("pass %i, pass_start 0x%llx, pass_end 0x%llx.",
1105 					pass, pass_start, pass_end);
1106 			if (data_pos >= pass_end)
1107 				break;
1108 		}
1109 	}
1110 	/* No free mft records in currently initialized mft bitmap. */
1111 	ntfs_debug("Done.  (No free mft records left in currently initialized mft bitmap.)");
1112 	return -ENOSPC;
1113 }
1114 
1115 static int ntfs_mft_attr_extend(struct ntfs_inode *ni)
1116 {
1117 	int ret = 0;
1118 	struct ntfs_inode *base_ni;
1119 
1120 	if (NInoAttr(ni))
1121 		base_ni = ni->ext.base_ntfs_ino;
1122 	else
1123 		base_ni = ni;
1124 
1125 	if (!NInoAttrList(base_ni)) {
1126 		ret = ntfs_inode_add_attrlist(base_ni);
1127 		if (ret) {
1128 			pr_err("Can not add attrlist\n");
1129 			goto out;
1130 		} else {
1131 			ret = -EAGAIN;
1132 			goto out;
1133 		}
1134 	}
1135 
1136 	ret = ntfs_attr_update_mapping_pairs(ni, 0);
1137 	if (ret)
1138 		pr_err("MP update failed\n");
1139 
1140 out:
1141 	return ret;
1142 }
1143 
1144 /*
1145  * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
1146  * @vol:	volume on which to extend the mft bitmap attribute
1147  *
1148  * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
1149  *
1150  * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1151  * data_size.
1152  *
1153  * Return 0 on success and -errno on error.
1154  *
1155  * Locking: - Caller must hold vol->mftbmp_lock for writing.
1156  *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
1157  *	      writing and releases it before returning.
1158  *	    - This function takes vol->lcnbmp_lock for writing and releases it
1159  *	      before returning.
1160  */
1161 static int ntfs_mft_bitmap_extend_allocation_nolock(struct ntfs_volume *vol)
1162 {
1163 	s64 lcn;
1164 	s64 ll;
1165 	unsigned long flags;
1166 	struct folio *folio;
1167 	struct ntfs_inode *mft_ni, *mftbmp_ni;
1168 	struct runlist_element *rl, *rl2 = NULL;
1169 	struct ntfs_attr_search_ctx *ctx = NULL;
1170 	struct mft_record *mrec;
1171 	struct attr_record *a = NULL;
1172 	int ret, mp_size;
1173 	u32 old_alen = 0;
1174 	u8 *b, tb;
1175 	struct {
1176 		u8 added_cluster:1;
1177 		u8 added_run:1;
1178 		u8 mp_rebuilt:1;
1179 		u8 mp_extended:1;
1180 	} status = { 0, 0, 0, 0 };
1181 	size_t new_rl_count;
1182 
1183 	ntfs_debug("Extending mft bitmap allocation.");
1184 	mft_ni = NTFS_I(vol->mft_ino);
1185 	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
1186 	/*
1187 	 * Determine the last lcn of the mft bitmap.  The allocated size of the
1188 	 * mft bitmap cannot be zero so we are ok to do this.
1189 	 */
1190 	down_write(&mftbmp_ni->runlist.lock);
1191 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1192 	ll = mftbmp_ni->allocated_size;
1193 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1194 	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
1195 			NTFS_B_TO_CLU(vol, ll - 1), NULL);
1196 	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
1197 		up_write(&mftbmp_ni->runlist.lock);
1198 		ntfs_error(vol->sb,
1199 			"Failed to determine last allocated cluster of mft bitmap attribute.");
1200 		if (!IS_ERR(rl))
1201 			ret = -EIO;
1202 		else
1203 			ret = PTR_ERR(rl);
1204 		return ret;
1205 	}
1206 	lcn = rl->lcn + rl->length;
1207 	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
1208 			(long long)lcn);
1209 	/*
1210 	 * Attempt to get the cluster following the last allocated cluster by
1211 	 * hand as it may be in the MFT zone so the allocator would not give it
1212 	 * to us.
1213 	 */
1214 	ll = lcn >> 3;
1215 	folio = read_mapping_folio(vol->lcnbmp_ino->i_mapping,
1216 			ll >> PAGE_SHIFT, NULL);
1217 	if (IS_ERR(folio)) {
1218 		up_write(&mftbmp_ni->runlist.lock);
1219 		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
1220 		return PTR_ERR(folio);
1221 	}
1222 
1223 	down_write(&vol->lcnbmp_lock);
1224 	folio_lock(folio);
1225 	b = (u8 *)kmap_local_folio(folio, 0) + (ll & ~PAGE_MASK);
1226 	tb = 1 << (lcn & 7ull);
1227 	if (*b != 0xff && !(*b & tb)) {
1228 		/* Next cluster is free, allocate it. */
1229 		*b |= tb;
1230 		folio_mark_dirty(folio);
1231 		folio_unlock(folio);
1232 		kunmap_local(b);
1233 		folio_put(folio);
1234 		up_write(&vol->lcnbmp_lock);
1235 		/* Update the mft bitmap runlist. */
1236 		rl->length++;
1237 		rl[1].vcn++;
1238 		status.added_cluster = 1;
1239 		ntfs_debug("Appending one cluster to mft bitmap.");
1240 	} else {
1241 		folio_unlock(folio);
1242 		kunmap_local(b);
1243 		folio_put(folio);
1244 		up_write(&vol->lcnbmp_lock);
1245 		/* Allocate a cluster from the DATA_ZONE. */
1246 		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
1247 				true, false, false);
1248 		if (IS_ERR(rl2)) {
1249 			up_write(&mftbmp_ni->runlist.lock);
1250 			ntfs_error(vol->sb,
1251 					"Failed to allocate a cluster for the mft bitmap.");
1252 			return PTR_ERR(rl2);
1253 		}
1254 		rl = ntfs_runlists_merge(&mftbmp_ni->runlist, rl2, 0, &new_rl_count);
1255 		if (IS_ERR(rl)) {
1256 			up_write(&mftbmp_ni->runlist.lock);
1257 			ntfs_error(vol->sb, "Failed to merge runlists for mft bitmap.");
1258 			if (ntfs_cluster_free_from_rl(vol, rl2)) {
1259 				ntfs_error(vol->sb, "Failed to deallocate allocated cluster.%s",
1260 						es);
1261 				NVolSetErrors(vol);
1262 			}
1263 			kvfree(rl2);
1264 			return PTR_ERR(rl);
1265 		}
1266 		mftbmp_ni->runlist.rl = rl;
1267 		mftbmp_ni->runlist.count = new_rl_count;
1268 		status.added_run = 1;
1269 		ntfs_debug("Adding one run to mft bitmap.");
1270 		/* Find the last run in the new runlist. */
1271 		for (; rl[1].length; rl++)
1272 			;
1273 	}
1274 	/*
1275 	 * Update the attribute record as well.  Note: @rl is the last
1276 	 * (non-terminator) runlist element of mft bitmap.
1277 	 */
1278 	mrec = map_mft_record(mft_ni);
1279 	if (IS_ERR(mrec)) {
1280 		ntfs_error(vol->sb, "Failed to map mft record.");
1281 		ret = PTR_ERR(mrec);
1282 		goto undo_alloc;
1283 	}
1284 	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1285 	if (unlikely(!ctx)) {
1286 		ntfs_error(vol->sb, "Failed to get search context.");
1287 		ret = -ENOMEM;
1288 		goto undo_alloc;
1289 	}
1290 	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1291 			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1292 			0, ctx);
1293 	if (unlikely(ret)) {
1294 		ntfs_error(vol->sb,
1295 			"Failed to find last attribute extent of mft bitmap attribute.");
1296 		if (ret == -ENOENT)
1297 			ret = -EIO;
1298 		goto undo_alloc;
1299 	}
1300 	a = ctx->attr;
1301 	ll = le64_to_cpu(a->data.non_resident.lowest_vcn);
1302 	/* Search back for the previous last allocated cluster of mft bitmap. */
1303 	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
1304 		if (ll >= rl2->vcn)
1305 			break;
1306 	}
1307 	WARN_ON(ll < rl2->vcn);
1308 	WARN_ON(ll >= rl2->vcn + rl2->length);
1309 	/* Get the size for the new mapping pairs array for this extent. */
1310 	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1, -1);
1311 	if (unlikely(mp_size <= 0)) {
1312 		ntfs_error(vol->sb,
1313 			"Get size for mapping pairs failed for mft bitmap attribute extent.");
1314 		ret = mp_size;
1315 		if (!ret)
1316 			ret = -EIO;
1317 		goto undo_alloc;
1318 	}
1319 	/* Expand the attribute record if necessary. */
1320 	old_alen = le32_to_cpu(a->length);
1321 	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1322 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1323 	if (unlikely(ret)) {
1324 		ret = ntfs_mft_attr_extend(mftbmp_ni);
1325 		if (!ret)
1326 			goto extended_ok;
1327 		if (ret != -EAGAIN)
1328 			status.mp_extended = 1;
1329 		goto undo_alloc;
1330 	}
1331 	status.mp_rebuilt = 1;
1332 	/* Generate the mapping pairs array directly into the attr record. */
1333 	ret = ntfs_mapping_pairs_build(vol, (u8 *)a +
1334 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1335 			mp_size, rl2, ll, -1, NULL, NULL, NULL);
1336 	if (unlikely(ret)) {
1337 		ntfs_error(vol->sb,
1338 			"Failed to build mapping pairs array for mft bitmap attribute.");
1339 		goto undo_alloc;
1340 	}
1341 	/* Update the highest_vcn. */
1342 	a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 1);
1343 	/*
1344 	 * We now have extended the mft bitmap allocated_size by one cluster.
1345 	 * Reflect this in the struct ntfs_inode structure and the attribute record.
1346 	 */
1347 	if (a->data.non_resident.lowest_vcn) {
1348 		/*
1349 		 * We are not in the first attribute extent, switch to it, but
1350 		 * first ensure the changes will make it to disk later.
1351 		 */
1352 		mark_mft_record_dirty(ctx->ntfs_ino);
1353 extended_ok:
1354 		ntfs_attr_reinit_search_ctx(ctx);
1355 		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1356 				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
1357 				0, ctx);
1358 		if (unlikely(ret)) {
1359 			ntfs_error(vol->sb,
1360 				"Failed to find first attribute extent of mft bitmap attribute.");
1361 			goto restore_undo_alloc;
1362 		}
1363 		a = ctx->attr;
1364 	}
1365 
1366 	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1367 	mftbmp_ni->allocated_size += vol->cluster_size;
1368 	a->data.non_resident.allocated_size =
1369 			cpu_to_le64(mftbmp_ni->allocated_size);
1370 	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1371 	/* Ensure the changes make it to disk. */
1372 	mark_mft_record_dirty(ctx->ntfs_ino);
1373 	ntfs_attr_put_search_ctx(ctx);
1374 	unmap_mft_record(mft_ni);
1375 	up_write(&mftbmp_ni->runlist.lock);
1376 	ntfs_debug("Done.");
1377 	return 0;
1378 
1379 restore_undo_alloc:
1380 	ntfs_attr_reinit_search_ctx(ctx);
1381 	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1382 			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1383 			0, ctx)) {
1384 		ntfs_error(vol->sb,
1385 			"Failed to find last attribute extent of mft bitmap attribute.%s", es);
1386 		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1387 		mftbmp_ni->allocated_size += vol->cluster_size;
1388 		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1389 		ntfs_attr_put_search_ctx(ctx);
1390 		unmap_mft_record(mft_ni);
1391 		up_write(&mftbmp_ni->runlist.lock);
1392 		/*
1393 		 * The only thing that is now wrong is ->allocated_size of the
1394 		 * base attribute extent which chkdsk should be able to fix.
1395 		 */
1396 		NVolSetErrors(vol);
1397 		return ret;
1398 	}
1399 	a = ctx->attr;
1400 	a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 2);
1401 undo_alloc:
1402 	if (status.added_cluster) {
1403 		/* Truncate the last run in the runlist by one cluster. */
1404 		rl->length--;
1405 		rl[1].vcn--;
1406 	} else if (status.added_run) {
1407 		lcn = rl->lcn;
1408 		/* Remove the last run from the runlist. */
1409 		rl->lcn = rl[1].lcn;
1410 		rl->length = 0;
1411 		mftbmp_ni->runlist.count--;
1412 	}
1413 	/* Deallocate the cluster. */
1414 	down_write(&vol->lcnbmp_lock);
1415 	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1416 		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
1417 		NVolSetErrors(vol);
1418 	} else
1419 		ntfs_inc_free_clusters(vol, 1);
1420 	up_write(&vol->lcnbmp_lock);
1421 	if (status.mp_rebuilt) {
1422 		if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
1423 				a->data.non_resident.mapping_pairs_offset),
1424 				old_alen - le16_to_cpu(
1425 				a->data.non_resident.mapping_pairs_offset),
1426 				rl2, ll, -1, NULL, NULL, NULL)) {
1427 			ntfs_error(vol->sb, "Failed to restore mapping pairs array.%s", es);
1428 			NVolSetErrors(vol);
1429 		}
1430 		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1431 			ntfs_error(vol->sb, "Failed to restore attribute record.%s", es);
1432 			NVolSetErrors(vol);
1433 		}
1434 		mark_mft_record_dirty(ctx->ntfs_ino);
1435 	} else if (status.mp_extended && ntfs_attr_update_mapping_pairs(mftbmp_ni, 0)) {
1436 		ntfs_error(vol->sb, "Failed to restore mapping pairs.%s", es);
1437 		NVolSetErrors(vol);
1438 	}
1439 	if (ctx)
1440 		ntfs_attr_put_search_ctx(ctx);
1441 	if (!IS_ERR(mrec))
1442 		unmap_mft_record(mft_ni);
1443 	up_write(&mftbmp_ni->runlist.lock);
1444 	return ret;
1445 }
1446 
1447 /*
1448  * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1449  * @vol:	volume on which to extend the mft bitmap attribute
1450  *
1451  * Extend the initialized portion of the mft bitmap attribute on the ntfs
1452  * volume @vol by 8 bytes.
1453  *
1454  * Note:  Only changes initialized_size and data_size, i.e. requires that
1455  * allocated_size is big enough to fit the new initialized_size.
1456  *
1457  * Return 0 on success and -error on error.
1458  *
1459  * Locking: Caller must hold vol->mftbmp_lock for writing.
1460  */
1461 static int ntfs_mft_bitmap_extend_initialized_nolock(struct ntfs_volume *vol)
1462 {
1463 	s64 old_data_size, old_initialized_size;
1464 	unsigned long flags;
1465 	struct inode *mftbmp_vi;
1466 	struct ntfs_inode *mft_ni, *mftbmp_ni;
1467 	struct ntfs_attr_search_ctx *ctx;
1468 	struct mft_record *mrec;
1469 	struct attr_record *a;
1470 	int ret;
1471 
1472 	ntfs_debug("Extending mft bitmap initialized (and data) size.");
1473 	mft_ni = NTFS_I(vol->mft_ino);
1474 	mftbmp_vi = vol->mftbmp_ino;
1475 	mftbmp_ni = NTFS_I(mftbmp_vi);
1476 	/* Get the attribute record. */
1477 	mrec = map_mft_record(mft_ni);
1478 	if (IS_ERR(mrec)) {
1479 		ntfs_error(vol->sb, "Failed to map mft record.");
1480 		return PTR_ERR(mrec);
1481 	}
1482 	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1483 	if (unlikely(!ctx)) {
1484 		ntfs_error(vol->sb, "Failed to get search context.");
1485 		ret = -ENOMEM;
1486 		goto unm_err_out;
1487 	}
1488 	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1489 			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
1490 	if (unlikely(ret)) {
1491 		ntfs_error(vol->sb,
1492 			"Failed to find first attribute extent of mft bitmap attribute.");
1493 		if (ret == -ENOENT)
1494 			ret = -EIO;
1495 		goto put_err_out;
1496 	}
1497 	a = ctx->attr;
1498 	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1499 	old_data_size = i_size_read(mftbmp_vi);
1500 	old_initialized_size = mftbmp_ni->initialized_size;
1501 	/*
1502 	 * We can simply update the initialized_size before filling the space
1503 	 * with zeroes because the caller is holding the mft bitmap lock for
1504 	 * writing which ensures that no one else is trying to access the data.
1505 	 */
1506 	mftbmp_ni->initialized_size += 8;
1507 	a->data.non_resident.initialized_size =
1508 			cpu_to_le64(mftbmp_ni->initialized_size);
1509 	if (mftbmp_ni->initialized_size > old_data_size) {
1510 		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
1511 		a->data.non_resident.data_size =
1512 				cpu_to_le64(mftbmp_ni->initialized_size);
1513 	}
1514 	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1515 	/* Ensure the changes make it to disk. */
1516 	mark_mft_record_dirty(ctx->ntfs_ino);
1517 	ntfs_attr_put_search_ctx(ctx);
1518 	unmap_mft_record(mft_ni);
1519 	/* Initialize the mft bitmap attribute value with zeroes. */
1520 	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
1521 	if (likely(!ret)) {
1522 		ntfs_debug("Done.  (Wrote eight initialized bytes to mft bitmap.");
1523 		ntfs_inc_free_mft_records(vol, 8 * 8);
1524 		return 0;
1525 	}
1526 	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
1527 	/* Try to recover from the error. */
1528 	mrec = map_mft_record(mft_ni);
1529 	if (IS_ERR(mrec)) {
1530 		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
1531 		NVolSetErrors(vol);
1532 		return ret;
1533 	}
1534 	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1535 	if (unlikely(!ctx)) {
1536 		ntfs_error(vol->sb, "Failed to get search context.%s", es);
1537 		NVolSetErrors(vol);
1538 		goto unm_err_out;
1539 	}
1540 	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1541 			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
1542 		ntfs_error(vol->sb,
1543 			"Failed to find first attribute extent of mft bitmap attribute.%s", es);
1544 		NVolSetErrors(vol);
1545 put_err_out:
1546 		ntfs_attr_put_search_ctx(ctx);
1547 unm_err_out:
1548 		unmap_mft_record(mft_ni);
1549 		goto err_out;
1550 	}
1551 	a = ctx->attr;
1552 	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1553 	mftbmp_ni->initialized_size = old_initialized_size;
1554 	a->data.non_resident.initialized_size =
1555 			cpu_to_le64(old_initialized_size);
1556 	if (i_size_read(mftbmp_vi) != old_data_size) {
1557 		i_size_write(mftbmp_vi, old_data_size);
1558 		a->data.non_resident.data_size = cpu_to_le64(old_data_size);
1559 	}
1560 	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1561 	mark_mft_record_dirty(ctx->ntfs_ino);
1562 	ntfs_attr_put_search_ctx(ctx);
1563 	unmap_mft_record(mft_ni);
1564 #ifdef DEBUG
1565 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1566 	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
1567 			mftbmp_ni->allocated_size, i_size_read(mftbmp_vi),
1568 			mftbmp_ni->initialized_size);
1569 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1570 #endif /* DEBUG */
1571 err_out:
1572 	return ret;
1573 }
1574 
1575 /*
1576  * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1577  * @vol:	volume on which to extend the mft data attribute
1578  *
1579  * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1580  * worth of clusters or if not enough space for this by one mft record worth
1581  * of clusters.
1582  *
1583  * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
1584  * data_size.
1585  *
1586  * Return 0 on success and -errno on error.
1587  *
1588  * Locking: - Caller must hold vol->mftbmp_lock for writing.
1589  *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
1590  *	      writing and releases it before returning.
1591  *	    - This function calls functions which take vol->lcnbmp_lock for
1592  *	      writing and release it before returning.
1593  */
1594 static int ntfs_mft_data_extend_allocation_nolock(struct ntfs_volume *vol)
1595 {
1596 	s64 lcn;
1597 	s64 old_last_vcn;
1598 	s64 min_nr, nr, ll;
1599 	unsigned long flags;
1600 	struct ntfs_inode *mft_ni;
1601 	struct runlist_element *rl, *rl2;
1602 	struct ntfs_attr_search_ctx *ctx = NULL;
1603 	struct mft_record *mrec;
1604 	struct attr_record *a = NULL;
1605 	int ret, mp_size;
1606 	u32 old_alen = 0;
1607 	bool mp_rebuilt = false, mp_extended = false;
1608 	size_t new_rl_count;
1609 
1610 	ntfs_debug("Extending mft data allocation.");
1611 	mft_ni = NTFS_I(vol->mft_ino);
1612 	/*
1613 	 * Determine the preferred allocation location, i.e. the last lcn of
1614 	 * the mft data attribute.  The allocated size of the mft data
1615 	 * attribute cannot be zero so we are ok to do this.
1616 	 */
1617 	down_write(&mft_ni->runlist.lock);
1618 	read_lock_irqsave(&mft_ni->size_lock, flags);
1619 	ll = mft_ni->allocated_size;
1620 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
1621 	rl = ntfs_attr_find_vcn_nolock(mft_ni,
1622 			NTFS_B_TO_CLU(vol, ll - 1), NULL);
1623 	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
1624 		up_write(&mft_ni->runlist.lock);
1625 		ntfs_error(vol->sb,
1626 			"Failed to determine last allocated cluster of mft data attribute.");
1627 		if (!IS_ERR(rl))
1628 			ret = -EIO;
1629 		else
1630 			ret = PTR_ERR(rl);
1631 		return ret;
1632 	}
1633 	lcn = rl->lcn + rl->length;
1634 	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", lcn);
1635 	/* Minimum allocation is one mft record worth of clusters. */
1636 	min_nr = NTFS_B_TO_CLU(vol, vol->mft_record_size);
1637 	if (!min_nr)
1638 		min_nr = 1;
1639 	/* Want to allocate 16 mft records worth of clusters. */
1640 	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
1641 	if (!nr)
1642 		nr = min_nr;
1643 	/* Ensure we do not go above 2^32-1 mft records. */
1644 	read_lock_irqsave(&mft_ni->size_lock, flags);
1645 	ll = mft_ni->allocated_size;
1646 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
1647 	if (unlikely((ll + NTFS_CLU_TO_B(vol, nr)) >>
1648 			vol->mft_record_size_bits >= (1ll << 32))) {
1649 		nr = min_nr;
1650 		if (unlikely((ll + NTFS_CLU_TO_B(vol, nr)) >>
1651 				vol->mft_record_size_bits >= (1ll << 32))) {
1652 			ntfs_warning(vol->sb,
1653 				"Cannot allocate mft record because the maximum number of inodes (2^32) has already been reached.");
1654 			up_write(&mft_ni->runlist.lock);
1655 			return -ENOSPC;
1656 		}
1657 	}
1658 	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
1659 			nr > min_nr ? "default" : "minimal", (long long)nr);
1660 	old_last_vcn = rl[1].vcn;
1661 	/*
1662 	 * We can release the mft_ni runlist lock, Because this function is
1663 	 * the only one that expends $MFT data attribute and is called with
1664 	 * mft_ni->mrec_lock.
1665 	 * This is required for the lock order, vol->lcnbmp_lock =>
1666 	 * mft_ni->runlist.lock.
1667 	 */
1668 	up_write(&mft_ni->runlist.lock);
1669 
1670 	do {
1671 		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
1672 				true, false, false);
1673 		if (!IS_ERR(rl2))
1674 			break;
1675 		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
1676 			ntfs_error(vol->sb,
1677 				"Failed to allocate the minimal number of clusters (%lli) for the mft data attribute.",
1678 				nr);
1679 			return PTR_ERR(rl2);
1680 		}
1681 		/*
1682 		 * There is not enough space to do the allocation, but there
1683 		 * might be enough space to do a minimal allocation so try that
1684 		 * before failing.
1685 		 */
1686 		nr = min_nr;
1687 		ntfs_debug("Retrying mft data allocation with minimal cluster count %lli.", nr);
1688 	} while (1);
1689 
1690 	down_write(&mft_ni->runlist.lock);
1691 	rl = ntfs_runlists_merge(&mft_ni->runlist, rl2, 0, &new_rl_count);
1692 	if (IS_ERR(rl)) {
1693 		up_write(&mft_ni->runlist.lock);
1694 		ntfs_error(vol->sb, "Failed to merge runlists for mft data attribute.");
1695 		if (ntfs_cluster_free_from_rl(vol, rl2)) {
1696 			ntfs_error(vol->sb,
1697 				"Failed to deallocate clusters from the mft data attribute.%s", es);
1698 			NVolSetErrors(vol);
1699 		}
1700 		kvfree(rl2);
1701 		return PTR_ERR(rl);
1702 	}
1703 	mft_ni->runlist.rl = rl;
1704 	mft_ni->runlist.count = new_rl_count;
1705 	ntfs_debug("Allocated %lli clusters.", (long long)nr);
1706 	/* Find the last run in the new runlist. */
1707 	for (; rl[1].length; rl++)
1708 		;
1709 	up_write(&mft_ni->runlist.lock);
1710 
1711 	/* Update the attribute record as well. */
1712 	mrec = map_mft_record(mft_ni);
1713 	if (IS_ERR(mrec)) {
1714 		ntfs_error(vol->sb, "Failed to map mft record.");
1715 		ret = PTR_ERR(mrec);
1716 		down_write(&mft_ni->runlist.lock);
1717 		goto undo_alloc;
1718 	}
1719 	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1720 	if (unlikely(!ctx)) {
1721 		ntfs_error(vol->sb, "Failed to get search context.");
1722 		ret = -ENOMEM;
1723 		goto undo_alloc;
1724 	}
1725 	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1726 			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
1727 	if (unlikely(ret)) {
1728 		ntfs_error(vol->sb, "Failed to find last attribute extent of mft data attribute.");
1729 		if (ret == -ENOENT)
1730 			ret = -EIO;
1731 		goto undo_alloc;
1732 	}
1733 	a = ctx->attr;
1734 	ll = le64_to_cpu(a->data.non_resident.lowest_vcn);
1735 
1736 	down_write(&mft_ni->runlist.lock);
1737 	/* Search back for the previous last allocated cluster of mft bitmap. */
1738 	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
1739 		if (ll >= rl2->vcn)
1740 			break;
1741 	}
1742 	WARN_ON(ll < rl2->vcn);
1743 	WARN_ON(ll >= rl2->vcn + rl2->length);
1744 	/* Get the size for the new mapping pairs array for this extent. */
1745 	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1, -1);
1746 	if (unlikely(mp_size <= 0)) {
1747 		ntfs_error(vol->sb,
1748 			"Get size for mapping pairs failed for mft data attribute extent.");
1749 		ret = mp_size;
1750 		if (!ret)
1751 			ret = -EIO;
1752 		up_write(&mft_ni->runlist.lock);
1753 		goto undo_alloc;
1754 	}
1755 	up_write(&mft_ni->runlist.lock);
1756 
1757 	/* Expand the attribute record if necessary. */
1758 	old_alen = le32_to_cpu(a->length);
1759 	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1760 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1761 	if (unlikely(ret)) {
1762 		ret = ntfs_mft_attr_extend(mft_ni);
1763 		if (!ret)
1764 			goto extended_ok;
1765 		if (ret != -EAGAIN)
1766 			mp_extended = true;
1767 		goto undo_alloc;
1768 	}
1769 	mp_rebuilt = true;
1770 	/* Generate the mapping pairs array directly into the attr record. */
1771 	ret = ntfs_mapping_pairs_build(vol, (u8 *)a +
1772 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1773 			mp_size, rl2, ll, -1, NULL, NULL, NULL);
1774 	if (unlikely(ret)) {
1775 		ntfs_error(vol->sb, "Failed to build mapping pairs array of mft data attribute.");
1776 		goto undo_alloc;
1777 	}
1778 	/* Update the highest_vcn. */
1779 	a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 1);
1780 	/*
1781 	 * We now have extended the mft data allocated_size by nr clusters.
1782 	 * Reflect this in the struct ntfs_inode structure and the attribute record.
1783 	 * @rl is the last (non-terminator) runlist element of mft data
1784 	 * attribute.
1785 	 */
1786 	if (a->data.non_resident.lowest_vcn) {
1787 		/*
1788 		 * We are not in the first attribute extent, switch to it, but
1789 		 * first ensure the changes will make it to disk later.
1790 		 */
1791 		mark_mft_record_dirty(ctx->ntfs_ino);
1792 extended_ok:
1793 		ntfs_attr_reinit_search_ctx(ctx);
1794 		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
1795 				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
1796 				ctx);
1797 		if (unlikely(ret)) {
1798 			ntfs_error(vol->sb,
1799 				"Failed to find first attribute extent of mft data attribute.");
1800 			goto restore_undo_alloc;
1801 		}
1802 		a = ctx->attr;
1803 	}
1804 
1805 	write_lock_irqsave(&mft_ni->size_lock, flags);
1806 	mft_ni->allocated_size += NTFS_CLU_TO_B(vol, nr);
1807 	a->data.non_resident.allocated_size =
1808 			cpu_to_le64(mft_ni->allocated_size);
1809 	write_unlock_irqrestore(&mft_ni->size_lock, flags);
1810 	/* Ensure the changes make it to disk. */
1811 	mark_mft_record_dirty(ctx->ntfs_ino);
1812 	ntfs_attr_put_search_ctx(ctx);
1813 	unmap_mft_record(mft_ni);
1814 	ntfs_debug("Done.");
1815 	return 0;
1816 restore_undo_alloc:
1817 	ntfs_attr_reinit_search_ctx(ctx);
1818 	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1819 			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
1820 		ntfs_error(vol->sb,
1821 			"Failed to find last attribute extent of mft data attribute.%s", es);
1822 		write_lock_irqsave(&mft_ni->size_lock, flags);
1823 		mft_ni->allocated_size += NTFS_CLU_TO_B(vol, nr);
1824 		write_unlock_irqrestore(&mft_ni->size_lock, flags);
1825 		ntfs_attr_put_search_ctx(ctx);
1826 		unmap_mft_record(mft_ni);
1827 		up_write(&mft_ni->runlist.lock);
1828 		/*
1829 		 * The only thing that is now wrong is ->allocated_size of the
1830 		 * base attribute extent which chkdsk should be able to fix.
1831 		 */
1832 		NVolSetErrors(vol);
1833 		return ret;
1834 	}
1835 	ctx->attr->data.non_resident.highest_vcn =
1836 			cpu_to_le64(old_last_vcn - 1);
1837 undo_alloc:
1838 	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
1839 		ntfs_error(vol->sb, "Failed to free clusters from mft data attribute.%s", es);
1840 		NVolSetErrors(vol);
1841 	}
1842 
1843 	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1844 		ntfs_error(vol->sb, "Failed to truncate mft data attribute runlist.%s", es);
1845 		NVolSetErrors(vol);
1846 	}
1847 	if (mp_extended && ntfs_attr_update_mapping_pairs(mft_ni, 0)) {
1848 		ntfs_error(vol->sb, "Failed to restore mapping pairs.%s",
1849 			   es);
1850 		NVolSetErrors(vol);
1851 	}
1852 	if (ctx) {
1853 		a = ctx->attr;
1854 		if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
1855 			if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
1856 				a->data.non_resident.mapping_pairs_offset),
1857 				old_alen - le16_to_cpu(
1858 					a->data.non_resident.mapping_pairs_offset),
1859 				rl2, ll, -1, NULL, NULL, NULL)) {
1860 				ntfs_error(vol->sb, "Failed to restore mapping pairs array.%s", es);
1861 				NVolSetErrors(vol);
1862 			}
1863 			if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1864 				ntfs_error(vol->sb, "Failed to restore attribute record.%s", es);
1865 				NVolSetErrors(vol);
1866 			}
1867 			mark_mft_record_dirty(ctx->ntfs_ino);
1868 		} else if (IS_ERR(ctx->mrec)) {
1869 			ntfs_error(vol->sb, "Failed to restore attribute search context.%s", es);
1870 			NVolSetErrors(vol);
1871 		}
1872 		ntfs_attr_put_search_ctx(ctx);
1873 	}
1874 	if (!IS_ERR(mrec))
1875 		unmap_mft_record(mft_ni);
1876 	return ret;
1877 }
1878 
1879 /*
1880  * ntfs_mft_record_layout - layout an mft record into a memory buffer
1881  * @vol:	volume to which the mft record will belong
1882  * @mft_no:	mft reference specifying the mft record number
1883  * @m:		destination buffer of size >= @vol->mft_record_size bytes
1884  *
1885  * Layout an empty, unused mft record with the mft record number @mft_no into
1886  * the buffer @m.  The volume @vol is needed because the mft record structure
1887  * was modified in NTFS 3.1 so we need to know which volume version this mft
1888  * record will be used on.
1889  *
1890  * Return 0 on success and -errno on error.
1891  */
1892 static int ntfs_mft_record_layout(const struct ntfs_volume *vol, const s64 mft_no,
1893 		struct mft_record *m)
1894 {
1895 	struct attr_record *a;
1896 
1897 	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
1898 	if (mft_no >= (1ll << 32)) {
1899 		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds maximum of 2^32.",
1900 				(long long)mft_no);
1901 		return -ERANGE;
1902 	}
1903 	/* Start by clearing the whole mft record to gives us a clean slate. */
1904 	memset(m, 0, vol->mft_record_size);
1905 	/* Aligned to 2-byte boundary. */
1906 	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
1907 		m->usa_ofs = cpu_to_le16((sizeof(struct mft_record_old) + 1) & ~1);
1908 	else {
1909 		m->usa_ofs = cpu_to_le16((sizeof(struct mft_record) + 1) & ~1);
1910 		/*
1911 		 * Set the NTFS 3.1+ specific fields while we know that the
1912 		 * volume version is 3.1+.
1913 		 */
1914 		m->reserved = 0;
1915 		m->mft_record_number = cpu_to_le32((u32)mft_no);
1916 	}
1917 	m->magic = magic_FILE;
1918 	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
1919 		m->usa_count = cpu_to_le16(vol->mft_record_size /
1920 				NTFS_BLOCK_SIZE + 1);
1921 	else {
1922 		m->usa_count = cpu_to_le16(1);
1923 		ntfs_warning(vol->sb,
1924 			"Sector size is bigger than mft record size.  Setting usa_count to 1.  If chkdsk reports this as corruption");
1925 	}
1926 	/* Set the update sequence number to 1. */
1927 	*(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
1928 	m->lsn = 0;
1929 	m->sequence_number = cpu_to_le16(1);
1930 	m->link_count = 0;
1931 	/*
1932 	 * Place the attributes straight after the update sequence array,
1933 	 * aligned to 8-byte boundary.
1934 	 */
1935 	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
1936 			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
1937 	m->flags = 0;
1938 	/*
1939 	 * Using attrs_offset plus eight bytes (for the termination attribute).
1940 	 * attrs_offset is already aligned to 8-byte boundary, so no need to
1941 	 * align again.
1942 	 */
1943 	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
1944 	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
1945 	m->base_mft_record = 0;
1946 	m->next_attr_instance = 0;
1947 	/* Add the termination attribute. */
1948 	a = (struct attr_record *)((u8 *)m + le16_to_cpu(m->attrs_offset));
1949 	a->type = AT_END;
1950 	a->length = 0;
1951 	ntfs_debug("Done.");
1952 	return 0;
1953 }
1954 
1955 /*
1956  * ntfs_mft_record_format - format an mft record on an ntfs volume
1957  * @vol:	volume on which to format the mft record
1958  * @mft_no:	mft record number to format
1959  *
1960  * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
1961  * mft record into the appropriate place of the mft data attribute.  This is
1962  * used when extending the mft data attribute.
1963  *
1964  * Return 0 on success and -errno on error.
1965  */
1966 static int ntfs_mft_record_format(const struct ntfs_volume *vol, const s64 mft_no)
1967 {
1968 	loff_t i_size;
1969 	struct inode *mft_vi = vol->mft_ino;
1970 	struct folio *folio;
1971 	struct mft_record *m;
1972 	pgoff_t index, end_index;
1973 	unsigned int ofs;
1974 	int err;
1975 
1976 	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
1977 	/*
1978 	 * The index into the page cache and the offset within the page cache
1979 	 * page of the wanted mft record.
1980 	 */
1981 	index = NTFS_MFT_NR_TO_PIDX(vol, mft_no);
1982 	ofs = NTFS_MFT_NR_TO_POFS(vol, mft_no);
1983 	/* The maximum valid index into the page cache for $MFT's data. */
1984 	i_size = i_size_read(mft_vi);
1985 	end_index = i_size >> PAGE_SHIFT;
1986 	if (unlikely(index >= end_index)) {
1987 		if (unlikely(index > end_index ||
1988 			     ofs + vol->mft_record_size > (i_size & ~PAGE_MASK))) {
1989 			ntfs_error(vol->sb, "Tried to format non-existing mft record 0x%llx.",
1990 					(long long)mft_no);
1991 			return -ENOENT;
1992 		}
1993 	}
1994 
1995 	/* Read, map, and pin the folio containing the mft record. */
1996 	folio = read_mapping_folio(mft_vi->i_mapping, index, NULL);
1997 	if (IS_ERR(folio)) {
1998 		ntfs_error(vol->sb, "Failed to map page containing mft record to format 0x%llx.",
1999 				(long long)mft_no);
2000 		return PTR_ERR(folio);
2001 	}
2002 	folio_lock(folio);
2003 	folio_clear_uptodate(folio);
2004 	m = (struct mft_record *)((u8 *)kmap_local_folio(folio, 0) + ofs);
2005 	err = ntfs_mft_record_layout(vol, mft_no, m);
2006 	if (unlikely(err)) {
2007 		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
2008 				(long long)mft_no);
2009 		folio_mark_uptodate(folio);
2010 		folio_unlock(folio);
2011 		kunmap_local(m);
2012 		folio_put(folio);
2013 		return err;
2014 	}
2015 	pre_write_mst_fixup((struct ntfs_record *)m, vol->mft_record_size);
2016 	folio_mark_uptodate(folio);
2017 	/*
2018 	 * Make sure the mft record is written out to disk.  We could use
2019 	 * ilookup5() to check if an inode is in icache and so on but this is
2020 	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
2021 	 */
2022 	ntfs_mft_mark_dirty(folio);
2023 	folio_unlock(folio);
2024 	kunmap_local(m);
2025 	folio_put(folio);
2026 	ntfs_debug("Done.");
2027 	return 0;
2028 }
2029 
2030 /*
2031  * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
2032  * @vol:	[IN]  volume on which to allocate the mft record
2033  * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
2034  * @ni:		[OUT] on success, set to the allocated ntfs inode
2035  * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
2036  * @ni_mrec:	[OUT] on successful return this is the mapped mft record
2037  *
2038  * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
2039  *
2040  * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
2041  * direvctory inode, and allocate it at the default allocator position.  In
2042  * this case @mode is the file mode as given to us by the caller.  We in
2043  * particular use @mode to distinguish whether a file or a directory is being
2044  * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
2045  *
2046  * If @base_ni is not NULL make the allocated mft record an extent record,
2047  * allocate it starting at the mft record after the base mft record and attach
2048  * the allocated and opened ntfs inode to the base inode @base_ni.  In this
2049  * case @mode must be 0 as it is meaningless for extent inodes.
2050  *
2051  * You need to check the return value with IS_ERR().  If false, the function
2052  * was successful and the return value is the now opened ntfs inode of the
2053  * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
2054  * and locked mft record.  If IS_ERR() is true, the function failed and the
2055  * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
2056  * this case.
2057  *
2058  * Allocation strategy:
2059  *
2060  * To find a free mft record, we scan the mft bitmap for a zero bit.  To
2061  * optimize this we start scanning at the place specified by @base_ni or if
2062  * @base_ni is NULL we start where we last stopped and we perform wrap around
2063  * when we reach the end.  Note, we do not try to allocate mft records below
2064  * number 64 because numbers 0 to 15 are the defined system files anyway and 16
2065  * to 64 are special in that they are used for storing extension mft records
2066  * for the $DATA attribute of $MFT.  This is required to avoid the possibility
2067  * of creating a runlist with a circular dependency which once written to disk
2068  * can never be read in again.  Windows will only use records 16 to 24 for
2069  * normal files if the volume is completely out of space.  We never use them
2070  * which means that when the volume is really out of space we cannot create any
2071  * more files while Windows can still create up to 8 small files.  We can start
2072  * doing this at some later time, it does not matter much for now.
2073  *
2074  * When scanning the mft bitmap, we only search up to the last allocated mft
2075  * record.  If there are no free records left in the range 64 to number of
2076  * allocated mft records, then we extend the $MFT/$DATA attribute in order to
2077  * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
2078  * records at a time or one cluster, if cluster size is above 16kiB.  If there
2079  * is not sufficient space to do this, we try to extend by a single mft record
2080  * or one cluster, if cluster size is above the mft record size.
2081  *
2082  * No matter how many mft records we allocate, we initialize only the first
2083  * allocated mft record, incrementing mft data size and initialized size
2084  * accordingly, open an struct ntfs_inode for it and return it to the caller, unless
2085  * there are less than 64 mft records, in which case we allocate and initialize
2086  * mft records until we reach record 64 which we consider as the first free mft
2087  * record for use by normal files.
2088  *
2089  * If during any stage we overflow the initialized data in the mft bitmap, we
2090  * extend the initialized size (and data size) by 8 bytes, allocating another
2091  * cluster if required.  The bitmap data size has to be at least equal to the
2092  * number of mft records in the mft, but it can be bigger, in which case the
2093  * superfluous bits are padded with zeroes.
2094  *
2095  * Thus, when we return successfully (IS_ERR() is false), we will have:
2096  *	- initialized / extended the mft bitmap if necessary,
2097  *	- initialized / extended the mft data if necessary,
2098  *	- set the bit corresponding to the mft record being allocated in the
2099  *	  mft bitmap,
2100  *	- opened an struct ntfs_inode for the allocated mft record, and we will have
2101  *	- returned the struct ntfs_inode as well as the allocated mapped, pinned, and
2102  *	  locked mft record.
2103  *
2104  * On error, the volume will be left in a consistent state and no record will
2105  * be allocated.  If rolling back a partial operation fails, we may leave some
2106  * inconsistent metadata in which case we set NVolErrors() so the volume is
2107  * left dirty when unmounted.
2108  *
2109  * Note, this function cannot make use of most of the normal functions, like
2110  * for example for attribute resizing, etc, because when the run list overflows
2111  * the base mft record and an attribute list is used, it is very important that
2112  * the extension mft records used to store the $DATA attribute of $MFT can be
2113  * reached without having to read the information contained inside them, as
2114  * this would make it impossible to find them in the first place after the
2115  * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
2116  * rule because the bitmap is not essential for finding the mft records, but on
2117  * the other hand, handling the bitmap in this special way would make life
2118  * easier because otherwise there might be circular invocations of functions
2119  * when reading the bitmap.
2120  */
2121 int ntfs_mft_record_alloc(struct ntfs_volume *vol, const int mode,
2122 			  struct ntfs_inode **ni, struct ntfs_inode *base_ni,
2123 			  struct mft_record **ni_mrec)
2124 {
2125 	s64 ll, bit, old_data_initialized, old_data_size;
2126 	unsigned long flags;
2127 	struct folio *folio;
2128 	struct ntfs_inode *mft_ni, *mftbmp_ni;
2129 	struct ntfs_attr_search_ctx *ctx;
2130 	struct mft_record *m = NULL;
2131 	struct attr_record *a;
2132 	pgoff_t index;
2133 	unsigned int ofs;
2134 	int err;
2135 	__le16 seq_no, usn;
2136 	bool record_formatted = false;
2137 	unsigned int memalloc_flags;
2138 
2139 	if (base_ni && *ni)
2140 		return -EINVAL;
2141 
2142 	/* @mode and @base_ni are mutually exclusive. */
2143 	if (mode && base_ni)
2144 		return -EINVAL;
2145 
2146 	if (base_ni)
2147 		ntfs_debug("Entering (allocating an extent mft record for base mft record 0x%llx).",
2148 				(long long)base_ni->mft_no);
2149 	else
2150 		ntfs_debug("Entering (allocating a base mft record).");
2151 
2152 	memalloc_flags = memalloc_nofs_save();
2153 
2154 	mft_ni = NTFS_I(vol->mft_ino);
2155 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2156 		mutex_lock(&mft_ni->mrec_lock);
2157 	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
2158 search_free_rec:
2159 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2160 		down_write(&vol->mftbmp_lock);
2161 	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
2162 	if (bit >= 0) {
2163 		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2164 				(long long)bit);
2165 		goto have_alloc_rec;
2166 	}
2167 	if (bit != -ENOSPC) {
2168 		if (!base_ni || base_ni->mft_no != FILE_MFT) {
2169 			up_write(&vol->mftbmp_lock);
2170 			mutex_unlock(&mft_ni->mrec_lock);
2171 		}
2172 		memalloc_nofs_restore(memalloc_flags);
2173 		return bit;
2174 	}
2175 
2176 	if (base_ni && base_ni->mft_no == FILE_MFT) {
2177 		memalloc_nofs_restore(memalloc_flags);
2178 		return bit;
2179 	}
2180 
2181 	/*
2182 	 * No free mft records left.  If the mft bitmap already covers more
2183 	 * than the currently used mft records, the next records are all free,
2184 	 * so we can simply allocate the first unused mft record.
2185 	 * Note: We also have to make sure that the mft bitmap at least covers
2186 	 * the first 24 mft records as they are special and whilst they may not
2187 	 * be in use, we do not allocate from them.
2188 	 */
2189 	read_lock_irqsave(&mft_ni->size_lock, flags);
2190 	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
2191 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2192 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2193 	old_data_initialized = mftbmp_ni->initialized_size;
2194 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2195 	if (old_data_initialized << 3 > ll &&
2196 	    old_data_initialized > RESERVED_MFT_RECORDS / 8) {
2197 		bit = ll;
2198 		if (bit < RESERVED_MFT_RECORDS)
2199 			bit = RESERVED_MFT_RECORDS;
2200 		if (unlikely(bit >= (1ll << 32)))
2201 			goto max_err_out;
2202 		ntfs_debug("Found free record (#2), bit 0x%llx.",
2203 				(long long)bit);
2204 		goto found_free_rec;
2205 	}
2206 	/*
2207 	 * The mft bitmap needs to be expanded until it covers the first unused
2208 	 * mft record that we can allocate.
2209 	 * Note: The smallest mft record we allocate is mft record 24.
2210 	 */
2211 	bit = old_data_initialized << 3;
2212 	if (unlikely(bit >= (1ll << 32)))
2213 		goto max_err_out;
2214 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2215 	old_data_size = mftbmp_ni->allocated_size;
2216 	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2217 			old_data_size, i_size_read(vol->mftbmp_ino),
2218 			old_data_initialized);
2219 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2220 	if (old_data_initialized + 8 > old_data_size) {
2221 		/* Need to extend bitmap by one more cluster. */
2222 		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2223 		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2224 		if (err == -EAGAIN)
2225 			err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2226 
2227 		if (unlikely(err)) {
2228 			if (!base_ni || base_ni->mft_no != FILE_MFT)
2229 				up_write(&vol->mftbmp_lock);
2230 			goto err_out;
2231 		}
2232 #ifdef DEBUG
2233 		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2234 		ntfs_debug("Status of mftbmp after allocation extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2235 				mftbmp_ni->allocated_size,
2236 				i_size_read(vol->mftbmp_ino),
2237 				mftbmp_ni->initialized_size);
2238 		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2239 #endif /* DEBUG */
2240 	}
2241 	/*
2242 	 * We now have sufficient allocated space, extend the initialized_size
2243 	 * as well as the data_size if necessary and fill the new space with
2244 	 * zeroes.
2245 	 */
2246 	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2247 	if (unlikely(err)) {
2248 		if (!base_ni || base_ni->mft_no != FILE_MFT)
2249 			up_write(&vol->mftbmp_lock);
2250 		goto err_out;
2251 	}
2252 #ifdef DEBUG
2253 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2254 	ntfs_debug("Status of mftbmp after initialized extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2255 			mftbmp_ni->allocated_size,
2256 			i_size_read(vol->mftbmp_ino),
2257 			mftbmp_ni->initialized_size);
2258 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2259 #endif /* DEBUG */
2260 	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
2261 found_free_rec:
2262 	/* @bit is the found free mft record, allocate it in the mft bitmap. */
2263 	ntfs_debug("At found_free_rec.");
2264 	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
2265 	if (unlikely(err)) {
2266 		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
2267 		if (!base_ni || base_ni->mft_no != FILE_MFT)
2268 			up_write(&vol->mftbmp_lock);
2269 		goto err_out;
2270 	}
2271 	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
2272 have_alloc_rec:
2273 	/*
2274 	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
2275 	 * Note, we keep hold of the mft bitmap lock for writing until all
2276 	 * modifications to the mft data attribute are complete, too, as they
2277 	 * will impact decisions for mft bitmap and mft record allocation done
2278 	 * by a parallel allocation and if the lock is not maintained a
2279 	 * parallel allocation could allocate the same mft record as this one.
2280 	 */
2281 	ll = (bit + 1) << vol->mft_record_size_bits;
2282 	read_lock_irqsave(&mft_ni->size_lock, flags);
2283 	old_data_initialized = mft_ni->initialized_size;
2284 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2285 	if (ll <= old_data_initialized) {
2286 		ntfs_debug("Allocated mft record already initialized.");
2287 		goto mft_rec_already_initialized;
2288 	}
2289 	ntfs_debug("Initializing allocated mft record.");
2290 	/*
2291 	 * The mft record is outside the initialized data.  Extend the mft data
2292 	 * attribute until it covers the allocated record.  The loop is only
2293 	 * actually traversed more than once when a freshly formatted volume is
2294 	 * first written to so it optimizes away nicely in the common case.
2295 	 */
2296 	if (!base_ni || base_ni->mft_no != FILE_MFT) {
2297 		read_lock_irqsave(&mft_ni->size_lock, flags);
2298 		ntfs_debug("Status of mft data before extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2299 				mft_ni->allocated_size, i_size_read(vol->mft_ino),
2300 				mft_ni->initialized_size);
2301 		while (ll > mft_ni->allocated_size) {
2302 			read_unlock_irqrestore(&mft_ni->size_lock, flags);
2303 			err = ntfs_mft_data_extend_allocation_nolock(vol);
2304 			if (err == -EAGAIN)
2305 				err = ntfs_mft_data_extend_allocation_nolock(vol);
2306 
2307 			if (unlikely(err)) {
2308 				ntfs_error(vol->sb, "Failed to extend mft data allocation.");
2309 				goto undo_mftbmp_alloc_nolock;
2310 			}
2311 			read_lock_irqsave(&mft_ni->size_lock, flags);
2312 			ntfs_debug("Status of mft data after allocation extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2313 					mft_ni->allocated_size, i_size_read(vol->mft_ino),
2314 					mft_ni->initialized_size);
2315 		}
2316 		read_unlock_irqrestore(&mft_ni->size_lock, flags);
2317 	} else if (ll > mft_ni->allocated_size) {
2318 		err = -ENOSPC;
2319 		goto undo_mftbmp_alloc_nolock;
2320 	}
2321 	/*
2322 	 * Extend mft data initialized size (and data size of course) to reach
2323 	 * the allocated mft record, formatting the mft records allong the way.
2324 	 * Note: We only modify the struct ntfs_inode structure as that is all that is
2325 	 * needed by ntfs_mft_record_format().  We will update the attribute
2326 	 * record itself in one fell swoop later on.
2327 	 */
2328 	write_lock_irqsave(&mft_ni->size_lock, flags);
2329 	old_data_initialized = mft_ni->initialized_size;
2330 	old_data_size = vol->mft_ino->i_size;
2331 	while (ll > mft_ni->initialized_size) {
2332 		s64 new_initialized_size, mft_no;
2333 
2334 		new_initialized_size = mft_ni->initialized_size +
2335 				vol->mft_record_size;
2336 		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
2337 		if (new_initialized_size > i_size_read(vol->mft_ino))
2338 			i_size_write(vol->mft_ino, new_initialized_size);
2339 		write_unlock_irqrestore(&mft_ni->size_lock, flags);
2340 		ntfs_debug("Initializing mft record 0x%llx.",
2341 				(long long)mft_no);
2342 		err = ntfs_mft_record_format(vol, mft_no);
2343 		if (unlikely(err)) {
2344 			ntfs_error(vol->sb, "Failed to format mft record.");
2345 			goto undo_data_init;
2346 		}
2347 		write_lock_irqsave(&mft_ni->size_lock, flags);
2348 		mft_ni->initialized_size = new_initialized_size;
2349 	}
2350 	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2351 	record_formatted = true;
2352 	/* Update the mft data attribute record to reflect the new sizes. */
2353 	m = map_mft_record(mft_ni);
2354 	if (IS_ERR(m)) {
2355 		ntfs_error(vol->sb, "Failed to map mft record.");
2356 		err = PTR_ERR(m);
2357 		goto undo_data_init;
2358 	}
2359 	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
2360 	if (unlikely(!ctx)) {
2361 		ntfs_error(vol->sb, "Failed to get search context.");
2362 		err = -ENOMEM;
2363 		unmap_mft_record(mft_ni);
2364 		goto undo_data_init;
2365 	}
2366 	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
2367 			CASE_SENSITIVE, 0, NULL, 0, ctx);
2368 	if (unlikely(err)) {
2369 		ntfs_error(vol->sb, "Failed to find first attribute extent of mft data attribute.");
2370 		ntfs_attr_put_search_ctx(ctx);
2371 		unmap_mft_record(mft_ni);
2372 		goto undo_data_init;
2373 	}
2374 	a = ctx->attr;
2375 	read_lock_irqsave(&mft_ni->size_lock, flags);
2376 	a->data.non_resident.initialized_size =
2377 			cpu_to_le64(mft_ni->initialized_size);
2378 	a->data.non_resident.data_size =
2379 			cpu_to_le64(i_size_read(vol->mft_ino));
2380 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2381 	/* Ensure the changes make it to disk. */
2382 	mark_mft_record_dirty(ctx->ntfs_ino);
2383 	ntfs_attr_put_search_ctx(ctx);
2384 	unmap_mft_record(mft_ni);
2385 	read_lock_irqsave(&mft_ni->size_lock, flags);
2386 	ntfs_debug("Status of mft data after mft record initialization: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2387 			mft_ni->allocated_size,	i_size_read(vol->mft_ino),
2388 			mft_ni->initialized_size);
2389 	WARN_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
2390 	WARN_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
2391 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2392 mft_rec_already_initialized:
2393 	/*
2394 	 * We can finally drop the mft bitmap lock as the mft data attribute
2395 	 * has been fully updated.  The only disparity left is that the
2396 	 * allocated mft record still needs to be marked as in use to match the
2397 	 * set bit in the mft bitmap but this is actually not a problem since
2398 	 * this mft record is not referenced from anywhere yet and the fact
2399 	 * that it is allocated in the mft bitmap means that no-one will try to
2400 	 * allocate it either.
2401 	 */
2402 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2403 		up_write(&vol->mftbmp_lock);
2404 	/*
2405 	 * We now have allocated and initialized the mft record.  Calculate the
2406 	 * index of and the offset within the page cache page the record is in.
2407 	 */
2408 	index = NTFS_MFT_NR_TO_PIDX(vol, bit);
2409 	ofs = NTFS_MFT_NR_TO_POFS(vol, bit);
2410 	/* Read, map, and pin the folio containing the mft record. */
2411 	folio = read_mapping_folio(vol->mft_ino->i_mapping, index, NULL);
2412 	if (IS_ERR(folio)) {
2413 		ntfs_error(vol->sb, "Failed to map page containing allocated mft record 0x%llx.",
2414 				bit);
2415 		err = PTR_ERR(folio);
2416 		goto undo_mftbmp_alloc;
2417 	}
2418 	folio_lock(folio);
2419 	folio_clear_uptodate(folio);
2420 	m = (struct mft_record *)((u8 *)kmap_local_folio(folio, 0) + ofs);
2421 	/* If we just formatted the mft record no need to do it again. */
2422 	if (!record_formatted) {
2423 		/* Sanity check that the mft record is really not in use. */
2424 		if (ntfs_is_file_record(m->magic) &&
2425 				(m->flags & MFT_RECORD_IN_USE)) {
2426 			ntfs_warning(vol->sb,
2427 				"Mft record 0x%llx was marked free in mft bitmap but is marked used itself. Unmount and run chkdsk.",
2428 				bit);
2429 			folio_mark_uptodate(folio);
2430 			folio_unlock(folio);
2431 			kunmap_local(m);
2432 			folio_put(folio);
2433 			NVolSetErrors(vol);
2434 			goto search_free_rec;
2435 		}
2436 		/*
2437 		 * We need to (re-)format the mft record, preserving the
2438 		 * sequence number if it is not zero as well as the update
2439 		 * sequence number if it is not zero or -1 (0xffff).  This
2440 		 * means we do not need to care whether or not something went
2441 		 * wrong with the previous mft record.
2442 		 */
2443 		seq_no = m->sequence_number;
2444 		usn = *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs));
2445 		err = ntfs_mft_record_layout(vol, bit, m);
2446 		if (unlikely(err)) {
2447 			ntfs_error(vol->sb, "Failed to layout allocated mft record 0x%llx.",
2448 					bit);
2449 			folio_mark_uptodate(folio);
2450 			folio_unlock(folio);
2451 			kunmap_local(m);
2452 			folio_put(folio);
2453 			goto undo_mftbmp_alloc;
2454 		}
2455 		if (seq_no)
2456 			m->sequence_number = seq_no;
2457 		if (usn && le16_to_cpu(usn) != 0xffff)
2458 			*(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)) = usn;
2459 		pre_write_mst_fixup((struct ntfs_record *)m, vol->mft_record_size);
2460 	}
2461 	/* Set the mft record itself in use. */
2462 	m->flags |= MFT_RECORD_IN_USE;
2463 	if (S_ISDIR(mode))
2464 		m->flags |= MFT_RECORD_IS_DIRECTORY;
2465 	folio_mark_uptodate(folio);
2466 	if (base_ni) {
2467 		struct mft_record *m_tmp;
2468 
2469 		/*
2470 		 * Setup the base mft record in the extent mft record.  This
2471 		 * completes initialization of the allocated extent mft record
2472 		 * and we can simply use it with map_extent_mft_record().
2473 		 */
2474 		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2475 				base_ni->seq_no);
2476 		/*
2477 		 * Allocate an extent inode structure for the new mft record,
2478 		 * attach it to the base inode @base_ni and map, pin, and lock
2479 		 * its, i.e. the allocated, mft record.
2480 		 */
2481 		m_tmp = map_extent_mft_record(base_ni,
2482 					      MK_MREF(bit, le16_to_cpu(m->sequence_number)),
2483 					      ni);
2484 		if (IS_ERR(m_tmp)) {
2485 			ntfs_error(vol->sb, "Failed to map allocated extent mft record 0x%llx.",
2486 					bit);
2487 			err = PTR_ERR(m_tmp);
2488 			/* Set the mft record itself not in use. */
2489 			m->flags &= cpu_to_le16(
2490 					~le16_to_cpu(MFT_RECORD_IN_USE));
2491 			/* Make sure the mft record is written out to disk. */
2492 			ntfs_mft_mark_dirty(folio);
2493 			folio_unlock(folio);
2494 			kunmap_local(m);
2495 			folio_put(folio);
2496 			goto undo_mftbmp_alloc;
2497 		}
2498 
2499 		/*
2500 		 * Make sure the allocated mft record is written out to disk.
2501 		 * No need to set the inode dirty because the caller is going
2502 		 * to do that anyway after finishing with the new extent mft
2503 		 * record (e.g. at a minimum a new attribute will be added to
2504 		 * the mft record.
2505 		 */
2506 		ntfs_mft_mark_dirty(folio);
2507 		folio_unlock(folio);
2508 		/*
2509 		 * Need to unmap the page since map_extent_mft_record() mapped
2510 		 * it as well so we have it mapped twice at the moment.
2511 		 */
2512 		kunmap_local(m);
2513 		folio_put(folio);
2514 	} else {
2515 		/*
2516 		 * Manually map, pin, and lock the mft record as we already
2517 		 * have its page mapped and it is very easy to do.
2518 		 */
2519 		(*ni)->seq_no = le16_to_cpu(m->sequence_number);
2520 		/*
2521 		 * Make sure the allocated mft record is written out to disk.
2522 		 * NOTE: We do not set the ntfs inode dirty because this would
2523 		 * fail in ntfs_write_inode() because the inode does not have a
2524 		 * standard information attribute yet.  Also, there is no need
2525 		 * to set the inode dirty because the caller is going to do
2526 		 * that anyway after finishing with the new mft record (e.g. at
2527 		 * a minimum some new attributes will be added to the mft
2528 		 * record.
2529 		 */
2530 
2531 		(*ni)->mrec = kmalloc(vol->mft_record_size, GFP_NOFS);
2532 		if (!(*ni)->mrec) {
2533 			folio_unlock(folio);
2534 			kunmap_local(m);
2535 			folio_put(folio);
2536 			err = -ENOMEM;
2537 			goto undo_mftbmp_alloc;
2538 		}
2539 
2540 		memcpy((*ni)->mrec, m, vol->mft_record_size);
2541 		post_read_mst_fixup((struct ntfs_record *)(*ni)->mrec, vol->mft_record_size);
2542 		ntfs_mft_mark_dirty(folio);
2543 		folio_unlock(folio);
2544 		(*ni)->folio = folio;
2545 		(*ni)->folio_ofs = ofs;
2546 		atomic_inc(&(*ni)->count);
2547 		/* Update the default mft allocation position. */
2548 		vol->mft_data_pos = bit + 1;
2549 	}
2550 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2551 		mutex_unlock(&mft_ni->mrec_lock);
2552 	memalloc_nofs_restore(memalloc_flags);
2553 
2554 	/*
2555 	 * Return the opened, allocated inode of the allocated mft record as
2556 	 * well as the mapped, pinned, and locked mft record.
2557 	 */
2558 	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
2559 			base_ni ? "extent " : "", bit);
2560 	(*ni)->mft_no = bit;
2561 	if (ni_mrec)
2562 		*ni_mrec = (*ni)->mrec;
2563 	ntfs_dec_free_mft_records(vol, 1);
2564 	return 0;
2565 undo_data_init:
2566 	write_lock_irqsave(&mft_ni->size_lock, flags);
2567 	mft_ni->initialized_size = old_data_initialized;
2568 	i_size_write(vol->mft_ino, old_data_size);
2569 	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2570 	goto undo_mftbmp_alloc_nolock;
2571 undo_mftbmp_alloc:
2572 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2573 		down_write(&vol->mftbmp_lock);
2574 undo_mftbmp_alloc_nolock:
2575 	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
2576 		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2577 		NVolSetErrors(vol);
2578 	}
2579 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2580 		up_write(&vol->mftbmp_lock);
2581 err_out:
2582 	if (!base_ni || base_ni->mft_no != FILE_MFT)
2583 		mutex_unlock(&mft_ni->mrec_lock);
2584 	memalloc_nofs_restore(memalloc_flags);
2585 	return err;
2586 max_err_out:
2587 	ntfs_warning(vol->sb,
2588 		"Cannot allocate mft record because the maximum number of inodes (2^32) has already been reached.");
2589 	if (!base_ni || base_ni->mft_no != FILE_MFT) {
2590 		up_write(&vol->mftbmp_lock);
2591 		mutex_unlock(&mft_ni->mrec_lock);
2592 	}
2593 	memalloc_nofs_restore(memalloc_flags);
2594 	return -ENOSPC;
2595 }
2596 
2597 /*
2598  * ntfs_mft_record_free - free an mft record on an ntfs volume
2599  * @vol:	volume on which to free the mft record
2600  * @ni:		open ntfs inode of the mft record to free
2601  *
2602  * Free the mft record of the open inode @ni on the mounted ntfs volume @vol.
2603  * Note that this function calls ntfs_inode_close() internally and hence you
2604  * cannot use the pointer @ni any more after this function returns success.
2605  *
2606  * On success return 0 and on error return -1 with errno set to the error code.
2607  */
2608 int ntfs_mft_record_free(struct ntfs_volume *vol, struct ntfs_inode *ni)
2609 {
2610 	u64 mft_no;
2611 	int err;
2612 	u16 seq_no;
2613 	__le16 old_seq_no;
2614 	struct mft_record *ni_mrec;
2615 	unsigned int memalloc_flags;
2616 	struct ntfs_inode *base_ni;
2617 
2618 	if (!vol || !ni)
2619 		return -EINVAL;
2620 
2621 	ntfs_debug("Entering for inode 0x%llx.\n", (long long)ni->mft_no);
2622 
2623 	ni_mrec = map_mft_record(ni);
2624 	if (IS_ERR(ni_mrec))
2625 		return -EIO;
2626 
2627 	/* Cache the mft reference for later. */
2628 	mft_no = ni->mft_no;
2629 
2630 	/* Mark the mft record as not in use. */
2631 	ni_mrec->flags &= ~MFT_RECORD_IN_USE;
2632 
2633 	/* Increment the sequence number, skipping zero, if it is not zero. */
2634 	old_seq_no = ni_mrec->sequence_number;
2635 	seq_no = le16_to_cpu(old_seq_no);
2636 	if (seq_no == 0xffff)
2637 		seq_no = 1;
2638 	else if (seq_no)
2639 		seq_no++;
2640 	ni_mrec->sequence_number = cpu_to_le16(seq_no);
2641 
2642 	down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
2643 	err = ntfs_get_block_mft_record(NTFS_I(vol->mft_ino), ni);
2644 	up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
2645 	if (err) {
2646 		unmap_mft_record(ni);
2647 		return err;
2648 	}
2649 
2650 	/*
2651 	 * Set the ntfs inode dirty and write it out.  We do not need to worry
2652 	 * about the base inode here since whatever caused the extent mft
2653 	 * record to be freed is guaranteed to do it already.
2654 	 */
2655 	NInoSetDirty(ni);
2656 	err = write_mft_record(ni, ni_mrec, 0);
2657 	if (err)
2658 		goto sync_rollback;
2659 
2660 	if (likely(ni->nr_extents >= 0))
2661 		base_ni = ni;
2662 	else
2663 		base_ni = ni->ext.base_ntfs_ino;
2664 
2665 	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
2666 	memalloc_flags = memalloc_nofs_save();
2667 	if (base_ni->mft_no != FILE_MFT)
2668 		down_write(&vol->mftbmp_lock);
2669 	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
2670 	if (base_ni->mft_no != FILE_MFT)
2671 		up_write(&vol->mftbmp_lock);
2672 	memalloc_nofs_restore(memalloc_flags);
2673 	if (err)
2674 		goto bitmap_rollback;
2675 
2676 	unmap_mft_record(ni);
2677 	ntfs_inc_free_mft_records(vol, 1);
2678 	return 0;
2679 
2680 	/* Rollback what we did... */
2681 bitmap_rollback:
2682 	memalloc_flags = memalloc_nofs_save();
2683 	if (base_ni->mft_no != FILE_MFT)
2684 		down_write(&vol->mftbmp_lock);
2685 	if (ntfs_bitmap_set_bit(vol->mftbmp_ino, mft_no))
2686 		ntfs_error(vol->sb, "ntfs_bitmap_set_bit failed in bitmap_rollback\n");
2687 	if (base_ni->mft_no != FILE_MFT)
2688 		up_write(&vol->mftbmp_lock);
2689 	memalloc_nofs_restore(memalloc_flags);
2690 sync_rollback:
2691 	ntfs_error(vol->sb,
2692 		"Eeek! Rollback failed in %s. Leaving inconsistent metadata!\n", __func__);
2693 	ni_mrec->flags |= MFT_RECORD_IN_USE;
2694 	ni_mrec->sequence_number = old_seq_no;
2695 	NInoSetDirty(ni);
2696 	write_mft_record(ni, ni_mrec, 0);
2697 	unmap_mft_record(ni);
2698 	return err;
2699 }
2700 
2701 static s64 lcn_from_index(struct ntfs_volume *vol, struct ntfs_inode *ni,
2702 		unsigned long index)
2703 {
2704 	s64 vcn;
2705 	s64 lcn;
2706 
2707 	vcn = ntfs_pidx_to_cluster(vol, index);
2708 
2709 	down_read(&ni->runlist.lock);
2710 	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, false);
2711 	up_read(&ni->runlist.lock);
2712 
2713 	return lcn;
2714 }
2715 
2716 /*
2717  * ntfs_write_mft_block - Write back a folio containing MFT records
2718  * @folio:	The folio to write back (contains one or more MFT records)
2719  * @wbc:	Writeback control structure
2720  *
2721  * This function is called as part of the address_space_operations
2722  * .writepages implementation for the $MFT inode (or $MFTMirr).
2723  * It handles writing one folio (normally 4KiB page) worth of MFT records
2724  * to the underlying block device.
2725  *
2726  * Return: 0 on success, or -errno on error.
2727  */
2728 static int ntfs_write_mft_block(struct folio *folio, struct writeback_control *wbc)
2729 {
2730 	struct address_space *mapping = folio->mapping;
2731 	struct inode *vi = mapping->host;
2732 	struct ntfs_inode *ni = NTFS_I(vi);
2733 	struct ntfs_volume *vol = ni->vol;
2734 	u8 *kaddr;
2735 	struct ntfs_inode **locked_nis __free(kfree) = kmalloc_array(PAGE_SIZE / NTFS_BLOCK_SIZE,
2736 							sizeof(struct ntfs_inode *), GFP_NOFS);
2737 	int nr_locked_nis = 0, err = 0, mft_ofs, prev_mft_ofs;
2738 	struct inode **ref_inos __free(kfree) = kmalloc_array(PAGE_SIZE / NTFS_BLOCK_SIZE,
2739 							      sizeof(struct inode *), GFP_NOFS);
2740 	int nr_ref_inos = 0;
2741 	struct bio *bio = NULL;
2742 	u64 mft_no;
2743 	struct ntfs_inode *tni;
2744 	s64 lcn;
2745 	s64 vcn = ntfs_pidx_to_cluster(vol, folio->index);
2746 	s64 end_vcn = ntfs_bytes_to_cluster(vol, ni->allocated_size);
2747 	unsigned int folio_sz;
2748 	struct runlist_element *rl = NULL;
2749 	loff_t i_size = i_size_read(vi);
2750 
2751 	ntfs_debug("Entering for inode 0x%llx, attribute type 0x%x, folio index 0x%lx.",
2752 			ni->mft_no, ni->type, folio->index);
2753 
2754 	if (!locked_nis || !ref_inos) {
2755 		folio_redirty_for_writepage(wbc, folio);
2756 		folio_unlock(folio);
2757 		return -ENOMEM;
2758 	}
2759 
2760 	/* We have to zero every time due to mmap-at-end-of-file. */
2761 	if (folio->index >= (i_size >> folio_shift(folio)))
2762 		/* The page straddles i_size. */
2763 		folio_zero_segment(folio,
2764 				   offset_in_folio(folio, i_size),
2765 				   folio_size(folio));
2766 
2767 	lcn = lcn_from_index(vol, ni, folio->index);
2768 	if (lcn <= LCN_HOLE) {
2769 		folio_start_writeback(folio);
2770 		folio_unlock(folio);
2771 		folio_end_writeback(folio);
2772 		return -EIO;
2773 	}
2774 
2775 	/* Map folio so we can access its contents. */
2776 	kaddr = kmap_local_folio(folio, 0);
2777 	/* Clear the page uptodate flag whilst the mst fixups are applied. */
2778 	folio_clear_uptodate(folio);
2779 
2780 	for (mft_ofs = 0; mft_ofs < PAGE_SIZE && vcn < end_vcn;
2781 	     mft_ofs += vol->mft_record_size) {
2782 		/* Get the mft record number. */
2783 		mft_no = (((s64)folio->index << PAGE_SHIFT) + mft_ofs) >>
2784 			vol->mft_record_size_bits;
2785 		vcn = ntfs_mft_no_to_cluster(vol, mft_no);
2786 		/* Check whether to write this mft record. */
2787 		tni = NULL;
2788 		if (ntfs_may_write_mft_record(vol, mft_no,
2789 					(struct mft_record *)(kaddr + mft_ofs),
2790 					&tni, &ref_inos[nr_ref_inos])) {
2791 			unsigned int mft_record_off = 0;
2792 			s64 vcn_off = vcn;
2793 
2794 			/*
2795 			 * Skip $MFT extent mft records and let them being written
2796 			 * by writeback to avioid deadlocks. the $MFT runlist
2797 			 * lock must be taken before $MFT extent mrec_lock is taken.
2798 			 */
2799 			if (tni && tni->nr_extents < 0 &&
2800 				tni->ext.base_ntfs_ino == NTFS_I(vol->mft_ino)) {
2801 				mutex_unlock(&tni->mrec_lock);
2802 				atomic_dec(&tni->count);
2803 				iput(vol->mft_ino);
2804 				continue;
2805 			}
2806 
2807 			/*
2808 			 * The record should be written.  If a locked ntfs
2809 			 * inode was returned, add it to the array of locked
2810 			 * ntfs inodes.
2811 			 */
2812 			if (tni)
2813 				locked_nis[nr_locked_nis++] = tni;
2814 			else if (ref_inos[nr_ref_inos])
2815 				nr_ref_inos++;
2816 
2817 			if (bio && (mft_ofs != prev_mft_ofs + vol->mft_record_size)) {
2818 flush_bio:
2819 				bio->bi_end_io = ntfs_bio_end_io;
2820 				submit_bio(bio);
2821 				bio = NULL;
2822 			}
2823 
2824 			if (vol->cluster_size < folio_size(folio)) {
2825 				down_write(&ni->runlist.lock);
2826 				rl = ntfs_attr_vcn_to_rl(ni, vcn_off, &lcn);
2827 				up_write(&ni->runlist.lock);
2828 				if (IS_ERR(rl) || lcn < 0) {
2829 					err = -EIO;
2830 					goto unm_done;
2831 				}
2832 
2833 				if (bio &&
2834 				   (bio_end_sector(bio) >> (vol->cluster_size_bits - 9)) !=
2835 				    lcn) {
2836 					bio->bi_end_io = ntfs_bio_end_io;
2837 					submit_bio(bio);
2838 					bio = NULL;
2839 				}
2840 			}
2841 
2842 			if (!bio) {
2843 				unsigned int off;
2844 
2845 				off = ((mft_no << vol->mft_record_size_bits) +
2846 				       mft_record_off) & vol->cluster_size_mask;
2847 
2848 				bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE,
2849 						GFP_NOIO);
2850 				bio->bi_iter.bi_sector =
2851 					ntfs_bytes_to_sector(vol,
2852 							ntfs_cluster_to_bytes(vol, lcn) + off);
2853 			}
2854 
2855 			if (vol->cluster_size == NTFS_BLOCK_SIZE &&
2856 			    (mft_record_off ||
2857 			     (rl && rl->length - (vcn_off - rl->vcn) == 1) ||
2858 			     mft_ofs + NTFS_BLOCK_SIZE >= PAGE_SIZE))
2859 				folio_sz = NTFS_BLOCK_SIZE;
2860 			else
2861 				folio_sz = vol->mft_record_size;
2862 			if (!bio_add_folio(bio, folio, folio_sz,
2863 					   mft_ofs + mft_record_off)) {
2864 				err = -EIO;
2865 				bio_put(bio);
2866 				goto unm_done;
2867 			}
2868 			mft_record_off += folio_sz;
2869 
2870 			if (mft_record_off != vol->mft_record_size) {
2871 				vcn_off++;
2872 				goto flush_bio;
2873 			}
2874 			prev_mft_ofs = mft_ofs;
2875 
2876 			if (mft_no < vol->mftmirr_size) {
2877 				int sub_err = ntfs_sync_mft_mirror(vol, mft_no,
2878 						(struct mft_record *)(kaddr + mft_ofs));
2879 
2880 				if (unlikely(sub_err) && !err)
2881 					err = sub_err;
2882 			}
2883 		} else if (ref_inos[nr_ref_inos])
2884 			nr_ref_inos++;
2885 	}
2886 
2887 	if (bio) {
2888 		bio->bi_end_io = ntfs_bio_end_io;
2889 		submit_bio(bio);
2890 	}
2891 unm_done:
2892 	folio_mark_uptodate(folio);
2893 	kunmap_local(kaddr);
2894 
2895 	folio_start_writeback(folio);
2896 	folio_unlock(folio);
2897 	folio_end_writeback(folio);
2898 
2899 	/* Unlock any locked inodes. */
2900 	while (nr_locked_nis-- > 0) {
2901 		struct ntfs_inode *base_tni;
2902 
2903 		tni = locked_nis[nr_locked_nis];
2904 		mutex_unlock(&tni->mrec_lock);
2905 
2906 		/* Get the base inode. */
2907 		mutex_lock(&tni->extent_lock);
2908 		if (tni->nr_extents >= 0)
2909 			base_tni = tni;
2910 		else
2911 			base_tni = tni->ext.base_ntfs_ino;
2912 		mutex_unlock(&tni->extent_lock);
2913 		ntfs_debug("Unlocking %s inode 0x%llx.",
2914 				tni == base_tni ? "base" : "extent",
2915 				tni->mft_no);
2916 		atomic_dec(&tni->count);
2917 		iput(VFS_I(base_tni));
2918 	}
2919 
2920 	/* Dropping deferred references */
2921 	while (nr_ref_inos-- > 0) {
2922 		if (ref_inos[nr_ref_inos])
2923 			iput(ref_inos[nr_ref_inos]);
2924 	}
2925 
2926 	if (unlikely(err && err != -ENOMEM))
2927 		NVolSetErrors(vol);
2928 	if (likely(!err))
2929 		ntfs_debug("Done.");
2930 	return err;
2931 }
2932 
2933 /*
2934  * ntfs_mft_writepages - Write back dirty folios for the $MFT inode
2935  * @mapping:	address space of the $MFT inode
2936  * @wbc:	writeback control
2937  *
2938  * Writeback iterator for MFT records. Iterates over dirty folios and
2939  * delegates actual writing to ntfs_write_mft_block() for each folio.
2940  * Called from the address_space_operations .writepages vector of the
2941  * $MFT inode.
2942  *
2943  * Returns 0 on success, or the first error encountered.
2944  */
2945 int ntfs_mft_writepages(struct address_space *mapping,
2946 		struct writeback_control *wbc)
2947 {
2948 	struct folio *folio = NULL;
2949 	int error;
2950 
2951 	if (NVolShutdown(NTFS_I(mapping->host)->vol))
2952 		return -EIO;
2953 
2954 	while ((folio = writeback_iter(mapping, wbc, folio, &error)))
2955 		error = ntfs_write_mft_block(folio, wbc);
2956 	return error;
2957 }
2958 
2959 void ntfs_mft_mark_dirty(struct folio *folio)
2960 {
2961 	iomap_dirty_folio(folio->mapping, folio);
2962 }
2963