1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NTFS kernel mft record operations.
4 * Part of this file is based on code from the NTFS-3G.
5 *
6 * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
7 * Copyright (c) 2002 Richard Russon
8 * Copyright (c) 2025 LG Electronics Co., Ltd.
9 */
10
11 #include <linux/writeback.h>
12 #include <linux/bio.h>
13 #include <linux/iomap.h>
14
15 #include "bitmap.h"
16 #include "lcnalloc.h"
17 #include "mft.h"
18 #include "ntfs.h"
19
20 /*
21 * ntfs_mft_record_check - Check the consistency of an MFT record
22 *
23 * Make sure its general fields are safe, then examine all its
24 * attributes and apply generic checks to them.
25 *
26 * Returns 0 if the checks are successful. If not, return -EIO.
27 */
ntfs_mft_record_check(const struct ntfs_volume * vol,struct mft_record * m,u64 mft_no)28 int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m,
29 u64 mft_no)
30 {
31 struct attr_record *a;
32 struct super_block *sb = vol->sb;
33
34 if (!ntfs_is_file_record(m->magic)) {
35 ntfs_error(sb, "Record %llu has no FILE magic (0x%x)\n",
36 mft_no, le32_to_cpu(*(__le32 *)m));
37 goto err_out;
38 }
39
40 if (le16_to_cpu(m->usa_ofs) & 0x1 ||
41 (vol->mft_record_size >> NTFS_BLOCK_SIZE_BITS) + 1 != le16_to_cpu(m->usa_count) ||
42 le16_to_cpu(m->usa_ofs) + le16_to_cpu(m->usa_count) * 2 > vol->mft_record_size) {
43 ntfs_error(sb, "Record %llu has corrupt fix-up values fields\n",
44 mft_no);
45 goto err_out;
46 }
47
48 if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
49 ntfs_error(sb, "Record %llu has corrupt allocation size (%u <> %u)\n",
50 mft_no, vol->mft_record_size,
51 le32_to_cpu(m->bytes_allocated));
52 goto err_out;
53 }
54
55 if (le32_to_cpu(m->bytes_in_use) > vol->mft_record_size) {
56 ntfs_error(sb, "Record %llu has corrupt in-use size (%u > %u)\n",
57 mft_no, le32_to_cpu(m->bytes_in_use),
58 vol->mft_record_size);
59 goto err_out;
60 }
61
62 if (le16_to_cpu(m->attrs_offset) & 7) {
63 ntfs_error(sb, "Attributes badly aligned in record %llu\n",
64 mft_no);
65 goto err_out;
66 }
67
68 a = (struct attr_record *)((char *)m + le16_to_cpu(m->attrs_offset));
69 if ((char *)a < (char *)m || (char *)a > (char *)m + vol->mft_record_size) {
70 ntfs_error(sb, "Record %llu is corrupt\n", mft_no);
71 goto err_out;
72 }
73
74 return 0;
75
76 err_out:
77 return -EIO;
78 }
79
80 /*
81 * map_mft_record_folio - map the folio in which a specific mft record resides
82 * @ni: ntfs inode whose mft record page to map
83 *
84 * This maps the folio in which the mft record of the ntfs inode @ni is
85 * situated.
86 *
87 * This allocates a new buffer (@ni->mrec), copies the MFT record data from
88 * the mapped folio into this buffer, and applies the MST (Multi Sector
89 * Transfer) fixups on the copy.
90 *
91 * The folio is pinned (referenced) in @ni->folio to ensure the data remains
92 * valid in the page cache, but the returned pointer is the allocated copy.
93 *
94 * Return: A pointer to the allocated and fixed-up mft record (@ni->mrec).
95 * The return value needs to be checked with IS_ERR(). If it is true,
96 * PTR_ERR() contains the negative error code.
97 */
map_mft_record_folio(struct ntfs_inode * ni)98 static inline struct mft_record *map_mft_record_folio(struct ntfs_inode *ni)
99 {
100 loff_t i_size;
101 struct ntfs_volume *vol = ni->vol;
102 struct inode *mft_vi = vol->mft_ino;
103 struct folio *folio;
104 unsigned long index, end_index;
105 unsigned int ofs;
106
107 WARN_ON(ni->folio);
108 /*
109 * The index into the page cache and the offset within the page cache
110 * page of the wanted mft record.
111 */
112 index = NTFS_MFT_NR_TO_PIDX(vol, ni->mft_no);
113 ofs = NTFS_MFT_NR_TO_POFS(vol, ni->mft_no);
114
115 i_size = i_size_read(mft_vi);
116 /* The maximum valid index into the page cache for $MFT's data. */
117 end_index = i_size >> PAGE_SHIFT;
118
119 /* If the wanted index is out of bounds the mft record doesn't exist. */
120 if (unlikely(index >= end_index)) {
121 if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
122 vol->mft_record_size) {
123 folio = ERR_PTR(-ENOENT);
124 ntfs_error(vol->sb,
125 "Attempt to read mft record 0x%llx, which is beyond the end of the mft. This is probably a bug in the ntfs driver.",
126 ni->mft_no);
127 goto err_out;
128 }
129 }
130
131 /* Read, map, and pin the folio. */
132 folio = read_mapping_folio(mft_vi->i_mapping, index, NULL);
133 if (!IS_ERR(folio)) {
134 u8 *addr;
135
136 ni->mrec = kmalloc(vol->mft_record_size, GFP_NOFS);
137 if (!ni->mrec) {
138 folio_put(folio);
139 folio = ERR_PTR(-ENOMEM);
140 goto err_out;
141 }
142
143 addr = kmap_local_folio(folio, 0);
144 memcpy(ni->mrec, addr + ofs, vol->mft_record_size);
145 post_read_mst_fixup((struct ntfs_record *)ni->mrec, vol->mft_record_size);
146
147 /* Catch multi sector transfer fixup errors. */
148 if (!ntfs_mft_record_check(vol, (struct mft_record *)ni->mrec, ni->mft_no)) {
149 kunmap_local(addr);
150 ni->folio = folio;
151 ni->folio_ofs = ofs;
152 return ni->mrec;
153 }
154 kunmap_local(addr);
155 folio_put(folio);
156 kfree(ni->mrec);
157 ni->mrec = NULL;
158 folio = ERR_PTR(-EIO);
159 NVolSetErrors(vol);
160 }
161 err_out:
162 ni->folio = NULL;
163 ni->folio_ofs = 0;
164 return (struct mft_record *)folio;
165 }
166
167 /*
168 * map_mft_record - map and pin an mft record
169 * @ni: ntfs inode whose MFT record to map
170 *
171 * This function ensures the MFT record for the given inode is mapped and
172 * accessible.
173 *
174 * It increments the reference count of the ntfs inode. If the record is
175 * already mapped (@ni->folio is set), it returns the cached record
176 * immediately.
177 *
178 * Otherwise, it calls map_mft_record_folio() to read the folio from disk
179 * (if necessary via read_mapping_folio), allocate a buffer, and copy the
180 * record data.
181 *
182 * Return: A pointer to the mft record. You need to check the returned
183 * pointer with IS_ERR().
184 */
map_mft_record(struct ntfs_inode * ni)185 struct mft_record *map_mft_record(struct ntfs_inode *ni)
186 {
187 struct mft_record *m;
188
189 if (!ni)
190 return ERR_PTR(-EINVAL);
191
192 ntfs_debug("Entering for mft_no 0x%llx.", ni->mft_no);
193
194 /* Make sure the ntfs inode doesn't go away. */
195 atomic_inc(&ni->count);
196
197 if (ni->folio)
198 return (struct mft_record *)ni->mrec;
199
200 m = map_mft_record_folio(ni);
201 if (!IS_ERR(m))
202 return m;
203
204 atomic_dec(&ni->count);
205 ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
206 return m;
207 }
208
209 /*
210 * unmap_mft_record - release a reference to a mapped mft record
211 * @ni: ntfs inode whose MFT record to unmap
212 *
213 * This decrements the reference count of the ntfs inode.
214 *
215 * It releases the caller's hold on the inode. If the reference count indicates
216 * that there are still other users (count > 1), the function returns
217 * immediately, keeping the resources (folio and mrec buffer) pinned for
218 * those users.
219 *
220 * NOTE: If caller has modified the mft record, it is imperative to set the mft
221 * record dirty BEFORE calling unmap_mft_record().
222 */
unmap_mft_record(struct ntfs_inode * ni)223 void unmap_mft_record(struct ntfs_inode *ni)
224 {
225 struct folio *folio;
226
227 if (!ni)
228 return;
229
230 ntfs_debug("Entering for mft_no 0x%llx.", ni->mft_no);
231
232 folio = ni->folio;
233 if (atomic_dec_return(&ni->count) > 1)
234 return;
235 WARN_ON(!folio);
236 }
237
238 /*
239 * map_extent_mft_record - load an extent inode and attach it to its base
240 * @base_ni: base ntfs inode
241 * @mref: mft reference of the extent inode to load
242 * @ntfs_ino: on successful return, pointer to the struct ntfs_inode structure
243 *
244 * Load the extent mft record @mref and attach it to its base inode @base_ni.
245 * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
246 * PTR_ERR(result) gives the negative error code.
247 *
248 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
249 * structure of the mapped extent inode.
250 */
map_extent_mft_record(struct ntfs_inode * base_ni,u64 mref,struct ntfs_inode ** ntfs_ino)251 struct mft_record *map_extent_mft_record(struct ntfs_inode *base_ni, u64 mref,
252 struct ntfs_inode **ntfs_ino)
253 {
254 struct mft_record *m;
255 struct ntfs_inode *ni = NULL;
256 struct ntfs_inode **extent_nis = NULL;
257 int i;
258 u64 mft_no = MREF(mref);
259 u16 seq_no = MSEQNO(mref);
260 bool destroy_ni = false;
261
262 ntfs_debug("Mapping extent mft record 0x%llx (base mft record 0x%llx).",
263 mft_no, base_ni->mft_no);
264 /* Make sure the base ntfs inode doesn't go away. */
265 atomic_inc(&base_ni->count);
266 /*
267 * Check if this extent inode has already been added to the base inode,
268 * in which case just return it. If not found, add it to the base
269 * inode before returning it.
270 */
271 retry:
272 mutex_lock(&base_ni->extent_lock);
273 if (base_ni->nr_extents > 0) {
274 extent_nis = base_ni->ext.extent_ntfs_inos;
275 for (i = 0; i < base_ni->nr_extents; i++) {
276 if (mft_no != extent_nis[i]->mft_no)
277 continue;
278 ni = extent_nis[i];
279 /* Make sure the ntfs inode doesn't go away. */
280 atomic_inc(&ni->count);
281 break;
282 }
283 }
284 if (likely(ni != NULL)) {
285 mutex_unlock(&base_ni->extent_lock);
286 atomic_dec(&base_ni->count);
287 /* We found the record; just have to map and return it. */
288 m = map_mft_record(ni);
289 /* map_mft_record() has incremented this on success. */
290 atomic_dec(&ni->count);
291 if (!IS_ERR(m)) {
292 /* Verify the sequence number. */
293 if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
294 ntfs_debug("Done 1.");
295 *ntfs_ino = ni;
296 return m;
297 }
298 unmap_mft_record(ni);
299 ntfs_error(base_ni->vol->sb,
300 "Found stale extent mft reference! Corrupt filesystem. Run chkdsk.");
301 return ERR_PTR(-EIO);
302 }
303 map_err_out:
304 ntfs_error(base_ni->vol->sb,
305 "Failed to map extent mft record, error code %ld.",
306 -PTR_ERR(m));
307 return m;
308 }
309 mutex_unlock(&base_ni->extent_lock);
310
311 /* Record wasn't there. Get a new ntfs inode and initialize it. */
312 ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
313 if (unlikely(!ni)) {
314 atomic_dec(&base_ni->count);
315 return ERR_PTR(-ENOMEM);
316 }
317 ni->vol = base_ni->vol;
318 ni->seq_no = seq_no;
319 ni->nr_extents = -1;
320 ni->ext.base_ntfs_ino = base_ni;
321 /* Now map the record. */
322 m = map_mft_record(ni);
323 if (IS_ERR(m)) {
324 atomic_dec(&base_ni->count);
325 ntfs_clear_extent_inode(ni);
326 goto map_err_out;
327 }
328 /* Verify the sequence number if it is present. */
329 if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
330 ntfs_error(base_ni->vol->sb,
331 "Found stale extent mft reference! Corrupt filesystem. Run chkdsk.");
332 destroy_ni = true;
333 m = ERR_PTR(-EIO);
334 goto unm_nolock_err_out;
335 }
336
337 mutex_lock(&base_ni->extent_lock);
338 for (i = 0; i < base_ni->nr_extents; i++) {
339 if (mft_no == extent_nis[i]->mft_no) {
340 mutex_unlock(&base_ni->extent_lock);
341 ntfs_clear_extent_inode(ni);
342 goto retry;
343 }
344 }
345 /* Attach extent inode to base inode, reallocating memory if needed. */
346 if (!(base_ni->nr_extents & 3)) {
347 struct ntfs_inode **tmp;
348 int new_size = (base_ni->nr_extents + 4) * sizeof(struct ntfs_inode *);
349
350 tmp = kvzalloc(new_size, GFP_NOFS);
351 if (unlikely(!tmp)) {
352 ntfs_error(base_ni->vol->sb, "Failed to allocate internal buffer.");
353 destroy_ni = true;
354 m = ERR_PTR(-ENOMEM);
355 goto unm_err_out;
356 }
357 if (base_ni->nr_extents) {
358 WARN_ON(!base_ni->ext.extent_ntfs_inos);
359 memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
360 4 * sizeof(struct ntfs_inode *));
361 kvfree(base_ni->ext.extent_ntfs_inos);
362 }
363 base_ni->ext.extent_ntfs_inos = tmp;
364 }
365 base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
366 mutex_unlock(&base_ni->extent_lock);
367 atomic_dec(&base_ni->count);
368 ntfs_debug("Done 2.");
369 *ntfs_ino = ni;
370 return m;
371 unm_err_out:
372 mutex_unlock(&base_ni->extent_lock);
373 unm_nolock_err_out:
374 unmap_mft_record(ni);
375 atomic_dec(&base_ni->count);
376 /*
377 * If the extent inode was not attached to the base inode we need to
378 * release it or we will leak memory.
379 */
380 if (destroy_ni)
381 ntfs_clear_extent_inode(ni);
382 return m;
383 }
384
385 /*
386 * __mark_mft_record_dirty - mark the base vfs inode dirty
387 * @ni: ntfs inode describing the mapped mft record
388 *
389 * Internal function. Users should call mark_mft_record_dirty() instead.
390 *
391 * This function determines the base ntfs inode (in case @ni is an extent
392 * inode) and marks the corresponding VFS inode dirty.
393 *
394 * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
395 * on the base vfs inode, because even though file data may have been modified,
396 * it is dirty in the inode meta data rather than the data page cache of the
397 * inode, and thus there are no data pages that need writing out. Therefore, a
398 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
399 * other hand, is not sufficient, because ->write_inode needs to be called even
400 * in case of fdatasync. This needs to happen or the file data would not
401 * necessarily hit the device synchronously, even though the vfs inode has the
402 * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
403 * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
404 * which is not what I_DIRTY_SYNC on its own would suggest.
405 */
__mark_mft_record_dirty(struct ntfs_inode * ni)406 void __mark_mft_record_dirty(struct ntfs_inode *ni)
407 {
408 struct ntfs_inode *base_ni;
409
410 ntfs_debug("Entering for inode 0x%llx.", ni->mft_no);
411 WARN_ON(NInoAttr(ni));
412 /* Determine the base vfs inode and mark it dirty, too. */
413 if (likely(ni->nr_extents >= 0))
414 base_ni = ni;
415 else
416 base_ni = ni->ext.base_ntfs_ino;
417 __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
418 }
419
420 /*
421 * ntfs_bio_end_io - bio completion callback for MFT record writes
422 *
423 * Decrements the folio reference count that was incremented before
424 * submit_bio(). This prevents a race condition where umount could
425 * evict the inode and release the folio while I/O is still in flight,
426 * potentially causing data corruption or use-after-free.
427 */
ntfs_bio_end_io(struct bio * bio)428 static void ntfs_bio_end_io(struct bio *bio)
429 {
430 if (bio->bi_private)
431 folio_put((struct folio *)bio->bi_private);
432 bio_put(bio);
433 }
434
435 /*
436 * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
437 * @vol: ntfs volume on which the mft record to synchronize resides
438 * @mft_no: mft record number of mft record to synchronize
439 * @m: mapped, mst protected (extent) mft record to synchronize
440 *
441 * Write the mapped, mst protected (extent) mft record @m with mft record
442 * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
443 *
444 * On success return 0. On error return -errno and set the volume errors flag
445 * in the ntfs volume @vol.
446 *
447 * NOTE: We always perform synchronous i/o.
448 */
ntfs_sync_mft_mirror(struct ntfs_volume * vol,const u64 mft_no,struct mft_record * m)449 int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const u64 mft_no,
450 struct mft_record *m)
451 {
452 u8 *kmirr = NULL;
453 struct folio *folio;
454 unsigned int folio_ofs, lcn_folio_off = 0;
455 int err = 0;
456 struct bio *bio;
457
458 ntfs_debug("Entering for inode 0x%llx.", mft_no);
459
460 if (unlikely(!vol->mftmirr_ino)) {
461 /* This could happen during umount... */
462 err = -EIO;
463 goto err_out;
464 }
465 /* Get the page containing the mirror copy of the mft record @m. */
466 folio = read_mapping_folio(vol->mftmirr_ino->i_mapping,
467 NTFS_MFT_NR_TO_PIDX(vol, mft_no), NULL);
468 if (IS_ERR(folio)) {
469 ntfs_error(vol->sb, "Failed to map mft mirror page.");
470 err = PTR_ERR(folio);
471 goto err_out;
472 }
473
474 folio_lock(folio);
475 folio_clear_uptodate(folio);
476 /* Offset of the mft mirror record inside the page. */
477 folio_ofs = NTFS_MFT_NR_TO_POFS(vol, mft_no);
478 /* The address in the page of the mirror copy of the mft record @m. */
479 kmirr = kmap_local_folio(folio, 0) + folio_ofs;
480 /* Copy the mst protected mft record to the mirror. */
481 memcpy(kmirr, m, vol->mft_record_size);
482
483 if (vol->cluster_size_bits > PAGE_SHIFT) {
484 lcn_folio_off = folio->index << PAGE_SHIFT;
485 lcn_folio_off &= vol->cluster_size_mask;
486 }
487
488 bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOIO);
489 bio->bi_iter.bi_sector =
490 NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, vol->mftmirr_lcn) +
491 lcn_folio_off + folio_ofs);
492
493 if (!bio_add_folio(bio, folio, vol->mft_record_size, folio_ofs)) {
494 err = -EIO;
495 bio_put(bio);
496 goto unlock_folio;
497 }
498
499 bio->bi_end_io = ntfs_bio_end_io;
500 submit_bio(bio);
501 /* Current state: all buffers are clean, unlocked, and uptodate. */
502 folio_mark_uptodate(folio);
503
504 unlock_folio:
505 folio_unlock(folio);
506 kunmap_local(kmirr);
507 folio_put(folio);
508 if (likely(!err)) {
509 ntfs_debug("Done.");
510 } else {
511 ntfs_error(vol->sb, "I/O error while writing mft mirror record 0x%llx!", mft_no);
512 err_out:
513 ntfs_error(vol->sb,
514 "Failed to synchronize $MFTMirr (error code %i). Volume will be left marked dirty on umount. Run chkdsk on the partition after umounting to correct this.",
515 err);
516 NVolSetErrors(vol);
517 }
518 return err;
519 }
520
521 /*
522 * write_mft_record_nolock - write out a mapped (extent) mft record
523 * @ni: ntfs inode describing the mapped (extent) mft record
524 * @m: mapped (extent) mft record to write
525 * @sync: if true, wait for i/o completion
526 *
527 * Write the mapped (extent) mft record @m described by the (regular or extent)
528 * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
529 * the mft mirror, that is also updated.
530 *
531 * We only write the mft record if the ntfs inode @ni is dirty.
532 *
533 * On success, clean the mft record and return 0.
534 * On error (specifically ENOMEM), we redirty the record so it can be retried.
535 * For other errors, we mark the volume with errors.
536 */
write_mft_record_nolock(struct ntfs_inode * ni,struct mft_record * m,int sync)537 int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int sync)
538 {
539 struct ntfs_volume *vol = ni->vol;
540 struct folio *folio = ni->folio;
541 int err = 0, i = 0;
542 u8 *kaddr;
543 struct mft_record *fixup_m;
544 struct bio *bio;
545 unsigned int offset = 0, folio_size;
546
547 ntfs_debug("Entering for inode 0x%llx.", ni->mft_no);
548
549 WARN_ON(NInoAttr(ni));
550 WARN_ON(!folio_test_locked(folio));
551
552 /*
553 * If the struct ntfs_inode is clean no need to do anything. If it is dirty,
554 * mark it as clean now so that it can be redirtied later on if needed.
555 * There is no danger of races since the caller is holding the locks
556 * for the mft record @m and the page it is in.
557 */
558 if (!NInoTestClearDirty(ni))
559 goto done;
560
561 kaddr = kmap_local_folio(folio, 0);
562 fixup_m = (struct mft_record *)(kaddr + ni->folio_ofs);
563 memcpy(fixup_m, m, vol->mft_record_size);
564
565 /* Apply the mst protection fixups. */
566 err = pre_write_mst_fixup((struct ntfs_record *)fixup_m, vol->mft_record_size);
567 if (err) {
568 ntfs_error(vol->sb, "Failed to apply mst fixups!");
569 goto err_out;
570 }
571
572 folio_size = vol->mft_record_size / ni->mft_lcn_count;
573 while (i < ni->mft_lcn_count) {
574 unsigned int clu_off;
575
576 clu_off = (unsigned int)((s64)ni->mft_no * vol->mft_record_size + offset) &
577 vol->cluster_size_mask;
578
579 bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOIO);
580 bio->bi_iter.bi_sector =
581 NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, ni->mft_lcn[i]) +
582 clu_off);
583
584 if (!bio_add_folio(bio, folio, folio_size,
585 ni->folio_ofs + offset)) {
586 err = -EIO;
587 goto put_bio_out;
588 }
589
590 /* Synchronize the mft mirror now if not @sync. */
591 if (!sync && ni->mft_no < vol->mftmirr_size)
592 ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
593
594 folio_get(folio);
595 bio->bi_private = folio;
596 bio->bi_end_io = ntfs_bio_end_io;
597 submit_bio(bio);
598 offset += vol->cluster_size;
599 i++;
600 }
601
602 /* If @sync, now synchronize the mft mirror. */
603 if (sync && ni->mft_no < vol->mftmirr_size)
604 ntfs_sync_mft_mirror(vol, ni->mft_no, fixup_m);
605 kunmap_local(kaddr);
606 if (unlikely(err)) {
607 /* I/O error during writing. This is really bad! */
608 ntfs_error(vol->sb,
609 "I/O error while writing mft record 0x%llx! Marking base inode as bad. You should unmount the volume and run chkdsk.",
610 ni->mft_no);
611 goto err_out;
612 }
613 done:
614 ntfs_debug("Done.");
615 return 0;
616 put_bio_out:
617 bio_put(bio);
618 err_out:
619 /*
620 * Current state: all buffers are clean, unlocked, and uptodate.
621 * The caller should mark the base inode as bad so that no more i/o
622 * happens. ->drop_inode() will still be invoked so all extent inodes
623 * and other allocated memory will be freed.
624 */
625 if (err == -ENOMEM) {
626 ntfs_error(vol->sb,
627 "Not enough memory to write mft record. Redirtying so the write is retried later.");
628 mark_mft_record_dirty(ni);
629 err = 0;
630 } else
631 NVolSetErrors(vol);
632 return err;
633 }
634
ntfs_test_inode_wb(struct inode * vi,u64 ino,void * data)635 static int ntfs_test_inode_wb(struct inode *vi, u64 ino, void *data)
636 {
637 struct ntfs_attr *na = data;
638
639 if (!ntfs_test_inode(vi, na))
640 return 0;
641
642 /*
643 * Without this, ntfs_write_mst_block() could call iput_final()
644 * , and ntfs_evict_big_inode() could try to unlink this inode
645 * and the contex could be blocked infinitly in map_mft_record().
646 */
647 if (NInoBeingDeleted(NTFS_I(vi))) {
648 na->state = NI_BeingDeleted;
649 return -1;
650 }
651
652 /*
653 * This condition can prevent ntfs_write_mst_block()
654 * from applying/undo fixups while ntfs_create() being
655 * called
656 */
657 spin_lock(&vi->i_lock);
658 if (inode_state_read_once(vi) & I_CREATING) {
659 spin_unlock(&vi->i_lock);
660 na->state = NI_BeingCreated;
661 return -1;
662 }
663 spin_unlock(&vi->i_lock);
664
665 return igrab(vi) ? 1 : -1;
666 }
667
668 /*
669 * ntfs_may_write_mft_record - check if an mft record may be written out
670 * @vol: [IN] ntfs volume on which the mft record to check resides
671 * @mft_no: [IN] mft record number of the mft record to check
672 * @m: [IN] mapped mft record to check
673 * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
674 * @ref_vi: [OUT] caller has to drop this vfs inode if one is returned
675 *
676 * Check if the mapped (base or extent) mft record @m with mft record number
677 * @mft_no belonging to the ntfs volume @vol may be written out. If necessary
678 * and possible the ntfs inode of the mft record is locked and the base vfs
679 * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
680 * caller is responsible for unlocking the ntfs inode and unpinning the base
681 * vfs inode.
682 *
683 * To avoid deadlock when the caller holds a folio lock, if the function
684 * returns @ref_vi it defers dropping the vfs inode reference by returning
685 * it in @ref_vi instead of calling iput() directly. The caller must call
686 * iput() on @ref_vi after releasing the folio lock.
687 *
688 * Return 'true' if the mft record may be written out and 'false' if not.
689 *
690 * The caller has locked the page and cleared the uptodate flag on it which
691 * means that we can safely write out any dirty mft records that do not have
692 * their inodes in icache as determined by find_inode_nowait().
693 *
694 * Here is a description of the tests we perform:
695 *
696 * If the inode is found in icache we know the mft record must be a base mft
697 * record. If it is dirty, we do not write it and return 'false' as the vfs
698 * inode write paths will result in the access times being updated which would
699 * cause the base mft record to be redirtied and written out again.
700 *
701 * If the inode is in icache and not dirty, we attempt to lock the mft record
702 * and if we find the lock was already taken, it is not safe to write the mft
703 * record and we return 'false'.
704 *
705 * If we manage to obtain the lock we have exclusive access to the mft record,
706 * which also allows us safe writeout of the mft record. We then set
707 * @locked_ni to the locked ntfs inode and return 'true'.
708 *
709 * Note we cannot just lock the mft record and sleep while waiting for the lock
710 * because this would deadlock due to lock reversal.
711 *
712 * If the inode is not in icache we need to perform further checks.
713 *
714 * If the mft record is not a FILE record or it is a base mft record, we can
715 * safely write it and return 'true'.
716 *
717 * We now know the mft record is an extent mft record. We check if the inode
718 * corresponding to its base mft record is in icache. If it is not, we cannot
719 * safely determine the state of the extent inode, so we return 'false'.
720 *
721 * We now have the base inode for the extent mft record. We check if it has an
722 * ntfs inode for the extent mft record attached. If not, it is safe to write
723 * the extent mft record and we return 'true'.
724 *
725 * If the extent inode is attached, we check if it is dirty. If so, we return
726 * 'false' (letting the standard write_inode path handle it).
727 *
728 * If it is not dirty, we attempt to lock the extent mft record. If the lock
729 * was already taken, it is not safe to write and we return 'false'.
730 *
731 * If we manage to obtain the lock we have exclusive access to the extent mft
732 * record. We set @locked_ni to the now locked ntfs inode and return 'true'.
733 */
ntfs_may_write_mft_record(struct ntfs_volume * vol,const u64 mft_no,const struct mft_record * m,struct ntfs_inode ** locked_ni,struct inode ** ref_vi)734 static bool ntfs_may_write_mft_record(struct ntfs_volume *vol, const u64 mft_no,
735 const struct mft_record *m, struct ntfs_inode **locked_ni,
736 struct inode **ref_vi)
737 {
738 struct super_block *sb = vol->sb;
739 struct inode *mft_vi = vol->mft_ino;
740 struct inode *vi;
741 struct ntfs_inode *ni, *eni, **extent_nis;
742 int i;
743 struct ntfs_attr na = {0};
744
745 ntfs_debug("Entering for inode 0x%llx.", mft_no);
746 /*
747 * Normally we do not return a locked inode so set @locked_ni to NULL.
748 */
749 *locked_ni = NULL;
750 *ref_vi = NULL;
751
752 /*
753 * Check if the inode corresponding to this mft record is in the VFS
754 * inode cache and obtain a reference to it if it is.
755 */
756 ntfs_debug("Looking for inode 0x%llx in icache.", mft_no);
757 na.mft_no = mft_no;
758 na.type = AT_UNUSED;
759 /*
760 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
761 * we get here for it rather often.
762 */
763 if (!mft_no) {
764 /* Balance the below iput(). */
765 vi = igrab(mft_vi);
766 WARN_ON(vi != mft_vi);
767 } else {
768 /*
769 * Have to use find_inode_nowait() since ilookup5_nowait()
770 * waits for inode with I_FREEING, which causes ntfs to deadlock
771 * when inodes are unlinked concurrently
772 */
773 vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na);
774 if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated)
775 return false;
776 }
777 if (vi) {
778 ntfs_debug("Base inode 0x%llx is in icache.", mft_no);
779 /* The inode is in icache. */
780 ni = NTFS_I(vi);
781 /* Take a reference to the ntfs inode. */
782 atomic_inc(&ni->count);
783 /* If the inode is dirty, do not write this record. */
784 if (NInoDirty(ni)) {
785 ntfs_debug("Inode 0x%llx is dirty, do not write it.",
786 mft_no);
787 atomic_dec(&ni->count);
788 *ref_vi = vi;
789 return false;
790 }
791 ntfs_debug("Inode 0x%llx is not dirty.", mft_no);
792 /* The inode is not dirty, try to take the mft record lock. */
793 if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
794 ntfs_debug("Mft record 0x%llx is already locked, do not write it.", mft_no);
795 atomic_dec(&ni->count);
796 *ref_vi = vi;
797 return false;
798 }
799 ntfs_debug("Managed to lock mft record 0x%llx, write it.",
800 mft_no);
801 /*
802 * The write has to occur while we hold the mft record lock so
803 * return the locked ntfs inode.
804 */
805 *locked_ni = ni;
806 return true;
807 }
808 ntfs_debug("Inode 0x%llx is not in icache.", mft_no);
809 /* The inode is not in icache. */
810 /* Write the record if it is not a mft record (type "FILE"). */
811 if (!ntfs_is_mft_record(m->magic)) {
812 ntfs_debug("Mft record 0x%llx is not a FILE record, write it.",
813 mft_no);
814 return true;
815 }
816 /* Write the mft record if it is a base inode. */
817 if (!m->base_mft_record) {
818 ntfs_debug("Mft record 0x%llx is a base record, write it.",
819 mft_no);
820 return true;
821 }
822 /*
823 * This is an extent mft record. Check if the inode corresponding to
824 * its base mft record is in icache and obtain a reference to it if it
825 * is.
826 */
827 na.mft_no = MREF_LE(m->base_mft_record);
828 na.state = 0;
829 ntfs_debug("Mft record 0x%llx is an extent record. Looking for base inode 0x%llx in icache.",
830 mft_no, na.mft_no);
831 if (!na.mft_no) {
832 /* Balance the below iput(). */
833 vi = igrab(mft_vi);
834 WARN_ON(vi != mft_vi);
835 } else {
836 vi = find_inode_nowait(sb, mft_no, ntfs_test_inode_wb, &na);
837 if (na.state == NI_BeingDeleted || na.state == NI_BeingCreated)
838 return false;
839 }
840
841 if (!vi)
842 return false;
843 ntfs_debug("Base inode 0x%llx is in icache.", na.mft_no);
844 /*
845 * The base inode is in icache. Check if it has the extent inode
846 * corresponding to this extent mft record attached.
847 */
848 ni = NTFS_I(vi);
849 mutex_lock(&ni->extent_lock);
850 if (ni->nr_extents <= 0) {
851 /*
852 * The base inode has no attached extent inodes, write this
853 * extent mft record.
854 */
855 mutex_unlock(&ni->extent_lock);
856 *ref_vi = vi;
857 ntfs_debug("Base inode 0x%llx has no attached extent inodes, write the extent record.",
858 na.mft_no);
859 return true;
860 }
861 /* Iterate over the attached extent inodes. */
862 extent_nis = ni->ext.extent_ntfs_inos;
863 for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
864 if (mft_no == extent_nis[i]->mft_no) {
865 /*
866 * Found the extent inode corresponding to this extent
867 * mft record.
868 */
869 eni = extent_nis[i];
870 break;
871 }
872 }
873 /*
874 * If the extent inode was not attached to the base inode, write this
875 * extent mft record.
876 */
877 if (!eni) {
878 mutex_unlock(&ni->extent_lock);
879 *ref_vi = vi;
880 ntfs_debug("Extent inode 0x%llx is not attached to its base inode 0x%llx, write the extent record.",
881 mft_no, na.mft_no);
882 return true;
883 }
884 ntfs_debug("Extent inode 0x%llx is attached to its base inode 0x%llx.",
885 mft_no, na.mft_no);
886 /* Take a reference to the extent ntfs inode. */
887 atomic_inc(&eni->count);
888 mutex_unlock(&ni->extent_lock);
889
890 /* if extent inode is dirty, write_inode will write it */
891 if (NInoDirty(eni)) {
892 atomic_dec(&eni->count);
893 *ref_vi = vi;
894 return false;
895 }
896
897 /*
898 * Found the extent inode coresponding to this extent mft record.
899 * Try to take the mft record lock.
900 */
901 if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
902 atomic_dec(&eni->count);
903 *ref_vi = vi;
904 ntfs_debug("Extent mft record 0x%llx is already locked, do not write it.",
905 mft_no);
906 return false;
907 }
908 ntfs_debug("Managed to lock extent mft record 0x%llx, write it.",
909 mft_no);
910 /*
911 * The write has to occur while we hold the mft record lock so return
912 * the locked extent ntfs inode.
913 */
914 *locked_ni = eni;
915 return true;
916 }
917
918 static const char *es = " Leaving inconsistent metadata. Unmount and run chkdsk.";
919
920 #define RESERVED_MFT_RECORDS 64
921
922 /*
923 * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
924 * @vol: volume on which to search for a free mft record
925 * @base_ni: open base inode if allocating an extent mft record or NULL
926 *
927 * Search for a free mft record in the mft bitmap attribute on the ntfs volume
928 * @vol.
929 *
930 * If @base_ni is NULL start the search at the default allocator position.
931 *
932 * If @base_ni is not NULL start the search at the mft record after the base
933 * mft record @base_ni.
934 *
935 * Return the free mft record on success and -errno on error. An error code of
936 * -ENOSPC means that there are no free mft records in the currently
937 * initialized mft bitmap.
938 *
939 * Locking: Caller must hold vol->mftbmp_lock for writing.
940 */
ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(struct ntfs_volume * vol,struct ntfs_inode * base_ni)941 static s64 ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(struct ntfs_volume *vol,
942 struct ntfs_inode *base_ni)
943 {
944 s64 pass_end, ll, data_pos, pass_start, ofs, bit;
945 unsigned long flags;
946 struct address_space *mftbmp_mapping;
947 u8 *buf = NULL, *byte;
948 struct folio *folio;
949 unsigned int folio_ofs, size;
950 u8 pass, b;
951
952 ntfs_debug("Searching for free mft record in the currently initialized mft bitmap.");
953 mftbmp_mapping = vol->mftbmp_ino->i_mapping;
954 /*
955 * Set the end of the pass making sure we do not overflow the mft
956 * bitmap.
957 */
958 read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
959 pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
960 vol->mft_record_size_bits;
961 read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
962 read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
963 ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
964 read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
965 if (pass_end > ll)
966 pass_end = ll;
967 pass = 1;
968 if (!base_ni)
969 data_pos = vol->mft_data_pos;
970 else
971 data_pos = base_ni->mft_no + 1;
972 if (data_pos < RESERVED_MFT_RECORDS)
973 data_pos = RESERVED_MFT_RECORDS;
974 if (data_pos >= pass_end) {
975 data_pos = RESERVED_MFT_RECORDS;
976 pass = 2;
977 /* This happens on a freshly formatted volume. */
978 if (data_pos >= pass_end)
979 return -ENOSPC;
980 }
981
982 if (base_ni && base_ni->mft_no == FILE_MFT) {
983 data_pos = 0;
984 pass = 2;
985 }
986
987 pass_start = data_pos;
988 ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, pass_end 0x%llx, data_pos 0x%llx.",
989 pass, pass_start, pass_end, data_pos);
990 /* Loop until a free mft record is found. */
991 for (; pass <= 2;) {
992 /* Cap size to pass_end. */
993 ofs = data_pos >> 3;
994 folio_ofs = ofs & ~PAGE_MASK;
995 size = PAGE_SIZE - folio_ofs;
996 ll = ((pass_end + 7) >> 3) - ofs;
997 if (size > ll)
998 size = ll;
999 size <<= 3;
1000 /*
1001 * If we are still within the active pass, search the next page
1002 * for a zero bit.
1003 */
1004 if (size) {
1005 folio = read_mapping_folio(mftbmp_mapping,
1006 ofs >> PAGE_SHIFT, NULL);
1007 if (IS_ERR(folio)) {
1008 ntfs_error(vol->sb, "Failed to read mft bitmap, aborting.");
1009 return PTR_ERR(folio);
1010 }
1011 folio_lock(folio);
1012 buf = (u8 *)kmap_local_folio(folio, 0) + folio_ofs;
1013 bit = data_pos & 7;
1014 data_pos &= ~7ull;
1015 ntfs_debug("Before inner for loop: size 0x%x, data_pos 0x%llx, bit 0x%llx",
1016 size, data_pos, bit);
1017 for (; bit < size && data_pos + bit < pass_end;
1018 bit &= ~7ull, bit += 8) {
1019 /*
1020 * If we're extending $MFT and running out of the first
1021 * mft record (base record) then give up searching since
1022 * no guarantee that the found record will be accessible.
1023 */
1024 if (base_ni && base_ni->mft_no == FILE_MFT && bit > 400) {
1025 folio_unlock(folio);
1026 kunmap_local(buf);
1027 folio_put(folio);
1028 return -ENOSPC;
1029 }
1030
1031 byte = buf + (bit >> 3);
1032 if (*byte == 0xff)
1033 continue;
1034 b = ffz((unsigned long)*byte);
1035 if (b < 8 && b >= (bit & 7)) {
1036 ll = data_pos + (bit & ~7ull) + b;
1037 if (unlikely(ll > (1ll << 32))) {
1038 folio_unlock(folio);
1039 kunmap_local(buf);
1040 folio_put(folio);
1041 return -ENOSPC;
1042 }
1043 *byte |= 1 << b;
1044 folio_mark_dirty(folio);
1045 folio_unlock(folio);
1046 kunmap_local(buf);
1047 folio_put(folio);
1048 ntfs_debug("Done. (Found and allocated mft record 0x%llx.)",
1049 ll);
1050 return ll;
1051 }
1052 }
1053 ntfs_debug("After inner for loop: size 0x%x, data_pos 0x%llx, bit 0x%llx",
1054 size, data_pos, bit);
1055 data_pos += size;
1056 folio_unlock(folio);
1057 kunmap_local(buf);
1058 folio_put(folio);
1059 /*
1060 * If the end of the pass has not been reached yet,
1061 * continue searching the mft bitmap for a zero bit.
1062 */
1063 if (data_pos < pass_end)
1064 continue;
1065 }
1066 /* Do the next pass. */
1067 if (++pass == 2) {
1068 /*
1069 * Starting the second pass, in which we scan the first
1070 * part of the zone which we omitted earlier.
1071 */
1072 pass_end = pass_start;
1073 data_pos = pass_start = RESERVED_MFT_RECORDS;
1074 ntfs_debug("pass %i, pass_start 0x%llx, pass_end 0x%llx.",
1075 pass, pass_start, pass_end);
1076 if (data_pos >= pass_end)
1077 break;
1078 }
1079 }
1080 /* No free mft records in currently initialized mft bitmap. */
1081 ntfs_debug("Done. (No free mft records left in currently initialized mft bitmap.)");
1082 return -ENOSPC;
1083 }
1084
ntfs_mft_attr_extend(struct ntfs_inode * ni)1085 static int ntfs_mft_attr_extend(struct ntfs_inode *ni)
1086 {
1087 int ret = 0;
1088 struct ntfs_inode *base_ni;
1089
1090 if (NInoAttr(ni))
1091 base_ni = ni->ext.base_ntfs_ino;
1092 else
1093 base_ni = ni;
1094
1095 if (!NInoAttrList(base_ni)) {
1096 ret = ntfs_inode_add_attrlist(base_ni);
1097 if (ret) {
1098 pr_err("Can not add attrlist\n");
1099 goto out;
1100 } else {
1101 ret = -EAGAIN;
1102 goto out;
1103 }
1104 }
1105
1106 ret = ntfs_attr_update_mapping_pairs(ni, 0);
1107 if (ret)
1108 pr_err("MP update failed\n");
1109
1110 out:
1111 return ret;
1112 }
1113
1114 /*
1115 * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
1116 * @vol: volume on which to extend the mft bitmap attribute
1117 *
1118 * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
1119 *
1120 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1121 * data_size.
1122 *
1123 * Return 0 on success and -errno on error.
1124 *
1125 * Locking: - Caller must hold vol->mftbmp_lock for writing.
1126 * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
1127 * writing and releases it before returning.
1128 * - This function takes vol->lcnbmp_lock for writing and releases it
1129 * before returning.
1130 */
ntfs_mft_bitmap_extend_allocation_nolock(struct ntfs_volume * vol)1131 static int ntfs_mft_bitmap_extend_allocation_nolock(struct ntfs_volume *vol)
1132 {
1133 s64 lcn;
1134 s64 ll;
1135 unsigned long flags;
1136 struct folio *folio;
1137 struct ntfs_inode *mft_ni, *mftbmp_ni;
1138 struct runlist_element *rl, *rl2 = NULL;
1139 struct ntfs_attr_search_ctx *ctx = NULL;
1140 struct mft_record *mrec;
1141 struct attr_record *a = NULL;
1142 int ret, mp_size;
1143 u32 old_alen = 0;
1144 u8 *b, tb;
1145 struct {
1146 u8 added_cluster:1;
1147 u8 added_run:1;
1148 u8 mp_rebuilt:1;
1149 u8 mp_extended:1;
1150 } status = { 0, 0, 0, 0 };
1151 size_t new_rl_count;
1152
1153 ntfs_debug("Extending mft bitmap allocation.");
1154 mft_ni = NTFS_I(vol->mft_ino);
1155 mftbmp_ni = NTFS_I(vol->mftbmp_ino);
1156 /*
1157 * Determine the last lcn of the mft bitmap. The allocated size of the
1158 * mft bitmap cannot be zero so we are ok to do this.
1159 */
1160 down_write(&mftbmp_ni->runlist.lock);
1161 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1162 ll = mftbmp_ni->allocated_size;
1163 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1164 rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
1165 NTFS_B_TO_CLU(vol, ll - 1), NULL);
1166 if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
1167 up_write(&mftbmp_ni->runlist.lock);
1168 ntfs_error(vol->sb,
1169 "Failed to determine last allocated cluster of mft bitmap attribute.");
1170 if (!IS_ERR(rl))
1171 ret = -EIO;
1172 else
1173 ret = PTR_ERR(rl);
1174 return ret;
1175 }
1176 lcn = rl->lcn + rl->length;
1177 ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
1178 (long long)lcn);
1179 /*
1180 * Attempt to get the cluster following the last allocated cluster by
1181 * hand as it may be in the MFT zone so the allocator would not give it
1182 * to us.
1183 */
1184 ll = lcn >> 3;
1185 folio = read_mapping_folio(vol->lcnbmp_ino->i_mapping,
1186 ll >> PAGE_SHIFT, NULL);
1187 if (IS_ERR(folio)) {
1188 up_write(&mftbmp_ni->runlist.lock);
1189 ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
1190 return PTR_ERR(folio);
1191 }
1192
1193 down_write(&vol->lcnbmp_lock);
1194 folio_lock(folio);
1195 b = (u8 *)kmap_local_folio(folio, 0) + (ll & ~PAGE_MASK);
1196 tb = 1 << (lcn & 7ull);
1197 if (*b != 0xff && !(*b & tb)) {
1198 /* Next cluster is free, allocate it. */
1199 *b |= tb;
1200 folio_mark_dirty(folio);
1201 folio_unlock(folio);
1202 kunmap_local(b);
1203 folio_put(folio);
1204 up_write(&vol->lcnbmp_lock);
1205 /* Update the mft bitmap runlist. */
1206 rl->length++;
1207 rl[1].vcn++;
1208 status.added_cluster = 1;
1209 ntfs_debug("Appending one cluster to mft bitmap.");
1210 } else {
1211 folio_unlock(folio);
1212 kunmap_local(b);
1213 folio_put(folio);
1214 up_write(&vol->lcnbmp_lock);
1215 /* Allocate a cluster from the DATA_ZONE. */
1216 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
1217 true, false, false);
1218 if (IS_ERR(rl2)) {
1219 up_write(&mftbmp_ni->runlist.lock);
1220 ntfs_error(vol->sb,
1221 "Failed to allocate a cluster for the mft bitmap.");
1222 return PTR_ERR(rl2);
1223 }
1224 rl = ntfs_runlists_merge(&mftbmp_ni->runlist, rl2, 0, &new_rl_count);
1225 if (IS_ERR(rl)) {
1226 up_write(&mftbmp_ni->runlist.lock);
1227 ntfs_error(vol->sb, "Failed to merge runlists for mft bitmap.");
1228 if (ntfs_cluster_free_from_rl(vol, rl2)) {
1229 ntfs_error(vol->sb, "Failed to deallocate allocated cluster.%s",
1230 es);
1231 NVolSetErrors(vol);
1232 }
1233 kvfree(rl2);
1234 return PTR_ERR(rl);
1235 }
1236 mftbmp_ni->runlist.rl = rl;
1237 mftbmp_ni->runlist.count = new_rl_count;
1238 status.added_run = 1;
1239 ntfs_debug("Adding one run to mft bitmap.");
1240 /* Find the last run in the new runlist. */
1241 for (; rl[1].length; rl++)
1242 ;
1243 }
1244 /*
1245 * Update the attribute record as well. Note: @rl is the last
1246 * (non-terminator) runlist element of mft bitmap.
1247 */
1248 mrec = map_mft_record(mft_ni);
1249 if (IS_ERR(mrec)) {
1250 ntfs_error(vol->sb, "Failed to map mft record.");
1251 ret = PTR_ERR(mrec);
1252 goto undo_alloc;
1253 }
1254 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1255 if (unlikely(!ctx)) {
1256 ntfs_error(vol->sb, "Failed to get search context.");
1257 ret = -ENOMEM;
1258 goto undo_alloc;
1259 }
1260 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1261 mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1262 0, ctx);
1263 if (unlikely(ret)) {
1264 ntfs_error(vol->sb,
1265 "Failed to find last attribute extent of mft bitmap attribute.");
1266 if (ret == -ENOENT)
1267 ret = -EIO;
1268 goto undo_alloc;
1269 }
1270 a = ctx->attr;
1271 ll = le64_to_cpu(a->data.non_resident.lowest_vcn);
1272 /* Search back for the previous last allocated cluster of mft bitmap. */
1273 for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
1274 if (ll >= rl2->vcn)
1275 break;
1276 }
1277 WARN_ON(ll < rl2->vcn);
1278 WARN_ON(ll >= rl2->vcn + rl2->length);
1279 /* Get the size for the new mapping pairs array for this extent. */
1280 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1, -1);
1281 if (unlikely(mp_size <= 0)) {
1282 ntfs_error(vol->sb,
1283 "Get size for mapping pairs failed for mft bitmap attribute extent.");
1284 ret = mp_size;
1285 if (!ret)
1286 ret = -EIO;
1287 goto undo_alloc;
1288 }
1289 /* Expand the attribute record if necessary. */
1290 old_alen = le32_to_cpu(a->length);
1291 ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1292 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1293 if (unlikely(ret)) {
1294 ret = ntfs_mft_attr_extend(mftbmp_ni);
1295 if (!ret)
1296 goto extended_ok;
1297 if (ret != -EAGAIN)
1298 status.mp_extended = 1;
1299 goto undo_alloc;
1300 }
1301 status.mp_rebuilt = 1;
1302 /* Generate the mapping pairs array directly into the attr record. */
1303 ret = ntfs_mapping_pairs_build(vol, (u8 *)a +
1304 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1305 mp_size, rl2, ll, -1, NULL, NULL, NULL);
1306 if (unlikely(ret)) {
1307 ntfs_error(vol->sb,
1308 "Failed to build mapping pairs array for mft bitmap attribute.");
1309 goto undo_alloc;
1310 }
1311 /* Update the highest_vcn. */
1312 a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 1);
1313 /*
1314 * We now have extended the mft bitmap allocated_size by one cluster.
1315 * Reflect this in the struct ntfs_inode structure and the attribute record.
1316 */
1317 if (a->data.non_resident.lowest_vcn) {
1318 /*
1319 * We are not in the first attribute extent, switch to it, but
1320 * first ensure the changes will make it to disk later.
1321 */
1322 mark_mft_record_dirty(ctx->ntfs_ino);
1323 extended_ok:
1324 ntfs_attr_reinit_search_ctx(ctx);
1325 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1326 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
1327 0, ctx);
1328 if (unlikely(ret)) {
1329 ntfs_error(vol->sb,
1330 "Failed to find first attribute extent of mft bitmap attribute.");
1331 goto restore_undo_alloc;
1332 }
1333 a = ctx->attr;
1334 }
1335
1336 write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1337 mftbmp_ni->allocated_size += vol->cluster_size;
1338 a->data.non_resident.allocated_size =
1339 cpu_to_le64(mftbmp_ni->allocated_size);
1340 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1341 /* Ensure the changes make it to disk. */
1342 mark_mft_record_dirty(ctx->ntfs_ino);
1343 ntfs_attr_put_search_ctx(ctx);
1344 unmap_mft_record(mft_ni);
1345 up_write(&mftbmp_ni->runlist.lock);
1346 ntfs_debug("Done.");
1347 return 0;
1348
1349 restore_undo_alloc:
1350 ntfs_attr_reinit_search_ctx(ctx);
1351 if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1352 mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1353 0, ctx)) {
1354 ntfs_error(vol->sb,
1355 "Failed to find last attribute extent of mft bitmap attribute.%s", es);
1356 write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1357 mftbmp_ni->allocated_size += vol->cluster_size;
1358 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1359 ntfs_attr_put_search_ctx(ctx);
1360 unmap_mft_record(mft_ni);
1361 up_write(&mftbmp_ni->runlist.lock);
1362 /*
1363 * The only thing that is now wrong is ->allocated_size of the
1364 * base attribute extent which chkdsk should be able to fix.
1365 */
1366 NVolSetErrors(vol);
1367 return ret;
1368 }
1369 a = ctx->attr;
1370 a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 2);
1371 undo_alloc:
1372 if (status.added_cluster) {
1373 /* Truncate the last run in the runlist by one cluster. */
1374 rl->length--;
1375 rl[1].vcn--;
1376 } else if (status.added_run) {
1377 lcn = rl->lcn;
1378 /* Remove the last run from the runlist. */
1379 rl->lcn = rl[1].lcn;
1380 rl->length = 0;
1381 mftbmp_ni->runlist.count--;
1382 }
1383 /* Deallocate the cluster. */
1384 down_write(&vol->lcnbmp_lock);
1385 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1386 ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
1387 NVolSetErrors(vol);
1388 } else
1389 ntfs_inc_free_clusters(vol, 1);
1390 up_write(&vol->lcnbmp_lock);
1391 if (status.mp_rebuilt) {
1392 if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
1393 a->data.non_resident.mapping_pairs_offset),
1394 old_alen - le16_to_cpu(
1395 a->data.non_resident.mapping_pairs_offset),
1396 rl2, ll, -1, NULL, NULL, NULL)) {
1397 ntfs_error(vol->sb, "Failed to restore mapping pairs array.%s", es);
1398 NVolSetErrors(vol);
1399 }
1400 if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1401 ntfs_error(vol->sb, "Failed to restore attribute record.%s", es);
1402 NVolSetErrors(vol);
1403 }
1404 mark_mft_record_dirty(ctx->ntfs_ino);
1405 } else if (status.mp_extended && ntfs_attr_update_mapping_pairs(mftbmp_ni, 0)) {
1406 ntfs_error(vol->sb, "Failed to restore mapping pairs.%s", es);
1407 NVolSetErrors(vol);
1408 }
1409 if (ctx)
1410 ntfs_attr_put_search_ctx(ctx);
1411 if (!IS_ERR(mrec))
1412 unmap_mft_record(mft_ni);
1413 up_write(&mftbmp_ni->runlist.lock);
1414 return ret;
1415 }
1416
1417 /*
1418 * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1419 * @vol: volume on which to extend the mft bitmap attribute
1420 *
1421 * Extend the initialized portion of the mft bitmap attribute on the ntfs
1422 * volume @vol by 8 bytes.
1423 *
1424 * Note: Only changes initialized_size and data_size, i.e. requires that
1425 * allocated_size is big enough to fit the new initialized_size.
1426 *
1427 * Return 0 on success and -error on error.
1428 *
1429 * Locking: Caller must hold vol->mftbmp_lock for writing.
1430 */
ntfs_mft_bitmap_extend_initialized_nolock(struct ntfs_volume * vol)1431 static int ntfs_mft_bitmap_extend_initialized_nolock(struct ntfs_volume *vol)
1432 {
1433 s64 old_data_size, old_initialized_size;
1434 unsigned long flags;
1435 struct inode *mftbmp_vi;
1436 struct ntfs_inode *mft_ni, *mftbmp_ni;
1437 struct ntfs_attr_search_ctx *ctx;
1438 struct mft_record *mrec;
1439 struct attr_record *a;
1440 int ret;
1441
1442 ntfs_debug("Extending mft bitmap initialized (and data) size.");
1443 mft_ni = NTFS_I(vol->mft_ino);
1444 mftbmp_vi = vol->mftbmp_ino;
1445 mftbmp_ni = NTFS_I(mftbmp_vi);
1446 /* Get the attribute record. */
1447 mrec = map_mft_record(mft_ni);
1448 if (IS_ERR(mrec)) {
1449 ntfs_error(vol->sb, "Failed to map mft record.");
1450 return PTR_ERR(mrec);
1451 }
1452 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1453 if (unlikely(!ctx)) {
1454 ntfs_error(vol->sb, "Failed to get search context.");
1455 ret = -ENOMEM;
1456 goto unm_err_out;
1457 }
1458 ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1459 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
1460 if (unlikely(ret)) {
1461 ntfs_error(vol->sb,
1462 "Failed to find first attribute extent of mft bitmap attribute.");
1463 if (ret == -ENOENT)
1464 ret = -EIO;
1465 goto put_err_out;
1466 }
1467 a = ctx->attr;
1468 write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1469 old_data_size = i_size_read(mftbmp_vi);
1470 old_initialized_size = mftbmp_ni->initialized_size;
1471 /*
1472 * We can simply update the initialized_size before filling the space
1473 * with zeroes because the caller is holding the mft bitmap lock for
1474 * writing which ensures that no one else is trying to access the data.
1475 */
1476 mftbmp_ni->initialized_size += 8;
1477 a->data.non_resident.initialized_size =
1478 cpu_to_le64(mftbmp_ni->initialized_size);
1479 if (mftbmp_ni->initialized_size > old_data_size) {
1480 i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
1481 a->data.non_resident.data_size =
1482 cpu_to_le64(mftbmp_ni->initialized_size);
1483 }
1484 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1485 /* Ensure the changes make it to disk. */
1486 mark_mft_record_dirty(ctx->ntfs_ino);
1487 ntfs_attr_put_search_ctx(ctx);
1488 unmap_mft_record(mft_ni);
1489 /* Initialize the mft bitmap attribute value with zeroes. */
1490 ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
1491 if (likely(!ret)) {
1492 ntfs_debug("Done. (Wrote eight initialized bytes to mft bitmap.");
1493 ntfs_inc_free_mft_records(vol, 8 * 8);
1494 return 0;
1495 }
1496 ntfs_error(vol->sb, "Failed to write to mft bitmap.");
1497 /* Try to recover from the error. */
1498 mrec = map_mft_record(mft_ni);
1499 if (IS_ERR(mrec)) {
1500 ntfs_error(vol->sb, "Failed to map mft record.%s", es);
1501 NVolSetErrors(vol);
1502 return ret;
1503 }
1504 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1505 if (unlikely(!ctx)) {
1506 ntfs_error(vol->sb, "Failed to get search context.%s", es);
1507 NVolSetErrors(vol);
1508 goto unm_err_out;
1509 }
1510 if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1511 mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
1512 ntfs_error(vol->sb,
1513 "Failed to find first attribute extent of mft bitmap attribute.%s", es);
1514 NVolSetErrors(vol);
1515 put_err_out:
1516 ntfs_attr_put_search_ctx(ctx);
1517 unm_err_out:
1518 unmap_mft_record(mft_ni);
1519 goto err_out;
1520 }
1521 a = ctx->attr;
1522 write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1523 mftbmp_ni->initialized_size = old_initialized_size;
1524 a->data.non_resident.initialized_size =
1525 cpu_to_le64(old_initialized_size);
1526 if (i_size_read(mftbmp_vi) != old_data_size) {
1527 i_size_write(mftbmp_vi, old_data_size);
1528 a->data.non_resident.data_size = cpu_to_le64(old_data_size);
1529 }
1530 write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1531 mark_mft_record_dirty(ctx->ntfs_ino);
1532 ntfs_attr_put_search_ctx(ctx);
1533 unmap_mft_record(mft_ni);
1534 #ifdef DEBUG
1535 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1536 ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
1537 mftbmp_ni->allocated_size, i_size_read(mftbmp_vi),
1538 mftbmp_ni->initialized_size);
1539 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1540 #endif /* DEBUG */
1541 err_out:
1542 return ret;
1543 }
1544
1545 /*
1546 * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1547 * @vol: volume on which to extend the mft data attribute
1548 *
1549 * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1550 * worth of clusters or if not enough space for this by one mft record worth
1551 * of clusters.
1552 *
1553 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1554 * data_size.
1555 *
1556 * Return 0 on success and -errno on error.
1557 *
1558 * Locking: - Caller must hold vol->mftbmp_lock for writing.
1559 * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
1560 * writing and releases it before returning.
1561 * - This function calls functions which take vol->lcnbmp_lock for
1562 * writing and release it before returning.
1563 */
ntfs_mft_data_extend_allocation_nolock(struct ntfs_volume * vol)1564 static int ntfs_mft_data_extend_allocation_nolock(struct ntfs_volume *vol)
1565 {
1566 s64 lcn;
1567 s64 old_last_vcn;
1568 s64 min_nr, nr, ll;
1569 unsigned long flags;
1570 struct ntfs_inode *mft_ni;
1571 struct runlist_element *rl, *rl2;
1572 struct ntfs_attr_search_ctx *ctx = NULL;
1573 struct mft_record *mrec;
1574 struct attr_record *a = NULL;
1575 int ret, mp_size;
1576 u32 old_alen = 0;
1577 bool mp_rebuilt = false, mp_extended = false;
1578 size_t new_rl_count;
1579
1580 ntfs_debug("Extending mft data allocation.");
1581 mft_ni = NTFS_I(vol->mft_ino);
1582 /*
1583 * Determine the preferred allocation location, i.e. the last lcn of
1584 * the mft data attribute. The allocated size of the mft data
1585 * attribute cannot be zero so we are ok to do this.
1586 */
1587 down_write(&mft_ni->runlist.lock);
1588 read_lock_irqsave(&mft_ni->size_lock, flags);
1589 ll = mft_ni->allocated_size;
1590 read_unlock_irqrestore(&mft_ni->size_lock, flags);
1591 rl = ntfs_attr_find_vcn_nolock(mft_ni,
1592 NTFS_B_TO_CLU(vol, ll - 1), NULL);
1593 if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
1594 up_write(&mft_ni->runlist.lock);
1595 ntfs_error(vol->sb,
1596 "Failed to determine last allocated cluster of mft data attribute.");
1597 if (!IS_ERR(rl))
1598 ret = -EIO;
1599 else
1600 ret = PTR_ERR(rl);
1601 return ret;
1602 }
1603 lcn = rl->lcn + rl->length;
1604 ntfs_debug("Last lcn of mft data attribute is 0x%llx.", lcn);
1605 /* Minimum allocation is one mft record worth of clusters. */
1606 min_nr = NTFS_B_TO_CLU(vol, vol->mft_record_size);
1607 if (!min_nr)
1608 min_nr = 1;
1609 /* Want to allocate 16 mft records worth of clusters. */
1610 nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
1611 if (!nr)
1612 nr = min_nr;
1613 /* Ensure we do not go above 2^32-1 mft records. */
1614 read_lock_irqsave(&mft_ni->size_lock, flags);
1615 ll = mft_ni->allocated_size;
1616 read_unlock_irqrestore(&mft_ni->size_lock, flags);
1617 if (unlikely((ll + NTFS_CLU_TO_B(vol, nr)) >>
1618 vol->mft_record_size_bits >= (1ll << 32))) {
1619 nr = min_nr;
1620 if (unlikely((ll + NTFS_CLU_TO_B(vol, nr)) >>
1621 vol->mft_record_size_bits >= (1ll << 32))) {
1622 ntfs_warning(vol->sb,
1623 "Cannot allocate mft record because the maximum number of inodes (2^32) has already been reached.");
1624 up_write(&mft_ni->runlist.lock);
1625 return -ENOSPC;
1626 }
1627 }
1628 ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
1629 nr > min_nr ? "default" : "minimal", (long long)nr);
1630 old_last_vcn = rl[1].vcn;
1631 /*
1632 * We can release the mft_ni runlist lock, Because this function is
1633 * the only one that expends $MFT data attribute and is called with
1634 * mft_ni->mrec_lock.
1635 * This is required for the lock order, vol->lcnbmp_lock =>
1636 * mft_ni->runlist.lock.
1637 */
1638 up_write(&mft_ni->runlist.lock);
1639
1640 do {
1641 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
1642 true, false, false);
1643 if (!IS_ERR(rl2))
1644 break;
1645 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
1646 ntfs_error(vol->sb,
1647 "Failed to allocate the minimal number of clusters (%lli) for the mft data attribute.",
1648 nr);
1649 return PTR_ERR(rl2);
1650 }
1651 /*
1652 * There is not enough space to do the allocation, but there
1653 * might be enough space to do a minimal allocation so try that
1654 * before failing.
1655 */
1656 nr = min_nr;
1657 ntfs_debug("Retrying mft data allocation with minimal cluster count %lli.", nr);
1658 } while (1);
1659
1660 down_write(&mft_ni->runlist.lock);
1661 rl = ntfs_runlists_merge(&mft_ni->runlist, rl2, 0, &new_rl_count);
1662 if (IS_ERR(rl)) {
1663 up_write(&mft_ni->runlist.lock);
1664 ntfs_error(vol->sb, "Failed to merge runlists for mft data attribute.");
1665 if (ntfs_cluster_free_from_rl(vol, rl2)) {
1666 ntfs_error(vol->sb,
1667 "Failed to deallocate clusters from the mft data attribute.%s", es);
1668 NVolSetErrors(vol);
1669 }
1670 kvfree(rl2);
1671 return PTR_ERR(rl);
1672 }
1673 mft_ni->runlist.rl = rl;
1674 mft_ni->runlist.count = new_rl_count;
1675 ntfs_debug("Allocated %lli clusters.", (long long)nr);
1676 /* Find the last run in the new runlist. */
1677 for (; rl[1].length; rl++)
1678 ;
1679 up_write(&mft_ni->runlist.lock);
1680
1681 /* Update the attribute record as well. */
1682 mrec = map_mft_record(mft_ni);
1683 if (IS_ERR(mrec)) {
1684 ntfs_error(vol->sb, "Failed to map mft record.");
1685 ret = PTR_ERR(mrec);
1686 down_write(&mft_ni->runlist.lock);
1687 goto undo_alloc;
1688 }
1689 ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1690 if (unlikely(!ctx)) {
1691 ntfs_error(vol->sb, "Failed to get search context.");
1692 ret = -ENOMEM;
1693 goto undo_alloc;
1694 }
1695 ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1696 CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
1697 if (unlikely(ret)) {
1698 ntfs_error(vol->sb, "Failed to find last attribute extent of mft data attribute.");
1699 if (ret == -ENOENT)
1700 ret = -EIO;
1701 goto undo_alloc;
1702 }
1703 a = ctx->attr;
1704 ll = le64_to_cpu(a->data.non_resident.lowest_vcn);
1705
1706 down_write(&mft_ni->runlist.lock);
1707 /* Search back for the previous last allocated cluster of mft bitmap. */
1708 for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
1709 if (ll >= rl2->vcn)
1710 break;
1711 }
1712 WARN_ON(ll < rl2->vcn);
1713 WARN_ON(ll >= rl2->vcn + rl2->length);
1714 /* Get the size for the new mapping pairs array for this extent. */
1715 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1, -1);
1716 if (unlikely(mp_size <= 0)) {
1717 ntfs_error(vol->sb,
1718 "Get size for mapping pairs failed for mft data attribute extent.");
1719 ret = mp_size;
1720 if (!ret)
1721 ret = -EIO;
1722 up_write(&mft_ni->runlist.lock);
1723 goto undo_alloc;
1724 }
1725 up_write(&mft_ni->runlist.lock);
1726
1727 /* Expand the attribute record if necessary. */
1728 old_alen = le32_to_cpu(a->length);
1729 ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1730 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1731 if (unlikely(ret)) {
1732 ret = ntfs_mft_attr_extend(mft_ni);
1733 if (!ret)
1734 goto extended_ok;
1735 if (ret != -EAGAIN)
1736 mp_extended = true;
1737 goto undo_alloc;
1738 }
1739 mp_rebuilt = true;
1740 /* Generate the mapping pairs array directly into the attr record. */
1741 ret = ntfs_mapping_pairs_build(vol, (u8 *)a +
1742 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1743 mp_size, rl2, ll, -1, NULL, NULL, NULL);
1744 if (unlikely(ret)) {
1745 ntfs_error(vol->sb, "Failed to build mapping pairs array of mft data attribute.");
1746 goto undo_alloc;
1747 }
1748 /* Update the highest_vcn. */
1749 a->data.non_resident.highest_vcn = cpu_to_le64(rl[1].vcn - 1);
1750 /*
1751 * We now have extended the mft data allocated_size by nr clusters.
1752 * Reflect this in the struct ntfs_inode structure and the attribute record.
1753 * @rl is the last (non-terminator) runlist element of mft data
1754 * attribute.
1755 */
1756 if (a->data.non_resident.lowest_vcn) {
1757 /*
1758 * We are not in the first attribute extent, switch to it, but
1759 * first ensure the changes will make it to disk later.
1760 */
1761 mark_mft_record_dirty(ctx->ntfs_ino);
1762 extended_ok:
1763 ntfs_attr_reinit_search_ctx(ctx);
1764 ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
1765 mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
1766 ctx);
1767 if (unlikely(ret)) {
1768 ntfs_error(vol->sb,
1769 "Failed to find first attribute extent of mft data attribute.");
1770 goto restore_undo_alloc;
1771 }
1772 a = ctx->attr;
1773 }
1774
1775 write_lock_irqsave(&mft_ni->size_lock, flags);
1776 mft_ni->allocated_size += NTFS_CLU_TO_B(vol, nr);
1777 a->data.non_resident.allocated_size =
1778 cpu_to_le64(mft_ni->allocated_size);
1779 write_unlock_irqrestore(&mft_ni->size_lock, flags);
1780 /* Ensure the changes make it to disk. */
1781 mark_mft_record_dirty(ctx->ntfs_ino);
1782 ntfs_attr_put_search_ctx(ctx);
1783 unmap_mft_record(mft_ni);
1784 ntfs_debug("Done.");
1785 return 0;
1786 restore_undo_alloc:
1787 ntfs_attr_reinit_search_ctx(ctx);
1788 if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1789 CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
1790 ntfs_error(vol->sb,
1791 "Failed to find last attribute extent of mft data attribute.%s", es);
1792 write_lock_irqsave(&mft_ni->size_lock, flags);
1793 mft_ni->allocated_size += NTFS_CLU_TO_B(vol, nr);
1794 write_unlock_irqrestore(&mft_ni->size_lock, flags);
1795 ntfs_attr_put_search_ctx(ctx);
1796 unmap_mft_record(mft_ni);
1797 up_write(&mft_ni->runlist.lock);
1798 /*
1799 * The only thing that is now wrong is ->allocated_size of the
1800 * base attribute extent which chkdsk should be able to fix.
1801 */
1802 NVolSetErrors(vol);
1803 return ret;
1804 }
1805 ctx->attr->data.non_resident.highest_vcn =
1806 cpu_to_le64(old_last_vcn - 1);
1807 undo_alloc:
1808 if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
1809 ntfs_error(vol->sb, "Failed to free clusters from mft data attribute.%s", es);
1810 NVolSetErrors(vol);
1811 }
1812
1813 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1814 ntfs_error(vol->sb, "Failed to truncate mft data attribute runlist.%s", es);
1815 NVolSetErrors(vol);
1816 }
1817 if (mp_extended && ntfs_attr_update_mapping_pairs(mft_ni, 0)) {
1818 ntfs_error(vol->sb, "Failed to restore mapping pairs.%s",
1819 es);
1820 NVolSetErrors(vol);
1821 }
1822 if (ctx) {
1823 a = ctx->attr;
1824 if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
1825 if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
1826 a->data.non_resident.mapping_pairs_offset),
1827 old_alen - le16_to_cpu(
1828 a->data.non_resident.mapping_pairs_offset),
1829 rl2, ll, -1, NULL, NULL, NULL)) {
1830 ntfs_error(vol->sb, "Failed to restore mapping pairs array.%s", es);
1831 NVolSetErrors(vol);
1832 }
1833 if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1834 ntfs_error(vol->sb, "Failed to restore attribute record.%s", es);
1835 NVolSetErrors(vol);
1836 }
1837 mark_mft_record_dirty(ctx->ntfs_ino);
1838 } else if (IS_ERR(ctx->mrec)) {
1839 ntfs_error(vol->sb, "Failed to restore attribute search context.%s", es);
1840 NVolSetErrors(vol);
1841 }
1842 ntfs_attr_put_search_ctx(ctx);
1843 }
1844 if (!IS_ERR(mrec))
1845 unmap_mft_record(mft_ni);
1846 return ret;
1847 }
1848
1849 /*
1850 * ntfs_mft_record_layout - layout an mft record into a memory buffer
1851 * @vol: volume to which the mft record will belong
1852 * @mft_no: mft reference specifying the mft record number
1853 * @m: destination buffer of size >= @vol->mft_record_size bytes
1854 *
1855 * Layout an empty, unused mft record with the mft record number @mft_no into
1856 * the buffer @m. The volume @vol is needed because the mft record structure
1857 * was modified in NTFS 3.1 so we need to know which volume version this mft
1858 * record will be used on.
1859 *
1860 * Return 0 on success and -errno on error.
1861 */
ntfs_mft_record_layout(const struct ntfs_volume * vol,const s64 mft_no,struct mft_record * m)1862 static int ntfs_mft_record_layout(const struct ntfs_volume *vol, const s64 mft_no,
1863 struct mft_record *m)
1864 {
1865 struct attr_record *a;
1866
1867 ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
1868 if (mft_no >= (1ll << 32)) {
1869 ntfs_error(vol->sb, "Mft record number 0x%llx exceeds maximum of 2^32.",
1870 (long long)mft_no);
1871 return -ERANGE;
1872 }
1873 /* Start by clearing the whole mft record to gives us a clean slate. */
1874 memset(m, 0, vol->mft_record_size);
1875 /* Aligned to 2-byte boundary. */
1876 if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
1877 m->usa_ofs = cpu_to_le16((sizeof(struct mft_record_old) + 1) & ~1);
1878 else {
1879 m->usa_ofs = cpu_to_le16((sizeof(struct mft_record) + 1) & ~1);
1880 /*
1881 * Set the NTFS 3.1+ specific fields while we know that the
1882 * volume version is 3.1+.
1883 */
1884 m->reserved = 0;
1885 m->mft_record_number = cpu_to_le32((u32)mft_no);
1886 }
1887 m->magic = magic_FILE;
1888 if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
1889 m->usa_count = cpu_to_le16(vol->mft_record_size /
1890 NTFS_BLOCK_SIZE + 1);
1891 else {
1892 m->usa_count = cpu_to_le16(1);
1893 ntfs_warning(vol->sb,
1894 "Sector size is bigger than mft record size. Setting usa_count to 1. If chkdsk reports this as corruption");
1895 }
1896 /* Set the update sequence number to 1. */
1897 *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
1898 m->lsn = 0;
1899 m->sequence_number = cpu_to_le16(1);
1900 m->link_count = 0;
1901 /*
1902 * Place the attributes straight after the update sequence array,
1903 * aligned to 8-byte boundary.
1904 */
1905 m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
1906 (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
1907 m->flags = 0;
1908 /*
1909 * Using attrs_offset plus eight bytes (for the termination attribute).
1910 * attrs_offset is already aligned to 8-byte boundary, so no need to
1911 * align again.
1912 */
1913 m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
1914 m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
1915 m->base_mft_record = 0;
1916 m->next_attr_instance = 0;
1917 /* Add the termination attribute. */
1918 a = (struct attr_record *)((u8 *)m + le16_to_cpu(m->attrs_offset));
1919 a->type = AT_END;
1920 a->length = 0;
1921 ntfs_debug("Done.");
1922 return 0;
1923 }
1924
1925 /*
1926 * ntfs_mft_record_format - format an mft record on an ntfs volume
1927 * @vol: volume on which to format the mft record
1928 * @mft_no: mft record number to format
1929 *
1930 * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
1931 * mft record into the appropriate place of the mft data attribute. This is
1932 * used when extending the mft data attribute.
1933 *
1934 * Return 0 on success and -errno on error.
1935 */
ntfs_mft_record_format(const struct ntfs_volume * vol,const s64 mft_no)1936 static int ntfs_mft_record_format(const struct ntfs_volume *vol, const s64 mft_no)
1937 {
1938 loff_t i_size;
1939 struct inode *mft_vi = vol->mft_ino;
1940 struct folio *folio;
1941 struct mft_record *m;
1942 pgoff_t index, end_index;
1943 unsigned int ofs;
1944 int err;
1945
1946 ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
1947 /*
1948 * The index into the page cache and the offset within the page cache
1949 * page of the wanted mft record.
1950 */
1951 index = NTFS_MFT_NR_TO_PIDX(vol, mft_no);
1952 ofs = NTFS_MFT_NR_TO_POFS(vol, mft_no);
1953 /* The maximum valid index into the page cache for $MFT's data. */
1954 i_size = i_size_read(mft_vi);
1955 end_index = i_size >> PAGE_SHIFT;
1956 if (unlikely(index >= end_index)) {
1957 if (unlikely(index > end_index ||
1958 ofs + vol->mft_record_size > (i_size & ~PAGE_MASK))) {
1959 ntfs_error(vol->sb, "Tried to format non-existing mft record 0x%llx.",
1960 (long long)mft_no);
1961 return -ENOENT;
1962 }
1963 }
1964
1965 /* Read, map, and pin the folio containing the mft record. */
1966 folio = read_mapping_folio(mft_vi->i_mapping, index, NULL);
1967 if (IS_ERR(folio)) {
1968 ntfs_error(vol->sb, "Failed to map page containing mft record to format 0x%llx.",
1969 (long long)mft_no);
1970 return PTR_ERR(folio);
1971 }
1972 folio_lock(folio);
1973 folio_clear_uptodate(folio);
1974 m = (struct mft_record *)((u8 *)kmap_local_folio(folio, 0) + ofs);
1975 err = ntfs_mft_record_layout(vol, mft_no, m);
1976 if (unlikely(err)) {
1977 ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
1978 (long long)mft_no);
1979 folio_mark_uptodate(folio);
1980 folio_unlock(folio);
1981 kunmap_local(m);
1982 folio_put(folio);
1983 return err;
1984 }
1985 pre_write_mst_fixup((struct ntfs_record *)m, vol->mft_record_size);
1986 folio_mark_uptodate(folio);
1987 /*
1988 * Make sure the mft record is written out to disk. We could use
1989 * ilookup5() to check if an inode is in icache and so on but this is
1990 * unnecessary as ntfs_writepage() will write the dirty record anyway.
1991 */
1992 ntfs_mft_mark_dirty(folio);
1993 folio_unlock(folio);
1994 kunmap_local(m);
1995 folio_put(folio);
1996 ntfs_debug("Done.");
1997 return 0;
1998 }
1999
2000 /*
2001 * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
2002 * @vol: [IN] volume on which to allocate the mft record
2003 * @mode: [IN] mode if want a file or directory, i.e. base inode or 0
2004 * @ni: [OUT] on success, set to the allocated ntfs inode
2005 * @base_ni: [IN] open base inode if allocating an extent mft record or NULL
2006 * @ni_mrec: [OUT] on successful return this is the mapped mft record
2007 *
2008 * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
2009 *
2010 * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
2011 * direvctory inode, and allocate it at the default allocator position. In
2012 * this case @mode is the file mode as given to us by the caller. We in
2013 * particular use @mode to distinguish whether a file or a directory is being
2014 * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
2015 *
2016 * If @base_ni is not NULL make the allocated mft record an extent record,
2017 * allocate it starting at the mft record after the base mft record and attach
2018 * the allocated and opened ntfs inode to the base inode @base_ni. In this
2019 * case @mode must be 0 as it is meaningless for extent inodes.
2020 *
2021 * You need to check the return value with IS_ERR(). If false, the function
2022 * was successful and the return value is the now opened ntfs inode of the
2023 * allocated mft record. *@mrec is then set to the allocated, mapped, pinned,
2024 * and locked mft record. If IS_ERR() is true, the function failed and the
2025 * error code is obtained from PTR_ERR(return value). *@mrec is undefined in
2026 * this case.
2027 *
2028 * Allocation strategy:
2029 *
2030 * To find a free mft record, we scan the mft bitmap for a zero bit. To
2031 * optimize this we start scanning at the place specified by @base_ni or if
2032 * @base_ni is NULL we start where we last stopped and we perform wrap around
2033 * when we reach the end. Note, we do not try to allocate mft records below
2034 * number 64 because numbers 0 to 15 are the defined system files anyway and 16
2035 * to 64 are special in that they are used for storing extension mft records
2036 * for the $DATA attribute of $MFT. This is required to avoid the possibility
2037 * of creating a runlist with a circular dependency which once written to disk
2038 * can never be read in again. Windows will only use records 16 to 24 for
2039 * normal files if the volume is completely out of space. We never use them
2040 * which means that when the volume is really out of space we cannot create any
2041 * more files while Windows can still create up to 8 small files. We can start
2042 * doing this at some later time, it does not matter much for now.
2043 *
2044 * When scanning the mft bitmap, we only search up to the last allocated mft
2045 * record. If there are no free records left in the range 64 to number of
2046 * allocated mft records, then we extend the $MFT/$DATA attribute in order to
2047 * create free mft records. We extend the allocated size of $MFT/$DATA by 16
2048 * records at a time or one cluster, if cluster size is above 16kiB. If there
2049 * is not sufficient space to do this, we try to extend by a single mft record
2050 * or one cluster, if cluster size is above the mft record size.
2051 *
2052 * No matter how many mft records we allocate, we initialize only the first
2053 * allocated mft record, incrementing mft data size and initialized size
2054 * accordingly, open an struct ntfs_inode for it and return it to the caller, unless
2055 * there are less than 64 mft records, in which case we allocate and initialize
2056 * mft records until we reach record 64 which we consider as the first free mft
2057 * record for use by normal files.
2058 *
2059 * If during any stage we overflow the initialized data in the mft bitmap, we
2060 * extend the initialized size (and data size) by 8 bytes, allocating another
2061 * cluster if required. The bitmap data size has to be at least equal to the
2062 * number of mft records in the mft, but it can be bigger, in which case the
2063 * superfluous bits are padded with zeroes.
2064 *
2065 * Thus, when we return successfully (IS_ERR() is false), we will have:
2066 * - initialized / extended the mft bitmap if necessary,
2067 * - initialized / extended the mft data if necessary,
2068 * - set the bit corresponding to the mft record being allocated in the
2069 * mft bitmap,
2070 * - opened an struct ntfs_inode for the allocated mft record, and we will have
2071 * - returned the struct ntfs_inode as well as the allocated mapped, pinned, and
2072 * locked mft record.
2073 *
2074 * On error, the volume will be left in a consistent state and no record will
2075 * be allocated. If rolling back a partial operation fails, we may leave some
2076 * inconsistent metadata in which case we set NVolErrors() so the volume is
2077 * left dirty when unmounted.
2078 *
2079 * Note, this function cannot make use of most of the normal functions, like
2080 * for example for attribute resizing, etc, because when the run list overflows
2081 * the base mft record and an attribute list is used, it is very important that
2082 * the extension mft records used to store the $DATA attribute of $MFT can be
2083 * reached without having to read the information contained inside them, as
2084 * this would make it impossible to find them in the first place after the
2085 * volume is unmounted. $MFT/$BITMAP probably does not need to follow this
2086 * rule because the bitmap is not essential for finding the mft records, but on
2087 * the other hand, handling the bitmap in this special way would make life
2088 * easier because otherwise there might be circular invocations of functions
2089 * when reading the bitmap.
2090 */
ntfs_mft_record_alloc(struct ntfs_volume * vol,const int mode,struct ntfs_inode ** ni,struct ntfs_inode * base_ni,struct mft_record ** ni_mrec)2091 int ntfs_mft_record_alloc(struct ntfs_volume *vol, const int mode,
2092 struct ntfs_inode **ni, struct ntfs_inode *base_ni,
2093 struct mft_record **ni_mrec)
2094 {
2095 s64 ll, bit, old_data_initialized, old_data_size;
2096 unsigned long flags;
2097 struct folio *folio;
2098 struct ntfs_inode *mft_ni, *mftbmp_ni;
2099 struct ntfs_attr_search_ctx *ctx;
2100 struct mft_record *m = NULL;
2101 struct attr_record *a;
2102 pgoff_t index;
2103 unsigned int ofs;
2104 int err;
2105 __le16 seq_no, usn;
2106 bool record_formatted = false;
2107 unsigned int memalloc_flags;
2108
2109 if (base_ni && *ni)
2110 return -EINVAL;
2111
2112 /* @mode and @base_ni are mutually exclusive. */
2113 if (mode && base_ni)
2114 return -EINVAL;
2115
2116 if (base_ni)
2117 ntfs_debug("Entering (allocating an extent mft record for base mft record 0x%llx).",
2118 (long long)base_ni->mft_no);
2119 else
2120 ntfs_debug("Entering (allocating a base mft record).");
2121
2122 memalloc_flags = memalloc_nofs_save();
2123
2124 mft_ni = NTFS_I(vol->mft_ino);
2125 if (!base_ni || base_ni->mft_no != FILE_MFT)
2126 mutex_lock(&mft_ni->mrec_lock);
2127 mftbmp_ni = NTFS_I(vol->mftbmp_ino);
2128 search_free_rec:
2129 if (!base_ni || base_ni->mft_no != FILE_MFT)
2130 down_write(&vol->mftbmp_lock);
2131 bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
2132 if (bit >= 0) {
2133 ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2134 (long long)bit);
2135 goto have_alloc_rec;
2136 }
2137 if (bit != -ENOSPC) {
2138 if (!base_ni || base_ni->mft_no != FILE_MFT) {
2139 up_write(&vol->mftbmp_lock);
2140 mutex_unlock(&mft_ni->mrec_lock);
2141 }
2142 memalloc_nofs_restore(memalloc_flags);
2143 return bit;
2144 }
2145
2146 if (base_ni && base_ni->mft_no == FILE_MFT) {
2147 memalloc_nofs_restore(memalloc_flags);
2148 return bit;
2149 }
2150
2151 /*
2152 * No free mft records left. If the mft bitmap already covers more
2153 * than the currently used mft records, the next records are all free,
2154 * so we can simply allocate the first unused mft record.
2155 * Note: We also have to make sure that the mft bitmap at least covers
2156 * the first 24 mft records as they are special and whilst they may not
2157 * be in use, we do not allocate from them.
2158 */
2159 read_lock_irqsave(&mft_ni->size_lock, flags);
2160 ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
2161 read_unlock_irqrestore(&mft_ni->size_lock, flags);
2162 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2163 old_data_initialized = mftbmp_ni->initialized_size;
2164 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2165 if (old_data_initialized << 3 > ll &&
2166 old_data_initialized > RESERVED_MFT_RECORDS / 8) {
2167 bit = ll;
2168 if (bit < RESERVED_MFT_RECORDS)
2169 bit = RESERVED_MFT_RECORDS;
2170 if (unlikely(bit >= (1ll << 32)))
2171 goto max_err_out;
2172 ntfs_debug("Found free record (#2), bit 0x%llx.",
2173 (long long)bit);
2174 goto found_free_rec;
2175 }
2176 /*
2177 * The mft bitmap needs to be expanded until it covers the first unused
2178 * mft record that we can allocate.
2179 * Note: The smallest mft record we allocate is mft record 24.
2180 */
2181 bit = old_data_initialized << 3;
2182 if (unlikely(bit >= (1ll << 32)))
2183 goto max_err_out;
2184 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2185 old_data_size = mftbmp_ni->allocated_size;
2186 ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2187 old_data_size, i_size_read(vol->mftbmp_ino),
2188 old_data_initialized);
2189 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2190 if (old_data_initialized + 8 > old_data_size) {
2191 /* Need to extend bitmap by one more cluster. */
2192 ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2193 err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2194 if (err == -EAGAIN)
2195 err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2196
2197 if (unlikely(err)) {
2198 if (!base_ni || base_ni->mft_no != FILE_MFT)
2199 up_write(&vol->mftbmp_lock);
2200 goto err_out;
2201 }
2202 #ifdef DEBUG
2203 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2204 ntfs_debug("Status of mftbmp after allocation extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2205 mftbmp_ni->allocated_size,
2206 i_size_read(vol->mftbmp_ino),
2207 mftbmp_ni->initialized_size);
2208 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2209 #endif /* DEBUG */
2210 }
2211 /*
2212 * We now have sufficient allocated space, extend the initialized_size
2213 * as well as the data_size if necessary and fill the new space with
2214 * zeroes.
2215 */
2216 err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2217 if (unlikely(err)) {
2218 if (!base_ni || base_ni->mft_no != FILE_MFT)
2219 up_write(&vol->mftbmp_lock);
2220 goto err_out;
2221 }
2222 #ifdef DEBUG
2223 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2224 ntfs_debug("Status of mftbmp after initialized extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2225 mftbmp_ni->allocated_size,
2226 i_size_read(vol->mftbmp_ino),
2227 mftbmp_ni->initialized_size);
2228 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2229 #endif /* DEBUG */
2230 ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
2231 found_free_rec:
2232 /* @bit is the found free mft record, allocate it in the mft bitmap. */
2233 ntfs_debug("At found_free_rec.");
2234 err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
2235 if (unlikely(err)) {
2236 ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
2237 if (!base_ni || base_ni->mft_no != FILE_MFT)
2238 up_write(&vol->mftbmp_lock);
2239 goto err_out;
2240 }
2241 ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
2242 have_alloc_rec:
2243 /*
2244 * The mft bitmap is now uptodate. Deal with mft data attribute now.
2245 * Note, we keep hold of the mft bitmap lock for writing until all
2246 * modifications to the mft data attribute are complete, too, as they
2247 * will impact decisions for mft bitmap and mft record allocation done
2248 * by a parallel allocation and if the lock is not maintained a
2249 * parallel allocation could allocate the same mft record as this one.
2250 */
2251 ll = (bit + 1) << vol->mft_record_size_bits;
2252 read_lock_irqsave(&mft_ni->size_lock, flags);
2253 old_data_initialized = mft_ni->initialized_size;
2254 read_unlock_irqrestore(&mft_ni->size_lock, flags);
2255 if (ll <= old_data_initialized) {
2256 ntfs_debug("Allocated mft record already initialized.");
2257 goto mft_rec_already_initialized;
2258 }
2259 ntfs_debug("Initializing allocated mft record.");
2260 /*
2261 * The mft record is outside the initialized data. Extend the mft data
2262 * attribute until it covers the allocated record. The loop is only
2263 * actually traversed more than once when a freshly formatted volume is
2264 * first written to so it optimizes away nicely in the common case.
2265 */
2266 if (!base_ni || base_ni->mft_no != FILE_MFT) {
2267 read_lock_irqsave(&mft_ni->size_lock, flags);
2268 ntfs_debug("Status of mft data before extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2269 mft_ni->allocated_size, i_size_read(vol->mft_ino),
2270 mft_ni->initialized_size);
2271 while (ll > mft_ni->allocated_size) {
2272 read_unlock_irqrestore(&mft_ni->size_lock, flags);
2273 err = ntfs_mft_data_extend_allocation_nolock(vol);
2274 if (err == -EAGAIN)
2275 err = ntfs_mft_data_extend_allocation_nolock(vol);
2276
2277 if (unlikely(err)) {
2278 ntfs_error(vol->sb, "Failed to extend mft data allocation.");
2279 goto undo_mftbmp_alloc_nolock;
2280 }
2281 read_lock_irqsave(&mft_ni->size_lock, flags);
2282 ntfs_debug("Status of mft data after allocation extension: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2283 mft_ni->allocated_size, i_size_read(vol->mft_ino),
2284 mft_ni->initialized_size);
2285 }
2286 read_unlock_irqrestore(&mft_ni->size_lock, flags);
2287 } else if (ll > mft_ni->allocated_size) {
2288 err = -ENOSPC;
2289 goto undo_mftbmp_alloc_nolock;
2290 }
2291 /*
2292 * Extend mft data initialized size (and data size of course) to reach
2293 * the allocated mft record, formatting the mft records allong the way.
2294 * Note: We only modify the struct ntfs_inode structure as that is all that is
2295 * needed by ntfs_mft_record_format(). We will update the attribute
2296 * record itself in one fell swoop later on.
2297 */
2298 write_lock_irqsave(&mft_ni->size_lock, flags);
2299 old_data_initialized = mft_ni->initialized_size;
2300 old_data_size = vol->mft_ino->i_size;
2301 while (ll > mft_ni->initialized_size) {
2302 s64 new_initialized_size, mft_no;
2303
2304 new_initialized_size = mft_ni->initialized_size +
2305 vol->mft_record_size;
2306 mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
2307 if (new_initialized_size > i_size_read(vol->mft_ino))
2308 i_size_write(vol->mft_ino, new_initialized_size);
2309 write_unlock_irqrestore(&mft_ni->size_lock, flags);
2310 ntfs_debug("Initializing mft record 0x%llx.",
2311 (long long)mft_no);
2312 err = ntfs_mft_record_format(vol, mft_no);
2313 if (unlikely(err)) {
2314 ntfs_error(vol->sb, "Failed to format mft record.");
2315 goto undo_data_init;
2316 }
2317 write_lock_irqsave(&mft_ni->size_lock, flags);
2318 mft_ni->initialized_size = new_initialized_size;
2319 }
2320 write_unlock_irqrestore(&mft_ni->size_lock, flags);
2321 record_formatted = true;
2322 /* Update the mft data attribute record to reflect the new sizes. */
2323 m = map_mft_record(mft_ni);
2324 if (IS_ERR(m)) {
2325 ntfs_error(vol->sb, "Failed to map mft record.");
2326 err = PTR_ERR(m);
2327 goto undo_data_init;
2328 }
2329 ctx = ntfs_attr_get_search_ctx(mft_ni, m);
2330 if (unlikely(!ctx)) {
2331 ntfs_error(vol->sb, "Failed to get search context.");
2332 err = -ENOMEM;
2333 unmap_mft_record(mft_ni);
2334 goto undo_data_init;
2335 }
2336 err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
2337 CASE_SENSITIVE, 0, NULL, 0, ctx);
2338 if (unlikely(err)) {
2339 ntfs_error(vol->sb, "Failed to find first attribute extent of mft data attribute.");
2340 ntfs_attr_put_search_ctx(ctx);
2341 unmap_mft_record(mft_ni);
2342 goto undo_data_init;
2343 }
2344 a = ctx->attr;
2345 read_lock_irqsave(&mft_ni->size_lock, flags);
2346 a->data.non_resident.initialized_size =
2347 cpu_to_le64(mft_ni->initialized_size);
2348 a->data.non_resident.data_size =
2349 cpu_to_le64(i_size_read(vol->mft_ino));
2350 read_unlock_irqrestore(&mft_ni->size_lock, flags);
2351 /* Ensure the changes make it to disk. */
2352 mark_mft_record_dirty(ctx->ntfs_ino);
2353 ntfs_attr_put_search_ctx(ctx);
2354 unmap_mft_record(mft_ni);
2355 read_lock_irqsave(&mft_ni->size_lock, flags);
2356 ntfs_debug("Status of mft data after mft record initialization: allocated_size 0x%llx, data_size 0x%llx, initialized_size 0x%llx.",
2357 mft_ni->allocated_size, i_size_read(vol->mft_ino),
2358 mft_ni->initialized_size);
2359 WARN_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
2360 WARN_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
2361 read_unlock_irqrestore(&mft_ni->size_lock, flags);
2362 mft_rec_already_initialized:
2363 /*
2364 * We can finally drop the mft bitmap lock as the mft data attribute
2365 * has been fully updated. The only disparity left is that the
2366 * allocated mft record still needs to be marked as in use to match the
2367 * set bit in the mft bitmap but this is actually not a problem since
2368 * this mft record is not referenced from anywhere yet and the fact
2369 * that it is allocated in the mft bitmap means that no-one will try to
2370 * allocate it either.
2371 */
2372 if (!base_ni || base_ni->mft_no != FILE_MFT)
2373 up_write(&vol->mftbmp_lock);
2374 /*
2375 * We now have allocated and initialized the mft record. Calculate the
2376 * index of and the offset within the page cache page the record is in.
2377 */
2378 index = NTFS_MFT_NR_TO_PIDX(vol, bit);
2379 ofs = NTFS_MFT_NR_TO_POFS(vol, bit);
2380 /* Read, map, and pin the folio containing the mft record. */
2381 folio = read_mapping_folio(vol->mft_ino->i_mapping, index, NULL);
2382 if (IS_ERR(folio)) {
2383 ntfs_error(vol->sb, "Failed to map page containing allocated mft record 0x%llx.",
2384 bit);
2385 err = PTR_ERR(folio);
2386 goto undo_mftbmp_alloc;
2387 }
2388 folio_lock(folio);
2389 folio_clear_uptodate(folio);
2390 m = (struct mft_record *)((u8 *)kmap_local_folio(folio, 0) + ofs);
2391 /* If we just formatted the mft record no need to do it again. */
2392 if (!record_formatted) {
2393 /* Sanity check that the mft record is really not in use. */
2394 if (ntfs_is_file_record(m->magic) &&
2395 (m->flags & MFT_RECORD_IN_USE)) {
2396 ntfs_warning(vol->sb,
2397 "Mft record 0x%llx was marked free in mft bitmap but is marked used itself. Unmount and run chkdsk.",
2398 bit);
2399 folio_mark_uptodate(folio);
2400 folio_unlock(folio);
2401 kunmap_local(m);
2402 folio_put(folio);
2403 NVolSetErrors(vol);
2404 goto search_free_rec;
2405 }
2406 /*
2407 * We need to (re-)format the mft record, preserving the
2408 * sequence number if it is not zero as well as the update
2409 * sequence number if it is not zero or -1 (0xffff). This
2410 * means we do not need to care whether or not something went
2411 * wrong with the previous mft record.
2412 */
2413 seq_no = m->sequence_number;
2414 usn = *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs));
2415 err = ntfs_mft_record_layout(vol, bit, m);
2416 if (unlikely(err)) {
2417 ntfs_error(vol->sb, "Failed to layout allocated mft record 0x%llx.",
2418 bit);
2419 folio_mark_uptodate(folio);
2420 folio_unlock(folio);
2421 kunmap_local(m);
2422 folio_put(folio);
2423 goto undo_mftbmp_alloc;
2424 }
2425 if (seq_no)
2426 m->sequence_number = seq_no;
2427 if (usn && le16_to_cpu(usn) != 0xffff)
2428 *(__le16 *)((u8 *)m + le16_to_cpu(m->usa_ofs)) = usn;
2429 pre_write_mst_fixup((struct ntfs_record *)m, vol->mft_record_size);
2430 }
2431 /* Set the mft record itself in use. */
2432 m->flags |= MFT_RECORD_IN_USE;
2433 if (S_ISDIR(mode))
2434 m->flags |= MFT_RECORD_IS_DIRECTORY;
2435 folio_mark_uptodate(folio);
2436 if (base_ni) {
2437 struct mft_record *m_tmp;
2438
2439 /*
2440 * Setup the base mft record in the extent mft record. This
2441 * completes initialization of the allocated extent mft record
2442 * and we can simply use it with map_extent_mft_record().
2443 */
2444 m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2445 base_ni->seq_no);
2446 /*
2447 * Allocate an extent inode structure for the new mft record,
2448 * attach it to the base inode @base_ni and map, pin, and lock
2449 * its, i.e. the allocated, mft record.
2450 */
2451 m_tmp = map_extent_mft_record(base_ni,
2452 MK_MREF(bit, le16_to_cpu(m->sequence_number)),
2453 ni);
2454 if (IS_ERR(m_tmp)) {
2455 ntfs_error(vol->sb, "Failed to map allocated extent mft record 0x%llx.",
2456 bit);
2457 err = PTR_ERR(m_tmp);
2458 /* Set the mft record itself not in use. */
2459 m->flags &= cpu_to_le16(
2460 ~le16_to_cpu(MFT_RECORD_IN_USE));
2461 /* Make sure the mft record is written out to disk. */
2462 ntfs_mft_mark_dirty(folio);
2463 folio_unlock(folio);
2464 kunmap_local(m);
2465 folio_put(folio);
2466 goto undo_mftbmp_alloc;
2467 }
2468
2469 /*
2470 * Make sure the allocated mft record is written out to disk.
2471 * No need to set the inode dirty because the caller is going
2472 * to do that anyway after finishing with the new extent mft
2473 * record (e.g. at a minimum a new attribute will be added to
2474 * the mft record.
2475 */
2476 ntfs_mft_mark_dirty(folio);
2477 folio_unlock(folio);
2478 /*
2479 * Need to unmap the page since map_extent_mft_record() mapped
2480 * it as well so we have it mapped twice at the moment.
2481 */
2482 kunmap_local(m);
2483 folio_put(folio);
2484 } else {
2485 /*
2486 * Manually map, pin, and lock the mft record as we already
2487 * have its page mapped and it is very easy to do.
2488 */
2489 (*ni)->seq_no = le16_to_cpu(m->sequence_number);
2490 /*
2491 * Make sure the allocated mft record is written out to disk.
2492 * NOTE: We do not set the ntfs inode dirty because this would
2493 * fail in ntfs_write_inode() because the inode does not have a
2494 * standard information attribute yet. Also, there is no need
2495 * to set the inode dirty because the caller is going to do
2496 * that anyway after finishing with the new mft record (e.g. at
2497 * a minimum some new attributes will be added to the mft
2498 * record.
2499 */
2500
2501 (*ni)->mrec = kmalloc(vol->mft_record_size, GFP_NOFS);
2502 if (!(*ni)->mrec) {
2503 folio_unlock(folio);
2504 kunmap_local(m);
2505 folio_put(folio);
2506 err = -ENOMEM;
2507 goto undo_mftbmp_alloc;
2508 }
2509
2510 memcpy((*ni)->mrec, m, vol->mft_record_size);
2511 post_read_mst_fixup((struct ntfs_record *)(*ni)->mrec, vol->mft_record_size);
2512 ntfs_mft_mark_dirty(folio);
2513 folio_unlock(folio);
2514 (*ni)->folio = folio;
2515 (*ni)->folio_ofs = ofs;
2516 atomic_inc(&(*ni)->count);
2517 /* Update the default mft allocation position. */
2518 vol->mft_data_pos = bit + 1;
2519 }
2520 if (!base_ni || base_ni->mft_no != FILE_MFT)
2521 mutex_unlock(&mft_ni->mrec_lock);
2522 memalloc_nofs_restore(memalloc_flags);
2523
2524 /*
2525 * Return the opened, allocated inode of the allocated mft record as
2526 * well as the mapped, pinned, and locked mft record.
2527 */
2528 ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
2529 base_ni ? "extent " : "", bit);
2530 (*ni)->mft_no = bit;
2531 if (ni_mrec)
2532 *ni_mrec = (*ni)->mrec;
2533 ntfs_dec_free_mft_records(vol, 1);
2534 return 0;
2535 undo_data_init:
2536 write_lock_irqsave(&mft_ni->size_lock, flags);
2537 mft_ni->initialized_size = old_data_initialized;
2538 i_size_write(vol->mft_ino, old_data_size);
2539 write_unlock_irqrestore(&mft_ni->size_lock, flags);
2540 goto undo_mftbmp_alloc_nolock;
2541 undo_mftbmp_alloc:
2542 if (!base_ni || base_ni->mft_no != FILE_MFT)
2543 down_write(&vol->mftbmp_lock);
2544 undo_mftbmp_alloc_nolock:
2545 if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
2546 ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2547 NVolSetErrors(vol);
2548 }
2549 if (!base_ni || base_ni->mft_no != FILE_MFT)
2550 up_write(&vol->mftbmp_lock);
2551 err_out:
2552 if (!base_ni || base_ni->mft_no != FILE_MFT)
2553 mutex_unlock(&mft_ni->mrec_lock);
2554 memalloc_nofs_restore(memalloc_flags);
2555 return err;
2556 max_err_out:
2557 ntfs_warning(vol->sb,
2558 "Cannot allocate mft record because the maximum number of inodes (2^32) has already been reached.");
2559 if (!base_ni || base_ni->mft_no != FILE_MFT) {
2560 up_write(&vol->mftbmp_lock);
2561 mutex_unlock(&mft_ni->mrec_lock);
2562 }
2563 memalloc_nofs_restore(memalloc_flags);
2564 return -ENOSPC;
2565 }
2566
2567 /*
2568 * ntfs_mft_record_free - free an mft record on an ntfs volume
2569 * @vol: volume on which to free the mft record
2570 * @ni: open ntfs inode of the mft record to free
2571 *
2572 * Free the mft record of the open inode @ni on the mounted ntfs volume @vol.
2573 * Note that this function calls ntfs_inode_close() internally and hence you
2574 * cannot use the pointer @ni any more after this function returns success.
2575 *
2576 * On success return 0 and on error return -1 with errno set to the error code.
2577 */
ntfs_mft_record_free(struct ntfs_volume * vol,struct ntfs_inode * ni)2578 int ntfs_mft_record_free(struct ntfs_volume *vol, struct ntfs_inode *ni)
2579 {
2580 u64 mft_no;
2581 int err;
2582 u16 seq_no;
2583 __le16 old_seq_no;
2584 struct mft_record *ni_mrec;
2585 unsigned int memalloc_flags;
2586 struct ntfs_inode *base_ni;
2587
2588 if (!vol || !ni)
2589 return -EINVAL;
2590
2591 ntfs_debug("Entering for inode 0x%llx.\n", (long long)ni->mft_no);
2592
2593 ni_mrec = map_mft_record(ni);
2594 if (IS_ERR(ni_mrec))
2595 return -EIO;
2596
2597 /* Cache the mft reference for later. */
2598 mft_no = ni->mft_no;
2599
2600 /* Mark the mft record as not in use. */
2601 ni_mrec->flags &= ~MFT_RECORD_IN_USE;
2602
2603 /* Increment the sequence number, skipping zero, if it is not zero. */
2604 old_seq_no = ni_mrec->sequence_number;
2605 seq_no = le16_to_cpu(old_seq_no);
2606 if (seq_no == 0xffff)
2607 seq_no = 1;
2608 else if (seq_no)
2609 seq_no++;
2610 ni_mrec->sequence_number = cpu_to_le16(seq_no);
2611
2612 down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
2613 err = ntfs_get_block_mft_record(NTFS_I(vol->mft_ino), ni);
2614 up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
2615 if (err) {
2616 unmap_mft_record(ni);
2617 return err;
2618 }
2619
2620 /*
2621 * Set the ntfs inode dirty and write it out. We do not need to worry
2622 * about the base inode here since whatever caused the extent mft
2623 * record to be freed is guaranteed to do it already.
2624 */
2625 NInoSetDirty(ni);
2626 err = write_mft_record(ni, ni_mrec, 0);
2627 if (err)
2628 goto sync_rollback;
2629
2630 if (likely(ni->nr_extents >= 0))
2631 base_ni = ni;
2632 else
2633 base_ni = ni->ext.base_ntfs_ino;
2634
2635 /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
2636 memalloc_flags = memalloc_nofs_save();
2637 if (base_ni->mft_no != FILE_MFT)
2638 down_write(&vol->mftbmp_lock);
2639 err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
2640 if (base_ni->mft_no != FILE_MFT)
2641 up_write(&vol->mftbmp_lock);
2642 memalloc_nofs_restore(memalloc_flags);
2643 if (err)
2644 goto bitmap_rollback;
2645
2646 unmap_mft_record(ni);
2647 ntfs_inc_free_mft_records(vol, 1);
2648 return 0;
2649
2650 /* Rollback what we did... */
2651 bitmap_rollback:
2652 memalloc_flags = memalloc_nofs_save();
2653 if (base_ni->mft_no != FILE_MFT)
2654 down_write(&vol->mftbmp_lock);
2655 if (ntfs_bitmap_set_bit(vol->mftbmp_ino, mft_no))
2656 ntfs_error(vol->sb, "ntfs_bitmap_set_bit failed in bitmap_rollback\n");
2657 if (base_ni->mft_no != FILE_MFT)
2658 up_write(&vol->mftbmp_lock);
2659 memalloc_nofs_restore(memalloc_flags);
2660 sync_rollback:
2661 ntfs_error(vol->sb,
2662 "Eeek! Rollback failed in %s. Leaving inconsistent metadata!\n", __func__);
2663 ni_mrec->flags |= MFT_RECORD_IN_USE;
2664 ni_mrec->sequence_number = old_seq_no;
2665 NInoSetDirty(ni);
2666 write_mft_record(ni, ni_mrec, 0);
2667 unmap_mft_record(ni);
2668 return err;
2669 }
2670
lcn_from_index(struct ntfs_volume * vol,struct ntfs_inode * ni,unsigned long index)2671 static s64 lcn_from_index(struct ntfs_volume *vol, struct ntfs_inode *ni,
2672 unsigned long index)
2673 {
2674 s64 vcn;
2675 s64 lcn;
2676
2677 vcn = ntfs_pidx_to_cluster(vol, index);
2678
2679 down_read(&ni->runlist.lock);
2680 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn, false);
2681 up_read(&ni->runlist.lock);
2682
2683 return lcn;
2684 }
2685
2686 /*
2687 * ntfs_write_mft_block - Write back a folio containing MFT records
2688 * @folio: The folio to write back (contains one or more MFT records)
2689 * @wbc: Writeback control structure
2690 *
2691 * This function is called as part of the address_space_operations
2692 * .writepages implementation for the $MFT inode (or $MFTMirr).
2693 * It handles writing one folio (normally 4KiB page) worth of MFT records
2694 * to the underlying block device.
2695 *
2696 * Return: 0 on success, or -errno on error.
2697 */
ntfs_write_mft_block(struct folio * folio,struct writeback_control * wbc)2698 static int ntfs_write_mft_block(struct folio *folio, struct writeback_control *wbc)
2699 {
2700 struct address_space *mapping = folio->mapping;
2701 struct inode *vi = mapping->host;
2702 struct ntfs_inode *ni = NTFS_I(vi);
2703 struct ntfs_volume *vol = ni->vol;
2704 u8 *kaddr;
2705 struct ntfs_inode **locked_nis __free(kfree) = kmalloc_array(PAGE_SIZE / NTFS_BLOCK_SIZE,
2706 sizeof(struct ntfs_inode *), GFP_NOFS);
2707 int nr_locked_nis = 0, err = 0, mft_ofs, prev_mft_ofs;
2708 struct inode **ref_inos __free(kfree) = kmalloc_array(PAGE_SIZE / NTFS_BLOCK_SIZE,
2709 sizeof(struct inode *), GFP_NOFS);
2710 int nr_ref_inos = 0;
2711 struct bio *bio = NULL;
2712 u64 mft_no;
2713 struct ntfs_inode *tni;
2714 s64 lcn;
2715 s64 vcn = ntfs_pidx_to_cluster(vol, folio->index);
2716 s64 end_vcn = ntfs_bytes_to_cluster(vol, ni->allocated_size);
2717 unsigned int folio_sz;
2718 struct runlist_element *rl = NULL;
2719 loff_t i_size = i_size_read(vi);
2720
2721 ntfs_debug("Entering for inode 0x%llx, attribute type 0x%x, folio index 0x%lx.",
2722 ni->mft_no, ni->type, folio->index);
2723
2724 if (!locked_nis || !ref_inos)
2725 return -ENOMEM;
2726
2727 /* We have to zero every time due to mmap-at-end-of-file. */
2728 if (folio->index >= (i_size >> folio_shift(folio)))
2729 /* The page straddles i_size. */
2730 folio_zero_segment(folio,
2731 offset_in_folio(folio, i_size),
2732 folio_size(folio));
2733
2734 lcn = lcn_from_index(vol, ni, folio->index);
2735 if (lcn <= LCN_HOLE) {
2736 folio_start_writeback(folio);
2737 folio_unlock(folio);
2738 folio_end_writeback(folio);
2739 return -EIO;
2740 }
2741
2742 /* Map folio so we can access its contents. */
2743 kaddr = kmap_local_folio(folio, 0);
2744 /* Clear the page uptodate flag whilst the mst fixups are applied. */
2745 folio_clear_uptodate(folio);
2746
2747 for (mft_ofs = 0; mft_ofs < PAGE_SIZE && vcn < end_vcn;
2748 mft_ofs += vol->mft_record_size) {
2749 /* Get the mft record number. */
2750 mft_no = (((s64)folio->index << PAGE_SHIFT) + mft_ofs) >>
2751 vol->mft_record_size_bits;
2752 vcn = ntfs_mft_no_to_cluster(vol, mft_no);
2753 /* Check whether to write this mft record. */
2754 tni = NULL;
2755 if (ntfs_may_write_mft_record(vol, mft_no,
2756 (struct mft_record *)(kaddr + mft_ofs),
2757 &tni, &ref_inos[nr_ref_inos])) {
2758 unsigned int mft_record_off = 0;
2759 s64 vcn_off = vcn;
2760
2761 /*
2762 * Skip $MFT extent mft records and let them being written
2763 * by writeback to avioid deadlocks. the $MFT runlist
2764 * lock must be taken before $MFT extent mrec_lock is taken.
2765 */
2766 if (tni && tni->nr_extents < 0 &&
2767 tni->ext.base_ntfs_ino == NTFS_I(vol->mft_ino)) {
2768 mutex_unlock(&tni->mrec_lock);
2769 atomic_dec(&tni->count);
2770 iput(vol->mft_ino);
2771 continue;
2772 }
2773
2774 /*
2775 * The record should be written. If a locked ntfs
2776 * inode was returned, add it to the array of locked
2777 * ntfs inodes.
2778 */
2779 if (tni)
2780 locked_nis[nr_locked_nis++] = tni;
2781 else if (ref_inos[nr_ref_inos])
2782 nr_ref_inos++;
2783
2784 if (bio && (mft_ofs != prev_mft_ofs + vol->mft_record_size)) {
2785 flush_bio:
2786 bio->bi_end_io = ntfs_bio_end_io;
2787 submit_bio(bio);
2788 bio = NULL;
2789 }
2790
2791 if (vol->cluster_size < folio_size(folio)) {
2792 down_write(&ni->runlist.lock);
2793 rl = ntfs_attr_vcn_to_rl(ni, vcn_off, &lcn);
2794 up_write(&ni->runlist.lock);
2795 if (IS_ERR(rl) || lcn < 0) {
2796 err = -EIO;
2797 goto unm_done;
2798 }
2799
2800 if (bio &&
2801 (bio_end_sector(bio) >> (vol->cluster_size_bits - 9)) !=
2802 lcn) {
2803 bio->bi_end_io = ntfs_bio_end_io;
2804 submit_bio(bio);
2805 bio = NULL;
2806 }
2807 }
2808
2809 if (!bio) {
2810 unsigned int off;
2811
2812 off = ((mft_no << vol->mft_record_size_bits) +
2813 mft_record_off) & vol->cluster_size_mask;
2814
2815 bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE,
2816 GFP_NOIO);
2817 bio->bi_iter.bi_sector =
2818 ntfs_bytes_to_sector(vol,
2819 ntfs_cluster_to_bytes(vol, lcn) + off);
2820 }
2821
2822 if (vol->cluster_size == NTFS_BLOCK_SIZE &&
2823 (mft_record_off ||
2824 (rl && rl->length - (vcn_off - rl->vcn) == 1) ||
2825 mft_ofs + NTFS_BLOCK_SIZE >= PAGE_SIZE))
2826 folio_sz = NTFS_BLOCK_SIZE;
2827 else
2828 folio_sz = vol->mft_record_size;
2829 if (!bio_add_folio(bio, folio, folio_sz,
2830 mft_ofs + mft_record_off)) {
2831 err = -EIO;
2832 bio_put(bio);
2833 goto unm_done;
2834 }
2835 mft_record_off += folio_sz;
2836
2837 if (mft_record_off != vol->mft_record_size) {
2838 vcn_off++;
2839 goto flush_bio;
2840 }
2841 prev_mft_ofs = mft_ofs;
2842
2843 if (mft_no < vol->mftmirr_size)
2844 ntfs_sync_mft_mirror(vol, mft_no,
2845 (struct mft_record *)(kaddr + mft_ofs));
2846 } else if (ref_inos[nr_ref_inos])
2847 nr_ref_inos++;
2848 }
2849
2850 if (bio) {
2851 bio->bi_end_io = ntfs_bio_end_io;
2852 submit_bio(bio);
2853 }
2854 unm_done:
2855 folio_mark_uptodate(folio);
2856 kunmap_local(kaddr);
2857
2858 folio_start_writeback(folio);
2859 folio_unlock(folio);
2860 folio_end_writeback(folio);
2861
2862 /* Unlock any locked inodes. */
2863 while (nr_locked_nis-- > 0) {
2864 struct ntfs_inode *base_tni;
2865
2866 tni = locked_nis[nr_locked_nis];
2867 mutex_unlock(&tni->mrec_lock);
2868
2869 /* Get the base inode. */
2870 mutex_lock(&tni->extent_lock);
2871 if (tni->nr_extents >= 0)
2872 base_tni = tni;
2873 else
2874 base_tni = tni->ext.base_ntfs_ino;
2875 mutex_unlock(&tni->extent_lock);
2876 ntfs_debug("Unlocking %s inode 0x%llx.",
2877 tni == base_tni ? "base" : "extent",
2878 tni->mft_no);
2879 atomic_dec(&tni->count);
2880 iput(VFS_I(base_tni));
2881 }
2882
2883 /* Dropping deferred references */
2884 while (nr_ref_inos-- > 0) {
2885 if (ref_inos[nr_ref_inos])
2886 iput(ref_inos[nr_ref_inos]);
2887 }
2888
2889 if (unlikely(err && err != -ENOMEM))
2890 NVolSetErrors(vol);
2891 if (likely(!err))
2892 ntfs_debug("Done.");
2893 return err;
2894 }
2895
2896 /*
2897 * ntfs_mft_writepages - Write back dirty folios for the $MFT inode
2898 * @mapping: address space of the $MFT inode
2899 * @wbc: writeback control
2900 *
2901 * Writeback iterator for MFT records. Iterates over dirty folios and
2902 * delegates actual writing to ntfs_write_mft_block() for each folio.
2903 * Called from the address_space_operations .writepages vector of the
2904 * $MFT inode.
2905 *
2906 * Returns 0 on success, or the first error encountered.
2907 */
ntfs_mft_writepages(struct address_space * mapping,struct writeback_control * wbc)2908 int ntfs_mft_writepages(struct address_space *mapping,
2909 struct writeback_control *wbc)
2910 {
2911 struct folio *folio = NULL;
2912 int error;
2913
2914 if (NVolShutdown(NTFS_I(mapping->host)->vol))
2915 return -EIO;
2916
2917 while ((folio = writeback_iter(mapping, wbc, folio, &error)))
2918 error = ntfs_write_mft_block(folio, wbc);
2919 return error;
2920 }
2921
ntfs_mft_mark_dirty(struct folio * folio)2922 void ntfs_mft_mark_dirty(struct folio *folio)
2923 {
2924 iomap_dirty_folio(folio->mapping, folio);
2925 }
2926