xref: /linux/fs/ntfs/lcnalloc.c (revision cdd4dc3aebeab43a72ce0bc2b5bab6f0a80b97a5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Cluster (de)allocation code.
4  *
5  * Copyright (c) 2004-2005 Anton Altaparmakov
6  * Copyright (c) 2025 LG Electronics Co., Ltd.
7  *
8  * Part of this file is based on code from the NTFS-3G.
9  * and is copyrighted by the respective authors below:
10  * Copyright (c) 2002-2004 Anton Altaparmakov
11  * Copyright (c) 2004 Yura Pakhuchiy
12  * Copyright (c) 2004-2008 Szabolcs Szakacsits
13  * Copyright (c) 2008-2009 Jean-Pierre Andre
14  */
15 
16 #include <linux/blkdev.h>
17 
18 #include "lcnalloc.h"
19 #include "bitmap.h"
20 #include "ntfs.h"
21 
22 /*
23  * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
24  * @vol:	mounted ntfs volume on which to free the clusters
25  * @rl:		runlist describing the clusters to free
26  *
27  * Free all the clusters described by the runlist @rl on the volume @vol.  In
28  * the case of an error being returned, at least some of the clusters were not
29  * freed.
30  *
31  * Return 0 on success and -errno on error.
32  *
33  * Locking: - The volume lcn bitmap must be locked for writing on entry and is
34  *	      left locked on return.
35  */
36 int ntfs_cluster_free_from_rl_nolock(struct ntfs_volume *vol,
37 		const struct runlist_element *rl)
38 {
39 	struct inode *lcnbmp_vi = vol->lcnbmp_ino;
40 	int ret = 0;
41 	s64 nr_freed = 0;
42 
43 	ntfs_debug("Entering.");
44 	if (!rl)
45 		return 0;
46 
47 	if (!NVolFreeClusterKnown(vol))
48 		wait_event(vol->free_waitq, NVolFreeClusterKnown(vol));
49 
50 	for (; rl->length; rl++) {
51 		int err;
52 
53 		if (rl->lcn < 0)
54 			continue;
55 		err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
56 		if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
57 			ret = err;
58 		else
59 			nr_freed += rl->length;
60 	}
61 	ntfs_inc_free_clusters(vol, nr_freed);
62 	ntfs_debug("Done.");
63 	return ret;
64 }
65 
66 static s64 max_empty_bit_range(unsigned char *buf, int size)
67 {
68 	int i, j, run = 0;
69 	int max_range = 0;
70 	s64 start_pos = -1;
71 
72 	ntfs_debug("Entering\n");
73 
74 	i = 0;
75 	while (i < size) {
76 		switch (*buf) {
77 		case 0:
78 			do {
79 				buf++;
80 				run += 8;
81 				i++;
82 			} while ((i < size) && !*buf);
83 			break;
84 		case 255:
85 			if (run > max_range) {
86 				max_range = run;
87 				start_pos = (s64)i * 8 - run;
88 			}
89 			run = 0;
90 			do {
91 				buf++;
92 				i++;
93 			} while ((i < size) && (*buf == 255));
94 			break;
95 		default:
96 			for (j = 0; j < 8; j++) {
97 				int bit = *buf & (1 << j);
98 
99 				if (bit) {
100 					if (run > max_range) {
101 						max_range = run;
102 						start_pos = (s64)i * 8 + (j - run);
103 					}
104 					run = 0;
105 				} else
106 					run++;
107 			}
108 			i++;
109 			buf++;
110 		}
111 	}
112 
113 	if (run > max_range)
114 		start_pos = (s64)i * 8 - run;
115 
116 	return start_pos;
117 }
118 
119 /*
120  * ntfs_cluster_alloc - allocate clusters on an ntfs volume
121  * @vol:		mounted ntfs volume on which to allocate clusters
122  * @start_vcn:		vcn of the first allocated cluster
123  * @count:		number of clusters to allocate
124  * @start_lcn:		starting lcn at which to allocate the clusters or -1 if none
125  * @zone:		zone from which to allocate (MFT_ZONE or DATA_ZONE)
126  * @is_extension:	if true, the caller is extending an attribute
127  * @is_contig:		if true, require contiguous allocation
128  * @is_dealloc:		if true, the allocation is for deallocation purposes
129  *
130  * Allocate @count clusters preferably starting at cluster @start_lcn or at the
131  * current allocator position if @start_lcn is -1, on the mounted ntfs volume
132  * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
133  * MFT_ZONE for allocation of clusters for the master file table, i.e. the
134  * $MFT/$DATA attribute.
135  *
136  * @start_vcn specifies the vcn of the first allocated cluster.  This makes
137  * merging the resulting runlist with the old runlist easier.
138  *
139  * If @is_extension is 'true', the caller is allocating clusters to extend an
140  * attribute and if it is 'false', the caller is allocating clusters to fill a
141  * hole in an attribute.  Practically the difference is that if @is_extension
142  * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
143  * @is_extension is 'false' the runlist will be terminated with
144  * LCN_RL_NOT_MAPPED.
145  *
146  * You need to check the return value with IS_ERR().  If this is false, the
147  * function was successful and the return value is a runlist describing the
148  * allocated cluster(s).  If IS_ERR() is true, the function failed and
149  * PTR_ERR() gives you the error code.
150  *
151  * Notes on the allocation algorithm
152  * =================================
153  *
154  * There are two data zones.  First is the area between the end of the mft zone
155  * and the end of the volume, and second is the area between the start of the
156  * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
157  * volumes, the second data zone does not exist due to the mft zone being
158  * expanded to cover the start of the volume in order to reserve space for the
159  * mft bitmap attribute.
160  *
161  * This is not the prettiest function but the complexity stems from the need of
162  * implementing the mft vs data zoned approach and from the fact that we have
163  * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
164  * need to cope with crossing over boundaries of two buffers.  Further, the
165  * fact that the allocator allows for caller supplied hints as to the location
166  * of where allocation should begin and the fact that the allocator keeps track
167  * of where in the data zones the next natural allocation should occur,
168  * contribute to the complexity of the function.  But it should all be
169  * worthwhile, because this allocator should: 1) be a full implementation of
170  * the MFT zone approach used by Windows NT, 2) cause reduction in
171  * fragmentation, and 3) be speedy in allocations (the code is not optimized
172  * for speed, but the algorithm is, so further speed improvements are probably
173  * possible).
174  *
175  * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
176  *	      on return.
177  *	    - This function takes the volume lcn bitmap lock for writing and
178  *	      modifies the bitmap contents.
179  *
180  * Return: Runlist describing the allocated cluster(s) on success, error pointer
181  *         on failure.
182  */
183 struct runlist_element *ntfs_cluster_alloc(struct ntfs_volume *vol, const s64 start_vcn,
184 		const s64 count, const s64 start_lcn,
185 		const int zone,
186 		const bool is_extension,
187 		const bool is_contig,
188 		const bool is_dealloc)
189 {
190 	s64 zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
191 	s64 prev_lcn = 0, prev_run_len = 0, mft_zone_size;
192 	s64 clusters, free_clusters;
193 	loff_t i_size;
194 	struct inode *lcnbmp_vi;
195 	struct runlist_element *rl = NULL;
196 	struct address_space *mapping;
197 	struct folio *folio = NULL;
198 	u8 *buf = NULL, *byte;
199 	int err = 0, rlpos, rlsize, buf_size, pg_off;
200 	u8 pass, done_zones, search_zone, need_writeback = 0, bit;
201 	unsigned int memalloc_flags;
202 	u8 has_guess, used_zone_pos;
203 	pgoff_t index;
204 
205 	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn 0x%llx, zone %s_ZONE.",
206 			start_vcn, count, start_lcn,
207 			zone == MFT_ZONE ? "MFT" : "DATA");
208 
209 	lcnbmp_vi = vol->lcnbmp_ino;
210 	if (start_vcn < 0 || start_lcn < LCN_HOLE ||
211 	    zone < FIRST_ZONE || zone > LAST_ZONE)
212 		return ERR_PTR(-EINVAL);
213 
214 	/* Return NULL if @count is zero. */
215 	if (count < 0 || !count)
216 		return ERR_PTR(-EINVAL);
217 
218 	memalloc_flags = memalloc_nofs_save();
219 
220 	if (!NVolFreeClusterKnown(vol))
221 		wait_event(vol->free_waitq, NVolFreeClusterKnown(vol));
222 	free_clusters = atomic64_read(&vol->free_clusters);
223 
224 	/* Take the lcnbmp lock for writing. */
225 	down_write(&vol->lcnbmp_lock);
226 	if (is_dealloc == false)
227 		free_clusters -= atomic64_read(&vol->dirty_clusters);
228 
229 	if (free_clusters < count) {
230 		err = -ENOSPC;
231 		goto out_restore;
232 	}
233 
234 	/*
235 	 * If no specific @start_lcn was requested, use the current data zone
236 	 * position, otherwise use the requested @start_lcn but make sure it
237 	 * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
238 	 * and pass depending on whether we are starting inside a zone (1) or
239 	 * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
240 	 * we either start at the current position within the mft zone or at
241 	 * the specified position.  If the latter is out of bounds then we start
242 	 * at the beginning of the MFT_ZONE.
243 	 */
244 	done_zones = 0;
245 	pass = 1;
246 	/*
247 	 * zone_start and zone_end are the current search range.  search_zone
248 	 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
249 	 * volume) and 4 for data zone 2 (start of volume till start of mft
250 	 * zone).
251 	 */
252 	has_guess = 1;
253 	zone_start = start_lcn;
254 
255 	if (zone_start < 0) {
256 		if (zone == DATA_ZONE)
257 			zone_start = vol->data1_zone_pos;
258 		else
259 			zone_start = vol->mft_zone_pos;
260 		if (!zone_start) {
261 			/*
262 			 * Zone starts at beginning of volume which means a
263 			 * single pass is sufficient.
264 			 */
265 			pass = 2;
266 		}
267 		has_guess = 0;
268 	}
269 
270 	used_zone_pos = has_guess ? 0 : 1;
271 
272 	if (!zone_start || zone_start == vol->mft_zone_start ||
273 			zone_start == vol->mft_zone_end)
274 		pass = 2;
275 
276 	if (zone_start < vol->mft_zone_start) {
277 		zone_end = vol->mft_zone_start;
278 		search_zone = 4;
279 		/* Skip searching the mft zone. */
280 		done_zones |= 1;
281 	} else if (zone_start < vol->mft_zone_end) {
282 		zone_end = vol->mft_zone_end;
283 		search_zone = 1;
284 	} else {
285 		zone_end = vol->nr_clusters;
286 		search_zone = 2;
287 		/* Skip searching the mft zone. */
288 		done_zones |= 1;
289 	}
290 
291 	/*
292 	 * bmp_pos is the current bit position inside the bitmap.  We use
293 	 * bmp_initial_pos to determine whether or not to do a zone switch.
294 	 */
295 	bmp_pos = bmp_initial_pos = zone_start;
296 
297 	/* Loop until all clusters are allocated, i.e. clusters == 0. */
298 	clusters = count;
299 	rlpos = rlsize = 0;
300 	mapping = lcnbmp_vi->i_mapping;
301 	i_size = i_size_read(lcnbmp_vi);
302 	while (1) {
303 		ntfs_debug("Start of outer while loop: done_zones 0x%x, search_zone %i, pass %i, zone_start 0x%llx, zone_end 0x%llx, bmp_initial_pos 0x%llx, bmp_pos 0x%llx, rlpos %i, rlsize %i.",
304 				done_zones, search_zone, pass,
305 				zone_start, zone_end, bmp_initial_pos,
306 				bmp_pos, rlpos, rlsize);
307 		/* Loop until we run out of free clusters. */
308 		last_read_pos = bmp_pos >> 3;
309 		ntfs_debug("last_read_pos 0x%llx.", last_read_pos);
310 		if (last_read_pos >= i_size) {
311 			ntfs_debug("End of attribute reached. Skipping to zone_pass_done.");
312 			goto zone_pass_done;
313 		}
314 		if (likely(folio)) {
315 			if (need_writeback) {
316 				ntfs_debug("Marking page dirty.");
317 				folio_mark_dirty(folio);
318 				need_writeback = 0;
319 			}
320 			folio_unlock(folio);
321 			kunmap_local(buf);
322 			folio_put(folio);
323 			folio = NULL;
324 		}
325 
326 		index = last_read_pos >> PAGE_SHIFT;
327 		pg_off = last_read_pos & ~PAGE_MASK;
328 		buf_size = PAGE_SIZE - pg_off;
329 		if (unlikely(last_read_pos + buf_size > i_size))
330 			buf_size = i_size - last_read_pos;
331 		buf_size <<= 3;
332 		lcn = bmp_pos & 7;
333 		bmp_pos &= ~(s64)7;
334 
335 		if (vol->lcn_empty_bits_per_page[index] == 0)
336 			goto next_bmp_pos;
337 
338 		folio = read_mapping_folio(mapping, index, NULL);
339 		if (IS_ERR(folio)) {
340 			err = PTR_ERR(folio);
341 			ntfs_error(vol->sb, "Failed to map page.");
342 			goto out;
343 		}
344 
345 		folio_lock(folio);
346 		buf = kmap_local_folio(folio, 0) + pg_off;
347 		ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, bmp_pos 0x%llx, need_writeback %i.",
348 				buf_size, lcn, bmp_pos, need_writeback);
349 		while (lcn < buf_size && lcn + bmp_pos < zone_end) {
350 			byte = buf + (lcn >> 3);
351 			ntfs_debug("In inner while loop: buf_size %i, lcn 0x%llx, bmp_pos 0x%llx, need_writeback %i, byte ofs 0x%x, *byte 0x%x.",
352 					buf_size, lcn, bmp_pos, need_writeback,
353 					(unsigned int)(lcn >> 3),
354 					(unsigned int)*byte);
355 			bit = 1 << (lcn & 7);
356 			ntfs_debug("bit 0x%x.", bit);
357 
358 			if (has_guess) {
359 				if (*byte & bit) {
360 					if (is_contig == true && prev_run_len > 0)
361 						goto done;
362 
363 					has_guess = 0;
364 					break;
365 				}
366 			} else {
367 				lcn = max_empty_bit_range(buf, buf_size >> 3);
368 				if (lcn < 0)
369 					break;
370 				has_guess = 1;
371 				continue;
372 			}
373 			/*
374 			 * Allocate more memory if needed, including space for
375 			 * the terminator element.
376 			 * kvzalloc() operates on whole pages only.
377 			 */
378 			if ((rlpos + 2) * sizeof(*rl) > rlsize) {
379 				struct runlist_element *rl2;
380 
381 				ntfs_debug("Reallocating memory.");
382 				if (!rl)
383 					ntfs_debug("First free bit is at s64 0x%llx.",
384 							lcn + bmp_pos);
385 				rl2 = kvzalloc(rlsize + PAGE_SIZE, GFP_NOFS);
386 				if (unlikely(!rl2)) {
387 					err = -ENOMEM;
388 					ntfs_error(vol->sb, "Failed to allocate memory.");
389 					goto out;
390 				}
391 				memcpy(rl2, rl, rlsize);
392 				kvfree(rl);
393 				rl = rl2;
394 				rlsize += PAGE_SIZE;
395 				ntfs_debug("Reallocated memory, rlsize 0x%x.",
396 						rlsize);
397 			}
398 			/* Allocate the bitmap bit. */
399 			*byte |= bit;
400 			/* We need to write this bitmap page to disk. */
401 			need_writeback = 1;
402 			ntfs_debug("*byte 0x%x, need_writeback is set.",
403 					(unsigned int)*byte);
404 			ntfs_dec_free_clusters(vol, 1);
405 			ntfs_set_lcn_empty_bits(vol, index, 1, 1);
406 
407 			/*
408 			 * Coalesce with previous run if adjacent LCNs.
409 			 * Otherwise, append a new run.
410 			 */
411 			ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), prev_lcn 0x%llx, lcn 0x%llx, bmp_pos 0x%llx, prev_run_len 0x%llx, rlpos %i.",
412 					lcn + bmp_pos, 1ULL, prev_lcn,
413 					lcn, bmp_pos, prev_run_len, rlpos);
414 			if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
415 				ntfs_debug("Coalescing to run (lcn 0x%llx, len 0x%llx).",
416 						rl[rlpos - 1].lcn,
417 						rl[rlpos - 1].length);
418 				rl[rlpos - 1].length = ++prev_run_len;
419 				ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), prev_run_len 0x%llx.",
420 						rl[rlpos - 1].lcn,
421 						rl[rlpos - 1].length,
422 						prev_run_len);
423 			} else {
424 				if (likely(rlpos)) {
425 					ntfs_debug("Adding new run, (previous run lcn 0x%llx, len 0x%llx).",
426 							rl[rlpos - 1].lcn, rl[rlpos - 1].length);
427 					rl[rlpos].vcn = rl[rlpos - 1].vcn +
428 							prev_run_len;
429 				} else {
430 					ntfs_debug("Adding new run, is first run.");
431 					rl[rlpos].vcn = start_vcn;
432 				}
433 				rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
434 				rl[rlpos].length = prev_run_len = 1;
435 				rlpos++;
436 			}
437 			/* Done? */
438 			if (!--clusters) {
439 				s64 tc;
440 done:
441 				if (!used_zone_pos)
442 					goto out;
443 				/*
444 				 * Update the current zone position.  Positions
445 				 * of already scanned zones have been updated
446 				 * during the respective zone switches.
447 				 */
448 				tc = lcn + bmp_pos + 1;
449 				ntfs_debug("Done. Updating current zone position, tc 0x%llx, search_zone %i.",
450 						tc, search_zone);
451 				switch (search_zone) {
452 				case 1:
453 					ntfs_debug("Before checks, vol->mft_zone_pos 0x%llx.",
454 							vol->mft_zone_pos);
455 					if (tc >= vol->mft_zone_end) {
456 						vol->mft_zone_pos =
457 								vol->mft_lcn;
458 						if (!vol->mft_zone_end)
459 							vol->mft_zone_pos = 0;
460 					} else if ((bmp_initial_pos >=
461 							vol->mft_zone_pos ||
462 							tc > vol->mft_zone_pos)
463 							&& tc >= vol->mft_lcn)
464 						vol->mft_zone_pos = tc;
465 					ntfs_debug("After checks, vol->mft_zone_pos 0x%llx.",
466 							vol->mft_zone_pos);
467 					break;
468 				case 2:
469 					ntfs_debug("Before checks, vol->data1_zone_pos 0x%llx.",
470 							vol->data1_zone_pos);
471 					if (tc >= vol->nr_clusters)
472 						vol->data1_zone_pos =
473 							     vol->mft_zone_end;
474 					else if ((bmp_initial_pos >=
475 						    vol->data1_zone_pos ||
476 						    tc > vol->data1_zone_pos)
477 						    && tc >= vol->mft_zone_end)
478 						vol->data1_zone_pos = tc;
479 					ntfs_debug("After checks, vol->data1_zone_pos 0x%llx.",
480 							vol->data1_zone_pos);
481 					break;
482 				case 4:
483 					ntfs_debug("Before checks, vol->data2_zone_pos 0x%llx.",
484 							vol->data2_zone_pos);
485 					if (tc >= vol->mft_zone_start)
486 						vol->data2_zone_pos = 0;
487 					else if (bmp_initial_pos >=
488 						      vol->data2_zone_pos ||
489 						      tc > vol->data2_zone_pos)
490 						vol->data2_zone_pos = tc;
491 					ntfs_debug("After checks, vol->data2_zone_pos 0x%llx.",
492 							vol->data2_zone_pos);
493 					break;
494 				default:
495 					WARN_ON(1);
496 				}
497 				ntfs_debug("Finished.  Going to out.");
498 				goto out;
499 			}
500 			lcn++;
501 		}
502 
503 		if (!used_zone_pos) {
504 			used_zone_pos = 1;
505 			if (search_zone == 1)
506 				zone_start = vol->mft_zone_pos;
507 			else if (search_zone == 2)
508 				zone_start = vol->data1_zone_pos;
509 			else
510 				zone_start = vol->data2_zone_pos;
511 
512 			if (!zone_start || zone_start == vol->mft_zone_start ||
513 			    zone_start == vol->mft_zone_end)
514 				pass = 2;
515 			bmp_pos = zone_start;
516 		} else {
517 next_bmp_pos:
518 			bmp_pos += buf_size;
519 		}
520 
521 		ntfs_debug("After inner while loop: buf_size 0x%x, lcn 0x%llx, bmp_pos 0x%llx, need_writeback %i.",
522 				buf_size, lcn, bmp_pos, need_writeback);
523 		if (bmp_pos < zone_end) {
524 			ntfs_debug("Continuing outer while loop, bmp_pos 0x%llx, zone_end 0x%llx.",
525 					bmp_pos, zone_end);
526 			continue;
527 		}
528 zone_pass_done:	/* Finished with the current zone pass. */
529 		ntfs_debug("At zone_pass_done, pass %i.", pass);
530 		if (pass == 1) {
531 			/*
532 			 * Now do pass 2, scanning the first part of the zone
533 			 * we omitted in pass 1.
534 			 */
535 			pass = 2;
536 			zone_end = zone_start;
537 			switch (search_zone) {
538 			case 1: /* mft_zone */
539 				zone_start = vol->mft_zone_start;
540 				break;
541 			case 2: /* data1_zone */
542 				zone_start = vol->mft_zone_end;
543 				break;
544 			case 4: /* data2_zone */
545 				zone_start = 0;
546 				break;
547 			default:
548 				WARN_ON(1);
549 			}
550 			/* Sanity check. */
551 			if (zone_end < zone_start)
552 				zone_end = zone_start;
553 			bmp_pos = zone_start;
554 			ntfs_debug("Continuing outer while loop, pass 2, zone_start 0x%llx, zone_end 0x%llx, bmp_pos 0x%llx.",
555 					zone_start, zone_end, bmp_pos);
556 			continue;
557 		} /* pass == 2 */
558 done_zones_check:
559 		ntfs_debug("At done_zones_check, search_zone %i, done_zones before 0x%x, done_zones after 0x%x.",
560 				search_zone, done_zones,
561 				done_zones | search_zone);
562 		done_zones |= search_zone;
563 		if (done_zones < 7) {
564 			ntfs_debug("Switching zone.");
565 			/* Now switch to the next zone we haven't done yet. */
566 			pass = 1;
567 			switch (search_zone) {
568 			case 1:
569 				ntfs_debug("Switching from mft zone to data1 zone.");
570 				/* Update mft zone position. */
571 				if (rlpos && used_zone_pos) {
572 					s64 tc;
573 
574 					ntfs_debug("Before checks, vol->mft_zone_pos 0x%llx.",
575 							vol->mft_zone_pos);
576 					tc = rl[rlpos - 1].lcn +
577 							rl[rlpos - 1].length;
578 					if (tc >= vol->mft_zone_end) {
579 						vol->mft_zone_pos =
580 								vol->mft_lcn;
581 						if (!vol->mft_zone_end)
582 							vol->mft_zone_pos = 0;
583 					} else if ((bmp_initial_pos >=
584 							vol->mft_zone_pos ||
585 							tc > vol->mft_zone_pos)
586 							&& tc >= vol->mft_lcn)
587 						vol->mft_zone_pos = tc;
588 					ntfs_debug("After checks, vol->mft_zone_pos 0x%llx.",
589 							vol->mft_zone_pos);
590 				}
591 				/* Switch from mft zone to data1 zone. */
592 switch_to_data1_zone:		search_zone = 2;
593 				zone_start = bmp_initial_pos =
594 						vol->data1_zone_pos;
595 				zone_end = vol->nr_clusters;
596 				if (zone_start == vol->mft_zone_end)
597 					pass = 2;
598 				if (zone_start >= zone_end) {
599 					vol->data1_zone_pos = zone_start =
600 							vol->mft_zone_end;
601 					pass = 2;
602 				}
603 				break;
604 			case 2:
605 				ntfs_debug("Switching from data1 zone to data2 zone.");
606 				/* Update data1 zone position. */
607 				if (rlpos && used_zone_pos) {
608 					s64 tc;
609 
610 					ntfs_debug("Before checks, vol->data1_zone_pos 0x%llx.",
611 							vol->data1_zone_pos);
612 					tc = rl[rlpos - 1].lcn +
613 							rl[rlpos - 1].length;
614 					if (tc >= vol->nr_clusters)
615 						vol->data1_zone_pos =
616 							     vol->mft_zone_end;
617 					else if ((bmp_initial_pos >=
618 						    vol->data1_zone_pos ||
619 						    tc > vol->data1_zone_pos)
620 						    && tc >= vol->mft_zone_end)
621 						vol->data1_zone_pos = tc;
622 					ntfs_debug("After checks, vol->data1_zone_pos 0x%llx.",
623 							vol->data1_zone_pos);
624 				}
625 				/* Switch from data1 zone to data2 zone. */
626 				search_zone = 4;
627 				zone_start = bmp_initial_pos =
628 						vol->data2_zone_pos;
629 				zone_end = vol->mft_zone_start;
630 				if (!zone_start)
631 					pass = 2;
632 				if (zone_start >= zone_end) {
633 					vol->data2_zone_pos = zone_start =
634 							bmp_initial_pos = 0;
635 					pass = 2;
636 				}
637 				break;
638 			case 4:
639 				ntfs_debug("Switching from data2 zone to data1 zone.");
640 				/* Update data2 zone position. */
641 				if (rlpos && used_zone_pos) {
642 					s64 tc;
643 
644 					ntfs_debug("Before checks, vol->data2_zone_pos 0x%llx.",
645 							vol->data2_zone_pos);
646 					tc = rl[rlpos - 1].lcn +
647 							rl[rlpos - 1].length;
648 					if (tc >= vol->mft_zone_start)
649 						vol->data2_zone_pos = 0;
650 					else if (bmp_initial_pos >=
651 						      vol->data2_zone_pos ||
652 						      tc > vol->data2_zone_pos)
653 						vol->data2_zone_pos = tc;
654 					ntfs_debug("After checks, vol->data2_zone_pos 0x%llx.",
655 							vol->data2_zone_pos);
656 				}
657 				/* Switch from data2 zone to data1 zone. */
658 				goto switch_to_data1_zone;
659 			default:
660 				WARN_ON(1);
661 			}
662 			ntfs_debug("After zone switch, search_zone %i, pass %i, bmp_initial_pos 0x%llx, zone_start 0x%llx, zone_end 0x%llx.",
663 					search_zone, pass,
664 					bmp_initial_pos,
665 					zone_start,
666 					zone_end);
667 			bmp_pos = zone_start;
668 			if (zone_start == zone_end) {
669 				ntfs_debug("Empty zone, going to done_zones_check.");
670 				/* Empty zone. Don't bother searching it. */
671 				goto done_zones_check;
672 			}
673 			ntfs_debug("Continuing outer while loop.");
674 			continue;
675 		} /* done_zones == 7 */
676 		ntfs_debug("All zones are finished.");
677 		/*
678 		 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
679 		 * MFT_ZONE, we have really run out of space.
680 		 */
681 		mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
682 		ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end 0x%llx, mft_zone_size 0x%llx.",
683 				vol->mft_zone_start, vol->mft_zone_end,
684 				mft_zone_size);
685 		if (zone == MFT_ZONE || mft_zone_size <= 0) {
686 			ntfs_debug("No free clusters left, going to out.");
687 			/* Really no more space left on device. */
688 			err = -ENOSPC;
689 			goto out;
690 		} /* zone == DATA_ZONE && mft_zone_size > 0 */
691 		ntfs_debug("Shrinking mft zone.");
692 		zone_end = vol->mft_zone_end;
693 		mft_zone_size >>= 1;
694 		if (mft_zone_size > 0)
695 			vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
696 		else /* mft zone and data2 zone no longer exist. */
697 			vol->data2_zone_pos = vol->mft_zone_start =
698 					vol->mft_zone_end = 0;
699 		if (vol->mft_zone_pos >= vol->mft_zone_end) {
700 			vol->mft_zone_pos = vol->mft_lcn;
701 			if (!vol->mft_zone_end)
702 				vol->mft_zone_pos = 0;
703 		}
704 		bmp_pos = zone_start = bmp_initial_pos =
705 				vol->data1_zone_pos = vol->mft_zone_end;
706 		search_zone = 2;
707 		pass = 2;
708 		done_zones &= ~2;
709 		ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, vol->mft_zone_start 0x%llx, vol->mft_zone_end 0x%llx, vol->mft_zone_pos 0x%llx, search_zone 2, pass 2, dones_zones 0x%x, zone_start 0x%llx, zone_end 0x%llx, vol->data1_zone_pos 0x%llx, continuing outer while loop.",
710 				mft_zone_size, vol->mft_zone_start,
711 				vol->mft_zone_end, vol->mft_zone_pos,
712 				done_zones, zone_start, zone_end,
713 				vol->data1_zone_pos);
714 	}
715 	ntfs_debug("After outer while loop.");
716 out:
717 	ntfs_debug("At out.");
718 	/* Add runlist terminator element. */
719 	if (likely(rl)) {
720 		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
721 		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
722 		rl[rlpos].length = 0;
723 	}
724 	if (!IS_ERR_OR_NULL(folio)) {
725 		if (need_writeback) {
726 			ntfs_debug("Marking page dirty.");
727 			folio_mark_dirty(folio);
728 			need_writeback = 0;
729 		}
730 		folio_unlock(folio);
731 		kunmap_local(buf);
732 		folio_put(folio);
733 	}
734 	if (likely(!err)) {
735 		if (!rl) {
736 			err = -EIO;
737 			goto out_restore;
738 		}
739 		if (is_dealloc == true)
740 			ntfs_release_dirty_clusters(vol, rl->length);
741 		ntfs_debug("Done.");
742 		goto out_restore;
743 	}
744 	if (err != -ENOSPC)
745 		ntfs_error(vol->sb,
746 			"Failed to allocate clusters, aborting (error %i).",
747 			err);
748 	if (rl) {
749 		int err2;
750 
751 		if (err == -ENOSPC)
752 			ntfs_debug("Not enough space to complete allocation, err -ENOSPC, first free lcn 0x%llx, could allocate up to 0x%llx clusters.",
753 					rl[0].lcn, count - clusters);
754 		/* Deallocate all allocated clusters. */
755 		ntfs_debug("Attempting rollback...");
756 		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
757 		if (err2) {
758 			ntfs_error(vol->sb,
759 				"Failed to rollback (error %i). Leaving inconsistent metadata! Unmount and run chkdsk.",
760 				err2);
761 			NVolSetErrors(vol);
762 		}
763 		/* Free the runlist. */
764 		kvfree(rl);
765 	} else if (err == -ENOSPC)
766 		ntfs_debug("No space left at all, err = -ENOSPC, first free lcn = 0x%llx.",
767 				vol->data1_zone_pos);
768 	atomic64_set(&vol->dirty_clusters, 0);
769 
770 out_restore:
771 	up_write(&vol->lcnbmp_lock);
772 	memalloc_nofs_restore(memalloc_flags);
773 
774 	return err < 0 ? ERR_PTR(err) : rl;
775 }
776 
777 /*
778  * __ntfs_cluster_free - free clusters on an ntfs volume
779  * @ni:		ntfs inode whose runlist describes the clusters to free
780  * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
781  * @count:	number of clusters to free or -1 for all clusters
782  * @ctx:	active attribute search context if present or NULL if not
783  * @is_rollback:	true if this is a rollback operation
784  *
785  * Free @count clusters starting at the cluster @start_vcn in the runlist
786  * described by the vfs inode @ni.
787  *
788  * If @count is -1, all clusters from @start_vcn to the end of the runlist are
789  * deallocated.  Thus, to completely free all clusters in a runlist, use
790  * @start_vcn = 0 and @count = -1.
791  *
792  * If @ctx is specified, it is an active search context of @ni and its base mft
793  * record.  This is needed when __ntfs_cluster_free() encounters unmapped
794  * runlist fragments and allows their mapping.  If you do not have the mft
795  * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
796  * perform the necessary mapping and unmapping.
797  *
798  * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
799  * before returning.  Thus, @ctx will be left pointing to the same attribute on
800  * return as on entry.  However, the actual pointers in @ctx may point to
801  * different memory locations on return, so you must remember to reset any
802  * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
803  * you will probably want to do:
804  *	m = ctx->mrec;
805  *	a = ctx->attr;
806  * Assuming you cache ctx->attr in a variable @a of type attr_record * and that
807  * you cache ctx->mrec in a variable @m of type struct mft_record *.
808  *
809  * @is_rollback should always be 'false', it is for internal use to rollback
810  * errors.  You probably want to use ntfs_cluster_free() instead.
811  *
812  * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
813  * remove from the runlist or mark sparse the freed runs later.
814  *
815  * Return the number of deallocated clusters (not counting sparse ones) on
816  * success and -errno on error.
817  *
818  * WARNING: If @ctx is supplied, regardless of whether success or failure is
819  *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
820  *	    is no longer valid, i.e. you need to either call
821  *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
822  *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
823  *	    why the mapping of the old inode failed.
824  *
825  * Locking: - The runlist described by @ni must be locked for writing on entry
826  *	      and is locked on return.  Note the runlist may be modified when
827  *	      needed runlist fragments need to be mapped.
828  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
829  *	      on return.
830  *	    - This function takes the volume lcn bitmap lock for writing and
831  *	      modifies the bitmap contents.
832  *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
833  *	      entry and it will be left unmapped on return.
834  *	    - If @ctx is not NULL, the base mft record must be mapped on entry
835  *	      and it will be left mapped on return.
836  */
837 s64 __ntfs_cluster_free(struct ntfs_inode *ni, const s64 start_vcn, s64 count,
838 		struct ntfs_attr_search_ctx *ctx, const bool is_rollback)
839 {
840 	s64 delta, to_free, total_freed, real_freed;
841 	struct ntfs_volume *vol;
842 	struct inode *lcnbmp_vi;
843 	struct runlist_element *rl;
844 	int err;
845 	unsigned int memalloc_flags;
846 
847 	ntfs_debug("Entering for i_ino 0x%llx, start_vcn 0x%llx, count 0x%llx.%s",
848 			ni->mft_no, start_vcn, count,
849 			is_rollback ? " (rollback)" : "");
850 	vol = ni->vol;
851 	lcnbmp_vi = vol->lcnbmp_ino;
852 	if (start_vcn < 0 || count < -1)
853 		return -EINVAL;
854 
855 	if (!NVolFreeClusterKnown(vol))
856 		wait_event(vol->free_waitq, NVolFreeClusterKnown(vol));
857 
858 	/*
859 	 * Lock the lcn bitmap for writing but only if not rolling back.  We
860 	 * must hold the lock all the way including through rollback otherwise
861 	 * rollback is not possible because once we have cleared a bit and
862 	 * dropped the lock, anyone could have set the bit again, thus
863 	 * allocating the cluster for another use.
864 	 */
865 	if (likely(!is_rollback)) {
866 		memalloc_flags = memalloc_nofs_save();
867 		down_write(&vol->lcnbmp_lock);
868 	}
869 
870 	total_freed = real_freed = 0;
871 
872 	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
873 	if (IS_ERR(rl)) {
874 		err = PTR_ERR(rl);
875 		if (err == -ENOENT) {
876 			if (likely(!is_rollback)) {
877 				up_write(&vol->lcnbmp_lock);
878 				memalloc_nofs_restore(memalloc_flags);
879 			}
880 			return 0;
881 		}
882 
883 		if (!is_rollback)
884 			ntfs_error(vol->sb,
885 				"Failed to find first runlist element (error %d), aborting.",
886 				err);
887 		goto err_out;
888 	}
889 	if (unlikely(rl->lcn < LCN_HOLE)) {
890 		if (!is_rollback)
891 			ntfs_error(vol->sb, "First runlist element has invalid lcn, aborting.");
892 		err = -EIO;
893 		goto err_out;
894 	}
895 	/* Find the starting cluster inside the run that needs freeing. */
896 	delta = start_vcn - rl->vcn;
897 
898 	/* The number of clusters in this run that need freeing. */
899 	to_free = rl->length - delta;
900 	if (count >= 0 && to_free > count)
901 		to_free = count;
902 
903 	if (likely(rl->lcn >= 0)) {
904 		/* Do the actual freeing of the clusters in this run. */
905 		err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
906 				to_free, likely(!is_rollback) ? 0 : 1);
907 		if (unlikely(err)) {
908 			if (!is_rollback)
909 				ntfs_error(vol->sb,
910 					"Failed to clear first run (error %i), aborting.",
911 					err);
912 			goto err_out;
913 		}
914 		/* We have freed @to_free real clusters. */
915 		real_freed = to_free;
916 	}
917 	/* Go to the next run and adjust the number of clusters left to free. */
918 	++rl;
919 	if (count >= 0)
920 		count -= to_free;
921 
922 	/* Keep track of the total "freed" clusters, including sparse ones. */
923 	total_freed = to_free;
924 	/*
925 	 * Loop over the remaining runs, using @count as a capping value, and
926 	 * free them.
927 	 */
928 	for (; rl->length && count != 0; ++rl) {
929 		if (unlikely(rl->lcn < LCN_HOLE)) {
930 			s64 vcn;
931 
932 			/* Attempt to map runlist. */
933 			vcn = rl->vcn;
934 			rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
935 			if (IS_ERR(rl)) {
936 				err = PTR_ERR(rl);
937 				if (!is_rollback)
938 					ntfs_error(vol->sb,
939 						"Failed to map runlist fragment or failed to find subsequent runlist element.");
940 				goto err_out;
941 			}
942 			if (unlikely(rl->lcn < LCN_HOLE)) {
943 				if (!is_rollback)
944 					ntfs_error(vol->sb,
945 						"Runlist element has invalid lcn (0x%llx).",
946 						rl->lcn);
947 				err = -EIO;
948 				goto err_out;
949 			}
950 		}
951 		/* The number of clusters in this run that need freeing. */
952 		to_free = rl->length;
953 		if (count >= 0 && to_free > count)
954 			to_free = count;
955 
956 		if (likely(rl->lcn >= 0)) {
957 			/* Do the actual freeing of the clusters in the run. */
958 			err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
959 					to_free, likely(!is_rollback) ? 0 : 1);
960 			if (unlikely(err)) {
961 				if (!is_rollback)
962 					ntfs_error(vol->sb, "Failed to clear subsequent run.");
963 				goto err_out;
964 			}
965 			/* We have freed @to_free real clusters. */
966 			real_freed += to_free;
967 		}
968 		/* Adjust the number of clusters left to free. */
969 		if (count >= 0)
970 			count -= to_free;
971 
972 		/* Update the total done clusters. */
973 		total_freed += to_free;
974 	}
975 	ntfs_inc_free_clusters(vol, real_freed);
976 	if (likely(!is_rollback)) {
977 		up_write(&vol->lcnbmp_lock);
978 		memalloc_nofs_restore(memalloc_flags);
979 	}
980 
981 	WARN_ON(count > 0);
982 
983 	if (NVolDiscard(vol) && !is_rollback) {
984 		s64 total_discarded = 0, rl_off;
985 		u32 gran = bdev_discard_granularity(vol->sb->s_bdev);
986 
987 		rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
988 		if (IS_ERR(rl))
989 			return real_freed;
990 		rl_off = start_vcn - rl->vcn;
991 		while (rl->length && total_discarded < total_freed) {
992 			s64 to_discard = rl->length - rl_off;
993 
994 			if (to_discard + total_discarded > total_freed)
995 				to_discard = total_freed - total_discarded;
996 			if (rl->lcn >= 0) {
997 				sector_t start_sector, end_sector;
998 				int ret;
999 
1000 				start_sector = ALIGN(NTFS_CLU_TO_B(vol, rl->lcn + rl_off),
1001 						     gran) >> SECTOR_SHIFT;
1002 				end_sector = ALIGN_DOWN(NTFS_CLU_TO_B(vol,
1003 							rl->lcn + rl_off + to_discard),
1004 							gran) >> SECTOR_SHIFT;
1005 				if (start_sector < end_sector) {
1006 					ret = blkdev_issue_discard(vol->sb->s_bdev, start_sector,
1007 								   end_sector - start_sector,
1008 								   GFP_NOFS);
1009 					if (ret)
1010 						break;
1011 				}
1012 			}
1013 
1014 			total_discarded += to_discard;
1015 			++rl;
1016 			rl_off = 0;
1017 		}
1018 	}
1019 
1020 	/* We are done.  Return the number of actually freed clusters. */
1021 	ntfs_debug("Done.");
1022 	return real_freed;
1023 err_out:
1024 	if (is_rollback)
1025 		return err;
1026 	/* If no real clusters were freed, no need to rollback. */
1027 	if (!real_freed) {
1028 		up_write(&vol->lcnbmp_lock);
1029 		memalloc_nofs_restore(memalloc_flags);
1030 		return err;
1031 	}
1032 	/*
1033 	 * Attempt to rollback and if that succeeds just return the error code.
1034 	 * If rollback fails, set the volume errors flag, emit an error
1035 	 * message, and return the error code.
1036 	 */
1037 	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
1038 	if (delta < 0) {
1039 		ntfs_error(vol->sb,
1040 			"Failed to rollback (error %i).  Leaving inconsistent metadata!  Unmount and run chkdsk.",
1041 			(int)delta);
1042 		NVolSetErrors(vol);
1043 	}
1044 	ntfs_dec_free_clusters(vol, delta);
1045 	up_write(&vol->lcnbmp_lock);
1046 	memalloc_nofs_restore(memalloc_flags);
1047 	ntfs_error(vol->sb, "Aborting (error %i).", err);
1048 	return err;
1049 }
1050