xref: /linux/fs/xfs/xfs_extent_busy.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * Copyright (c) 2010 David Chinner.
5  * Copyright (c) 2011 Christoph Hellwig.
6  * All Rights Reserved.
7  */
8 #include "xfs.h"
9 #include "xfs_fs.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_shared.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_alloc.h"
16 #include "xfs_extent_busy.h"
17 #include "xfs_trace.h"
18 #include "xfs_trans.h"
19 #include "xfs_log.h"
20 #include "xfs_ag.h"
21 
22 static void
23 xfs_extent_busy_insert_list(
24 	struct xfs_perag	*pag,
25 	xfs_agblock_t		bno,
26 	xfs_extlen_t		len,
27 	unsigned int		flags,
28 	struct list_head	*busy_list)
29 {
30 	struct xfs_extent_busy	*new;
31 	struct xfs_extent_busy	*busyp;
32 	struct rb_node		**rbp;
33 	struct rb_node		*parent = NULL;
34 
35 	new = kzalloc(sizeof(struct xfs_extent_busy),
36 			GFP_KERNEL | __GFP_NOFAIL);
37 	new->agno = pag->pag_agno;
38 	new->bno = bno;
39 	new->length = len;
40 	INIT_LIST_HEAD(&new->list);
41 	new->flags = flags;
42 
43 	/* trace before insert to be able to see failed inserts */
44 	trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
45 
46 	spin_lock(&pag->pagb_lock);
47 	rbp = &pag->pagb_tree.rb_node;
48 	while (*rbp) {
49 		parent = *rbp;
50 		busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
51 
52 		if (new->bno < busyp->bno) {
53 			rbp = &(*rbp)->rb_left;
54 			ASSERT(new->bno + new->length <= busyp->bno);
55 		} else if (new->bno > busyp->bno) {
56 			rbp = &(*rbp)->rb_right;
57 			ASSERT(bno >= busyp->bno + busyp->length);
58 		} else {
59 			ASSERT(0);
60 		}
61 	}
62 
63 	rb_link_node(&new->rb_node, parent, rbp);
64 	rb_insert_color(&new->rb_node, &pag->pagb_tree);
65 
66 	/* always process discard lists in fifo order */
67 	list_add_tail(&new->list, busy_list);
68 	spin_unlock(&pag->pagb_lock);
69 }
70 
71 void
72 xfs_extent_busy_insert(
73 	struct xfs_trans	*tp,
74 	struct xfs_perag	*pag,
75 	xfs_agblock_t		bno,
76 	xfs_extlen_t		len,
77 	unsigned int		flags)
78 {
79 	xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
80 }
81 
82 void
83 xfs_extent_busy_insert_discard(
84 	struct xfs_perag	*pag,
85 	xfs_agblock_t		bno,
86 	xfs_extlen_t		len,
87 	struct list_head	*busy_list)
88 {
89 	xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
90 			busy_list);
91 }
92 
93 /*
94  * Search for a busy extent within the range of the extent we are about to
95  * allocate.  You need to be holding the busy extent tree lock when calling
96  * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
97  * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
98  * match. This is done so that a non-zero return indicates an overlap that
99  * will require a synchronous transaction, but it can still be
100  * used to distinguish between a partial or exact match.
101  */
102 int
103 xfs_extent_busy_search(
104 	struct xfs_mount	*mp,
105 	struct xfs_perag	*pag,
106 	xfs_agblock_t		bno,
107 	xfs_extlen_t		len)
108 {
109 	struct rb_node		*rbp;
110 	struct xfs_extent_busy	*busyp;
111 	int			match = 0;
112 
113 	/* find closest start bno overlap */
114 	spin_lock(&pag->pagb_lock);
115 	rbp = pag->pagb_tree.rb_node;
116 	while (rbp) {
117 		busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
118 		if (bno < busyp->bno) {
119 			/* may overlap, but exact start block is lower */
120 			if (bno + len > busyp->bno)
121 				match = -1;
122 			rbp = rbp->rb_left;
123 		} else if (bno > busyp->bno) {
124 			/* may overlap, but exact start block is higher */
125 			if (bno < busyp->bno + busyp->length)
126 				match = -1;
127 			rbp = rbp->rb_right;
128 		} else {
129 			/* bno matches busyp, length determines exact match */
130 			match = (busyp->length == len) ? 1 : -1;
131 			break;
132 		}
133 	}
134 	spin_unlock(&pag->pagb_lock);
135 	return match;
136 }
137 
138 /*
139  * The found free extent [fbno, fend] overlaps part or all of the given busy
140  * extent.  If the overlap covers the beginning, the end, or all of the busy
141  * extent, the overlapping portion can be made unbusy and used for the
142  * allocation.  We can't split a busy extent because we can't modify a
143  * transaction/CIL context busy list, but we can update an entry's block
144  * number or length.
145  *
146  * Returns true if the extent can safely be reused, or false if the search
147  * needs to be restarted.
148  */
149 STATIC bool
150 xfs_extent_busy_update_extent(
151 	struct xfs_mount	*mp,
152 	struct xfs_perag	*pag,
153 	struct xfs_extent_busy	*busyp,
154 	xfs_agblock_t		fbno,
155 	xfs_extlen_t		flen,
156 	bool			userdata) __releases(&pag->pagb_lock)
157 					  __acquires(&pag->pagb_lock)
158 {
159 	xfs_agblock_t		fend = fbno + flen;
160 	xfs_agblock_t		bbno = busyp->bno;
161 	xfs_agblock_t		bend = bbno + busyp->length;
162 
163 	/*
164 	 * This extent is currently being discarded.  Give the thread
165 	 * performing the discard a chance to mark the extent unbusy
166 	 * and retry.
167 	 */
168 	if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
169 		spin_unlock(&pag->pagb_lock);
170 		delay(1);
171 		spin_lock(&pag->pagb_lock);
172 		return false;
173 	}
174 
175 	/*
176 	 * If there is a busy extent overlapping a user allocation, we have
177 	 * no choice but to force the log and retry the search.
178 	 *
179 	 * Fortunately this does not happen during normal operation, but
180 	 * only if the filesystem is very low on space and has to dip into
181 	 * the AGFL for normal allocations.
182 	 */
183 	if (userdata)
184 		goto out_force_log;
185 
186 	if (bbno < fbno && bend > fend) {
187 		/*
188 		 * Case 1:
189 		 *    bbno           bend
190 		 *    +BBBBBBBBBBBBBBBBB+
191 		 *        +---------+
192 		 *        fbno   fend
193 		 */
194 
195 		/*
196 		 * We would have to split the busy extent to be able to track
197 		 * it correct, which we cannot do because we would have to
198 		 * modify the list of busy extents attached to the transaction
199 		 * or CIL context, which is immutable.
200 		 *
201 		 * Force out the log to clear the busy extent and retry the
202 		 * search.
203 		 */
204 		goto out_force_log;
205 	} else if (bbno >= fbno && bend <= fend) {
206 		/*
207 		 * Case 2:
208 		 *    bbno           bend
209 		 *    +BBBBBBBBBBBBBBBBB+
210 		 *    +-----------------+
211 		 *    fbno           fend
212 		 *
213 		 * Case 3:
214 		 *    bbno           bend
215 		 *    +BBBBBBBBBBBBBBBBB+
216 		 *    +--------------------------+
217 		 *    fbno                    fend
218 		 *
219 		 * Case 4:
220 		 *             bbno           bend
221 		 *             +BBBBBBBBBBBBBBBBB+
222 		 *    +--------------------------+
223 		 *    fbno                    fend
224 		 *
225 		 * Case 5:
226 		 *             bbno           bend
227 		 *             +BBBBBBBBBBBBBBBBB+
228 		 *    +-----------------------------------+
229 		 *    fbno                             fend
230 		 *
231 		 */
232 
233 		/*
234 		 * The busy extent is fully covered by the extent we are
235 		 * allocating, and can simply be removed from the rbtree.
236 		 * However we cannot remove it from the immutable list
237 		 * tracking busy extents in the transaction or CIL context,
238 		 * so set the length to zero to mark it invalid.
239 		 *
240 		 * We also need to restart the busy extent search from the
241 		 * tree root, because erasing the node can rearrange the
242 		 * tree topology.
243 		 */
244 		rb_erase(&busyp->rb_node, &pag->pagb_tree);
245 		busyp->length = 0;
246 		return false;
247 	} else if (fend < bend) {
248 		/*
249 		 * Case 6:
250 		 *              bbno           bend
251 		 *             +BBBBBBBBBBBBBBBBB+
252 		 *             +---------+
253 		 *             fbno   fend
254 		 *
255 		 * Case 7:
256 		 *             bbno           bend
257 		 *             +BBBBBBBBBBBBBBBBB+
258 		 *    +------------------+
259 		 *    fbno            fend
260 		 *
261 		 */
262 		busyp->bno = fend;
263 		busyp->length = bend - fend;
264 	} else if (bbno < fbno) {
265 		/*
266 		 * Case 8:
267 		 *    bbno           bend
268 		 *    +BBBBBBBBBBBBBBBBB+
269 		 *        +-------------+
270 		 *        fbno       fend
271 		 *
272 		 * Case 9:
273 		 *    bbno           bend
274 		 *    +BBBBBBBBBBBBBBBBB+
275 		 *        +----------------------+
276 		 *        fbno                fend
277 		 */
278 		busyp->length = fbno - busyp->bno;
279 	} else {
280 		ASSERT(0);
281 	}
282 
283 	trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
284 	return true;
285 
286 out_force_log:
287 	spin_unlock(&pag->pagb_lock);
288 	xfs_log_force(mp, XFS_LOG_SYNC);
289 	trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
290 	spin_lock(&pag->pagb_lock);
291 	return false;
292 }
293 
294 
295 /*
296  * For a given extent [fbno, flen], make sure we can reuse it safely.
297  */
298 void
299 xfs_extent_busy_reuse(
300 	struct xfs_mount	*mp,
301 	struct xfs_perag	*pag,
302 	xfs_agblock_t		fbno,
303 	xfs_extlen_t		flen,
304 	bool			userdata)
305 {
306 	struct rb_node		*rbp;
307 
308 	ASSERT(flen > 0);
309 	spin_lock(&pag->pagb_lock);
310 restart:
311 	rbp = pag->pagb_tree.rb_node;
312 	while (rbp) {
313 		struct xfs_extent_busy *busyp =
314 			rb_entry(rbp, struct xfs_extent_busy, rb_node);
315 		xfs_agblock_t	bbno = busyp->bno;
316 		xfs_agblock_t	bend = bbno + busyp->length;
317 
318 		if (fbno + flen <= bbno) {
319 			rbp = rbp->rb_left;
320 			continue;
321 		} else if (fbno >= bend) {
322 			rbp = rbp->rb_right;
323 			continue;
324 		}
325 
326 		if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
327 						  userdata))
328 			goto restart;
329 	}
330 	spin_unlock(&pag->pagb_lock);
331 }
332 
333 /*
334  * For a given extent [fbno, flen], search the busy extent list to find a
335  * subset of the extent that is not busy.  If *rlen is smaller than
336  * args->minlen no suitable extent could be found, and the higher level
337  * code needs to force out the log and retry the allocation.
338  *
339  * Return the current busy generation for the AG if the extent is busy. This
340  * value can be used to wait for at least one of the currently busy extents
341  * to be cleared. Note that the busy list is not guaranteed to be empty after
342  * the gen is woken. The state of a specific extent must always be confirmed
343  * with another call to xfs_extent_busy_trim() before it can be used.
344  */
345 bool
346 xfs_extent_busy_trim(
347 	struct xfs_alloc_arg	*args,
348 	xfs_agblock_t		*bno,
349 	xfs_extlen_t		*len,
350 	unsigned		*busy_gen)
351 {
352 	xfs_agblock_t		fbno;
353 	xfs_extlen_t		flen;
354 	struct rb_node		*rbp;
355 	bool			ret = false;
356 
357 	ASSERT(*len > 0);
358 
359 	spin_lock(&args->pag->pagb_lock);
360 	fbno = *bno;
361 	flen = *len;
362 	rbp = args->pag->pagb_tree.rb_node;
363 	while (rbp && flen >= args->minlen) {
364 		struct xfs_extent_busy *busyp =
365 			rb_entry(rbp, struct xfs_extent_busy, rb_node);
366 		xfs_agblock_t	fend = fbno + flen;
367 		xfs_agblock_t	bbno = busyp->bno;
368 		xfs_agblock_t	bend = bbno + busyp->length;
369 
370 		if (fend <= bbno) {
371 			rbp = rbp->rb_left;
372 			continue;
373 		} else if (fbno >= bend) {
374 			rbp = rbp->rb_right;
375 			continue;
376 		}
377 
378 		if (bbno <= fbno) {
379 			/* start overlap */
380 
381 			/*
382 			 * Case 1:
383 			 *    bbno           bend
384 			 *    +BBBBBBBBBBBBBBBBB+
385 			 *        +---------+
386 			 *        fbno   fend
387 			 *
388 			 * Case 2:
389 			 *    bbno           bend
390 			 *    +BBBBBBBBBBBBBBBBB+
391 			 *    +-------------+
392 			 *    fbno       fend
393 			 *
394 			 * Case 3:
395 			 *    bbno           bend
396 			 *    +BBBBBBBBBBBBBBBBB+
397 			 *        +-------------+
398 			 *        fbno       fend
399 			 *
400 			 * Case 4:
401 			 *    bbno           bend
402 			 *    +BBBBBBBBBBBBBBBBB+
403 			 *    +-----------------+
404 			 *    fbno           fend
405 			 *
406 			 * No unbusy region in extent, return failure.
407 			 */
408 			if (fend <= bend)
409 				goto fail;
410 
411 			/*
412 			 * Case 5:
413 			 *    bbno           bend
414 			 *    +BBBBBBBBBBBBBBBBB+
415 			 *        +----------------------+
416 			 *        fbno                fend
417 			 *
418 			 * Case 6:
419 			 *    bbno           bend
420 			 *    +BBBBBBBBBBBBBBBBB+
421 			 *    +--------------------------+
422 			 *    fbno                    fend
423 			 *
424 			 * Needs to be trimmed to:
425 			 *                       +-------+
426 			 *                       fbno fend
427 			 */
428 			fbno = bend;
429 		} else if (bend >= fend) {
430 			/* end overlap */
431 
432 			/*
433 			 * Case 7:
434 			 *             bbno           bend
435 			 *             +BBBBBBBBBBBBBBBBB+
436 			 *    +------------------+
437 			 *    fbno            fend
438 			 *
439 			 * Case 8:
440 			 *             bbno           bend
441 			 *             +BBBBBBBBBBBBBBBBB+
442 			 *    +--------------------------+
443 			 *    fbno                    fend
444 			 *
445 			 * Needs to be trimmed to:
446 			 *    +-------+
447 			 *    fbno fend
448 			 */
449 			fend = bbno;
450 		} else {
451 			/* middle overlap */
452 
453 			/*
454 			 * Case 9:
455 			 *             bbno           bend
456 			 *             +BBBBBBBBBBBBBBBBB+
457 			 *    +-----------------------------------+
458 			 *    fbno                             fend
459 			 *
460 			 * Can be trimmed to:
461 			 *    +-------+        OR         +-------+
462 			 *    fbno fend                   fbno fend
463 			 *
464 			 * Backward allocation leads to significant
465 			 * fragmentation of directories, which degrades
466 			 * directory performance, therefore we always want to
467 			 * choose the option that produces forward allocation
468 			 * patterns.
469 			 * Preferring the lower bno extent will make the next
470 			 * request use "fend" as the start of the next
471 			 * allocation;  if the segment is no longer busy at
472 			 * that point, we'll get a contiguous allocation, but
473 			 * even if it is still busy, we will get a forward
474 			 * allocation.
475 			 * We try to avoid choosing the segment at "bend",
476 			 * because that can lead to the next allocation
477 			 * taking the segment at "fbno", which would be a
478 			 * backward allocation.  We only use the segment at
479 			 * "fbno" if it is much larger than the current
480 			 * requested size, because in that case there's a
481 			 * good chance subsequent allocations will be
482 			 * contiguous.
483 			 */
484 			if (bbno - fbno >= args->maxlen) {
485 				/* left candidate fits perfect */
486 				fend = bbno;
487 			} else if (fend - bend >= args->maxlen * 4) {
488 				/* right candidate has enough free space */
489 				fbno = bend;
490 			} else if (bbno - fbno >= args->minlen) {
491 				/* left candidate fits minimum requirement */
492 				fend = bbno;
493 			} else {
494 				goto fail;
495 			}
496 		}
497 
498 		flen = fend - fbno;
499 	}
500 out:
501 
502 	if (fbno != *bno || flen != *len) {
503 		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
504 					  fbno, flen);
505 		*bno = fbno;
506 		*len = flen;
507 		*busy_gen = args->pag->pagb_gen;
508 		ret = true;
509 	}
510 	spin_unlock(&args->pag->pagb_lock);
511 	return ret;
512 fail:
513 	/*
514 	 * Return a zero extent length as failure indications.  All callers
515 	 * re-check if the trimmed extent satisfies the minlen requirement.
516 	 */
517 	flen = 0;
518 	goto out;
519 }
520 
521 static bool
522 xfs_extent_busy_clear_one(
523 	struct xfs_perag	*pag,
524 	struct xfs_extent_busy	*busyp,
525 	bool			do_discard)
526 {
527 	if (busyp->length) {
528 		if (do_discard &&
529 		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
530 			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
531 			return false;
532 		}
533 		trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno,
534 				busyp->bno, busyp->length);
535 		rb_erase(&busyp->rb_node, &pag->pagb_tree);
536 	}
537 
538 	list_del_init(&busyp->list);
539 	kfree(busyp);
540 	return true;
541 }
542 
543 /*
544  * Remove all extents on the passed in list from the busy extents tree.
545  * If do_discard is set skip extents that need to be discarded, and mark
546  * these as undergoing a discard operation instead.
547  */
548 void
549 xfs_extent_busy_clear(
550 	struct xfs_mount	*mp,
551 	struct list_head	*list,
552 	bool			do_discard)
553 {
554 	struct xfs_extent_busy	*busyp, *next;
555 
556 	busyp = list_first_entry_or_null(list, typeof(*busyp), list);
557 	if (!busyp)
558 		return;
559 
560 	do {
561 		bool			wakeup = false;
562 		struct xfs_perag	*pag;
563 
564 		pag = xfs_perag_get(mp, busyp->agno);
565 		spin_lock(&pag->pagb_lock);
566 		do {
567 			next = list_next_entry(busyp, list);
568 			if (xfs_extent_busy_clear_one(pag, busyp, do_discard))
569 				wakeup = true;
570 			busyp = next;
571 		} while (!list_entry_is_head(busyp, list, list) &&
572 			 busyp->agno == pag->pag_agno);
573 
574 		if (wakeup) {
575 			pag->pagb_gen++;
576 			wake_up_all(&pag->pagb_wait);
577 		}
578 		spin_unlock(&pag->pagb_lock);
579 		xfs_perag_put(pag);
580 	} while (!list_entry_is_head(busyp, list, list));
581 }
582 
583 /*
584  * Flush out all busy extents for this AG.
585  *
586  * If the current transaction is holding busy extents, the caller may not want
587  * to wait for committed busy extents to resolve. If we are being told just to
588  * try a flush or progress has been made since we last skipped a busy extent,
589  * return immediately to allow the caller to try again.
590  *
591  * If we are freeing extents, we might actually be holding the only free extents
592  * in the transaction busy list and the log force won't resolve that situation.
593  * In this case, we must return -EAGAIN to avoid a deadlock by informing the
594  * caller it needs to commit the busy extents it holds before retrying the
595  * extent free operation.
596  */
597 int
598 xfs_extent_busy_flush(
599 	struct xfs_trans	*tp,
600 	struct xfs_perag	*pag,
601 	unsigned		busy_gen,
602 	uint32_t		alloc_flags)
603 {
604 	DEFINE_WAIT		(wait);
605 	int			error;
606 
607 	error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
608 	if (error)
609 		return error;
610 
611 	/* Avoid deadlocks on uncommitted busy extents. */
612 	if (!list_empty(&tp->t_busy)) {
613 		if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
614 			return 0;
615 
616 		if (busy_gen != READ_ONCE(pag->pagb_gen))
617 			return 0;
618 
619 		if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
620 			return -EAGAIN;
621 	}
622 
623 	/* Wait for committed busy extents to resolve. */
624 	do {
625 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
626 		if  (busy_gen != READ_ONCE(pag->pagb_gen))
627 			break;
628 		schedule();
629 	} while (1);
630 
631 	finish_wait(&pag->pagb_wait, &wait);
632 	return 0;
633 }
634 
635 void
636 xfs_extent_busy_wait_all(
637 	struct xfs_mount	*mp)
638 {
639 	struct xfs_perag	*pag;
640 	DEFINE_WAIT		(wait);
641 	xfs_agnumber_t		agno;
642 
643 	for_each_perag(mp, agno, pag) {
644 		do {
645 			prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
646 			if  (RB_EMPTY_ROOT(&pag->pagb_tree))
647 				break;
648 			schedule();
649 		} while (1);
650 		finish_wait(&pag->pagb_wait, &wait);
651 	}
652 }
653 
654 /*
655  * Callback for list_sort to sort busy extents by the AG they reside in.
656  */
657 int
658 xfs_extent_busy_ag_cmp(
659 	void			*priv,
660 	const struct list_head	*l1,
661 	const struct list_head	*l2)
662 {
663 	struct xfs_extent_busy	*b1 =
664 		container_of(l1, struct xfs_extent_busy, list);
665 	struct xfs_extent_busy	*b2 =
666 		container_of(l2, struct xfs_extent_busy, list);
667 	s32 diff;
668 
669 	diff = b1->agno - b2->agno;
670 	if (!diff)
671 		diff = b1->bno - b2->bno;
672 	return diff;
673 }
674 
675 /* Are there any busy extents in this AG? */
676 bool
677 xfs_extent_busy_list_empty(
678 	struct xfs_perag	*pag)
679 {
680 	bool			res;
681 
682 	spin_lock(&pag->pagb_lock);
683 	res = RB_EMPTY_ROOT(&pag->pagb_tree);
684 	spin_unlock(&pag->pagb_lock);
685 	return res;
686 }
687