xref: /linux/fs/xfs/libxfs/xfs_alloc.c (revision 68a052239fc4b351e961f698b824f7654a346091)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
16 #include "xfs_rmap.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
25 #include "xfs_log.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_bmap.h"
29 #include "xfs_health.h"
30 #include "xfs_extfree_item.h"
31 
32 struct kmem_cache	*xfs_extfree_item_cache;
33 
34 struct workqueue_struct *xfs_alloc_wq;
35 
36 #define	XFSA_FIXUP_BNO_OK	1
37 #define	XFSA_FIXUP_CNT_OK	2
38 
39 /*
40  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
41  * the beginning of the block for a proper header with the location information
42  * and CRC.
43  */
44 unsigned int
45 xfs_agfl_size(
46 	struct xfs_mount	*mp)
47 {
48 	unsigned int		size = mp->m_sb.sb_sectsize;
49 
50 	if (xfs_has_crc(mp))
51 		size -= sizeof(struct xfs_agfl);
52 
53 	return size / sizeof(xfs_agblock_t);
54 }
55 
56 unsigned int
57 xfs_refc_block(
58 	struct xfs_mount	*mp)
59 {
60 	if (xfs_has_rmapbt(mp))
61 		return XFS_RMAP_BLOCK(mp) + 1;
62 	if (xfs_has_finobt(mp))
63 		return XFS_FIBT_BLOCK(mp) + 1;
64 	return XFS_IBT_BLOCK(mp) + 1;
65 }
66 
67 xfs_extlen_t
68 xfs_prealloc_blocks(
69 	struct xfs_mount	*mp)
70 {
71 	if (xfs_has_reflink(mp))
72 		return xfs_refc_block(mp) + 1;
73 	if (xfs_has_rmapbt(mp))
74 		return XFS_RMAP_BLOCK(mp) + 1;
75 	if (xfs_has_finobt(mp))
76 		return XFS_FIBT_BLOCK(mp) + 1;
77 	return XFS_IBT_BLOCK(mp) + 1;
78 }
79 
80 /*
81  * The number of blocks per AG that we withhold from xfs_dec_fdblocks to
82  * guarantee that we can refill the AGFL prior to allocating space in a nearly
83  * full AG.  Although the space described by the free space btrees, the
84  * blocks used by the freesp btrees themselves, and the blocks owned by the
85  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
86  * free space in the AG drop so low that the free space btrees cannot refill an
87  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
88  * until the fs goes down, we subtract this many AG blocks from the incore
89  * fdblocks to ensure user allocation does not overcommit the space the
90  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
91  * withhold space from xfs_dec_fdblocks, so we do not account for that here.
92  */
93 #define XFS_ALLOCBT_AGFL_RESERVE	4
94 
95 /*
96  * Compute the number of blocks that we set aside to guarantee the ability to
97  * refill the AGFL and handle a full bmap btree split.
98  *
99  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
100  * AGF buffer (PV 947395), we place constraints on the relationship among
101  * actual allocations for data blocks, freelist blocks, and potential file data
102  * bmap btree blocks. However, these restrictions may result in no actual space
103  * allocated for a delayed extent, for example, a data block in a certain AG is
104  * allocated but there is no additional block for the additional bmap btree
105  * block due to a split of the bmap btree of the file. The result of this may
106  * lead to an infinite loop when the file gets flushed to disk and all delayed
107  * extents need to be actually allocated. To get around this, we explicitly set
108  * aside a few blocks which will not be reserved in delayed allocation.
109  *
110  * For each AG, we need to reserve enough blocks to replenish a totally empty
111  * AGFL and 4 more to handle a potential split of the file's bmap btree.
112  */
113 unsigned int
114 xfs_alloc_set_aside(
115 	struct xfs_mount	*mp)
116 {
117 	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
118 }
119 
120 /*
121  * When deciding how much space to allocate out of an AG, we limit the
122  * allocation maximum size to the size the AG. However, we cannot use all the
123  * blocks in the AG - some are permanently used by metadata. These
124  * blocks are generally:
125  *	- the AG superblock, AGF, AGI and AGFL
126  *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
127  *	  the AGI free inode and rmap btree root blocks.
128  *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
129  *	- the rmapbt root block
130  *
131  * The AG headers are sector sized, so the amount of space they take up is
132  * dependent on filesystem geometry. The others are all single blocks.
133  */
134 unsigned int
135 xfs_alloc_ag_max_usable(
136 	struct xfs_mount	*mp)
137 {
138 	unsigned int		blocks;
139 
140 	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
141 	blocks += XFS_ALLOCBT_AGFL_RESERVE;
142 	blocks += 3;			/* AGF, AGI btree root blocks */
143 	if (xfs_has_finobt(mp))
144 		blocks++;		/* finobt root block */
145 	if (xfs_has_rmapbt(mp))
146 		blocks++;		/* rmap root block */
147 	if (xfs_has_reflink(mp))
148 		blocks++;		/* refcount root block */
149 
150 	return mp->m_sb.sb_agblocks - blocks;
151 }
152 
153 
154 static int
155 xfs_alloc_lookup(
156 	struct xfs_btree_cur	*cur,
157 	xfs_lookup_t		dir,
158 	xfs_agblock_t		bno,
159 	xfs_extlen_t		len,
160 	int			*stat)
161 {
162 	int			error;
163 
164 	cur->bc_rec.a.ar_startblock = bno;
165 	cur->bc_rec.a.ar_blockcount = len;
166 	error = xfs_btree_lookup(cur, dir, stat);
167 	if (*stat == 1)
168 		cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
169 	else
170 		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
171 	return error;
172 }
173 
174 /*
175  * Lookup the record equal to [bno, len] in the btree given by cur.
176  */
177 static inline int				/* error */
178 xfs_alloc_lookup_eq(
179 	struct xfs_btree_cur	*cur,	/* btree cursor */
180 	xfs_agblock_t		bno,	/* starting block of extent */
181 	xfs_extlen_t		len,	/* length of extent */
182 	int			*stat)	/* success/failure */
183 {
184 	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
185 }
186 
187 /*
188  * Lookup the first record greater than or equal to [bno, len]
189  * in the btree given by cur.
190  */
191 int				/* error */
192 xfs_alloc_lookup_ge(
193 	struct xfs_btree_cur	*cur,	/* btree cursor */
194 	xfs_agblock_t		bno,	/* starting block of extent */
195 	xfs_extlen_t		len,	/* length of extent */
196 	int			*stat)	/* success/failure */
197 {
198 	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
199 }
200 
201 /*
202  * Lookup the first record less than or equal to [bno, len]
203  * in the btree given by cur.
204  */
205 int					/* error */
206 xfs_alloc_lookup_le(
207 	struct xfs_btree_cur	*cur,	/* btree cursor */
208 	xfs_agblock_t		bno,	/* starting block of extent */
209 	xfs_extlen_t		len,	/* length of extent */
210 	int			*stat)	/* success/failure */
211 {
212 	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
213 }
214 
215 static inline bool
216 xfs_alloc_cur_active(
217 	struct xfs_btree_cur	*cur)
218 {
219 	return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
220 }
221 
222 /*
223  * Update the record referred to by cur to the value given
224  * by [bno, len].
225  * This either works (return 0) or gets an EFSCORRUPTED error.
226  */
227 STATIC int				/* error */
228 xfs_alloc_update(
229 	struct xfs_btree_cur	*cur,	/* btree cursor */
230 	xfs_agblock_t		bno,	/* starting block of extent */
231 	xfs_extlen_t		len)	/* length of extent */
232 {
233 	union xfs_btree_rec	rec;
234 
235 	rec.alloc.ar_startblock = cpu_to_be32(bno);
236 	rec.alloc.ar_blockcount = cpu_to_be32(len);
237 	return xfs_btree_update(cur, &rec);
238 }
239 
240 /* Convert the ondisk btree record to its incore representation. */
241 void
242 xfs_alloc_btrec_to_irec(
243 	const union xfs_btree_rec	*rec,
244 	struct xfs_alloc_rec_incore	*irec)
245 {
246 	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
247 	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
248 }
249 
250 /* Simple checks for free space records. */
251 xfs_failaddr_t
252 xfs_alloc_check_irec(
253 	struct xfs_perag			*pag,
254 	const struct xfs_alloc_rec_incore	*irec)
255 {
256 	if (irec->ar_blockcount == 0)
257 		return __this_address;
258 
259 	/* check for valid extent range, including overflow */
260 	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
261 		return __this_address;
262 
263 	return NULL;
264 }
265 
266 static inline int
267 xfs_alloc_complain_bad_rec(
268 	struct xfs_btree_cur		*cur,
269 	xfs_failaddr_t			fa,
270 	const struct xfs_alloc_rec_incore *irec)
271 {
272 	struct xfs_mount		*mp = cur->bc_mp;
273 
274 	xfs_warn(mp,
275 		"%sbt record corruption in AG %d detected at %pS!",
276 		cur->bc_ops->name, cur->bc_group->xg_gno, fa);
277 	xfs_warn(mp,
278 		"start block 0x%x block count 0x%x", irec->ar_startblock,
279 		irec->ar_blockcount);
280 	xfs_btree_mark_sick(cur);
281 	return -EFSCORRUPTED;
282 }
283 
284 /*
285  * Get the data from the pointed-to record.
286  */
287 int					/* error */
288 xfs_alloc_get_rec(
289 	struct xfs_btree_cur	*cur,	/* btree cursor */
290 	xfs_agblock_t		*bno,	/* output: starting block of extent */
291 	xfs_extlen_t		*len,	/* output: length of extent */
292 	int			*stat)	/* output: success/failure */
293 {
294 	struct xfs_alloc_rec_incore irec;
295 	union xfs_btree_rec	*rec;
296 	xfs_failaddr_t		fa;
297 	int			error;
298 
299 	error = xfs_btree_get_rec(cur, &rec, stat);
300 	if (error || !(*stat))
301 		return error;
302 
303 	xfs_alloc_btrec_to_irec(rec, &irec);
304 	fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
305 	if (fa)
306 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
307 
308 	*bno = irec.ar_startblock;
309 	*len = irec.ar_blockcount;
310 	return 0;
311 }
312 
313 /*
314  * Compute aligned version of the found extent.
315  * Takes alignment and min length into account.
316  */
317 STATIC bool
318 xfs_alloc_compute_aligned(
319 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
320 	xfs_agblock_t	foundbno,	/* starting block in found extent */
321 	xfs_extlen_t	foundlen,	/* length in found extent */
322 	xfs_agblock_t	*resbno,	/* result block number */
323 	xfs_extlen_t	*reslen,	/* result length */
324 	unsigned	*busy_gen)
325 {
326 	xfs_agblock_t	bno = foundbno;
327 	xfs_extlen_t	len = foundlen;
328 	xfs_extlen_t	diff;
329 	bool		busy;
330 
331 	/* Trim busy sections out of found extent */
332 	busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen,
333 			args->maxlen, &bno, &len, busy_gen);
334 
335 	/*
336 	 * If we have a largish extent that happens to start before min_agbno,
337 	 * see if we can shift it into range...
338 	 */
339 	if (bno < args->min_agbno && bno + len > args->min_agbno) {
340 		diff = args->min_agbno - bno;
341 		if (len > diff) {
342 			bno += diff;
343 			len -= diff;
344 		}
345 	}
346 
347 	if (args->alignment > 1 && len >= args->minlen) {
348 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
349 
350 		diff = aligned_bno - bno;
351 
352 		*resbno = aligned_bno;
353 		*reslen = diff >= len ? 0 : len - diff;
354 	} else {
355 		*resbno = bno;
356 		*reslen = len;
357 	}
358 
359 	return busy;
360 }
361 
362 /*
363  * Compute best start block and diff for "near" allocations.
364  * freelen >= wantlen already checked by caller.
365  */
366 STATIC xfs_extlen_t			/* difference value (absolute) */
367 xfs_alloc_compute_diff(
368 	xfs_agblock_t	wantbno,	/* target starting block */
369 	xfs_extlen_t	wantlen,	/* target length */
370 	xfs_extlen_t	alignment,	/* target alignment */
371 	int		datatype,	/* are we allocating data? */
372 	xfs_agblock_t	freebno,	/* freespace's starting block */
373 	xfs_extlen_t	freelen,	/* freespace's length */
374 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
375 {
376 	xfs_agblock_t	freeend;	/* end of freespace extent */
377 	xfs_agblock_t	newbno1;	/* return block number */
378 	xfs_agblock_t	newbno2;	/* other new block number */
379 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
380 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
381 	xfs_agblock_t	wantend;	/* end of target extent */
382 	bool		userdata = datatype & XFS_ALLOC_USERDATA;
383 
384 	ASSERT(freelen >= wantlen);
385 	freeend = freebno + freelen;
386 	wantend = wantbno + wantlen;
387 	/*
388 	 * We want to allocate from the start of a free extent if it is past
389 	 * the desired block or if we are allocating user data and the free
390 	 * extent is before desired block. The second case is there to allow
391 	 * for contiguous allocation from the remaining free space if the file
392 	 * grows in the short term.
393 	 */
394 	if (freebno >= wantbno || (userdata && freeend < wantend)) {
395 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
396 			newbno1 = NULLAGBLOCK;
397 	} else if (freeend >= wantend && alignment > 1) {
398 		newbno1 = roundup(wantbno, alignment);
399 		newbno2 = newbno1 - alignment;
400 		if (newbno1 >= freeend)
401 			newbno1 = NULLAGBLOCK;
402 		else
403 			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
404 		if (newbno2 < freebno)
405 			newbno2 = NULLAGBLOCK;
406 		else
407 			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
408 		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
409 			if (newlen1 < newlen2 ||
410 			    (newlen1 == newlen2 &&
411 			     abs_diff(newbno1, wantbno) >
412 			     abs_diff(newbno2, wantbno)))
413 				newbno1 = newbno2;
414 		} else if (newbno2 != NULLAGBLOCK)
415 			newbno1 = newbno2;
416 	} else if (freeend >= wantend) {
417 		newbno1 = wantbno;
418 	} else if (alignment > 1) {
419 		newbno1 = roundup(freeend - wantlen, alignment);
420 		if (newbno1 > freeend - wantlen &&
421 		    newbno1 - alignment >= freebno)
422 			newbno1 -= alignment;
423 		else if (newbno1 >= freeend)
424 			newbno1 = NULLAGBLOCK;
425 	} else
426 		newbno1 = freeend - wantlen;
427 	*newbnop = newbno1;
428 	return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno);
429 }
430 
431 /*
432  * Fix up the length, based on mod and prod.
433  * len should be k * prod + mod for some k.
434  * If len is too small it is returned unchanged.
435  * If len hits maxlen it is left alone.
436  */
437 STATIC void
438 xfs_alloc_fix_len(
439 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
440 {
441 	xfs_extlen_t	k;
442 	xfs_extlen_t	rlen;
443 
444 	ASSERT(args->mod < args->prod);
445 	rlen = args->len;
446 	ASSERT(rlen >= args->minlen);
447 	ASSERT(rlen <= args->maxlen);
448 	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
449 	    (args->mod == 0 && rlen < args->prod))
450 		return;
451 	k = rlen % args->prod;
452 	if (k == args->mod)
453 		return;
454 	if (k > args->mod)
455 		rlen = rlen - (k - args->mod);
456 	else
457 		rlen = rlen - args->prod + (args->mod - k);
458 	/* casts to (int) catch length underflows */
459 	if ((int)rlen < (int)args->minlen)
460 		return;
461 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
462 	ASSERT(rlen % args->prod == args->mod);
463 	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
464 		rlen + args->minleft);
465 	args->len = rlen;
466 }
467 
468 /*
469  * Determine if the cursor points to the block that contains the right-most
470  * block of records in the by-count btree. This block contains the largest
471  * contiguous free extent in the AG, so if we modify a record in this block we
472  * need to call xfs_alloc_fixup_longest() once the modifications are done to
473  * ensure the agf->agf_longest field is kept up to date with the longest free
474  * extent tracked by the by-count btree.
475  */
476 static bool
477 xfs_alloc_cursor_at_lastrec(
478 	struct xfs_btree_cur	*cnt_cur)
479 {
480 	struct xfs_btree_block	*block;
481 	union xfs_btree_ptr	ptr;
482 	struct xfs_buf		*bp;
483 
484 	block = xfs_btree_get_block(cnt_cur, 0, &bp);
485 
486 	xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB);
487 	return xfs_btree_ptr_is_null(cnt_cur, &ptr);
488 }
489 
490 /*
491  * Find the rightmost record of the cntbt, and return the longest free space
492  * recorded in it. Simply set both the block number and the length to their
493  * maximum values before searching.
494  */
495 static int
496 xfs_cntbt_longest(
497 	struct xfs_btree_cur	*cnt_cur,
498 	xfs_extlen_t		*longest)
499 {
500 	struct xfs_alloc_rec_incore irec;
501 	union xfs_btree_rec	    *rec;
502 	int			    stat = 0;
503 	int			    error;
504 
505 	memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec));
506 	error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat);
507 	if (error)
508 		return error;
509 	if (!stat) {
510 		/* totally empty tree */
511 		*longest = 0;
512 		return 0;
513 	}
514 
515 	error = xfs_btree_get_rec(cnt_cur, &rec, &stat);
516 	if (error)
517 		return error;
518 	if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) {
519 		xfs_btree_mark_sick(cnt_cur);
520 		return -EFSCORRUPTED;
521 	}
522 
523 	xfs_alloc_btrec_to_irec(rec, &irec);
524 	*longest = irec.ar_blockcount;
525 	return 0;
526 }
527 
528 /*
529  * Update the longest contiguous free extent in the AG from the by-count cursor
530  * that is passed to us. This should be done at the end of any allocation or
531  * freeing operation that touches the longest extent in the btree.
532  *
533  * Needing to update the longest extent can be determined by calling
534  * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record
535  * modification but before the modification begins.
536  */
537 static int
538 xfs_alloc_fixup_longest(
539 	struct xfs_btree_cur	*cnt_cur)
540 {
541 	struct xfs_perag	*pag = to_perag(cnt_cur->bc_group);
542 	struct xfs_buf		*bp = cnt_cur->bc_ag.agbp;
543 	struct xfs_agf		*agf = bp->b_addr;
544 	xfs_extlen_t		longest = 0;
545 	int			error;
546 
547 	/* Lookup last rec in order to update AGF. */
548 	error = xfs_cntbt_longest(cnt_cur, &longest);
549 	if (error)
550 		return error;
551 
552 	pag->pagf_longest = longest;
553 	agf->agf_longest = cpu_to_be32(pag->pagf_longest);
554 	xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST);
555 
556 	return 0;
557 }
558 
559 /*
560  * Update the two btrees, logically removing from freespace the extent
561  * starting at rbno, rlen blocks.  The extent is contained within the
562  * actual (current) free extent fbno for flen blocks.
563  * Flags are passed in indicating whether the cursors are set to the
564  * relevant records.
565  */
566 STATIC int				/* error code */
567 xfs_alloc_fixup_trees(
568 	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
569 	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
570 	xfs_agblock_t	fbno,		/* starting block of free extent */
571 	xfs_extlen_t	flen,		/* length of free extent */
572 	xfs_agblock_t	rbno,		/* starting block of returned extent */
573 	xfs_extlen_t	rlen,		/* length of returned extent */
574 	int		flags)		/* flags, XFSA_FIXUP_... */
575 {
576 	int		error;		/* error code */
577 	int		i;		/* operation results */
578 	xfs_agblock_t	nfbno1;		/* first new free startblock */
579 	xfs_agblock_t	nfbno2;		/* second new free startblock */
580 	xfs_extlen_t	nflen1=0;	/* first new free length */
581 	xfs_extlen_t	nflen2=0;	/* second new free length */
582 	struct xfs_mount *mp;
583 	bool		fixup_longest = false;
584 
585 	mp = cnt_cur->bc_mp;
586 
587 	/*
588 	 * Look up the record in the by-size tree if necessary.
589 	 */
590 	if (flags & XFSA_FIXUP_CNT_OK) {
591 #ifdef DEBUG
592 		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
593 			return error;
594 		if (XFS_IS_CORRUPT(mp,
595 				   i != 1 ||
596 				   nfbno1 != fbno ||
597 				   nflen1 != flen)) {
598 			xfs_btree_mark_sick(cnt_cur);
599 			return -EFSCORRUPTED;
600 		}
601 #endif
602 	} else {
603 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
604 			return error;
605 		if (XFS_IS_CORRUPT(mp, i != 1)) {
606 			xfs_btree_mark_sick(cnt_cur);
607 			return -EFSCORRUPTED;
608 		}
609 	}
610 	/*
611 	 * Look up the record in the by-block tree if necessary.
612 	 */
613 	if (flags & XFSA_FIXUP_BNO_OK) {
614 #ifdef DEBUG
615 		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
616 			return error;
617 		if (XFS_IS_CORRUPT(mp,
618 				   i != 1 ||
619 				   nfbno1 != fbno ||
620 				   nflen1 != flen)) {
621 			xfs_btree_mark_sick(bno_cur);
622 			return -EFSCORRUPTED;
623 		}
624 #endif
625 	} else {
626 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
627 			return error;
628 		if (XFS_IS_CORRUPT(mp, i != 1)) {
629 			xfs_btree_mark_sick(bno_cur);
630 			return -EFSCORRUPTED;
631 		}
632 	}
633 
634 #ifdef DEBUG
635 	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
636 		struct xfs_btree_block	*bnoblock;
637 		struct xfs_btree_block	*cntblock;
638 
639 		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
640 		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
641 
642 		if (XFS_IS_CORRUPT(mp,
643 				   bnoblock->bb_numrecs !=
644 				   cntblock->bb_numrecs)) {
645 			xfs_btree_mark_sick(bno_cur);
646 			return -EFSCORRUPTED;
647 		}
648 	}
649 #endif
650 
651 	/*
652 	 * Deal with all four cases: the allocated record is contained
653 	 * within the freespace record, so we can have new freespace
654 	 * at either (or both) end, or no freespace remaining.
655 	 */
656 	if (rbno == fbno && rlen == flen)
657 		nfbno1 = nfbno2 = NULLAGBLOCK;
658 	else if (rbno == fbno) {
659 		nfbno1 = rbno + rlen;
660 		nflen1 = flen - rlen;
661 		nfbno2 = NULLAGBLOCK;
662 	} else if (rbno + rlen == fbno + flen) {
663 		nfbno1 = fbno;
664 		nflen1 = flen - rlen;
665 		nfbno2 = NULLAGBLOCK;
666 	} else {
667 		nfbno1 = fbno;
668 		nflen1 = rbno - fbno;
669 		nfbno2 = rbno + rlen;
670 		nflen2 = (fbno + flen) - nfbno2;
671 	}
672 
673 	if (xfs_alloc_cursor_at_lastrec(cnt_cur))
674 		fixup_longest = true;
675 
676 	/*
677 	 * Delete the entry from the by-size btree.
678 	 */
679 	if ((error = xfs_btree_delete(cnt_cur, &i)))
680 		return error;
681 	if (XFS_IS_CORRUPT(mp, i != 1)) {
682 		xfs_btree_mark_sick(cnt_cur);
683 		return -EFSCORRUPTED;
684 	}
685 	/*
686 	 * Add new by-size btree entry(s).
687 	 */
688 	if (nfbno1 != NULLAGBLOCK) {
689 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
690 			return error;
691 		if (XFS_IS_CORRUPT(mp, i != 0)) {
692 			xfs_btree_mark_sick(cnt_cur);
693 			return -EFSCORRUPTED;
694 		}
695 		if ((error = xfs_btree_insert(cnt_cur, &i)))
696 			return error;
697 		if (XFS_IS_CORRUPT(mp, i != 1)) {
698 			xfs_btree_mark_sick(cnt_cur);
699 			return -EFSCORRUPTED;
700 		}
701 	}
702 	if (nfbno2 != NULLAGBLOCK) {
703 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
704 			return error;
705 		if (XFS_IS_CORRUPT(mp, i != 0)) {
706 			xfs_btree_mark_sick(cnt_cur);
707 			return -EFSCORRUPTED;
708 		}
709 		if ((error = xfs_btree_insert(cnt_cur, &i)))
710 			return error;
711 		if (XFS_IS_CORRUPT(mp, i != 1)) {
712 			xfs_btree_mark_sick(cnt_cur);
713 			return -EFSCORRUPTED;
714 		}
715 	}
716 	/*
717 	 * Fix up the by-block btree entry(s).
718 	 */
719 	if (nfbno1 == NULLAGBLOCK) {
720 		/*
721 		 * No remaining freespace, just delete the by-block tree entry.
722 		 */
723 		if ((error = xfs_btree_delete(bno_cur, &i)))
724 			return error;
725 		if (XFS_IS_CORRUPT(mp, i != 1)) {
726 			xfs_btree_mark_sick(bno_cur);
727 			return -EFSCORRUPTED;
728 		}
729 	} else {
730 		/*
731 		 * Update the by-block entry to start later|be shorter.
732 		 */
733 		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
734 			return error;
735 	}
736 	if (nfbno2 != NULLAGBLOCK) {
737 		/*
738 		 * 2 resulting free entries, need to add one.
739 		 */
740 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
741 			return error;
742 		if (XFS_IS_CORRUPT(mp, i != 0)) {
743 			xfs_btree_mark_sick(bno_cur);
744 			return -EFSCORRUPTED;
745 		}
746 		if ((error = xfs_btree_insert(bno_cur, &i)))
747 			return error;
748 		if (XFS_IS_CORRUPT(mp, i != 1)) {
749 			xfs_btree_mark_sick(bno_cur);
750 			return -EFSCORRUPTED;
751 		}
752 	}
753 
754 	if (fixup_longest)
755 		return xfs_alloc_fixup_longest(cnt_cur);
756 
757 	return 0;
758 }
759 
760 /*
761  * We do not verify the AGFL contents against AGF-based index counters here,
762  * even though we may have access to the perag that contains shadow copies. We
763  * don't know if the AGF based counters have been checked, and if they have they
764  * still may be inconsistent because they haven't yet been reset on the first
765  * allocation after the AGF has been read in.
766  *
767  * This means we can only check that all agfl entries contain valid or null
768  * values because we can't reliably determine the active range to exclude
769  * NULLAGBNO as a valid value.
770  *
771  * However, we can't even do that for v4 format filesystems because there are
772  * old versions of mkfs out there that does not initialise the AGFL to known,
773  * verifiable values. HEnce we can't tell the difference between a AGFL block
774  * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
775  *
776  * As a result, we can only fully validate AGFL block numbers when we pull them
777  * from the freelist in xfs_alloc_get_freelist().
778  */
779 static xfs_failaddr_t
780 xfs_agfl_verify(
781 	struct xfs_buf	*bp)
782 {
783 	struct xfs_mount *mp = bp->b_mount;
784 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
785 	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
786 	int		i;
787 
788 	if (!xfs_has_crc(mp))
789 		return NULL;
790 
791 	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
792 		return __this_address;
793 	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
794 		return __this_address;
795 	/*
796 	 * during growfs operations, the perag is not fully initialised,
797 	 * so we can't use it for any useful checking. growfs ensures we can't
798 	 * use it by using uncached buffers that don't have the perag attached
799 	 * so we can detect and avoid this problem.
800 	 */
801 	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag)))
802 		return __this_address;
803 
804 	for (i = 0; i < xfs_agfl_size(mp); i++) {
805 		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
806 		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
807 			return __this_address;
808 	}
809 
810 	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
811 		return __this_address;
812 	return NULL;
813 }
814 
815 static void
816 xfs_agfl_read_verify(
817 	struct xfs_buf	*bp)
818 {
819 	struct xfs_mount *mp = bp->b_mount;
820 	xfs_failaddr_t	fa;
821 
822 	/*
823 	 * There is no verification of non-crc AGFLs because mkfs does not
824 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
825 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
826 	 * can't verify just those entries are valid.
827 	 */
828 	if (!xfs_has_crc(mp))
829 		return;
830 
831 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
832 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
833 	else {
834 		fa = xfs_agfl_verify(bp);
835 		if (fa)
836 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
837 	}
838 }
839 
840 static void
841 xfs_agfl_write_verify(
842 	struct xfs_buf	*bp)
843 {
844 	struct xfs_mount	*mp = bp->b_mount;
845 	struct xfs_buf_log_item	*bip = bp->b_log_item;
846 	xfs_failaddr_t		fa;
847 
848 	/* no verification of non-crc AGFLs */
849 	if (!xfs_has_crc(mp))
850 		return;
851 
852 	fa = xfs_agfl_verify(bp);
853 	if (fa) {
854 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
855 		return;
856 	}
857 
858 	if (bip)
859 		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
860 
861 	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
862 }
863 
864 const struct xfs_buf_ops xfs_agfl_buf_ops = {
865 	.name = "xfs_agfl",
866 	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
867 	.verify_read = xfs_agfl_read_verify,
868 	.verify_write = xfs_agfl_write_verify,
869 	.verify_struct = xfs_agfl_verify,
870 };
871 
872 /*
873  * Read in the allocation group free block array.
874  */
875 int
876 xfs_alloc_read_agfl(
877 	struct xfs_perag	*pag,
878 	struct xfs_trans	*tp,
879 	struct xfs_buf		**bpp)
880 {
881 	struct xfs_mount	*mp = pag_mount(pag);
882 	struct xfs_buf		*bp;
883 	int			error;
884 
885 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
886 			XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)),
887 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
888 	if (xfs_metadata_is_sick(error))
889 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
890 	if (error)
891 		return error;
892 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
893 	*bpp = bp;
894 	return 0;
895 }
896 
897 STATIC int
898 xfs_alloc_update_counters(
899 	struct xfs_trans	*tp,
900 	struct xfs_buf		*agbp,
901 	long			len)
902 {
903 	struct xfs_agf		*agf = agbp->b_addr;
904 
905 	agbp->b_pag->pagf_freeblks += len;
906 	be32_add_cpu(&agf->agf_freeblks, len);
907 
908 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
909 		     be32_to_cpu(agf->agf_length))) {
910 		xfs_buf_mark_corrupt(agbp);
911 		xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
912 		return -EFSCORRUPTED;
913 	}
914 
915 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
916 	return 0;
917 }
918 
919 /*
920  * Block allocation algorithm and data structures.
921  */
922 struct xfs_alloc_cur {
923 	struct xfs_btree_cur		*cnt;	/* btree cursors */
924 	struct xfs_btree_cur		*bnolt;
925 	struct xfs_btree_cur		*bnogt;
926 	xfs_extlen_t			cur_len;/* current search length */
927 	xfs_agblock_t			rec_bno;/* extent startblock */
928 	xfs_extlen_t			rec_len;/* extent length */
929 	xfs_agblock_t			bno;	/* alloc bno */
930 	xfs_extlen_t			len;	/* alloc len */
931 	xfs_extlen_t			diff;	/* diff from search bno */
932 	unsigned int			busy_gen;/* busy state */
933 	bool				busy;
934 };
935 
936 /*
937  * Set up cursors, etc. in the extent allocation cursor. This function can be
938  * called multiple times to reset an initialized structure without having to
939  * reallocate cursors.
940  */
941 static int
942 xfs_alloc_cur_setup(
943 	struct xfs_alloc_arg	*args,
944 	struct xfs_alloc_cur	*acur)
945 {
946 	int			error;
947 	int			i;
948 
949 	acur->cur_len = args->maxlen;
950 	acur->rec_bno = 0;
951 	acur->rec_len = 0;
952 	acur->bno = 0;
953 	acur->len = 0;
954 	acur->diff = -1;
955 	acur->busy = false;
956 	acur->busy_gen = 0;
957 
958 	/*
959 	 * Perform an initial cntbt lookup to check for availability of maxlen
960 	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
961 	 * attempt a small allocation.
962 	 */
963 	if (!acur->cnt)
964 		acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
965 					args->agbp, args->pag);
966 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
967 	if (error)
968 		return error;
969 
970 	/*
971 	 * Allocate the bnobt left and right search cursors.
972 	 */
973 	if (!acur->bnolt)
974 		acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
975 					args->agbp, args->pag);
976 	if (!acur->bnogt)
977 		acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
978 					args->agbp, args->pag);
979 	return i == 1 ? 0 : -ENOSPC;
980 }
981 
982 static void
983 xfs_alloc_cur_close(
984 	struct xfs_alloc_cur	*acur,
985 	bool			error)
986 {
987 	int			cur_error = XFS_BTREE_NOERROR;
988 
989 	if (error)
990 		cur_error = XFS_BTREE_ERROR;
991 
992 	if (acur->cnt)
993 		xfs_btree_del_cursor(acur->cnt, cur_error);
994 	if (acur->bnolt)
995 		xfs_btree_del_cursor(acur->bnolt, cur_error);
996 	if (acur->bnogt)
997 		xfs_btree_del_cursor(acur->bnogt, cur_error);
998 	acur->cnt = acur->bnolt = acur->bnogt = NULL;
999 }
1000 
1001 /*
1002  * Check an extent for allocation and track the best available candidate in the
1003  * allocation structure. The cursor is deactivated if it has entered an out of
1004  * range state based on allocation arguments. Optionally return the extent
1005  * extent geometry and allocation status if requested by the caller.
1006  */
1007 static int
1008 xfs_alloc_cur_check(
1009 	struct xfs_alloc_arg	*args,
1010 	struct xfs_alloc_cur	*acur,
1011 	struct xfs_btree_cur	*cur,
1012 	int			*new)
1013 {
1014 	int			error, i;
1015 	xfs_agblock_t		bno, bnoa, bnew;
1016 	xfs_extlen_t		len, lena, diff = -1;
1017 	bool			busy;
1018 	unsigned		busy_gen = 0;
1019 	bool			deactivate = false;
1020 	bool			isbnobt = xfs_btree_is_bno(cur->bc_ops);
1021 
1022 	*new = 0;
1023 
1024 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1025 	if (error)
1026 		return error;
1027 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1028 		xfs_btree_mark_sick(cur);
1029 		return -EFSCORRUPTED;
1030 	}
1031 
1032 	/*
1033 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
1034 	 * range (i.e., walking backwards looking for a minlen extent).
1035 	 */
1036 	if (len < args->minlen) {
1037 		deactivate = !isbnobt;
1038 		goto out;
1039 	}
1040 
1041 	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
1042 					 &busy_gen);
1043 	acur->busy |= busy;
1044 	if (busy)
1045 		acur->busy_gen = busy_gen;
1046 	/* deactivate a bnobt cursor outside of locality range */
1047 	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
1048 		deactivate = isbnobt;
1049 		goto out;
1050 	}
1051 	if (lena < args->minlen)
1052 		goto out;
1053 
1054 	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
1055 	xfs_alloc_fix_len(args);
1056 	ASSERT(args->len >= args->minlen);
1057 	if (args->len < acur->len)
1058 		goto out;
1059 
1060 	/*
1061 	 * We have an aligned record that satisfies minlen and beats or matches
1062 	 * the candidate extent size. Compare locality for near allocation mode.
1063 	 */
1064 	diff = xfs_alloc_compute_diff(args->agbno, args->len,
1065 				      args->alignment, args->datatype,
1066 				      bnoa, lena, &bnew);
1067 	if (bnew == NULLAGBLOCK)
1068 		goto out;
1069 
1070 	/*
1071 	 * Deactivate a bnobt cursor with worse locality than the current best.
1072 	 */
1073 	if (diff > acur->diff) {
1074 		deactivate = isbnobt;
1075 		goto out;
1076 	}
1077 
1078 	ASSERT(args->len > acur->len ||
1079 	       (args->len == acur->len && diff <= acur->diff));
1080 	acur->rec_bno = bno;
1081 	acur->rec_len = len;
1082 	acur->bno = bnew;
1083 	acur->len = args->len;
1084 	acur->diff = diff;
1085 	*new = 1;
1086 
1087 	/*
1088 	 * We're done if we found a perfect allocation. This only deactivates
1089 	 * the current cursor, but this is just an optimization to terminate a
1090 	 * cntbt search that otherwise runs to the edge of the tree.
1091 	 */
1092 	if (acur->diff == 0 && acur->len == args->maxlen)
1093 		deactivate = true;
1094 out:
1095 	if (deactivate)
1096 		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
1097 	trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
1098 	return 0;
1099 }
1100 
1101 /*
1102  * Complete an allocation of a candidate extent. Remove the extent from both
1103  * trees and update the args structure.
1104  */
1105 STATIC int
1106 xfs_alloc_cur_finish(
1107 	struct xfs_alloc_arg	*args,
1108 	struct xfs_alloc_cur	*acur)
1109 {
1110 	int			error;
1111 
1112 	ASSERT(acur->cnt && acur->bnolt);
1113 	ASSERT(acur->bno >= acur->rec_bno);
1114 	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
1115 	ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len));
1116 
1117 	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
1118 				      acur->rec_len, acur->bno, acur->len, 0);
1119 	if (error)
1120 		return error;
1121 
1122 	args->agbno = acur->bno;
1123 	args->len = acur->len;
1124 	args->wasfromfl = 0;
1125 
1126 	trace_xfs_alloc_cur(args);
1127 	return 0;
1128 }
1129 
1130 /*
1131  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
1132  * bno optimized lookup to search for extents with ideal size and locality.
1133  */
1134 STATIC int
1135 xfs_alloc_cntbt_iter(
1136 	struct xfs_alloc_arg		*args,
1137 	struct xfs_alloc_cur		*acur)
1138 {
1139 	struct xfs_btree_cur	*cur = acur->cnt;
1140 	xfs_agblock_t		bno;
1141 	xfs_extlen_t		len, cur_len;
1142 	int			error;
1143 	int			i;
1144 
1145 	if (!xfs_alloc_cur_active(cur))
1146 		return 0;
1147 
1148 	/* locality optimized lookup */
1149 	cur_len = acur->cur_len;
1150 	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1151 	if (error)
1152 		return error;
1153 	if (i == 0)
1154 		return 0;
1155 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1156 	if (error)
1157 		return error;
1158 
1159 	/* check the current record and update search length from it */
1160 	error = xfs_alloc_cur_check(args, acur, cur, &i);
1161 	if (error)
1162 		return error;
1163 	ASSERT(len >= acur->cur_len);
1164 	acur->cur_len = len;
1165 
1166 	/*
1167 	 * We looked up the first record >= [agbno, len] above. The agbno is a
1168 	 * secondary key and so the current record may lie just before or after
1169 	 * agbno. If it is past agbno, check the previous record too so long as
1170 	 * the length matches as it may be closer. Don't check a smaller record
1171 	 * because that could deactivate our cursor.
1172 	 */
1173 	if (bno > args->agbno) {
1174 		error = xfs_btree_decrement(cur, 0, &i);
1175 		if (!error && i) {
1176 			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1177 			if (!error && i && len == acur->cur_len)
1178 				error = xfs_alloc_cur_check(args, acur, cur,
1179 							    &i);
1180 		}
1181 		if (error)
1182 			return error;
1183 	}
1184 
1185 	/*
1186 	 * Increment the search key until we find at least one allocation
1187 	 * candidate or if the extent we found was larger. Otherwise, double the
1188 	 * search key to optimize the search. Efficiency is more important here
1189 	 * than absolute best locality.
1190 	 */
1191 	cur_len <<= 1;
1192 	if (!acur->len || acur->cur_len >= cur_len)
1193 		acur->cur_len++;
1194 	else
1195 		acur->cur_len = cur_len;
1196 
1197 	return error;
1198 }
1199 
1200 /*
1201  * Deal with the case where only small freespaces remain. Either return the
1202  * contents of the last freespace record, or allocate space from the freelist if
1203  * there is nothing in the tree.
1204  */
1205 STATIC int			/* error */
1206 xfs_alloc_ag_vextent_small(
1207 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1208 	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1209 	xfs_agblock_t		*fbnop,	/* result block number */
1210 	xfs_extlen_t		*flenp,	/* result length */
1211 	int			*stat)	/* status: 0-freelist, 1-normal/none */
1212 {
1213 	struct xfs_agf		*agf = args->agbp->b_addr;
1214 	int			error = 0;
1215 	xfs_agblock_t		fbno = NULLAGBLOCK;
1216 	xfs_extlen_t		flen = 0;
1217 	int			i = 0;
1218 
1219 	/*
1220 	 * If a cntbt cursor is provided, try to allocate the largest record in
1221 	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1222 	 * allocation. Make sure to respect minleft even when pulling from the
1223 	 * freelist.
1224 	 */
1225 	if (ccur)
1226 		error = xfs_btree_decrement(ccur, 0, &i);
1227 	if (error)
1228 		goto error;
1229 	if (i) {
1230 		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1231 		if (error)
1232 			goto error;
1233 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1234 			xfs_btree_mark_sick(ccur);
1235 			error = -EFSCORRUPTED;
1236 			goto error;
1237 		}
1238 		goto out;
1239 	}
1240 
1241 	if (args->minlen != 1 || args->alignment != 1 ||
1242 	    args->resv == XFS_AG_RESV_AGFL ||
1243 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1244 		goto out;
1245 
1246 	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1247 			&fbno, 0);
1248 	if (error)
1249 		goto error;
1250 	if (fbno == NULLAGBLOCK)
1251 		goto out;
1252 
1253 	xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1,
1254 			      (args->datatype & XFS_ALLOC_NOBUSY));
1255 
1256 	if (args->datatype & XFS_ALLOC_USERDATA) {
1257 		struct xfs_buf	*bp;
1258 
1259 		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1260 				xfs_agbno_to_daddr(args->pag, fbno),
1261 				args->mp->m_bsize, 0, &bp);
1262 		if (error)
1263 			goto error;
1264 		xfs_trans_binval(args->tp, bp);
1265 	}
1266 	*fbnop = args->agbno = fbno;
1267 	*flenp = args->len = 1;
1268 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1269 		xfs_btree_mark_sick(ccur);
1270 		error = -EFSCORRUPTED;
1271 		goto error;
1272 	}
1273 	args->wasfromfl = 1;
1274 	trace_xfs_alloc_small_freelist(args);
1275 
1276 	/*
1277 	 * If we're feeding an AGFL block to something that doesn't live in the
1278 	 * free space, we need to clear out the OWN_AG rmap.
1279 	 */
1280 	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1281 			      &XFS_RMAP_OINFO_AG);
1282 	if (error)
1283 		goto error;
1284 
1285 	*stat = 0;
1286 	return 0;
1287 
1288 out:
1289 	/*
1290 	 * Can't do the allocation, give up.
1291 	 */
1292 	if (flen < args->minlen) {
1293 		args->agbno = NULLAGBLOCK;
1294 		trace_xfs_alloc_small_notenough(args);
1295 		flen = 0;
1296 	}
1297 	*fbnop = fbno;
1298 	*flenp = flen;
1299 	*stat = 1;
1300 	trace_xfs_alloc_small_done(args);
1301 	return 0;
1302 
1303 error:
1304 	trace_xfs_alloc_small_error(args);
1305 	return error;
1306 }
1307 
1308 /*
1309  * Allocate a variable extent at exactly agno/bno.
1310  * Extent's length (returned in *len) will be between minlen and maxlen,
1311  * and of the form k * prod + mod unless there's nothing that large.
1312  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1313  */
1314 STATIC int			/* error */
1315 xfs_alloc_ag_vextent_exact(
1316 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1317 {
1318 	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1319 	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1320 	int		error;
1321 	xfs_agblock_t	fbno;	/* start block of found extent */
1322 	xfs_extlen_t	flen;	/* length of found extent */
1323 	xfs_agblock_t	tbno;	/* start block of busy extent */
1324 	xfs_extlen_t	tlen;	/* length of busy extent */
1325 	xfs_agblock_t	tend;	/* end block of busy extent */
1326 	int		i;	/* success/failure of operation */
1327 	unsigned	busy_gen;
1328 
1329 	ASSERT(args->alignment == 1);
1330 
1331 	/*
1332 	 * Allocate/initialize a cursor for the by-number freespace btree.
1333 	 */
1334 	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
1335 					  args->pag);
1336 
1337 	/*
1338 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1339 	 * Look for the closest free block <= bno, it must contain bno
1340 	 * if any free block does.
1341 	 */
1342 	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1343 	if (error)
1344 		goto error0;
1345 	if (!i)
1346 		goto not_found;
1347 
1348 	/*
1349 	 * Grab the freespace record.
1350 	 */
1351 	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1352 	if (error)
1353 		goto error0;
1354 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1355 		xfs_btree_mark_sick(bno_cur);
1356 		error = -EFSCORRUPTED;
1357 		goto error0;
1358 	}
1359 	ASSERT(fbno <= args->agbno);
1360 
1361 	/*
1362 	 * Check for overlapping busy extents.
1363 	 */
1364 	tbno = fbno;
1365 	tlen = flen;
1366 	xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen,
1367 			&tbno, &tlen, &busy_gen);
1368 
1369 	/*
1370 	 * Give up if the start of the extent is busy, or the freespace isn't
1371 	 * long enough for the minimum request.
1372 	 */
1373 	if (tbno > args->agbno)
1374 		goto not_found;
1375 	if (tlen < args->minlen)
1376 		goto not_found;
1377 	tend = tbno + tlen;
1378 	if (tend < args->agbno + args->minlen)
1379 		goto not_found;
1380 
1381 	/*
1382 	 * End of extent will be smaller of the freespace end and the
1383 	 * maximal requested end.
1384 	 *
1385 	 * Fix the length according to mod and prod if given.
1386 	 */
1387 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1388 						- args->agbno;
1389 	xfs_alloc_fix_len(args);
1390 	ASSERT(args->agbno + args->len <= tend);
1391 
1392 	/*
1393 	 * We are allocating agbno for args->len
1394 	 * Allocate/initialize a cursor for the by-size btree.
1395 	 */
1396 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
1397 					args->pag);
1398 	ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len));
1399 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1400 				      args->len, XFSA_FIXUP_BNO_OK);
1401 	if (error) {
1402 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1403 		goto error0;
1404 	}
1405 
1406 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1407 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1408 
1409 	args->wasfromfl = 0;
1410 	trace_xfs_alloc_exact_done(args);
1411 	return 0;
1412 
1413 not_found:
1414 	/* Didn't find it, return null. */
1415 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1416 	args->agbno = NULLAGBLOCK;
1417 	trace_xfs_alloc_exact_notfound(args);
1418 	return 0;
1419 
1420 error0:
1421 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1422 	trace_xfs_alloc_exact_error(args);
1423 	return error;
1424 }
1425 
1426 /*
1427  * Search a given number of btree records in a given direction. Check each
1428  * record against the good extent we've already found.
1429  */
1430 STATIC int
1431 xfs_alloc_walk_iter(
1432 	struct xfs_alloc_arg	*args,
1433 	struct xfs_alloc_cur	*acur,
1434 	struct xfs_btree_cur	*cur,
1435 	bool			increment,
1436 	bool			find_one, /* quit on first candidate */
1437 	int			count,    /* rec count (-1 for infinite) */
1438 	int			*stat)
1439 {
1440 	int			error;
1441 	int			i;
1442 
1443 	*stat = 0;
1444 
1445 	/*
1446 	 * Search so long as the cursor is active or we find a better extent.
1447 	 * The cursor is deactivated if it extends beyond the range of the
1448 	 * current allocation candidate.
1449 	 */
1450 	while (xfs_alloc_cur_active(cur) && count) {
1451 		error = xfs_alloc_cur_check(args, acur, cur, &i);
1452 		if (error)
1453 			return error;
1454 		if (i == 1) {
1455 			*stat = 1;
1456 			if (find_one)
1457 				break;
1458 		}
1459 		if (!xfs_alloc_cur_active(cur))
1460 			break;
1461 
1462 		if (increment)
1463 			error = xfs_btree_increment(cur, 0, &i);
1464 		else
1465 			error = xfs_btree_decrement(cur, 0, &i);
1466 		if (error)
1467 			return error;
1468 		if (i == 0)
1469 			cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
1470 
1471 		if (count > 0)
1472 			count--;
1473 	}
1474 
1475 	return 0;
1476 }
1477 
1478 /*
1479  * Search the by-bno and by-size btrees in parallel in search of an extent with
1480  * ideal locality based on the NEAR mode ->agbno locality hint.
1481  */
1482 STATIC int
1483 xfs_alloc_ag_vextent_locality(
1484 	struct xfs_alloc_arg	*args,
1485 	struct xfs_alloc_cur	*acur,
1486 	int			*stat)
1487 {
1488 	struct xfs_btree_cur	*fbcur = NULL;
1489 	int			error;
1490 	int			i;
1491 	bool			fbinc;
1492 
1493 	ASSERT(acur->len == 0);
1494 
1495 	*stat = 0;
1496 
1497 	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1498 	if (error)
1499 		return error;
1500 	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1501 	if (error)
1502 		return error;
1503 	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1504 	if (error)
1505 		return error;
1506 
1507 	/*
1508 	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1509 	 * right and lookup the closest extent to the locality hint for each
1510 	 * extent size key in the cntbt. The entire search terminates
1511 	 * immediately on a bnobt hit because that means we've found best case
1512 	 * locality. Otherwise the search continues until the cntbt cursor runs
1513 	 * off the end of the tree. If no allocation candidate is found at this
1514 	 * point, give up on locality, walk backwards from the end of the cntbt
1515 	 * and take the first available extent.
1516 	 *
1517 	 * The parallel tree searches balance each other out to provide fairly
1518 	 * consistent performance for various situations. The bnobt search can
1519 	 * have pathological behavior in the worst case scenario of larger
1520 	 * allocation requests and fragmented free space. On the other hand, the
1521 	 * bnobt is able to satisfy most smaller allocation requests much more
1522 	 * quickly than the cntbt. The cntbt search can sift through fragmented
1523 	 * free space and sets of free extents for larger allocation requests
1524 	 * more quickly than the bnobt. Since the locality hint is just a hint
1525 	 * and we don't want to scan the entire bnobt for perfect locality, the
1526 	 * cntbt search essentially bounds the bnobt search such that we can
1527 	 * find good enough locality at reasonable performance in most cases.
1528 	 */
1529 	while (xfs_alloc_cur_active(acur->bnolt) ||
1530 	       xfs_alloc_cur_active(acur->bnogt) ||
1531 	       xfs_alloc_cur_active(acur->cnt)) {
1532 
1533 		trace_xfs_alloc_cur_lookup(args);
1534 
1535 		/*
1536 		 * Search the bnobt left and right. In the case of a hit, finish
1537 		 * the search in the opposite direction and we're done.
1538 		 */
1539 		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1540 					    true, 1, &i);
1541 		if (error)
1542 			return error;
1543 		if (i == 1) {
1544 			trace_xfs_alloc_cur_left(args);
1545 			fbcur = acur->bnogt;
1546 			fbinc = true;
1547 			break;
1548 		}
1549 		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1550 					    1, &i);
1551 		if (error)
1552 			return error;
1553 		if (i == 1) {
1554 			trace_xfs_alloc_cur_right(args);
1555 			fbcur = acur->bnolt;
1556 			fbinc = false;
1557 			break;
1558 		}
1559 
1560 		/*
1561 		 * Check the extent with best locality based on the current
1562 		 * extent size search key and keep track of the best candidate.
1563 		 */
1564 		error = xfs_alloc_cntbt_iter(args, acur);
1565 		if (error)
1566 			return error;
1567 		if (!xfs_alloc_cur_active(acur->cnt)) {
1568 			trace_xfs_alloc_cur_lookup_done(args);
1569 			break;
1570 		}
1571 	}
1572 
1573 	/*
1574 	 * If we failed to find anything due to busy extents, return empty
1575 	 * handed so the caller can flush and retry. If no busy extents were
1576 	 * found, walk backwards from the end of the cntbt as a last resort.
1577 	 */
1578 	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1579 		error = xfs_btree_decrement(acur->cnt, 0, &i);
1580 		if (error)
1581 			return error;
1582 		if (i) {
1583 			acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
1584 			fbcur = acur->cnt;
1585 			fbinc = false;
1586 		}
1587 	}
1588 
1589 	/*
1590 	 * Search in the opposite direction for a better entry in the case of
1591 	 * a bnobt hit or walk backwards from the end of the cntbt.
1592 	 */
1593 	if (fbcur) {
1594 		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1595 					    &i);
1596 		if (error)
1597 			return error;
1598 	}
1599 
1600 	if (acur->len)
1601 		*stat = 1;
1602 
1603 	return 0;
1604 }
1605 
1606 /* Check the last block of the cnt btree for allocations. */
1607 static int
1608 xfs_alloc_ag_vextent_lastblock(
1609 	struct xfs_alloc_arg	*args,
1610 	struct xfs_alloc_cur	*acur,
1611 	xfs_agblock_t		*bno,
1612 	xfs_extlen_t		*len,
1613 	bool			*allocated)
1614 {
1615 	int			error;
1616 	int			i;
1617 
1618 #ifdef DEBUG
1619 	/* Randomly don't execute the first algorithm. */
1620 	if (get_random_u32_below(2))
1621 		return 0;
1622 #endif
1623 
1624 	/*
1625 	 * Start from the entry that lookup found, sequence through all larger
1626 	 * free blocks.  If we're actually pointing at a record smaller than
1627 	 * maxlen, go to the start of this block, and skip all those smaller
1628 	 * than minlen.
1629 	 */
1630 	if (*len || args->alignment > 1) {
1631 		acur->cnt->bc_levels[0].ptr = 1;
1632 		do {
1633 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1634 			if (error)
1635 				return error;
1636 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1637 				xfs_btree_mark_sick(acur->cnt);
1638 				return -EFSCORRUPTED;
1639 			}
1640 			if (*len >= args->minlen)
1641 				break;
1642 			error = xfs_btree_increment(acur->cnt, 0, &i);
1643 			if (error)
1644 				return error;
1645 		} while (i);
1646 		ASSERT(*len >= args->minlen);
1647 		if (!i)
1648 			return 0;
1649 	}
1650 
1651 	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1652 	if (error)
1653 		return error;
1654 
1655 	/*
1656 	 * It didn't work.  We COULD be in a case where there's a good record
1657 	 * somewhere, so try again.
1658 	 */
1659 	if (acur->len == 0)
1660 		return 0;
1661 
1662 	trace_xfs_alloc_near_first(args);
1663 	*allocated = true;
1664 	return 0;
1665 }
1666 
1667 /*
1668  * Allocate a variable extent near bno in the allocation group agno.
1669  * Extent's length (returned in len) will be between minlen and maxlen,
1670  * and of the form k * prod + mod unless there's nothing that large.
1671  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1672  */
1673 STATIC int
1674 xfs_alloc_ag_vextent_near(
1675 	struct xfs_alloc_arg	*args,
1676 	uint32_t		alloc_flags)
1677 {
1678 	struct xfs_alloc_cur	acur = {};
1679 	int			error;		/* error code */
1680 	int			i;		/* result code, temporary */
1681 	xfs_agblock_t		bno;
1682 	xfs_extlen_t		len;
1683 
1684 	/* handle uninitialized agbno range so caller doesn't have to */
1685 	if (!args->min_agbno && !args->max_agbno)
1686 		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1687 	ASSERT(args->min_agbno <= args->max_agbno);
1688 
1689 	/* clamp agbno to the range if it's outside */
1690 	if (args->agbno < args->min_agbno)
1691 		args->agbno = args->min_agbno;
1692 	if (args->agbno > args->max_agbno)
1693 		args->agbno = args->max_agbno;
1694 
1695 	/* Retry once quickly if we find busy extents before blocking. */
1696 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1697 restart:
1698 	len = 0;
1699 
1700 	/*
1701 	 * Set up cursors and see if there are any free extents as big as
1702 	 * maxlen. If not, pick the last entry in the tree unless the tree is
1703 	 * empty.
1704 	 */
1705 	error = xfs_alloc_cur_setup(args, &acur);
1706 	if (error == -ENOSPC) {
1707 		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1708 				&len, &i);
1709 		if (error)
1710 			goto out;
1711 		if (i == 0 || len == 0) {
1712 			trace_xfs_alloc_near_noentry(args);
1713 			goto out;
1714 		}
1715 		ASSERT(i == 1);
1716 	} else if (error) {
1717 		goto out;
1718 	}
1719 
1720 	/*
1721 	 * First algorithm.
1722 	 * If the requested extent is large wrt the freespaces available
1723 	 * in this a.g., then the cursor will be pointing to a btree entry
1724 	 * near the right edge of the tree.  If it's in the last btree leaf
1725 	 * block, then we just examine all the entries in that block
1726 	 * that are big enough, and pick the best one.
1727 	 */
1728 	if (xfs_btree_islastblock(acur.cnt, 0)) {
1729 		bool		allocated = false;
1730 
1731 		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1732 				&allocated);
1733 		if (error)
1734 			goto out;
1735 		if (allocated)
1736 			goto alloc_finish;
1737 	}
1738 
1739 	/*
1740 	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1741 	 * locality.
1742 	 */
1743 	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1744 	if (error)
1745 		goto out;
1746 
1747 	/*
1748 	 * If we couldn't get anything, give up.
1749 	 */
1750 	if (!acur.len) {
1751 		if (acur.busy) {
1752 			/*
1753 			 * Our only valid extents must have been busy. Flush and
1754 			 * retry the allocation again. If we get an -EAGAIN
1755 			 * error, we're being told that a deadlock was avoided
1756 			 * and the current transaction needs committing before
1757 			 * the allocation can be retried.
1758 			 */
1759 			trace_xfs_alloc_near_busy(args);
1760 			error = xfs_extent_busy_flush(args->tp,
1761 					pag_group(args->pag), acur.busy_gen,
1762 					alloc_flags);
1763 			if (error)
1764 				goto out;
1765 
1766 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1767 			goto restart;
1768 		}
1769 		trace_xfs_alloc_size_neither(args);
1770 		args->agbno = NULLAGBLOCK;
1771 		goto out;
1772 	}
1773 
1774 alloc_finish:
1775 	/* fix up btrees on a successful allocation */
1776 	error = xfs_alloc_cur_finish(args, &acur);
1777 
1778 out:
1779 	xfs_alloc_cur_close(&acur, error);
1780 	return error;
1781 }
1782 
1783 /*
1784  * Allocate a variable extent anywhere in the allocation group agno.
1785  * Extent's length (returned in len) will be between minlen and maxlen,
1786  * and of the form k * prod + mod unless there's nothing that large.
1787  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1788  */
1789 static int
1790 xfs_alloc_ag_vextent_size(
1791 	struct xfs_alloc_arg	*args,
1792 	uint32_t		alloc_flags)
1793 {
1794 	struct xfs_agf		*agf = args->agbp->b_addr;
1795 	struct xfs_btree_cur	*bno_cur;
1796 	struct xfs_btree_cur	*cnt_cur;
1797 	xfs_agblock_t		fbno;		/* start of found freespace */
1798 	xfs_extlen_t		flen;		/* length of found freespace */
1799 	xfs_agblock_t		rbno;		/* returned block number */
1800 	xfs_extlen_t		rlen;		/* length of returned extent */
1801 	bool			busy;
1802 	unsigned		busy_gen;
1803 	int			error;
1804 	int			i;
1805 
1806 	/* Retry once quickly if we find busy extents before blocking. */
1807 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1808 restart:
1809 	/*
1810 	 * Allocate and initialize a cursor for the by-size btree.
1811 	 */
1812 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
1813 					args->pag);
1814 	bno_cur = NULL;
1815 
1816 	/*
1817 	 * Look for an entry >= maxlen+alignment-1 blocks.
1818 	 */
1819 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1820 			args->maxlen + args->alignment - 1, &i)))
1821 		goto error0;
1822 
1823 	/*
1824 	 * If none then we have to settle for a smaller extent. In the case that
1825 	 * there are no large extents, this will return the last entry in the
1826 	 * tree unless the tree is empty. In the case that there are only busy
1827 	 * large extents, this will return the largest small extent unless there
1828 	 * are no smaller extents available.
1829 	 */
1830 	if (!i) {
1831 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1832 						   &fbno, &flen, &i);
1833 		if (error)
1834 			goto error0;
1835 		if (i == 0 || flen == 0) {
1836 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1837 			trace_xfs_alloc_size_noentry(args);
1838 			return 0;
1839 		}
1840 		ASSERT(i == 1);
1841 		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1842 				&rlen, &busy_gen);
1843 	} else {
1844 		/*
1845 		 * Search for a non-busy extent that is large enough.
1846 		 */
1847 		for (;;) {
1848 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1849 			if (error)
1850 				goto error0;
1851 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1852 				xfs_btree_mark_sick(cnt_cur);
1853 				error = -EFSCORRUPTED;
1854 				goto error0;
1855 			}
1856 
1857 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1858 					&rbno, &rlen, &busy_gen);
1859 
1860 			if (rlen >= args->maxlen)
1861 				break;
1862 
1863 			error = xfs_btree_increment(cnt_cur, 0, &i);
1864 			if (error)
1865 				goto error0;
1866 			if (i)
1867 				continue;
1868 
1869 			/*
1870 			 * Our only valid extents must have been busy. Flush and
1871 			 * retry the allocation again. If we get an -EAGAIN
1872 			 * error, we're being told that a deadlock was avoided
1873 			 * and the current transaction needs committing before
1874 			 * the allocation can be retried.
1875 			 */
1876 			trace_xfs_alloc_size_busy(args);
1877 			error = xfs_extent_busy_flush(args->tp,
1878 					pag_group(args->pag), busy_gen,
1879 					alloc_flags);
1880 			if (error)
1881 				goto error0;
1882 
1883 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1884 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1885 			goto restart;
1886 		}
1887 	}
1888 
1889 	/*
1890 	 * In the first case above, we got the last entry in the
1891 	 * by-size btree.  Now we check to see if the space hits maxlen
1892 	 * once aligned; if not, we search left for something better.
1893 	 * This can't happen in the second case above.
1894 	 */
1895 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1896 	if (XFS_IS_CORRUPT(args->mp,
1897 			   rlen != 0 &&
1898 			   (rlen > flen ||
1899 			    rbno + rlen > fbno + flen))) {
1900 		xfs_btree_mark_sick(cnt_cur);
1901 		error = -EFSCORRUPTED;
1902 		goto error0;
1903 	}
1904 	if (rlen < args->maxlen) {
1905 		xfs_agblock_t	bestfbno;
1906 		xfs_extlen_t	bestflen;
1907 		xfs_agblock_t	bestrbno;
1908 		xfs_extlen_t	bestrlen;
1909 
1910 		bestrlen = rlen;
1911 		bestrbno = rbno;
1912 		bestflen = flen;
1913 		bestfbno = fbno;
1914 		for (;;) {
1915 			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1916 				goto error0;
1917 			if (i == 0)
1918 				break;
1919 			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1920 					&i)))
1921 				goto error0;
1922 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1923 				xfs_btree_mark_sick(cnt_cur);
1924 				error = -EFSCORRUPTED;
1925 				goto error0;
1926 			}
1927 			if (flen <= bestrlen)
1928 				break;
1929 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1930 					&rbno, &rlen, &busy_gen);
1931 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1932 			if (XFS_IS_CORRUPT(args->mp,
1933 					   rlen != 0 &&
1934 					   (rlen > flen ||
1935 					    rbno + rlen > fbno + flen))) {
1936 				xfs_btree_mark_sick(cnt_cur);
1937 				error = -EFSCORRUPTED;
1938 				goto error0;
1939 			}
1940 			if (rlen > bestrlen) {
1941 				bestrlen = rlen;
1942 				bestrbno = rbno;
1943 				bestflen = flen;
1944 				bestfbno = fbno;
1945 				if (rlen == args->maxlen)
1946 					break;
1947 			}
1948 		}
1949 		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1950 				&i)))
1951 			goto error0;
1952 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1953 			xfs_btree_mark_sick(cnt_cur);
1954 			error = -EFSCORRUPTED;
1955 			goto error0;
1956 		}
1957 		rlen = bestrlen;
1958 		rbno = bestrbno;
1959 		flen = bestflen;
1960 		fbno = bestfbno;
1961 	}
1962 	args->wasfromfl = 0;
1963 	/*
1964 	 * Fix up the length.
1965 	 */
1966 	args->len = rlen;
1967 	if (rlen < args->minlen) {
1968 		if (busy) {
1969 			/*
1970 			 * Our only valid extents must have been busy. Flush and
1971 			 * retry the allocation again. If we get an -EAGAIN
1972 			 * error, we're being told that a deadlock was avoided
1973 			 * and the current transaction needs committing before
1974 			 * the allocation can be retried.
1975 			 */
1976 			trace_xfs_alloc_size_busy(args);
1977 			error = xfs_extent_busy_flush(args->tp,
1978 					pag_group(args->pag), busy_gen,
1979 					alloc_flags);
1980 			if (error)
1981 				goto error0;
1982 
1983 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1984 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1985 			goto restart;
1986 		}
1987 		goto out_nominleft;
1988 	}
1989 	xfs_alloc_fix_len(args);
1990 
1991 	rlen = args->len;
1992 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1993 		xfs_btree_mark_sick(cnt_cur);
1994 		error = -EFSCORRUPTED;
1995 		goto error0;
1996 	}
1997 	/*
1998 	 * Allocate and initialize a cursor for the by-block tree.
1999 	 */
2000 	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
2001 					args->pag);
2002 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
2003 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
2004 		goto error0;
2005 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2006 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2007 	cnt_cur = bno_cur = NULL;
2008 	args->len = rlen;
2009 	args->agbno = rbno;
2010 	if (XFS_IS_CORRUPT(args->mp,
2011 			   args->agbno + args->len >
2012 			   be32_to_cpu(agf->agf_length))) {
2013 		xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
2014 		error = -EFSCORRUPTED;
2015 		goto error0;
2016 	}
2017 	trace_xfs_alloc_size_done(args);
2018 	return 0;
2019 
2020 error0:
2021 	trace_xfs_alloc_size_error(args);
2022 	if (cnt_cur)
2023 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2024 	if (bno_cur)
2025 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2026 	return error;
2027 
2028 out_nominleft:
2029 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2030 	trace_xfs_alloc_size_nominleft(args);
2031 	args->agbno = NULLAGBLOCK;
2032 	return 0;
2033 }
2034 
2035 /*
2036  * Free the extent starting at agno/bno for length.
2037  */
2038 int
2039 xfs_free_ag_extent(
2040 	struct xfs_trans		*tp,
2041 	struct xfs_buf			*agbp,
2042 	xfs_agblock_t			bno,
2043 	xfs_extlen_t			len,
2044 	const struct xfs_owner_info	*oinfo,
2045 	enum xfs_ag_resv_type		type)
2046 {
2047 	struct xfs_mount		*mp;
2048 	struct xfs_btree_cur		*bno_cur;
2049 	struct xfs_btree_cur		*cnt_cur;
2050 	xfs_agblock_t			gtbno; /* start of right neighbor */
2051 	xfs_extlen_t			gtlen; /* length of right neighbor */
2052 	xfs_agblock_t			ltbno; /* start of left neighbor */
2053 	xfs_extlen_t			ltlen; /* length of left neighbor */
2054 	xfs_agblock_t			nbno; /* new starting block of freesp */
2055 	xfs_extlen_t			nlen; /* new length of freespace */
2056 	int				haveleft; /* have a left neighbor */
2057 	int				haveright; /* have a right neighbor */
2058 	int				i;
2059 	int				error;
2060 	struct xfs_perag		*pag = agbp->b_pag;
2061 	bool				fixup_longest = false;
2062 
2063 	bno_cur = cnt_cur = NULL;
2064 	mp = tp->t_mountp;
2065 
2066 	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
2067 		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
2068 		if (error)
2069 			goto error0;
2070 	}
2071 
2072 	/*
2073 	 * Allocate and initialize a cursor for the by-block btree.
2074 	 */
2075 	bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
2076 	/*
2077 	 * Look for a neighboring block on the left (lower block numbers)
2078 	 * that is contiguous with this space.
2079 	 */
2080 	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
2081 		goto error0;
2082 	if (haveleft) {
2083 		/*
2084 		 * There is a block to our left.
2085 		 */
2086 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
2087 			goto error0;
2088 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2089 			xfs_btree_mark_sick(bno_cur);
2090 			error = -EFSCORRUPTED;
2091 			goto error0;
2092 		}
2093 		/*
2094 		 * It's not contiguous, though.
2095 		 */
2096 		if (ltbno + ltlen < bno)
2097 			haveleft = 0;
2098 		else {
2099 			/*
2100 			 * If this failure happens the request to free this
2101 			 * space was invalid, it's (partly) already free.
2102 			 * Very bad.
2103 			 */
2104 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
2105 				xfs_btree_mark_sick(bno_cur);
2106 				error = -EFSCORRUPTED;
2107 				goto error0;
2108 			}
2109 		}
2110 	}
2111 	/*
2112 	 * Look for a neighboring block on the right (higher block numbers)
2113 	 * that is contiguous with this space.
2114 	 */
2115 	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2116 		goto error0;
2117 	if (haveright) {
2118 		/*
2119 		 * There is a block to our right.
2120 		 */
2121 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2122 			goto error0;
2123 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2124 			xfs_btree_mark_sick(bno_cur);
2125 			error = -EFSCORRUPTED;
2126 			goto error0;
2127 		}
2128 		/*
2129 		 * It's not contiguous, though.
2130 		 */
2131 		if (bno + len < gtbno)
2132 			haveright = 0;
2133 		else {
2134 			/*
2135 			 * If this failure happens the request to free this
2136 			 * space was invalid, it's (partly) already free.
2137 			 * Very bad.
2138 			 */
2139 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
2140 				xfs_btree_mark_sick(bno_cur);
2141 				error = -EFSCORRUPTED;
2142 				goto error0;
2143 			}
2144 		}
2145 	}
2146 	/*
2147 	 * Now allocate and initialize a cursor for the by-size tree.
2148 	 */
2149 	cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
2150 	/*
2151 	 * Have both left and right contiguous neighbors.
2152 	 * Merge all three into a single free block.
2153 	 */
2154 	if (haveleft && haveright) {
2155 		/*
2156 		 * Delete the old by-size entry on the left.
2157 		 */
2158 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2159 			goto error0;
2160 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2161 			xfs_btree_mark_sick(cnt_cur);
2162 			error = -EFSCORRUPTED;
2163 			goto error0;
2164 		}
2165 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2166 			goto error0;
2167 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2168 			xfs_btree_mark_sick(cnt_cur);
2169 			error = -EFSCORRUPTED;
2170 			goto error0;
2171 		}
2172 		/*
2173 		 * Delete the old by-size entry on the right.
2174 		 */
2175 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2176 			goto error0;
2177 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2178 			xfs_btree_mark_sick(cnt_cur);
2179 			error = -EFSCORRUPTED;
2180 			goto error0;
2181 		}
2182 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2183 			goto error0;
2184 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2185 			xfs_btree_mark_sick(cnt_cur);
2186 			error = -EFSCORRUPTED;
2187 			goto error0;
2188 		}
2189 		/*
2190 		 * Delete the old by-block entry for the right block.
2191 		 */
2192 		if ((error = xfs_btree_delete(bno_cur, &i)))
2193 			goto error0;
2194 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2195 			xfs_btree_mark_sick(bno_cur);
2196 			error = -EFSCORRUPTED;
2197 			goto error0;
2198 		}
2199 		/*
2200 		 * Move the by-block cursor back to the left neighbor.
2201 		 */
2202 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2203 			goto error0;
2204 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2205 			xfs_btree_mark_sick(bno_cur);
2206 			error = -EFSCORRUPTED;
2207 			goto error0;
2208 		}
2209 #ifdef DEBUG
2210 		/*
2211 		 * Check that this is the right record: delete didn't
2212 		 * mangle the cursor.
2213 		 */
2214 		{
2215 			xfs_agblock_t	xxbno;
2216 			xfs_extlen_t	xxlen;
2217 
2218 			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2219 					&i)))
2220 				goto error0;
2221 			if (XFS_IS_CORRUPT(mp,
2222 					   i != 1 ||
2223 					   xxbno != ltbno ||
2224 					   xxlen != ltlen)) {
2225 				xfs_btree_mark_sick(bno_cur);
2226 				error = -EFSCORRUPTED;
2227 				goto error0;
2228 			}
2229 		}
2230 #endif
2231 		/*
2232 		 * Update remaining by-block entry to the new, joined block.
2233 		 */
2234 		nbno = ltbno;
2235 		nlen = len + ltlen + gtlen;
2236 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2237 			goto error0;
2238 	}
2239 	/*
2240 	 * Have only a left contiguous neighbor.
2241 	 * Merge it together with the new freespace.
2242 	 */
2243 	else if (haveleft) {
2244 		/*
2245 		 * Delete the old by-size entry on the left.
2246 		 */
2247 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2248 			goto error0;
2249 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2250 			xfs_btree_mark_sick(cnt_cur);
2251 			error = -EFSCORRUPTED;
2252 			goto error0;
2253 		}
2254 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2255 			goto error0;
2256 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2257 			xfs_btree_mark_sick(cnt_cur);
2258 			error = -EFSCORRUPTED;
2259 			goto error0;
2260 		}
2261 		/*
2262 		 * Back up the by-block cursor to the left neighbor, and
2263 		 * update its length.
2264 		 */
2265 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2266 			goto error0;
2267 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2268 			xfs_btree_mark_sick(bno_cur);
2269 			error = -EFSCORRUPTED;
2270 			goto error0;
2271 		}
2272 		nbno = ltbno;
2273 		nlen = len + ltlen;
2274 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2275 			goto error0;
2276 	}
2277 	/*
2278 	 * Have only a right contiguous neighbor.
2279 	 * Merge it together with the new freespace.
2280 	 */
2281 	else if (haveright) {
2282 		/*
2283 		 * Delete the old by-size entry on the right.
2284 		 */
2285 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2286 			goto error0;
2287 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2288 			xfs_btree_mark_sick(cnt_cur);
2289 			error = -EFSCORRUPTED;
2290 			goto error0;
2291 		}
2292 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2293 			goto error0;
2294 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2295 			xfs_btree_mark_sick(cnt_cur);
2296 			error = -EFSCORRUPTED;
2297 			goto error0;
2298 		}
2299 		/*
2300 		 * Update the starting block and length of the right
2301 		 * neighbor in the by-block tree.
2302 		 */
2303 		nbno = bno;
2304 		nlen = len + gtlen;
2305 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2306 			goto error0;
2307 	}
2308 	/*
2309 	 * No contiguous neighbors.
2310 	 * Insert the new freespace into the by-block tree.
2311 	 */
2312 	else {
2313 		nbno = bno;
2314 		nlen = len;
2315 		if ((error = xfs_btree_insert(bno_cur, &i)))
2316 			goto error0;
2317 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2318 			xfs_btree_mark_sick(bno_cur);
2319 			error = -EFSCORRUPTED;
2320 			goto error0;
2321 		}
2322 	}
2323 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2324 	bno_cur = NULL;
2325 
2326 	/*
2327 	 * In all cases we need to insert the new freespace in the by-size tree.
2328 	 *
2329 	 * If this new freespace is being inserted in the block that contains
2330 	 * the largest free space in the btree, make sure we also fix up the
2331 	 * agf->agf-longest tracker field.
2332 	 */
2333 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2334 		goto error0;
2335 	if (XFS_IS_CORRUPT(mp, i != 0)) {
2336 		xfs_btree_mark_sick(cnt_cur);
2337 		error = -EFSCORRUPTED;
2338 		goto error0;
2339 	}
2340 	if (xfs_alloc_cursor_at_lastrec(cnt_cur))
2341 		fixup_longest = true;
2342 	if ((error = xfs_btree_insert(cnt_cur, &i)))
2343 		goto error0;
2344 	if (XFS_IS_CORRUPT(mp, i != 1)) {
2345 		xfs_btree_mark_sick(cnt_cur);
2346 		error = -EFSCORRUPTED;
2347 		goto error0;
2348 	}
2349 	if (fixup_longest) {
2350 		error = xfs_alloc_fixup_longest(cnt_cur);
2351 		if (error)
2352 			goto error0;
2353 	}
2354 
2355 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2356 	cnt_cur = NULL;
2357 
2358 	/*
2359 	 * Update the freespace totals in the ag and superblock.
2360 	 */
2361 	error = xfs_alloc_update_counters(tp, agbp, len);
2362 	xfs_ag_resv_free_extent(pag, type, tp, len);
2363 	if (error)
2364 		goto error0;
2365 
2366 	XFS_STATS_INC(mp, xs_freex);
2367 	XFS_STATS_ADD(mp, xs_freeb, len);
2368 
2369 	trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright);
2370 
2371 	return 0;
2372 
2373  error0:
2374 	trace_xfs_free_extent(pag, bno, len, type, -1, -1);
2375 	if (bno_cur)
2376 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2377 	if (cnt_cur)
2378 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2379 	return error;
2380 }
2381 
2382 /*
2383  * Visible (exported) allocation/free functions.
2384  * Some of these are used just by xfs_alloc_btree.c and this file.
2385  */
2386 
2387 /*
2388  * Compute and fill in value of m_alloc_maxlevels.
2389  */
2390 void
2391 xfs_alloc_compute_maxlevels(
2392 	xfs_mount_t	*mp)	/* file system mount structure */
2393 {
2394 	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2395 			(mp->m_sb.sb_agblocks + 1) / 2);
2396 	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2397 }
2398 
2399 /*
2400  * Find the length of the longest extent in an AG.  The 'need' parameter
2401  * specifies how much space we're going to need for the AGFL and the
2402  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2403  * other callers.
2404  */
2405 xfs_extlen_t
2406 xfs_alloc_longest_free_extent(
2407 	struct xfs_perag	*pag,
2408 	xfs_extlen_t		need,
2409 	xfs_extlen_t		reserved)
2410 {
2411 	xfs_extlen_t		delta = 0;
2412 
2413 	/*
2414 	 * If the AGFL needs a recharge, we'll have to subtract that from the
2415 	 * longest extent.
2416 	 */
2417 	if (need > pag->pagf_flcount)
2418 		delta = need - pag->pagf_flcount;
2419 
2420 	/*
2421 	 * If we cannot maintain others' reservations with space from the
2422 	 * not-longest freesp extents, we'll have to subtract /that/ from
2423 	 * the longest extent too.
2424 	 */
2425 	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2426 		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2427 
2428 	/*
2429 	 * If the longest extent is long enough to satisfy all the
2430 	 * reservations and AGFL rules in place, we can return this extent.
2431 	 */
2432 	if (pag->pagf_longest > delta)
2433 		return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable,
2434 				pag->pagf_longest - delta);
2435 
2436 	/* Otherwise, let the caller try for 1 block if there's space. */
2437 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2438 }
2439 
2440 /*
2441  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2442  * return the largest possible minimum length.
2443  */
2444 unsigned int
2445 xfs_alloc_min_freelist(
2446 	struct xfs_mount	*mp,
2447 	struct xfs_perag	*pag)
2448 {
2449 	/* AG btrees have at least 1 level. */
2450 	const unsigned int	bno_level = pag ? pag->pagf_bno_level : 1;
2451 	const unsigned int	cnt_level = pag ? pag->pagf_cnt_level : 1;
2452 	const unsigned int	rmap_level = pag ? pag->pagf_rmap_level : 1;
2453 	unsigned int		min_free;
2454 
2455 	ASSERT(mp->m_alloc_maxlevels > 0);
2456 
2457 	/*
2458 	 * For a btree shorter than the maximum height, the worst case is that
2459 	 * every level gets split and a new level is added, then while inserting
2460 	 * another entry to refill the AGFL, every level under the old root gets
2461 	 * split again. This is:
2462 	 *
2463 	 *   (full height split reservation) + (AGFL refill split height)
2464 	 * = (current height + 1) + (current height - 1)
2465 	 * = (new height) + (new height - 2)
2466 	 * = 2 * new height - 2
2467 	 *
2468 	 * For a btree of maximum height, the worst case is that every level
2469 	 * under the root gets split, then while inserting another entry to
2470 	 * refill the AGFL, every level under the root gets split again. This is
2471 	 * also:
2472 	 *
2473 	 *   2 * (current height - 1)
2474 	 * = 2 * (new height - 1)
2475 	 * = 2 * new height - 2
2476 	 */
2477 
2478 	/* space needed by-bno freespace btree */
2479 	min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
2480 	/* space needed by-size freespace btree */
2481 	min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
2482 	/* space needed reverse mapping used space btree */
2483 	if (xfs_has_rmapbt(mp))
2484 		min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
2485 	return min_free;
2486 }
2487 
2488 /*
2489  * Check if the operation we are fixing up the freelist for should go ahead or
2490  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2491  * is dependent on whether the size and shape of free space available will
2492  * permit the requested allocation to take place.
2493  */
2494 static bool
2495 xfs_alloc_space_available(
2496 	struct xfs_alloc_arg	*args,
2497 	xfs_extlen_t		min_free,
2498 	int			flags)
2499 {
2500 	struct xfs_perag	*pag = args->pag;
2501 	xfs_extlen_t		alloc_len, longest;
2502 	xfs_extlen_t		reservation; /* blocks that are still reserved */
2503 	int			available;
2504 	xfs_extlen_t		agflcount;
2505 
2506 	if (flags & XFS_ALLOC_FLAG_FREEING)
2507 		return true;
2508 
2509 	reservation = xfs_ag_resv_needed(pag, args->resv);
2510 
2511 	/* do we have enough contiguous free space for the allocation? */
2512 	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2513 	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2514 	if (longest < alloc_len)
2515 		return false;
2516 
2517 	/*
2518 	 * Do we have enough free space remaining for the allocation? Don't
2519 	 * account extra agfl blocks because we are about to defer free them,
2520 	 * making them unavailable until the current transaction commits.
2521 	 */
2522 	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2523 	available = (int)(pag->pagf_freeblks + agflcount -
2524 			  reservation - min_free - args->minleft);
2525 	if (available < (int)max(args->total, alloc_len))
2526 		return false;
2527 
2528 	/*
2529 	 * Clamp maxlen to the amount of free space available for the actual
2530 	 * extent allocation.
2531 	 */
2532 	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2533 		args->maxlen = available;
2534 		ASSERT(args->maxlen > 0);
2535 		ASSERT(args->maxlen >= args->minlen);
2536 	}
2537 
2538 	return true;
2539 }
2540 
2541 /*
2542  * Check the agfl fields of the agf for inconsistency or corruption.
2543  *
2544  * The original purpose was to detect an agfl header padding mismatch between
2545  * current and early v5 kernels. This problem manifests as a 1-slot size
2546  * difference between the on-disk flcount and the active [first, last] range of
2547  * a wrapped agfl.
2548  *
2549  * However, we need to use these same checks to catch agfl count corruptions
2550  * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2551  * way, we need to reset the agfl and warn the user.
2552  *
2553  * Return true if a reset is required before the agfl can be used, false
2554  * otherwise.
2555  */
2556 static bool
2557 xfs_agfl_needs_reset(
2558 	struct xfs_mount	*mp,
2559 	struct xfs_agf		*agf)
2560 {
2561 	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2562 	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2563 	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2564 	int			agfl_size = xfs_agfl_size(mp);
2565 	int			active;
2566 
2567 	/*
2568 	 * The agf read verifier catches severe corruption of these fields.
2569 	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2570 	 * the verifier allows it.
2571 	 */
2572 	if (f >= agfl_size || l >= agfl_size)
2573 		return true;
2574 	if (c > agfl_size)
2575 		return true;
2576 
2577 	/*
2578 	 * Check consistency between the on-disk count and the active range. An
2579 	 * agfl padding mismatch manifests as an inconsistent flcount.
2580 	 */
2581 	if (c && l >= f)
2582 		active = l - f + 1;
2583 	else if (c)
2584 		active = agfl_size - f + l + 1;
2585 	else
2586 		active = 0;
2587 
2588 	return active != c;
2589 }
2590 
2591 /*
2592  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2593  * agfl content cannot be trusted. Warn the user that a repair is required to
2594  * recover leaked blocks.
2595  *
2596  * The purpose of this mechanism is to handle filesystems affected by the agfl
2597  * header padding mismatch problem. A reset keeps the filesystem online with a
2598  * relatively minor free space accounting inconsistency rather than suffer the
2599  * inevitable crash from use of an invalid agfl block.
2600  */
2601 static void
2602 xfs_agfl_reset(
2603 	struct xfs_trans	*tp,
2604 	struct xfs_buf		*agbp,
2605 	struct xfs_perag	*pag)
2606 {
2607 	struct xfs_mount	*mp = tp->t_mountp;
2608 	struct xfs_agf		*agf = agbp->b_addr;
2609 
2610 	ASSERT(xfs_perag_agfl_needs_reset(pag));
2611 	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2612 
2613 	xfs_warn(mp,
2614 	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2615 	       "Please unmount and run xfs_repair.",
2616 		pag_agno(pag), pag->pagf_flcount);
2617 
2618 	agf->agf_flfirst = 0;
2619 	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2620 	agf->agf_flcount = 0;
2621 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2622 				    XFS_AGF_FLCOUNT);
2623 
2624 	pag->pagf_flcount = 0;
2625 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
2626 }
2627 
2628 /*
2629  * Add the extent to the list of extents to be free at transaction end.
2630  * The list is maintained sorted (by block number).
2631  */
2632 static int
2633 xfs_defer_extent_free(
2634 	struct xfs_trans		*tp,
2635 	xfs_fsblock_t			bno,
2636 	xfs_filblks_t			len,
2637 	const struct xfs_owner_info	*oinfo,
2638 	enum xfs_ag_resv_type		type,
2639 	unsigned int			free_flags,
2640 	struct xfs_defer_pending	**dfpp)
2641 {
2642 	struct xfs_extent_free_item	*xefi;
2643 	struct xfs_mount		*mp = tp->t_mountp;
2644 
2645 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2646 	ASSERT(!isnullstartblock(bno));
2647 	ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
2648 
2649 	if (free_flags & XFS_FREE_EXTENT_REALTIME) {
2650 		if (type != XFS_AG_RESV_NONE) {
2651 			ASSERT(type == XFS_AG_RESV_NONE);
2652 			return -EFSCORRUPTED;
2653 		}
2654 		if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len)))
2655 			return -EFSCORRUPTED;
2656 	} else {
2657 		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
2658 			return -EFSCORRUPTED;
2659 	}
2660 
2661 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2662 			       GFP_KERNEL | __GFP_NOFAIL);
2663 	xefi->xefi_startblock = bno;
2664 	xefi->xefi_blockcount = (xfs_extlen_t)len;
2665 	xefi->xefi_agresv = type;
2666 	if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
2667 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2668 	if (free_flags & XFS_FREE_EXTENT_REALTIME)
2669 		xefi->xefi_flags |= XFS_EFI_REALTIME;
2670 	if (oinfo) {
2671 		ASSERT(oinfo->oi_offset == 0);
2672 
2673 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2674 			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
2675 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2676 			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2677 		xefi->xefi_owner = oinfo->oi_owner;
2678 	} else {
2679 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
2680 	}
2681 
2682 	xfs_extent_free_defer_add(tp, xefi, dfpp);
2683 	return 0;
2684 }
2685 
2686 int
2687 xfs_free_extent_later(
2688 	struct xfs_trans		*tp,
2689 	xfs_fsblock_t			bno,
2690 	xfs_filblks_t			len,
2691 	const struct xfs_owner_info	*oinfo,
2692 	enum xfs_ag_resv_type		type,
2693 	unsigned int			free_flags)
2694 {
2695 	struct xfs_defer_pending	*dontcare = NULL;
2696 
2697 	return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags,
2698 			&dontcare);
2699 }
2700 
2701 /*
2702  * Set up automatic freeing of unwritten space in the filesystem.
2703  *
2704  * This function attached a paused deferred extent free item to the
2705  * transaction.  Pausing means that the EFI will be logged in the next
2706  * transaction commit, but the pending EFI will not be finished until the
2707  * pending item is unpaused.
2708  *
2709  * If the system goes down after the EFI has been persisted to the log but
2710  * before the pending item is unpaused, log recovery will find the EFI, fail to
2711  * find the EFD, and free the space.
2712  *
2713  * If the pending item is unpaused, the next transaction commit will log an EFD
2714  * without freeing the space.
2715  *
2716  * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
2717  * @args structure are set to the relevant values.
2718  */
2719 int
2720 xfs_alloc_schedule_autoreap(
2721 	const struct xfs_alloc_arg	*args,
2722 	unsigned int			free_flags,
2723 	struct xfs_alloc_autoreap	*aarp)
2724 {
2725 	int				error;
2726 
2727 	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
2728 			&args->oinfo, args->resv, free_flags, &aarp->dfp);
2729 	if (error)
2730 		return error;
2731 
2732 	xfs_defer_item_pause(args->tp, aarp->dfp);
2733 	return 0;
2734 }
2735 
2736 /*
2737  * Cancel automatic freeing of unwritten space in the filesystem.
2738  *
2739  * Earlier, we created a paused deferred extent free item and attached it to
2740  * this transaction so that we could automatically roll back a new space
2741  * allocation if the system went down.  Now we want to cancel the paused work
2742  * item by marking the EFI stale so we don't actually free the space, unpausing
2743  * the pending item and logging an EFD.
2744  *
2745  * The caller generally should have already mapped the space into the ondisk
2746  * filesystem.  If the reserved space was partially used, the caller must call
2747  * xfs_free_extent_later to create a new EFI to free the unused space.
2748  */
2749 void
2750 xfs_alloc_cancel_autoreap(
2751 	struct xfs_trans		*tp,
2752 	struct xfs_alloc_autoreap	*aarp)
2753 {
2754 	struct xfs_defer_pending	*dfp = aarp->dfp;
2755 	struct xfs_extent_free_item	*xefi;
2756 
2757 	if (!dfp)
2758 		return;
2759 
2760 	list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
2761 		xefi->xefi_flags |= XFS_EFI_CANCELLED;
2762 
2763 	xfs_defer_item_unpause(tp, dfp);
2764 }
2765 
2766 /*
2767  * Commit automatic freeing of unwritten space in the filesystem.
2768  *
2769  * This unpauses an earlier _schedule_autoreap and commits to freeing the
2770  * allocated space.  Call this if none of the reserved space was used.
2771  */
2772 void
2773 xfs_alloc_commit_autoreap(
2774 	struct xfs_trans		*tp,
2775 	struct xfs_alloc_autoreap	*aarp)
2776 {
2777 	if (aarp->dfp)
2778 		xfs_defer_item_unpause(tp, aarp->dfp);
2779 }
2780 
2781 /*
2782  * Check if an AGF has a free extent record whose length is equal to
2783  * args->minlen.
2784  */
2785 STATIC int
2786 xfs_exact_minlen_extent_available(
2787 	struct xfs_alloc_arg	*args,
2788 	struct xfs_buf		*agbp,
2789 	int			*stat)
2790 {
2791 	struct xfs_btree_cur	*cnt_cur;
2792 	xfs_agblock_t		fbno;
2793 	xfs_extlen_t		flen;
2794 	int			error = 0;
2795 
2796 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
2797 					args->pag);
2798 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2799 	if (error)
2800 		goto out;
2801 
2802 	if (*stat == 0) {
2803 		xfs_btree_mark_sick(cnt_cur);
2804 		error = -EFSCORRUPTED;
2805 		goto out;
2806 	}
2807 
2808 	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2809 	if (error)
2810 		goto out;
2811 
2812 	if (*stat == 1 && flen != args->minlen)
2813 		*stat = 0;
2814 
2815 out:
2816 	xfs_btree_del_cursor(cnt_cur, error);
2817 
2818 	return error;
2819 }
2820 
2821 /*
2822  * Decide whether to use this allocation group for this allocation.
2823  * If so, fix up the btree freelist's size.
2824  */
2825 int			/* error */
2826 xfs_alloc_fix_freelist(
2827 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2828 	uint32_t		alloc_flags)
2829 {
2830 	struct xfs_mount	*mp = args->mp;
2831 	struct xfs_perag	*pag = args->pag;
2832 	struct xfs_trans	*tp = args->tp;
2833 	struct xfs_buf		*agbp = NULL;
2834 	struct xfs_buf		*agflbp = NULL;
2835 	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2836 	xfs_agblock_t		bno;	/* freelist block */
2837 	xfs_extlen_t		need;	/* total blocks needed in freelist */
2838 	int			error = 0;
2839 
2840 	/* deferred ops (AGFL block frees) require permanent transactions */
2841 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2842 
2843 	if (!xfs_perag_initialised_agf(pag)) {
2844 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2845 		if (error) {
2846 			/* Couldn't lock the AGF so skip this AG. */
2847 			if (error == -EAGAIN)
2848 				error = 0;
2849 			goto out_no_agbp;
2850 		}
2851 	}
2852 
2853 	/*
2854 	 * If this is a metadata preferred pag and we are user data then try
2855 	 * somewhere else if we are not being asked to try harder at this
2856 	 * point
2857 	 */
2858 	if (xfs_perag_prefers_metadata(pag) &&
2859 	    (args->datatype & XFS_ALLOC_USERDATA) &&
2860 	    (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2861 		ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
2862 		goto out_agbp_relse;
2863 	}
2864 
2865 	need = xfs_alloc_min_freelist(mp, pag);
2866 	if (!xfs_alloc_space_available(args, need, alloc_flags |
2867 			XFS_ALLOC_FLAG_CHECK))
2868 		goto out_agbp_relse;
2869 
2870 	/*
2871 	 * Get the a.g. freespace buffer.
2872 	 * Can fail if we're not blocking on locks, and it's held.
2873 	 */
2874 	if (!agbp) {
2875 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2876 		if (error) {
2877 			/* Couldn't lock the AGF so skip this AG. */
2878 			if (error == -EAGAIN)
2879 				error = 0;
2880 			goto out_no_agbp;
2881 		}
2882 	}
2883 
2884 	/* reset a padding mismatched agfl before final free space check */
2885 	if (xfs_perag_agfl_needs_reset(pag))
2886 		xfs_agfl_reset(tp, agbp, pag);
2887 
2888 	/* If there isn't enough total space or single-extent, reject it. */
2889 	need = xfs_alloc_min_freelist(mp, pag);
2890 	if (!xfs_alloc_space_available(args, need, alloc_flags))
2891 		goto out_agbp_relse;
2892 
2893 	if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) {
2894 		int stat;
2895 
2896 		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2897 		if (error || !stat)
2898 			goto out_agbp_relse;
2899 	}
2900 
2901 	/*
2902 	 * Make the freelist shorter if it's too long.
2903 	 *
2904 	 * Note that from this point onwards, we will always release the agf and
2905 	 * agfl buffers on error. This handles the case where we error out and
2906 	 * the buffers are clean or may not have been joined to the transaction
2907 	 * and hence need to be released manually. If they have been joined to
2908 	 * the transaction, then xfs_trans_brelse() will handle them
2909 	 * appropriately based on the recursion count and dirty state of the
2910 	 * buffer.
2911 	 *
2912 	 * XXX (dgc): When we have lots of free space, does this buy us
2913 	 * anything other than extra overhead when we need to put more blocks
2914 	 * back on the free list? Maybe we should only do this when space is
2915 	 * getting low or the AGFL is more than half full?
2916 	 *
2917 	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2918 	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2919 	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2920 	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2921 	 * both required to ensure that rmaps are correctly recorded for the
2922 	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2923 	 * repair/rmap.c in xfsprogs for details.
2924 	 */
2925 	memset(&targs, 0, sizeof(targs));
2926 	/* struct copy below */
2927 	if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
2928 		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2929 	else
2930 		targs.oinfo = XFS_RMAP_OINFO_AG;
2931 	while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
2932 			pag->pagf_flcount > need) {
2933 		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2934 		if (error)
2935 			goto out_agbp_relse;
2936 
2937 		/*
2938 		 * Defer the AGFL block free.
2939 		 *
2940 		 * This helps to prevent log reservation overruns due to too
2941 		 * many allocation operations in a transaction. AGFL frees are
2942 		 * prone to this problem because for one they are always freed
2943 		 * one at a time.  Further, an immediate AGFL block free can
2944 		 * cause a btree join and require another block free before the
2945 		 * real allocation can proceed.
2946 		 * Deferring the free disconnects freeing up the AGFL slot from
2947 		 * freeing the block.
2948 		 */
2949 		error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno),
2950 				1, &targs.oinfo, XFS_AG_RESV_AGFL, 0);
2951 		if (error)
2952 			goto out_agbp_relse;
2953 	}
2954 
2955 	targs.tp = tp;
2956 	targs.mp = mp;
2957 	targs.agbp = agbp;
2958 	targs.agno = args->agno;
2959 	targs.alignment = targs.minlen = targs.prod = 1;
2960 	targs.pag = pag;
2961 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2962 	if (error)
2963 		goto out_agbp_relse;
2964 
2965 	/* Make the freelist longer if it's too short. */
2966 	while (pag->pagf_flcount < need) {
2967 		targs.agbno = 0;
2968 		targs.maxlen = need - pag->pagf_flcount;
2969 		targs.resv = XFS_AG_RESV_AGFL;
2970 
2971 		/* Allocate as many blocks as possible at once. */
2972 		error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
2973 		if (error)
2974 			goto out_agflbp_relse;
2975 
2976 		/*
2977 		 * Stop if we run out.  Won't happen if callers are obeying
2978 		 * the restrictions correctly.  Can happen for free calls
2979 		 * on a completely full ag.
2980 		 */
2981 		if (targs.agbno == NULLAGBLOCK) {
2982 			if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
2983 				break;
2984 			goto out_agflbp_relse;
2985 		}
2986 
2987 		if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2988 			error = xfs_rmap_alloc(tp, agbp, pag,
2989 				       targs.agbno, targs.len, &targs.oinfo);
2990 			if (error)
2991 				goto out_agflbp_relse;
2992 		}
2993 		error = xfs_alloc_update_counters(tp, agbp,
2994 						  -((long)(targs.len)));
2995 		if (error)
2996 			goto out_agflbp_relse;
2997 
2998 		/*
2999 		 * Put each allocated block on the list.
3000 		 */
3001 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
3002 			error = xfs_alloc_put_freelist(pag, tp, agbp,
3003 							agflbp, bno, 0);
3004 			if (error)
3005 				goto out_agflbp_relse;
3006 		}
3007 	}
3008 	xfs_trans_brelse(tp, agflbp);
3009 	args->agbp = agbp;
3010 	return 0;
3011 
3012 out_agflbp_relse:
3013 	xfs_trans_brelse(tp, agflbp);
3014 out_agbp_relse:
3015 	if (agbp)
3016 		xfs_trans_brelse(tp, agbp);
3017 out_no_agbp:
3018 	args->agbp = NULL;
3019 	return error;
3020 }
3021 
3022 /*
3023  * Get a block from the freelist.
3024  * Returns with the buffer for the block gotten.
3025  */
3026 int
3027 xfs_alloc_get_freelist(
3028 	struct xfs_perag	*pag,
3029 	struct xfs_trans	*tp,
3030 	struct xfs_buf		*agbp,
3031 	xfs_agblock_t		*bnop,
3032 	int			btreeblk)
3033 {
3034 	struct xfs_agf		*agf = agbp->b_addr;
3035 	struct xfs_buf		*agflbp;
3036 	xfs_agblock_t		bno;
3037 	__be32			*agfl_bno;
3038 	int			error;
3039 	uint32_t		logflags;
3040 	struct xfs_mount	*mp = tp->t_mountp;
3041 
3042 	/*
3043 	 * Freelist is empty, give up.
3044 	 */
3045 	if (!agf->agf_flcount) {
3046 		*bnop = NULLAGBLOCK;
3047 		return 0;
3048 	}
3049 	/*
3050 	 * Read the array of free blocks.
3051 	 */
3052 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3053 	if (error)
3054 		return error;
3055 
3056 
3057 	/*
3058 	 * Get the block number and update the data structures.
3059 	 */
3060 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3061 	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
3062 	if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
3063 		return -EFSCORRUPTED;
3064 
3065 	be32_add_cpu(&agf->agf_flfirst, 1);
3066 	xfs_trans_brelse(tp, agflbp);
3067 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
3068 		agf->agf_flfirst = 0;
3069 
3070 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3071 	be32_add_cpu(&agf->agf_flcount, -1);
3072 	pag->pagf_flcount--;
3073 
3074 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
3075 	if (btreeblk) {
3076 		be32_add_cpu(&agf->agf_btreeblks, 1);
3077 		pag->pagf_btreeblks++;
3078 		logflags |= XFS_AGF_BTREEBLKS;
3079 	}
3080 
3081 	xfs_alloc_log_agf(tp, agbp, logflags);
3082 	*bnop = bno;
3083 
3084 	return 0;
3085 }
3086 
3087 /*
3088  * Log the given fields from the agf structure.
3089  */
3090 void
3091 xfs_alloc_log_agf(
3092 	struct xfs_trans	*tp,
3093 	struct xfs_buf		*bp,
3094 	uint32_t		fields)
3095 {
3096 	int	first;		/* first byte offset */
3097 	int	last;		/* last byte offset */
3098 	static const short	offsets[] = {
3099 		offsetof(xfs_agf_t, agf_magicnum),
3100 		offsetof(xfs_agf_t, agf_versionnum),
3101 		offsetof(xfs_agf_t, agf_seqno),
3102 		offsetof(xfs_agf_t, agf_length),
3103 		offsetof(xfs_agf_t, agf_bno_root),   /* also cnt/rmap root */
3104 		offsetof(xfs_agf_t, agf_bno_level),  /* also cnt/rmap levels */
3105 		offsetof(xfs_agf_t, agf_flfirst),
3106 		offsetof(xfs_agf_t, agf_fllast),
3107 		offsetof(xfs_agf_t, agf_flcount),
3108 		offsetof(xfs_agf_t, agf_freeblks),
3109 		offsetof(xfs_agf_t, agf_longest),
3110 		offsetof(xfs_agf_t, agf_btreeblks),
3111 		offsetof(xfs_agf_t, agf_uuid),
3112 		offsetof(xfs_agf_t, agf_rmap_blocks),
3113 		offsetof(xfs_agf_t, agf_refcount_blocks),
3114 		offsetof(xfs_agf_t, agf_refcount_root),
3115 		offsetof(xfs_agf_t, agf_refcount_level),
3116 		/* needed so that we don't log the whole rest of the structure: */
3117 		offsetof(xfs_agf_t, agf_spare64),
3118 		sizeof(xfs_agf_t)
3119 	};
3120 
3121 	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
3122 
3123 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
3124 
3125 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
3126 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
3127 }
3128 
3129 /*
3130  * Put the block on the freelist for the allocation group.
3131  */
3132 int
3133 xfs_alloc_put_freelist(
3134 	struct xfs_perag	*pag,
3135 	struct xfs_trans	*tp,
3136 	struct xfs_buf		*agbp,
3137 	struct xfs_buf		*agflbp,
3138 	xfs_agblock_t		bno,
3139 	int			btreeblk)
3140 {
3141 	struct xfs_mount	*mp = tp->t_mountp;
3142 	struct xfs_agf		*agf = agbp->b_addr;
3143 	__be32			*blockp;
3144 	int			error;
3145 	uint32_t		logflags;
3146 	__be32			*agfl_bno;
3147 	int			startoff;
3148 
3149 	if (!agflbp) {
3150 		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3151 		if (error)
3152 			return error;
3153 	}
3154 
3155 	be32_add_cpu(&agf->agf_fllast, 1);
3156 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
3157 		agf->agf_fllast = 0;
3158 
3159 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3160 	be32_add_cpu(&agf->agf_flcount, 1);
3161 	pag->pagf_flcount++;
3162 
3163 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
3164 	if (btreeblk) {
3165 		be32_add_cpu(&agf->agf_btreeblks, -1);
3166 		pag->pagf_btreeblks--;
3167 		logflags |= XFS_AGF_BTREEBLKS;
3168 	}
3169 
3170 	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
3171 
3172 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3173 	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
3174 	*blockp = cpu_to_be32(bno);
3175 	startoff = (char *)blockp - (char *)agflbp->b_addr;
3176 
3177 	xfs_alloc_log_agf(tp, agbp, logflags);
3178 
3179 	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
3180 	xfs_trans_log_buf(tp, agflbp, startoff,
3181 			  startoff + sizeof(xfs_agblock_t) - 1);
3182 	return 0;
3183 }
3184 
3185 /*
3186  * Check that this AGF/AGI header's sequence number and length matches the AG
3187  * number and size in fsblocks.
3188  */
3189 xfs_failaddr_t
3190 xfs_validate_ag_length(
3191 	struct xfs_buf		*bp,
3192 	uint32_t		seqno,
3193 	uint32_t		length)
3194 {
3195 	struct xfs_mount	*mp = bp->b_mount;
3196 	/*
3197 	 * During growfs operations, the perag is not fully initialised,
3198 	 * so we can't use it for any useful checking. growfs ensures we can't
3199 	 * use it by using uncached buffers that don't have the perag attached
3200 	 * so we can detect and avoid this problem.
3201 	 */
3202 	if (bp->b_pag && seqno != pag_agno(bp->b_pag))
3203 		return __this_address;
3204 
3205 	/*
3206 	 * Only the last AG in the filesystem is allowed to be shorter
3207 	 * than the AG size recorded in the superblock.
3208 	 */
3209 	if (length != mp->m_sb.sb_agblocks) {
3210 		/*
3211 		 * During growfs, the new last AG can get here before we
3212 		 * have updated the superblock. Give it a pass on the seqno
3213 		 * check.
3214 		 */
3215 		if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
3216 			return __this_address;
3217 		if (length < XFS_MIN_AG_BLOCKS)
3218 			return __this_address;
3219 		if (length > mp->m_sb.sb_agblocks)
3220 			return __this_address;
3221 	}
3222 
3223 	return NULL;
3224 }
3225 
3226 /*
3227  * Verify the AGF is consistent.
3228  *
3229  * We do not verify the AGFL indexes in the AGF are fully consistent here
3230  * because of issues with variable on-disk structure sizes. Instead, we check
3231  * the agfl indexes for consistency when we initialise the perag from the AGF
3232  * information after a read completes.
3233  *
3234  * If the index is inconsistent, then we mark the perag as needing an AGFL
3235  * reset. The first AGFL update performed then resets the AGFL indexes and
3236  * refills the AGFL with known good free blocks, allowing the filesystem to
3237  * continue operating normally at the cost of a few leaked free space blocks.
3238  */
3239 static xfs_failaddr_t
3240 xfs_agf_verify(
3241 	struct xfs_buf		*bp)
3242 {
3243 	struct xfs_mount	*mp = bp->b_mount;
3244 	struct xfs_agf		*agf = bp->b_addr;
3245 	xfs_failaddr_t		fa;
3246 	uint32_t		agf_seqno = be32_to_cpu(agf->agf_seqno);
3247 	uint32_t		agf_length = be32_to_cpu(agf->agf_length);
3248 
3249 	if (xfs_has_crc(mp)) {
3250 		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
3251 			return __this_address;
3252 		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
3253 			return __this_address;
3254 	}
3255 
3256 	if (!xfs_verify_magic(bp, agf->agf_magicnum))
3257 		return __this_address;
3258 
3259 	if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
3260 		return __this_address;
3261 
3262 	/*
3263 	 * Both agf_seqno and agf_length need to validated before anything else
3264 	 * block number related in the AGF or AGFL can be checked.
3265 	 */
3266 	fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
3267 	if (fa)
3268 		return fa;
3269 
3270 	if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
3271 		return __this_address;
3272 	if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
3273 		return __this_address;
3274 	if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
3275 		return __this_address;
3276 
3277 	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
3278 	    be32_to_cpu(agf->agf_freeblks) > agf_length)
3279 		return __this_address;
3280 
3281 	if (be32_to_cpu(agf->agf_bno_level) < 1 ||
3282 	    be32_to_cpu(agf->agf_cnt_level) < 1 ||
3283 	    be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
3284 	    be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
3285 		return __this_address;
3286 
3287 	if (xfs_has_lazysbcount(mp) &&
3288 	    be32_to_cpu(agf->agf_btreeblks) > agf_length)
3289 		return __this_address;
3290 
3291 	if (xfs_has_rmapbt(mp)) {
3292 		if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
3293 			return __this_address;
3294 
3295 		if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
3296 		    be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
3297 			return __this_address;
3298 	}
3299 
3300 	if (xfs_has_reflink(mp)) {
3301 		if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
3302 			return __this_address;
3303 
3304 		if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
3305 		    be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
3306 			return __this_address;
3307 	}
3308 
3309 	return NULL;
3310 }
3311 
3312 static void
3313 xfs_agf_read_verify(
3314 	struct xfs_buf	*bp)
3315 {
3316 	struct xfs_mount *mp = bp->b_mount;
3317 	xfs_failaddr_t	fa;
3318 
3319 	if (xfs_has_crc(mp) &&
3320 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3321 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3322 	else {
3323 		fa = xfs_agf_verify(bp);
3324 		if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF))
3325 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3326 	}
3327 }
3328 
3329 static void
3330 xfs_agf_write_verify(
3331 	struct xfs_buf	*bp)
3332 {
3333 	struct xfs_mount	*mp = bp->b_mount;
3334 	struct xfs_buf_log_item	*bip = bp->b_log_item;
3335 	struct xfs_agf		*agf = bp->b_addr;
3336 	xfs_failaddr_t		fa;
3337 
3338 	fa = xfs_agf_verify(bp);
3339 	if (fa) {
3340 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3341 		return;
3342 	}
3343 
3344 	if (!xfs_has_crc(mp))
3345 		return;
3346 
3347 	if (bip)
3348 		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3349 
3350 	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3351 }
3352 
3353 const struct xfs_buf_ops xfs_agf_buf_ops = {
3354 	.name = "xfs_agf",
3355 	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3356 	.verify_read = xfs_agf_read_verify,
3357 	.verify_write = xfs_agf_write_verify,
3358 	.verify_struct = xfs_agf_verify,
3359 };
3360 
3361 /*
3362  * Read in the allocation group header (free/alloc section).
3363  */
3364 int
3365 xfs_read_agf(
3366 	struct xfs_perag	*pag,
3367 	struct xfs_trans	*tp,
3368 	int			flags,
3369 	struct xfs_buf		**agfbpp)
3370 {
3371 	struct xfs_mount	*mp = pag_mount(pag);
3372 	int			error;
3373 
3374 	trace_xfs_read_agf(pag);
3375 
3376 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3377 			XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)),
3378 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3379 	if (xfs_metadata_is_sick(error))
3380 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
3381 	if (error)
3382 		return error;
3383 
3384 	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3385 	return 0;
3386 }
3387 
3388 /*
3389  * Read in the allocation group header (free/alloc section) and initialise the
3390  * perag structure if necessary. If the caller provides @agfbpp, then return the
3391  * locked buffer to the caller, otherwise free it.
3392  */
3393 int
3394 xfs_alloc_read_agf(
3395 	struct xfs_perag	*pag,
3396 	struct xfs_trans	*tp,
3397 	int			flags,
3398 	struct xfs_buf		**agfbpp)
3399 {
3400 	struct xfs_mount	*mp = pag_mount(pag);
3401 	struct xfs_buf		*agfbp;
3402 	struct xfs_agf		*agf;
3403 	int			error;
3404 	int			allocbt_blks;
3405 
3406 	trace_xfs_alloc_read_agf(pag);
3407 
3408 	/* We don't support trylock when freeing. */
3409 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3410 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3411 	error = xfs_read_agf(pag, tp,
3412 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3413 			&agfbp);
3414 	if (error)
3415 		return error;
3416 
3417 	agf = agfbp->b_addr;
3418 	if (!xfs_perag_initialised_agf(pag)) {
3419 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3420 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3421 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3422 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3423 		pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
3424 		pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
3425 		pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
3426 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3427 		if (xfs_agfl_needs_reset(mp, agf))
3428 			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3429 		else
3430 			clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3431 
3432 		/*
3433 		 * Update the in-core allocbt counter. Filter out the rmapbt
3434 		 * subset of the btreeblks counter because the rmapbt is managed
3435 		 * by perag reservation. Subtract one for the rmapbt root block
3436 		 * because the rmap counter includes it while the btreeblks
3437 		 * counter only tracks non-root blocks.
3438 		 */
3439 		allocbt_blks = pag->pagf_btreeblks;
3440 		if (xfs_has_rmapbt(mp))
3441 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3442 		if (allocbt_blks > 0)
3443 			atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
3444 
3445 		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
3446 	}
3447 
3448 #ifdef DEBUG
3449 	/*
3450 	 * It's possible for the AGF to be out of sync if the block device is
3451 	 * silently dropping writes. This can happen in fstests with dmflakey
3452 	 * enabled, which allows the buffer to be cleaned and reclaimed by
3453 	 * memory pressure and then re-read from disk here. We will get a
3454 	 * stale version of the AGF from disk, and nothing good can happen from
3455 	 * here. Hence if we detect this situation, immediately shut down the
3456 	 * filesystem.
3457 	 *
3458 	 * This can also happen if we are already in the middle of a forced
3459 	 * shutdown, so don't bother checking if we are already shut down.
3460 	 */
3461 	if (!xfs_is_shutdown(pag_mount(pag))) {
3462 		bool	ok = true;
3463 
3464 		ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
3465 		ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
3466 		ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
3467 		ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
3468 		ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
3469 		ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
3470 		ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);
3471 
3472 		if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
3473 			xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
3474 			xfs_trans_brelse(tp, agfbp);
3475 			xfs_force_shutdown(pag_mount(pag),
3476 					SHUTDOWN_CORRUPT_ONDISK);
3477 			return -EFSCORRUPTED;
3478 		}
3479 	}
3480 #endif /* DEBUG */
3481 
3482 	if (agfbpp)
3483 		*agfbpp = agfbp;
3484 	else
3485 		xfs_trans_brelse(tp, agfbp);
3486 	return 0;
3487 }
3488 
3489 /*
3490  * Pre-proces allocation arguments to set initial state that we don't require
3491  * callers to set up correctly, as well as bounds check the allocation args
3492  * that are set up.
3493  */
3494 static int
3495 xfs_alloc_vextent_check_args(
3496 	struct xfs_alloc_arg	*args,
3497 	xfs_fsblock_t		target,
3498 	xfs_agnumber_t		*minimum_agno)
3499 {
3500 	struct xfs_mount	*mp = args->mp;
3501 	xfs_agblock_t		agsize;
3502 
3503 	args->fsbno = NULLFSBLOCK;
3504 
3505 	*minimum_agno = 0;
3506 	if (args->tp->t_highest_agno != NULLAGNUMBER)
3507 		*minimum_agno = args->tp->t_highest_agno;
3508 
3509 	/*
3510 	 * Just fix this up, for the case where the last a.g. is shorter
3511 	 * (or there's only one a.g.) and the caller couldn't easily figure
3512 	 * that out (xfs_bmap_alloc).
3513 	 */
3514 	agsize = mp->m_sb.sb_agblocks;
3515 	if (args->maxlen > agsize)
3516 		args->maxlen = agsize;
3517 	if (args->alignment == 0)
3518 		args->alignment = 1;
3519 
3520 	ASSERT(args->minlen > 0);
3521 	ASSERT(args->maxlen > 0);
3522 	ASSERT(args->alignment > 0);
3523 	ASSERT(args->resv != XFS_AG_RESV_AGFL);
3524 
3525 	ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3526 	ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
3527 	ASSERT(args->minlen <= args->maxlen);
3528 	ASSERT(args->minlen <= agsize);
3529 	ASSERT(args->mod < args->prod);
3530 
3531 	if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3532 	    XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
3533 	    args->minlen > args->maxlen || args->minlen > agsize ||
3534 	    args->mod >= args->prod) {
3535 		trace_xfs_alloc_vextent_badargs(args);
3536 		return -ENOSPC;
3537 	}
3538 
3539 	if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3540 		trace_xfs_alloc_vextent_skip_deadlock(args);
3541 		return -ENOSPC;
3542 	}
3543 	return 0;
3544 
3545 }
3546 
3547 /*
3548  * Prepare an AG for allocation. If the AG is not prepared to accept the
3549  * allocation, return failure.
3550  *
3551  * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3552  * modified to hold their own perag references.
3553  */
3554 static int
3555 xfs_alloc_vextent_prepare_ag(
3556 	struct xfs_alloc_arg	*args,
3557 	uint32_t		alloc_flags)
3558 {
3559 	bool			need_pag = !args->pag;
3560 	int			error;
3561 
3562 	if (need_pag)
3563 		args->pag = xfs_perag_get(args->mp, args->agno);
3564 
3565 	args->agbp = NULL;
3566 	error = xfs_alloc_fix_freelist(args, alloc_flags);
3567 	if (error) {
3568 		trace_xfs_alloc_vextent_nofix(args);
3569 		if (need_pag)
3570 			xfs_perag_put(args->pag);
3571 		args->agbno = NULLAGBLOCK;
3572 		return error;
3573 	}
3574 	if (!args->agbp) {
3575 		/* cannot allocate in this AG at all */
3576 		trace_xfs_alloc_vextent_noagbp(args);
3577 		args->agbno = NULLAGBLOCK;
3578 		return 0;
3579 	}
3580 	args->wasfromfl = 0;
3581 	return 0;
3582 }
3583 
3584 /*
3585  * Post-process allocation results to account for the allocation if it succeed
3586  * and set the allocated block number correctly for the caller.
3587  *
3588  * XXX: we should really be returning ENOSPC for ENOSPC, not
3589  * hiding it behind a "successful" NULLFSBLOCK allocation.
3590  */
3591 static int
3592 xfs_alloc_vextent_finish(
3593 	struct xfs_alloc_arg	*args,
3594 	xfs_agnumber_t		minimum_agno,
3595 	int			alloc_error,
3596 	bool			drop_perag)
3597 {
3598 	struct xfs_mount	*mp = args->mp;
3599 	int			error = 0;
3600 
3601 	/*
3602 	 * We can end up here with a locked AGF. If we failed, the caller is
3603 	 * likely going to try to allocate again with different parameters, and
3604 	 * that can widen the AGs that are searched for free space. If we have
3605 	 * to do BMBT block allocation, we have to do a new allocation.
3606 	 *
3607 	 * Hence leaving this function with the AGF locked opens up potential
3608 	 * ABBA AGF deadlocks because a future allocation attempt in this
3609 	 * transaction may attempt to lock a lower number AGF.
3610 	 *
3611 	 * We can't release the AGF until the transaction is commited, so at
3612 	 * this point we must update the "first allocation" tracker to point at
3613 	 * this AG if the tracker is empty or points to a lower AG. This allows
3614 	 * the next allocation attempt to be modified appropriately to avoid
3615 	 * deadlocks.
3616 	 */
3617 	if (args->agbp &&
3618 	    (args->tp->t_highest_agno == NULLAGNUMBER ||
3619 	     args->agno > minimum_agno))
3620 		args->tp->t_highest_agno = args->agno;
3621 
3622 	/*
3623 	 * If the allocation failed with an error or we had an ENOSPC result,
3624 	 * preserve the returned error whilst also marking the allocation result
3625 	 * as "no extent allocated". This ensures that callers that fail to
3626 	 * capture the error will still treat it as a failed allocation.
3627 	 */
3628 	if (alloc_error || args->agbno == NULLAGBLOCK) {
3629 		args->fsbno = NULLFSBLOCK;
3630 		error = alloc_error;
3631 		goto out_drop_perag;
3632 	}
3633 
3634 	args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno);
3635 
3636 	ASSERT(args->len >= args->minlen);
3637 	ASSERT(args->len <= args->maxlen);
3638 	ASSERT(args->agbno % args->alignment == 0);
3639 	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
3640 
3641 	/* if not file data, insert new block into the reverse map btree */
3642 	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3643 		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3644 				       args->agbno, args->len, &args->oinfo);
3645 		if (error)
3646 			goto out_drop_perag;
3647 	}
3648 
3649 	if (!args->wasfromfl) {
3650 		error = xfs_alloc_update_counters(args->tp, args->agbp,
3651 						  -((long)(args->len)));
3652 		if (error)
3653 			goto out_drop_perag;
3654 
3655 		ASSERT(!xfs_extent_busy_search(pag_group(args->pag),
3656 				args->agbno, args->len));
3657 	}
3658 
3659 	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3660 
3661 	XFS_STATS_INC(mp, xs_allocx);
3662 	XFS_STATS_ADD(mp, xs_allocb, args->len);
3663 
3664 	trace_xfs_alloc_vextent_finish(args);
3665 
3666 out_drop_perag:
3667 	if (drop_perag && args->pag) {
3668 		xfs_perag_rele(args->pag);
3669 		args->pag = NULL;
3670 	}
3671 	return error;
3672 }
3673 
3674 /*
3675  * Allocate within a single AG only. This uses a best-fit length algorithm so if
3676  * you need an exact sized allocation without locality constraints, this is the
3677  * fastest way to do it.
3678  *
3679  * Caller is expected to hold a perag reference in args->pag.
3680  */
3681 int
3682 xfs_alloc_vextent_this_ag(
3683 	struct xfs_alloc_arg	*args,
3684 	xfs_agnumber_t		agno)
3685 {
3686 	xfs_agnumber_t		minimum_agno;
3687 	uint32_t		alloc_flags = 0;
3688 	int			error;
3689 
3690 	ASSERT(args->pag != NULL);
3691 	ASSERT(pag_agno(args->pag) == agno);
3692 
3693 	args->agno = agno;
3694 	args->agbno = 0;
3695 
3696 	trace_xfs_alloc_vextent_this_ag(args);
3697 
3698 	error = xfs_alloc_vextent_check_args(args,
3699 			xfs_agbno_to_fsb(args->pag, 0), &minimum_agno);
3700 	if (error) {
3701 		if (error == -ENOSPC)
3702 			return 0;
3703 		return error;
3704 	}
3705 
3706 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3707 	if (!error && args->agbp)
3708 		error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3709 
3710 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3711 }
3712 
3713 /*
3714  * Iterate all AGs trying to allocate an extent starting from @start_ag.
3715  *
3716  * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3717  * allocation attempts in @start_agno have locality information. If we fail to
3718  * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3719  * we attempt to allocation in as there is no locality optimisation possible for
3720  * those allocations.
3721  *
3722  * On return, args->pag may be left referenced if we finish before the "all
3723  * failed" return point. The allocation finish still needs the perag, and
3724  * so the caller will release it once they've finished the allocation.
3725  *
3726  * When we wrap the AG iteration at the end of the filesystem, we have to be
3727  * careful not to wrap into AGs below ones we already have locked in the
3728  * transaction if we are doing a blocking iteration. This will result in an
3729  * out-of-order locking of AGFs and hence can cause deadlocks.
3730  */
3731 static int
3732 xfs_alloc_vextent_iterate_ags(
3733 	struct xfs_alloc_arg	*args,
3734 	xfs_agnumber_t		minimum_agno,
3735 	xfs_agnumber_t		start_agno,
3736 	xfs_agblock_t		target_agbno,
3737 	uint32_t		alloc_flags)
3738 {
3739 	struct xfs_mount	*mp = args->mp;
3740 	xfs_agnumber_t		restart_agno = minimum_agno;
3741 	xfs_agnumber_t		agno;
3742 	int			error = 0;
3743 
3744 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
3745 		restart_agno = 0;
3746 restart:
3747 	for_each_perag_wrap_range(mp, start_agno, restart_agno,
3748 			mp->m_sb.sb_agcount, agno, args->pag) {
3749 		args->agno = agno;
3750 		error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3751 		if (error)
3752 			break;
3753 		if (!args->agbp) {
3754 			trace_xfs_alloc_vextent_loopfailed(args);
3755 			continue;
3756 		}
3757 
3758 		/*
3759 		 * Allocation is supposed to succeed now, so break out of the
3760 		 * loop regardless of whether we succeed or not.
3761 		 */
3762 		if (args->agno == start_agno && target_agbno) {
3763 			args->agbno = target_agbno;
3764 			error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3765 		} else {
3766 			args->agbno = 0;
3767 			error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3768 		}
3769 		break;
3770 	}
3771 	if (error) {
3772 		xfs_perag_rele(args->pag);
3773 		args->pag = NULL;
3774 		return error;
3775 	}
3776 	if (args->agbp)
3777 		return 0;
3778 
3779 	/*
3780 	 * We didn't find an AG we can alloation from. If we were given
3781 	 * constraining flags by the caller, drop them and retry the allocation
3782 	 * without any constraints being set.
3783 	 */
3784 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
3785 		alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
3786 		restart_agno = minimum_agno;
3787 		goto restart;
3788 	}
3789 
3790 	ASSERT(args->pag == NULL);
3791 	trace_xfs_alloc_vextent_allfailed(args);
3792 	return 0;
3793 }
3794 
3795 /*
3796  * Iterate from the AGs from the start AG to the end of the filesystem, trying
3797  * to allocate blocks. It starts with a near allocation attempt in the initial
3798  * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3799  * back to zero if allowed by previous allocations in this transaction,
3800  * otherwise will wrap back to the start AG and run a second blocking pass to
3801  * the end of the filesystem.
3802  */
3803 int
3804 xfs_alloc_vextent_start_ag(
3805 	struct xfs_alloc_arg	*args,
3806 	xfs_fsblock_t		target)
3807 {
3808 	struct xfs_mount	*mp = args->mp;
3809 	xfs_agnumber_t		minimum_agno;
3810 	xfs_agnumber_t		start_agno;
3811 	xfs_agnumber_t		rotorstep = xfs_rotorstep;
3812 	bool			bump_rotor = false;
3813 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3814 	int			error;
3815 
3816 	ASSERT(args->pag == NULL);
3817 
3818 	args->agno = NULLAGNUMBER;
3819 	args->agbno = NULLAGBLOCK;
3820 
3821 	trace_xfs_alloc_vextent_start_ag(args);
3822 
3823 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3824 	if (error) {
3825 		if (error == -ENOSPC)
3826 			return 0;
3827 		return error;
3828 	}
3829 
3830 	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3831 	    xfs_is_inode32(mp)) {
3832 		target = XFS_AGB_TO_FSB(mp,
3833 				((mp->m_agfrotor / rotorstep) %
3834 				mp->m_sb.sb_agcount), 0);
3835 		bump_rotor = 1;
3836 	}
3837 
3838 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3839 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3840 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3841 
3842 	if (bump_rotor) {
3843 		if (args->agno == start_agno)
3844 			mp->m_agfrotor = (mp->m_agfrotor + 1) %
3845 				(mp->m_sb.sb_agcount * rotorstep);
3846 		else
3847 			mp->m_agfrotor = (args->agno * rotorstep + 1) %
3848 				(mp->m_sb.sb_agcount * rotorstep);
3849 	}
3850 
3851 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3852 }
3853 
3854 /*
3855  * Iterate from the agno indicated via @target through to the end of the
3856  * filesystem attempting blocking allocation. This does not wrap or try a second
3857  * pass, so will not recurse into AGs lower than indicated by the target.
3858  */
3859 int
3860 xfs_alloc_vextent_first_ag(
3861 	struct xfs_alloc_arg	*args,
3862 	xfs_fsblock_t		target)
3863  {
3864 	struct xfs_mount	*mp = args->mp;
3865 	xfs_agnumber_t		minimum_agno;
3866 	xfs_agnumber_t		start_agno;
3867 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3868 	int			error;
3869 
3870 	ASSERT(args->pag == NULL);
3871 
3872 	args->agno = NULLAGNUMBER;
3873 	args->agbno = NULLAGBLOCK;
3874 
3875 	trace_xfs_alloc_vextent_first_ag(args);
3876 
3877 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3878 	if (error) {
3879 		if (error == -ENOSPC)
3880 			return 0;
3881 		return error;
3882 	}
3883 
3884 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3885 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3886 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3887 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3888 }
3889 
3890 /*
3891  * Allocate at the exact block target or fail. Caller is expected to hold a
3892  * perag reference in args->pag.
3893  */
3894 int
3895 xfs_alloc_vextent_exact_bno(
3896 	struct xfs_alloc_arg	*args,
3897 	xfs_fsblock_t		target)
3898 {
3899 	struct xfs_mount	*mp = args->mp;
3900 	xfs_agnumber_t		minimum_agno;
3901 	int			error;
3902 
3903 	ASSERT(args->pag != NULL);
3904 	ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
3905 
3906 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3907 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3908 
3909 	trace_xfs_alloc_vextent_exact_bno(args);
3910 
3911 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3912 	if (error) {
3913 		if (error == -ENOSPC)
3914 			return 0;
3915 		return error;
3916 	}
3917 
3918 	error = xfs_alloc_vextent_prepare_ag(args, 0);
3919 	if (!error && args->agbp)
3920 		error = xfs_alloc_ag_vextent_exact(args);
3921 
3922 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3923 }
3924 
3925 /*
3926  * Allocate an extent as close to the target as possible. If there are not
3927  * viable candidates in the AG, then fail the allocation.
3928  *
3929  * Caller may or may not have a per-ag reference in args->pag.
3930  */
3931 int
3932 xfs_alloc_vextent_near_bno(
3933 	struct xfs_alloc_arg	*args,
3934 	xfs_fsblock_t		target)
3935 {
3936 	struct xfs_mount	*mp = args->mp;
3937 	xfs_agnumber_t		minimum_agno;
3938 	bool			needs_perag = args->pag == NULL;
3939 	uint32_t		alloc_flags = 0;
3940 	int			error;
3941 
3942 	if (!needs_perag)
3943 		ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
3944 
3945 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3946 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3947 
3948 	trace_xfs_alloc_vextent_near_bno(args);
3949 
3950 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3951 	if (error) {
3952 		if (error == -ENOSPC)
3953 			return 0;
3954 		return error;
3955 	}
3956 
3957 	if (needs_perag)
3958 		args->pag = xfs_perag_grab(mp, args->agno);
3959 
3960 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3961 	if (!error && args->agbp)
3962 		error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3963 
3964 	return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
3965 }
3966 
3967 /* Ensure that the freelist is at full capacity. */
3968 int
3969 xfs_free_extent_fix_freelist(
3970 	struct xfs_trans	*tp,
3971 	struct xfs_perag	*pag,
3972 	struct xfs_buf		**agbp)
3973 {
3974 	struct xfs_alloc_arg	args;
3975 	int			error;
3976 
3977 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3978 	args.tp = tp;
3979 	args.mp = tp->t_mountp;
3980 	args.agno = pag_agno(pag);
3981 	args.pag = pag;
3982 
3983 	/*
3984 	 * validate that the block number is legal - the enables us to detect
3985 	 * and handle a silent filesystem corruption rather than crashing.
3986 	 */
3987 	if (args.agno >= args.mp->m_sb.sb_agcount)
3988 		return -EFSCORRUPTED;
3989 
3990 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3991 	if (error)
3992 		return error;
3993 
3994 	*agbp = args.agbp;
3995 	return 0;
3996 }
3997 
3998 /*
3999  * Free an extent.
4000  * Just break up the extent address and hand off to xfs_free_ag_extent
4001  * after fixing up the freelist.
4002  */
4003 int
4004 __xfs_free_extent(
4005 	struct xfs_trans		*tp,
4006 	struct xfs_perag		*pag,
4007 	xfs_agblock_t			agbno,
4008 	xfs_extlen_t			len,
4009 	const struct xfs_owner_info	*oinfo,
4010 	enum xfs_ag_resv_type		type,
4011 	bool				skip_discard)
4012 {
4013 	struct xfs_mount		*mp = tp->t_mountp;
4014 	struct xfs_buf			*agbp;
4015 	struct xfs_agf			*agf;
4016 	int				error;
4017 	unsigned int			busy_flags = 0;
4018 
4019 	ASSERT(len != 0);
4020 	ASSERT(type != XFS_AG_RESV_AGFL);
4021 
4022 	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
4023 		return -EIO;
4024 
4025 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
4026 	if (error) {
4027 		if (xfs_metadata_is_sick(error))
4028 			xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
4029 		return error;
4030 	}
4031 
4032 	agf = agbp->b_addr;
4033 
4034 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
4035 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
4036 		error = -EFSCORRUPTED;
4037 		goto err_release;
4038 	}
4039 
4040 	/* validate the extent size is legal now we have the agf locked */
4041 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
4042 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
4043 		error = -EFSCORRUPTED;
4044 		goto err_release;
4045 	}
4046 
4047 	error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type);
4048 	if (error)
4049 		goto err_release;
4050 
4051 	if (skip_discard)
4052 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
4053 	xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags);
4054 	return 0;
4055 
4056 err_release:
4057 	xfs_trans_brelse(tp, agbp);
4058 	return error;
4059 }
4060 
4061 struct xfs_alloc_query_range_info {
4062 	xfs_alloc_query_range_fn	fn;
4063 	void				*priv;
4064 };
4065 
4066 /* Format btree record and pass to our callback. */
4067 STATIC int
4068 xfs_alloc_query_range_helper(
4069 	struct xfs_btree_cur		*cur,
4070 	const union xfs_btree_rec	*rec,
4071 	void				*priv)
4072 {
4073 	struct xfs_alloc_query_range_info	*query = priv;
4074 	struct xfs_alloc_rec_incore		irec;
4075 	xfs_failaddr_t				fa;
4076 
4077 	xfs_alloc_btrec_to_irec(rec, &irec);
4078 	fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
4079 	if (fa)
4080 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
4081 
4082 	return query->fn(cur, &irec, query->priv);
4083 }
4084 
4085 /* Find all free space within a given range of blocks. */
4086 int
4087 xfs_alloc_query_range(
4088 	struct xfs_btree_cur			*cur,
4089 	const struct xfs_alloc_rec_incore	*low_rec,
4090 	const struct xfs_alloc_rec_incore	*high_rec,
4091 	xfs_alloc_query_range_fn		fn,
4092 	void					*priv)
4093 {
4094 	union xfs_btree_irec			low_brec = { .a = *low_rec };
4095 	union xfs_btree_irec			high_brec = { .a = *high_rec };
4096 	struct xfs_alloc_query_range_info	query = { .priv = priv, .fn = fn };
4097 
4098 	ASSERT(xfs_btree_is_bno(cur->bc_ops));
4099 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
4100 			xfs_alloc_query_range_helper, &query);
4101 }
4102 
4103 /* Find all free space records. */
4104 int
4105 xfs_alloc_query_all(
4106 	struct xfs_btree_cur			*cur,
4107 	xfs_alloc_query_range_fn		fn,
4108 	void					*priv)
4109 {
4110 	struct xfs_alloc_query_range_info	query;
4111 
4112 	ASSERT(xfs_btree_is_bno(cur->bc_ops));
4113 	query.priv = priv;
4114 	query.fn = fn;
4115 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
4116 }
4117 
4118 /*
4119  * Scan part of the keyspace of the free space and tell us if the area has no
4120  * records, is fully mapped by records, or is partially filled.
4121  */
4122 int
4123 xfs_alloc_has_records(
4124 	struct xfs_btree_cur	*cur,
4125 	xfs_agblock_t		bno,
4126 	xfs_extlen_t		len,
4127 	enum xbtree_recpacking	*outcome)
4128 {
4129 	union xfs_btree_irec	low;
4130 	union xfs_btree_irec	high;
4131 
4132 	memset(&low, 0, sizeof(low));
4133 	low.a.ar_startblock = bno;
4134 	memset(&high, 0xFF, sizeof(high));
4135 	high.a.ar_startblock = bno + len - 1;
4136 
4137 	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
4138 }
4139 
4140 /*
4141  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
4142  * error code or XFS_ITER_*.
4143  */
4144 int
4145 xfs_agfl_walk(
4146 	struct xfs_mount	*mp,
4147 	struct xfs_agf		*agf,
4148 	struct xfs_buf		*agflbp,
4149 	xfs_agfl_walk_fn	walk_fn,
4150 	void			*priv)
4151 {
4152 	__be32			*agfl_bno;
4153 	unsigned int		i;
4154 	int			error;
4155 
4156 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
4157 	i = be32_to_cpu(agf->agf_flfirst);
4158 
4159 	/* Nothing to walk in an empty AGFL. */
4160 	if (agf->agf_flcount == cpu_to_be32(0))
4161 		return 0;
4162 
4163 	/* Otherwise, walk from first to last, wrapping as needed. */
4164 	for (;;) {
4165 		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
4166 		if (error)
4167 			return error;
4168 		if (i == be32_to_cpu(agf->agf_fllast))
4169 			break;
4170 		if (++i == xfs_agfl_size(mp))
4171 			i = 0;
4172 	}
4173 
4174 	return 0;
4175 }
4176 
4177 int __init
4178 xfs_extfree_intent_init_cache(void)
4179 {
4180 	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
4181 			sizeof(struct xfs_extent_free_item),
4182 			0, 0, NULL);
4183 
4184 	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
4185 }
4186 
4187 void
4188 xfs_extfree_intent_destroy_cache(void)
4189 {
4190 	kmem_cache_destroy(xfs_extfree_item_cache);
4191 	xfs_extfree_item_cache = NULL;
4192 }
4193