xref: /linux/fs/xfs/libxfs/xfs_alloc.c (revision 6fe1910e8557e73d81f30d862f7b731d24ea60be)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
16 #include "xfs_rmap.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
25 #include "xfs_log.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_bmap.h"
29 #include "xfs_health.h"
30 
31 struct kmem_cache	*xfs_extfree_item_cache;
32 
33 struct workqueue_struct *xfs_alloc_wq;
34 
35 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
36 
37 #define	XFSA_FIXUP_BNO_OK	1
38 #define	XFSA_FIXUP_CNT_OK	2
39 
40 /*
41  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
42  * the beginning of the block for a proper header with the location information
43  * and CRC.
44  */
45 unsigned int
46 xfs_agfl_size(
47 	struct xfs_mount	*mp)
48 {
49 	unsigned int		size = mp->m_sb.sb_sectsize;
50 
51 	if (xfs_has_crc(mp))
52 		size -= sizeof(struct xfs_agfl);
53 
54 	return size / sizeof(xfs_agblock_t);
55 }
56 
57 unsigned int
58 xfs_refc_block(
59 	struct xfs_mount	*mp)
60 {
61 	if (xfs_has_rmapbt(mp))
62 		return XFS_RMAP_BLOCK(mp) + 1;
63 	if (xfs_has_finobt(mp))
64 		return XFS_FIBT_BLOCK(mp) + 1;
65 	return XFS_IBT_BLOCK(mp) + 1;
66 }
67 
68 xfs_extlen_t
69 xfs_prealloc_blocks(
70 	struct xfs_mount	*mp)
71 {
72 	if (xfs_has_reflink(mp))
73 		return xfs_refc_block(mp) + 1;
74 	if (xfs_has_rmapbt(mp))
75 		return XFS_RMAP_BLOCK(mp) + 1;
76 	if (xfs_has_finobt(mp))
77 		return XFS_FIBT_BLOCK(mp) + 1;
78 	return XFS_IBT_BLOCK(mp) + 1;
79 }
80 
81 /*
82  * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
83  * guarantee that we can refill the AGFL prior to allocating space in a nearly
84  * full AG.  Although the space described by the free space btrees, the
85  * blocks used by the freesp btrees themselves, and the blocks owned by the
86  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
87  * free space in the AG drop so low that the free space btrees cannot refill an
88  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
89  * until the fs goes down, we subtract this many AG blocks from the incore
90  * fdblocks to ensure user allocation does not overcommit the space the
91  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
92  * withhold space from xfs_mod_fdblocks, so we do not account for that here.
93  */
94 #define XFS_ALLOCBT_AGFL_RESERVE	4
95 
96 /*
97  * Compute the number of blocks that we set aside to guarantee the ability to
98  * refill the AGFL and handle a full bmap btree split.
99  *
100  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
101  * AGF buffer (PV 947395), we place constraints on the relationship among
102  * actual allocations for data blocks, freelist blocks, and potential file data
103  * bmap btree blocks. However, these restrictions may result in no actual space
104  * allocated for a delayed extent, for example, a data block in a certain AG is
105  * allocated but there is no additional block for the additional bmap btree
106  * block due to a split of the bmap btree of the file. The result of this may
107  * lead to an infinite loop when the file gets flushed to disk and all delayed
108  * extents need to be actually allocated. To get around this, we explicitly set
109  * aside a few blocks which will not be reserved in delayed allocation.
110  *
111  * For each AG, we need to reserve enough blocks to replenish a totally empty
112  * AGFL and 4 more to handle a potential split of the file's bmap btree.
113  */
114 unsigned int
115 xfs_alloc_set_aside(
116 	struct xfs_mount	*mp)
117 {
118 	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
119 }
120 
121 /*
122  * When deciding how much space to allocate out of an AG, we limit the
123  * allocation maximum size to the size the AG. However, we cannot use all the
124  * blocks in the AG - some are permanently used by metadata. These
125  * blocks are generally:
126  *	- the AG superblock, AGF, AGI and AGFL
127  *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
128  *	  the AGI free inode and rmap btree root blocks.
129  *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
130  *	- the rmapbt root block
131  *
132  * The AG headers are sector sized, so the amount of space they take up is
133  * dependent on filesystem geometry. The others are all single blocks.
134  */
135 unsigned int
136 xfs_alloc_ag_max_usable(
137 	struct xfs_mount	*mp)
138 {
139 	unsigned int		blocks;
140 
141 	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
142 	blocks += XFS_ALLOCBT_AGFL_RESERVE;
143 	blocks += 3;			/* AGF, AGI btree root blocks */
144 	if (xfs_has_finobt(mp))
145 		blocks++;		/* finobt root block */
146 	if (xfs_has_rmapbt(mp))
147 		blocks++;		/* rmap root block */
148 	if (xfs_has_reflink(mp))
149 		blocks++;		/* refcount root block */
150 
151 	return mp->m_sb.sb_agblocks - blocks;
152 }
153 
154 /*
155  * Lookup the record equal to [bno, len] in the btree given by cur.
156  */
157 STATIC int				/* error */
158 xfs_alloc_lookup_eq(
159 	struct xfs_btree_cur	*cur,	/* btree cursor */
160 	xfs_agblock_t		bno,	/* starting block of extent */
161 	xfs_extlen_t		len,	/* length of extent */
162 	int			*stat)	/* success/failure */
163 {
164 	int			error;
165 
166 	cur->bc_rec.a.ar_startblock = bno;
167 	cur->bc_rec.a.ar_blockcount = len;
168 	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
169 	cur->bc_ag.abt.active = (*stat == 1);
170 	return error;
171 }
172 
173 /*
174  * Lookup the first record greater than or equal to [bno, len]
175  * in the btree given by cur.
176  */
177 int				/* error */
178 xfs_alloc_lookup_ge(
179 	struct xfs_btree_cur	*cur,	/* btree cursor */
180 	xfs_agblock_t		bno,	/* starting block of extent */
181 	xfs_extlen_t		len,	/* length of extent */
182 	int			*stat)	/* success/failure */
183 {
184 	int			error;
185 
186 	cur->bc_rec.a.ar_startblock = bno;
187 	cur->bc_rec.a.ar_blockcount = len;
188 	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
189 	cur->bc_ag.abt.active = (*stat == 1);
190 	return error;
191 }
192 
193 /*
194  * Lookup the first record less than or equal to [bno, len]
195  * in the btree given by cur.
196  */
197 int					/* error */
198 xfs_alloc_lookup_le(
199 	struct xfs_btree_cur	*cur,	/* btree cursor */
200 	xfs_agblock_t		bno,	/* starting block of extent */
201 	xfs_extlen_t		len,	/* length of extent */
202 	int			*stat)	/* success/failure */
203 {
204 	int			error;
205 	cur->bc_rec.a.ar_startblock = bno;
206 	cur->bc_rec.a.ar_blockcount = len;
207 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
208 	cur->bc_ag.abt.active = (*stat == 1);
209 	return error;
210 }
211 
212 static inline bool
213 xfs_alloc_cur_active(
214 	struct xfs_btree_cur	*cur)
215 {
216 	return cur && cur->bc_ag.abt.active;
217 }
218 
219 /*
220  * Update the record referred to by cur to the value given
221  * by [bno, len].
222  * This either works (return 0) or gets an EFSCORRUPTED error.
223  */
224 STATIC int				/* error */
225 xfs_alloc_update(
226 	struct xfs_btree_cur	*cur,	/* btree cursor */
227 	xfs_agblock_t		bno,	/* starting block of extent */
228 	xfs_extlen_t		len)	/* length of extent */
229 {
230 	union xfs_btree_rec	rec;
231 
232 	rec.alloc.ar_startblock = cpu_to_be32(bno);
233 	rec.alloc.ar_blockcount = cpu_to_be32(len);
234 	return xfs_btree_update(cur, &rec);
235 }
236 
237 /* Convert the ondisk btree record to its incore representation. */
238 void
239 xfs_alloc_btrec_to_irec(
240 	const union xfs_btree_rec	*rec,
241 	struct xfs_alloc_rec_incore	*irec)
242 {
243 	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
244 	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
245 }
246 
247 /* Simple checks for free space records. */
248 xfs_failaddr_t
249 xfs_alloc_check_irec(
250 	struct xfs_perag			*pag,
251 	const struct xfs_alloc_rec_incore	*irec)
252 {
253 	if (irec->ar_blockcount == 0)
254 		return __this_address;
255 
256 	/* check for valid extent range, including overflow */
257 	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
258 		return __this_address;
259 
260 	return NULL;
261 }
262 
263 static inline int
264 xfs_alloc_complain_bad_rec(
265 	struct xfs_btree_cur		*cur,
266 	xfs_failaddr_t			fa,
267 	const struct xfs_alloc_rec_incore *irec)
268 {
269 	struct xfs_mount		*mp = cur->bc_mp;
270 
271 	xfs_warn(mp,
272 		"%s Freespace BTree record corruption in AG %d detected at %pS!",
273 		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
274 		cur->bc_ag.pag->pag_agno, fa);
275 	xfs_warn(mp,
276 		"start block 0x%x block count 0x%x", irec->ar_startblock,
277 		irec->ar_blockcount);
278 	xfs_btree_mark_sick(cur);
279 	return -EFSCORRUPTED;
280 }
281 
282 /*
283  * Get the data from the pointed-to record.
284  */
285 int					/* error */
286 xfs_alloc_get_rec(
287 	struct xfs_btree_cur	*cur,	/* btree cursor */
288 	xfs_agblock_t		*bno,	/* output: starting block of extent */
289 	xfs_extlen_t		*len,	/* output: length of extent */
290 	int			*stat)	/* output: success/failure */
291 {
292 	struct xfs_alloc_rec_incore irec;
293 	union xfs_btree_rec	*rec;
294 	xfs_failaddr_t		fa;
295 	int			error;
296 
297 	error = xfs_btree_get_rec(cur, &rec, stat);
298 	if (error || !(*stat))
299 		return error;
300 
301 	xfs_alloc_btrec_to_irec(rec, &irec);
302 	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
303 	if (fa)
304 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
305 
306 	*bno = irec.ar_startblock;
307 	*len = irec.ar_blockcount;
308 	return 0;
309 }
310 
311 /*
312  * Compute aligned version of the found extent.
313  * Takes alignment and min length into account.
314  */
315 STATIC bool
316 xfs_alloc_compute_aligned(
317 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
318 	xfs_agblock_t	foundbno,	/* starting block in found extent */
319 	xfs_extlen_t	foundlen,	/* length in found extent */
320 	xfs_agblock_t	*resbno,	/* result block number */
321 	xfs_extlen_t	*reslen,	/* result length */
322 	unsigned	*busy_gen)
323 {
324 	xfs_agblock_t	bno = foundbno;
325 	xfs_extlen_t	len = foundlen;
326 	xfs_extlen_t	diff;
327 	bool		busy;
328 
329 	/* Trim busy sections out of found extent */
330 	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
331 
332 	/*
333 	 * If we have a largish extent that happens to start before min_agbno,
334 	 * see if we can shift it into range...
335 	 */
336 	if (bno < args->min_agbno && bno + len > args->min_agbno) {
337 		diff = args->min_agbno - bno;
338 		if (len > diff) {
339 			bno += diff;
340 			len -= diff;
341 		}
342 	}
343 
344 	if (args->alignment > 1 && len >= args->minlen) {
345 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
346 
347 		diff = aligned_bno - bno;
348 
349 		*resbno = aligned_bno;
350 		*reslen = diff >= len ? 0 : len - diff;
351 	} else {
352 		*resbno = bno;
353 		*reslen = len;
354 	}
355 
356 	return busy;
357 }
358 
359 /*
360  * Compute best start block and diff for "near" allocations.
361  * freelen >= wantlen already checked by caller.
362  */
363 STATIC xfs_extlen_t			/* difference value (absolute) */
364 xfs_alloc_compute_diff(
365 	xfs_agblock_t	wantbno,	/* target starting block */
366 	xfs_extlen_t	wantlen,	/* target length */
367 	xfs_extlen_t	alignment,	/* target alignment */
368 	int		datatype,	/* are we allocating data? */
369 	xfs_agblock_t	freebno,	/* freespace's starting block */
370 	xfs_extlen_t	freelen,	/* freespace's length */
371 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
372 {
373 	xfs_agblock_t	freeend;	/* end of freespace extent */
374 	xfs_agblock_t	newbno1;	/* return block number */
375 	xfs_agblock_t	newbno2;	/* other new block number */
376 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
377 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
378 	xfs_agblock_t	wantend;	/* end of target extent */
379 	bool		userdata = datatype & XFS_ALLOC_USERDATA;
380 
381 	ASSERT(freelen >= wantlen);
382 	freeend = freebno + freelen;
383 	wantend = wantbno + wantlen;
384 	/*
385 	 * We want to allocate from the start of a free extent if it is past
386 	 * the desired block or if we are allocating user data and the free
387 	 * extent is before desired block. The second case is there to allow
388 	 * for contiguous allocation from the remaining free space if the file
389 	 * grows in the short term.
390 	 */
391 	if (freebno >= wantbno || (userdata && freeend < wantend)) {
392 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
393 			newbno1 = NULLAGBLOCK;
394 	} else if (freeend >= wantend && alignment > 1) {
395 		newbno1 = roundup(wantbno, alignment);
396 		newbno2 = newbno1 - alignment;
397 		if (newbno1 >= freeend)
398 			newbno1 = NULLAGBLOCK;
399 		else
400 			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
401 		if (newbno2 < freebno)
402 			newbno2 = NULLAGBLOCK;
403 		else
404 			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
405 		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
406 			if (newlen1 < newlen2 ||
407 			    (newlen1 == newlen2 &&
408 			     XFS_ABSDIFF(newbno1, wantbno) >
409 			     XFS_ABSDIFF(newbno2, wantbno)))
410 				newbno1 = newbno2;
411 		} else if (newbno2 != NULLAGBLOCK)
412 			newbno1 = newbno2;
413 	} else if (freeend >= wantend) {
414 		newbno1 = wantbno;
415 	} else if (alignment > 1) {
416 		newbno1 = roundup(freeend - wantlen, alignment);
417 		if (newbno1 > freeend - wantlen &&
418 		    newbno1 - alignment >= freebno)
419 			newbno1 -= alignment;
420 		else if (newbno1 >= freeend)
421 			newbno1 = NULLAGBLOCK;
422 	} else
423 		newbno1 = freeend - wantlen;
424 	*newbnop = newbno1;
425 	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
426 }
427 
428 /*
429  * Fix up the length, based on mod and prod.
430  * len should be k * prod + mod for some k.
431  * If len is too small it is returned unchanged.
432  * If len hits maxlen it is left alone.
433  */
434 STATIC void
435 xfs_alloc_fix_len(
436 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
437 {
438 	xfs_extlen_t	k;
439 	xfs_extlen_t	rlen;
440 
441 	ASSERT(args->mod < args->prod);
442 	rlen = args->len;
443 	ASSERT(rlen >= args->minlen);
444 	ASSERT(rlen <= args->maxlen);
445 	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
446 	    (args->mod == 0 && rlen < args->prod))
447 		return;
448 	k = rlen % args->prod;
449 	if (k == args->mod)
450 		return;
451 	if (k > args->mod)
452 		rlen = rlen - (k - args->mod);
453 	else
454 		rlen = rlen - args->prod + (args->mod - k);
455 	/* casts to (int) catch length underflows */
456 	if ((int)rlen < (int)args->minlen)
457 		return;
458 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
459 	ASSERT(rlen % args->prod == args->mod);
460 	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
461 		rlen + args->minleft);
462 	args->len = rlen;
463 }
464 
465 /*
466  * Update the two btrees, logically removing from freespace the extent
467  * starting at rbno, rlen blocks.  The extent is contained within the
468  * actual (current) free extent fbno for flen blocks.
469  * Flags are passed in indicating whether the cursors are set to the
470  * relevant records.
471  */
472 STATIC int				/* error code */
473 xfs_alloc_fixup_trees(
474 	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
475 	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
476 	xfs_agblock_t	fbno,		/* starting block of free extent */
477 	xfs_extlen_t	flen,		/* length of free extent */
478 	xfs_agblock_t	rbno,		/* starting block of returned extent */
479 	xfs_extlen_t	rlen,		/* length of returned extent */
480 	int		flags)		/* flags, XFSA_FIXUP_... */
481 {
482 	int		error;		/* error code */
483 	int		i;		/* operation results */
484 	xfs_agblock_t	nfbno1;		/* first new free startblock */
485 	xfs_agblock_t	nfbno2;		/* second new free startblock */
486 	xfs_extlen_t	nflen1=0;	/* first new free length */
487 	xfs_extlen_t	nflen2=0;	/* second new free length */
488 	struct xfs_mount *mp;
489 
490 	mp = cnt_cur->bc_mp;
491 
492 	/*
493 	 * Look up the record in the by-size tree if necessary.
494 	 */
495 	if (flags & XFSA_FIXUP_CNT_OK) {
496 #ifdef DEBUG
497 		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
498 			return error;
499 		if (XFS_IS_CORRUPT(mp,
500 				   i != 1 ||
501 				   nfbno1 != fbno ||
502 				   nflen1 != flen)) {
503 			xfs_btree_mark_sick(cnt_cur);
504 			return -EFSCORRUPTED;
505 		}
506 #endif
507 	} else {
508 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
509 			return error;
510 		if (XFS_IS_CORRUPT(mp, i != 1)) {
511 			xfs_btree_mark_sick(cnt_cur);
512 			return -EFSCORRUPTED;
513 		}
514 	}
515 	/*
516 	 * Look up the record in the by-block tree if necessary.
517 	 */
518 	if (flags & XFSA_FIXUP_BNO_OK) {
519 #ifdef DEBUG
520 		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
521 			return error;
522 		if (XFS_IS_CORRUPT(mp,
523 				   i != 1 ||
524 				   nfbno1 != fbno ||
525 				   nflen1 != flen)) {
526 			xfs_btree_mark_sick(bno_cur);
527 			return -EFSCORRUPTED;
528 		}
529 #endif
530 	} else {
531 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
532 			return error;
533 		if (XFS_IS_CORRUPT(mp, i != 1)) {
534 			xfs_btree_mark_sick(bno_cur);
535 			return -EFSCORRUPTED;
536 		}
537 	}
538 
539 #ifdef DEBUG
540 	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
541 		struct xfs_btree_block	*bnoblock;
542 		struct xfs_btree_block	*cntblock;
543 
544 		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
545 		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
546 
547 		if (XFS_IS_CORRUPT(mp,
548 				   bnoblock->bb_numrecs !=
549 				   cntblock->bb_numrecs)) {
550 			xfs_btree_mark_sick(bno_cur);
551 			return -EFSCORRUPTED;
552 		}
553 	}
554 #endif
555 
556 	/*
557 	 * Deal with all four cases: the allocated record is contained
558 	 * within the freespace record, so we can have new freespace
559 	 * at either (or both) end, or no freespace remaining.
560 	 */
561 	if (rbno == fbno && rlen == flen)
562 		nfbno1 = nfbno2 = NULLAGBLOCK;
563 	else if (rbno == fbno) {
564 		nfbno1 = rbno + rlen;
565 		nflen1 = flen - rlen;
566 		nfbno2 = NULLAGBLOCK;
567 	} else if (rbno + rlen == fbno + flen) {
568 		nfbno1 = fbno;
569 		nflen1 = flen - rlen;
570 		nfbno2 = NULLAGBLOCK;
571 	} else {
572 		nfbno1 = fbno;
573 		nflen1 = rbno - fbno;
574 		nfbno2 = rbno + rlen;
575 		nflen2 = (fbno + flen) - nfbno2;
576 	}
577 	/*
578 	 * Delete the entry from the by-size btree.
579 	 */
580 	if ((error = xfs_btree_delete(cnt_cur, &i)))
581 		return error;
582 	if (XFS_IS_CORRUPT(mp, i != 1)) {
583 		xfs_btree_mark_sick(cnt_cur);
584 		return -EFSCORRUPTED;
585 	}
586 	/*
587 	 * Add new by-size btree entry(s).
588 	 */
589 	if (nfbno1 != NULLAGBLOCK) {
590 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
591 			return error;
592 		if (XFS_IS_CORRUPT(mp, i != 0)) {
593 			xfs_btree_mark_sick(cnt_cur);
594 			return -EFSCORRUPTED;
595 		}
596 		if ((error = xfs_btree_insert(cnt_cur, &i)))
597 			return error;
598 		if (XFS_IS_CORRUPT(mp, i != 1)) {
599 			xfs_btree_mark_sick(cnt_cur);
600 			return -EFSCORRUPTED;
601 		}
602 	}
603 	if (nfbno2 != NULLAGBLOCK) {
604 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
605 			return error;
606 		if (XFS_IS_CORRUPT(mp, i != 0)) {
607 			xfs_btree_mark_sick(cnt_cur);
608 			return -EFSCORRUPTED;
609 		}
610 		if ((error = xfs_btree_insert(cnt_cur, &i)))
611 			return error;
612 		if (XFS_IS_CORRUPT(mp, i != 1)) {
613 			xfs_btree_mark_sick(cnt_cur);
614 			return -EFSCORRUPTED;
615 		}
616 	}
617 	/*
618 	 * Fix up the by-block btree entry(s).
619 	 */
620 	if (nfbno1 == NULLAGBLOCK) {
621 		/*
622 		 * No remaining freespace, just delete the by-block tree entry.
623 		 */
624 		if ((error = xfs_btree_delete(bno_cur, &i)))
625 			return error;
626 		if (XFS_IS_CORRUPT(mp, i != 1)) {
627 			xfs_btree_mark_sick(bno_cur);
628 			return -EFSCORRUPTED;
629 		}
630 	} else {
631 		/*
632 		 * Update the by-block entry to start later|be shorter.
633 		 */
634 		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
635 			return error;
636 	}
637 	if (nfbno2 != NULLAGBLOCK) {
638 		/*
639 		 * 2 resulting free entries, need to add one.
640 		 */
641 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
642 			return error;
643 		if (XFS_IS_CORRUPT(mp, i != 0)) {
644 			xfs_btree_mark_sick(bno_cur);
645 			return -EFSCORRUPTED;
646 		}
647 		if ((error = xfs_btree_insert(bno_cur, &i)))
648 			return error;
649 		if (XFS_IS_CORRUPT(mp, i != 1)) {
650 			xfs_btree_mark_sick(bno_cur);
651 			return -EFSCORRUPTED;
652 		}
653 	}
654 	return 0;
655 }
656 
657 /*
658  * We do not verify the AGFL contents against AGF-based index counters here,
659  * even though we may have access to the perag that contains shadow copies. We
660  * don't know if the AGF based counters have been checked, and if they have they
661  * still may be inconsistent because they haven't yet been reset on the first
662  * allocation after the AGF has been read in.
663  *
664  * This means we can only check that all agfl entries contain valid or null
665  * values because we can't reliably determine the active range to exclude
666  * NULLAGBNO as a valid value.
667  *
668  * However, we can't even do that for v4 format filesystems because there are
669  * old versions of mkfs out there that does not initialise the AGFL to known,
670  * verifiable values. HEnce we can't tell the difference between a AGFL block
671  * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
672  *
673  * As a result, we can only fully validate AGFL block numbers when we pull them
674  * from the freelist in xfs_alloc_get_freelist().
675  */
676 static xfs_failaddr_t
677 xfs_agfl_verify(
678 	struct xfs_buf	*bp)
679 {
680 	struct xfs_mount *mp = bp->b_mount;
681 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
682 	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
683 	int		i;
684 
685 	if (!xfs_has_crc(mp))
686 		return NULL;
687 
688 	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
689 		return __this_address;
690 	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
691 		return __this_address;
692 	/*
693 	 * during growfs operations, the perag is not fully initialised,
694 	 * so we can't use it for any useful checking. growfs ensures we can't
695 	 * use it by using uncached buffers that don't have the perag attached
696 	 * so we can detect and avoid this problem.
697 	 */
698 	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
699 		return __this_address;
700 
701 	for (i = 0; i < xfs_agfl_size(mp); i++) {
702 		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
703 		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
704 			return __this_address;
705 	}
706 
707 	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
708 		return __this_address;
709 	return NULL;
710 }
711 
712 static void
713 xfs_agfl_read_verify(
714 	struct xfs_buf	*bp)
715 {
716 	struct xfs_mount *mp = bp->b_mount;
717 	xfs_failaddr_t	fa;
718 
719 	/*
720 	 * There is no verification of non-crc AGFLs because mkfs does not
721 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
722 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
723 	 * can't verify just those entries are valid.
724 	 */
725 	if (!xfs_has_crc(mp))
726 		return;
727 
728 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
729 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
730 	else {
731 		fa = xfs_agfl_verify(bp);
732 		if (fa)
733 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
734 	}
735 }
736 
737 static void
738 xfs_agfl_write_verify(
739 	struct xfs_buf	*bp)
740 {
741 	struct xfs_mount	*mp = bp->b_mount;
742 	struct xfs_buf_log_item	*bip = bp->b_log_item;
743 	xfs_failaddr_t		fa;
744 
745 	/* no verification of non-crc AGFLs */
746 	if (!xfs_has_crc(mp))
747 		return;
748 
749 	fa = xfs_agfl_verify(bp);
750 	if (fa) {
751 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
752 		return;
753 	}
754 
755 	if (bip)
756 		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
757 
758 	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
759 }
760 
761 const struct xfs_buf_ops xfs_agfl_buf_ops = {
762 	.name = "xfs_agfl",
763 	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
764 	.verify_read = xfs_agfl_read_verify,
765 	.verify_write = xfs_agfl_write_verify,
766 	.verify_struct = xfs_agfl_verify,
767 };
768 
769 /*
770  * Read in the allocation group free block array.
771  */
772 int
773 xfs_alloc_read_agfl(
774 	struct xfs_perag	*pag,
775 	struct xfs_trans	*tp,
776 	struct xfs_buf		**bpp)
777 {
778 	struct xfs_mount	*mp = pag->pag_mount;
779 	struct xfs_buf		*bp;
780 	int			error;
781 
782 	error = xfs_trans_read_buf(
783 			mp, tp, mp->m_ddev_targp,
784 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
785 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
786 	if (xfs_metadata_is_sick(error))
787 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
788 	if (error)
789 		return error;
790 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
791 	*bpp = bp;
792 	return 0;
793 }
794 
795 STATIC int
796 xfs_alloc_update_counters(
797 	struct xfs_trans	*tp,
798 	struct xfs_buf		*agbp,
799 	long			len)
800 {
801 	struct xfs_agf		*agf = agbp->b_addr;
802 
803 	agbp->b_pag->pagf_freeblks += len;
804 	be32_add_cpu(&agf->agf_freeblks, len);
805 
806 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
807 		     be32_to_cpu(agf->agf_length))) {
808 		xfs_buf_mark_corrupt(agbp);
809 		xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
810 		return -EFSCORRUPTED;
811 	}
812 
813 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
814 	return 0;
815 }
816 
817 /*
818  * Block allocation algorithm and data structures.
819  */
820 struct xfs_alloc_cur {
821 	struct xfs_btree_cur		*cnt;	/* btree cursors */
822 	struct xfs_btree_cur		*bnolt;
823 	struct xfs_btree_cur		*bnogt;
824 	xfs_extlen_t			cur_len;/* current search length */
825 	xfs_agblock_t			rec_bno;/* extent startblock */
826 	xfs_extlen_t			rec_len;/* extent length */
827 	xfs_agblock_t			bno;	/* alloc bno */
828 	xfs_extlen_t			len;	/* alloc len */
829 	xfs_extlen_t			diff;	/* diff from search bno */
830 	unsigned int			busy_gen;/* busy state */
831 	bool				busy;
832 };
833 
834 /*
835  * Set up cursors, etc. in the extent allocation cursor. This function can be
836  * called multiple times to reset an initialized structure without having to
837  * reallocate cursors.
838  */
839 static int
840 xfs_alloc_cur_setup(
841 	struct xfs_alloc_arg	*args,
842 	struct xfs_alloc_cur	*acur)
843 {
844 	int			error;
845 	int			i;
846 
847 	acur->cur_len = args->maxlen;
848 	acur->rec_bno = 0;
849 	acur->rec_len = 0;
850 	acur->bno = 0;
851 	acur->len = 0;
852 	acur->diff = -1;
853 	acur->busy = false;
854 	acur->busy_gen = 0;
855 
856 	/*
857 	 * Perform an initial cntbt lookup to check for availability of maxlen
858 	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
859 	 * attempt a small allocation.
860 	 */
861 	if (!acur->cnt)
862 		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
863 					args->agbp, args->pag, XFS_BTNUM_CNT);
864 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
865 	if (error)
866 		return error;
867 
868 	/*
869 	 * Allocate the bnobt left and right search cursors.
870 	 */
871 	if (!acur->bnolt)
872 		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
873 					args->agbp, args->pag, XFS_BTNUM_BNO);
874 	if (!acur->bnogt)
875 		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
876 					args->agbp, args->pag, XFS_BTNUM_BNO);
877 	return i == 1 ? 0 : -ENOSPC;
878 }
879 
880 static void
881 xfs_alloc_cur_close(
882 	struct xfs_alloc_cur	*acur,
883 	bool			error)
884 {
885 	int			cur_error = XFS_BTREE_NOERROR;
886 
887 	if (error)
888 		cur_error = XFS_BTREE_ERROR;
889 
890 	if (acur->cnt)
891 		xfs_btree_del_cursor(acur->cnt, cur_error);
892 	if (acur->bnolt)
893 		xfs_btree_del_cursor(acur->bnolt, cur_error);
894 	if (acur->bnogt)
895 		xfs_btree_del_cursor(acur->bnogt, cur_error);
896 	acur->cnt = acur->bnolt = acur->bnogt = NULL;
897 }
898 
899 /*
900  * Check an extent for allocation and track the best available candidate in the
901  * allocation structure. The cursor is deactivated if it has entered an out of
902  * range state based on allocation arguments. Optionally return the extent
903  * extent geometry and allocation status if requested by the caller.
904  */
905 static int
906 xfs_alloc_cur_check(
907 	struct xfs_alloc_arg	*args,
908 	struct xfs_alloc_cur	*acur,
909 	struct xfs_btree_cur	*cur,
910 	int			*new)
911 {
912 	int			error, i;
913 	xfs_agblock_t		bno, bnoa, bnew;
914 	xfs_extlen_t		len, lena, diff = -1;
915 	bool			busy;
916 	unsigned		busy_gen = 0;
917 	bool			deactivate = false;
918 	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
919 
920 	*new = 0;
921 
922 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
923 	if (error)
924 		return error;
925 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
926 		xfs_btree_mark_sick(cur);
927 		return -EFSCORRUPTED;
928 	}
929 
930 	/*
931 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
932 	 * range (i.e., walking backwards looking for a minlen extent).
933 	 */
934 	if (len < args->minlen) {
935 		deactivate = !isbnobt;
936 		goto out;
937 	}
938 
939 	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
940 					 &busy_gen);
941 	acur->busy |= busy;
942 	if (busy)
943 		acur->busy_gen = busy_gen;
944 	/* deactivate a bnobt cursor outside of locality range */
945 	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
946 		deactivate = isbnobt;
947 		goto out;
948 	}
949 	if (lena < args->minlen)
950 		goto out;
951 
952 	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
953 	xfs_alloc_fix_len(args);
954 	ASSERT(args->len >= args->minlen);
955 	if (args->len < acur->len)
956 		goto out;
957 
958 	/*
959 	 * We have an aligned record that satisfies minlen and beats or matches
960 	 * the candidate extent size. Compare locality for near allocation mode.
961 	 */
962 	diff = xfs_alloc_compute_diff(args->agbno, args->len,
963 				      args->alignment, args->datatype,
964 				      bnoa, lena, &bnew);
965 	if (bnew == NULLAGBLOCK)
966 		goto out;
967 
968 	/*
969 	 * Deactivate a bnobt cursor with worse locality than the current best.
970 	 */
971 	if (diff > acur->diff) {
972 		deactivate = isbnobt;
973 		goto out;
974 	}
975 
976 	ASSERT(args->len > acur->len ||
977 	       (args->len == acur->len && diff <= acur->diff));
978 	acur->rec_bno = bno;
979 	acur->rec_len = len;
980 	acur->bno = bnew;
981 	acur->len = args->len;
982 	acur->diff = diff;
983 	*new = 1;
984 
985 	/*
986 	 * We're done if we found a perfect allocation. This only deactivates
987 	 * the current cursor, but this is just an optimization to terminate a
988 	 * cntbt search that otherwise runs to the edge of the tree.
989 	 */
990 	if (acur->diff == 0 && acur->len == args->maxlen)
991 		deactivate = true;
992 out:
993 	if (deactivate)
994 		cur->bc_ag.abt.active = false;
995 	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
996 				  *new);
997 	return 0;
998 }
999 
1000 /*
1001  * Complete an allocation of a candidate extent. Remove the extent from both
1002  * trees and update the args structure.
1003  */
1004 STATIC int
1005 xfs_alloc_cur_finish(
1006 	struct xfs_alloc_arg	*args,
1007 	struct xfs_alloc_cur	*acur)
1008 {
1009 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1010 	int			error;
1011 
1012 	ASSERT(acur->cnt && acur->bnolt);
1013 	ASSERT(acur->bno >= acur->rec_bno);
1014 	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
1015 	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
1016 
1017 	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
1018 				      acur->rec_len, acur->bno, acur->len, 0);
1019 	if (error)
1020 		return error;
1021 
1022 	args->agbno = acur->bno;
1023 	args->len = acur->len;
1024 	args->wasfromfl = 0;
1025 
1026 	trace_xfs_alloc_cur(args);
1027 	return 0;
1028 }
1029 
1030 /*
1031  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
1032  * bno optimized lookup to search for extents with ideal size and locality.
1033  */
1034 STATIC int
1035 xfs_alloc_cntbt_iter(
1036 	struct xfs_alloc_arg		*args,
1037 	struct xfs_alloc_cur		*acur)
1038 {
1039 	struct xfs_btree_cur	*cur = acur->cnt;
1040 	xfs_agblock_t		bno;
1041 	xfs_extlen_t		len, cur_len;
1042 	int			error;
1043 	int			i;
1044 
1045 	if (!xfs_alloc_cur_active(cur))
1046 		return 0;
1047 
1048 	/* locality optimized lookup */
1049 	cur_len = acur->cur_len;
1050 	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1051 	if (error)
1052 		return error;
1053 	if (i == 0)
1054 		return 0;
1055 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1056 	if (error)
1057 		return error;
1058 
1059 	/* check the current record and update search length from it */
1060 	error = xfs_alloc_cur_check(args, acur, cur, &i);
1061 	if (error)
1062 		return error;
1063 	ASSERT(len >= acur->cur_len);
1064 	acur->cur_len = len;
1065 
1066 	/*
1067 	 * We looked up the first record >= [agbno, len] above. The agbno is a
1068 	 * secondary key and so the current record may lie just before or after
1069 	 * agbno. If it is past agbno, check the previous record too so long as
1070 	 * the length matches as it may be closer. Don't check a smaller record
1071 	 * because that could deactivate our cursor.
1072 	 */
1073 	if (bno > args->agbno) {
1074 		error = xfs_btree_decrement(cur, 0, &i);
1075 		if (!error && i) {
1076 			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1077 			if (!error && i && len == acur->cur_len)
1078 				error = xfs_alloc_cur_check(args, acur, cur,
1079 							    &i);
1080 		}
1081 		if (error)
1082 			return error;
1083 	}
1084 
1085 	/*
1086 	 * Increment the search key until we find at least one allocation
1087 	 * candidate or if the extent we found was larger. Otherwise, double the
1088 	 * search key to optimize the search. Efficiency is more important here
1089 	 * than absolute best locality.
1090 	 */
1091 	cur_len <<= 1;
1092 	if (!acur->len || acur->cur_len >= cur_len)
1093 		acur->cur_len++;
1094 	else
1095 		acur->cur_len = cur_len;
1096 
1097 	return error;
1098 }
1099 
1100 /*
1101  * Deal with the case where only small freespaces remain. Either return the
1102  * contents of the last freespace record, or allocate space from the freelist if
1103  * there is nothing in the tree.
1104  */
1105 STATIC int			/* error */
1106 xfs_alloc_ag_vextent_small(
1107 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1108 	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1109 	xfs_agblock_t		*fbnop,	/* result block number */
1110 	xfs_extlen_t		*flenp,	/* result length */
1111 	int			*stat)	/* status: 0-freelist, 1-normal/none */
1112 {
1113 	struct xfs_agf		*agf = args->agbp->b_addr;
1114 	int			error = 0;
1115 	xfs_agblock_t		fbno = NULLAGBLOCK;
1116 	xfs_extlen_t		flen = 0;
1117 	int			i = 0;
1118 
1119 	/*
1120 	 * If a cntbt cursor is provided, try to allocate the largest record in
1121 	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1122 	 * allocation. Make sure to respect minleft even when pulling from the
1123 	 * freelist.
1124 	 */
1125 	if (ccur)
1126 		error = xfs_btree_decrement(ccur, 0, &i);
1127 	if (error)
1128 		goto error;
1129 	if (i) {
1130 		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1131 		if (error)
1132 			goto error;
1133 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1134 			xfs_btree_mark_sick(ccur);
1135 			error = -EFSCORRUPTED;
1136 			goto error;
1137 		}
1138 		goto out;
1139 	}
1140 
1141 	if (args->minlen != 1 || args->alignment != 1 ||
1142 	    args->resv == XFS_AG_RESV_AGFL ||
1143 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1144 		goto out;
1145 
1146 	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1147 			&fbno, 0);
1148 	if (error)
1149 		goto error;
1150 	if (fbno == NULLAGBLOCK)
1151 		goto out;
1152 
1153 	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1154 			      (args->datatype & XFS_ALLOC_NOBUSY));
1155 
1156 	if (args->datatype & XFS_ALLOC_USERDATA) {
1157 		struct xfs_buf	*bp;
1158 
1159 		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1160 				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1161 				args->mp->m_bsize, 0, &bp);
1162 		if (error)
1163 			goto error;
1164 		xfs_trans_binval(args->tp, bp);
1165 	}
1166 	*fbnop = args->agbno = fbno;
1167 	*flenp = args->len = 1;
1168 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1169 		xfs_btree_mark_sick(ccur);
1170 		error = -EFSCORRUPTED;
1171 		goto error;
1172 	}
1173 	args->wasfromfl = 1;
1174 	trace_xfs_alloc_small_freelist(args);
1175 
1176 	/*
1177 	 * If we're feeding an AGFL block to something that doesn't live in the
1178 	 * free space, we need to clear out the OWN_AG rmap.
1179 	 */
1180 	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1181 			      &XFS_RMAP_OINFO_AG);
1182 	if (error)
1183 		goto error;
1184 
1185 	*stat = 0;
1186 	return 0;
1187 
1188 out:
1189 	/*
1190 	 * Can't do the allocation, give up.
1191 	 */
1192 	if (flen < args->minlen) {
1193 		args->agbno = NULLAGBLOCK;
1194 		trace_xfs_alloc_small_notenough(args);
1195 		flen = 0;
1196 	}
1197 	*fbnop = fbno;
1198 	*flenp = flen;
1199 	*stat = 1;
1200 	trace_xfs_alloc_small_done(args);
1201 	return 0;
1202 
1203 error:
1204 	trace_xfs_alloc_small_error(args);
1205 	return error;
1206 }
1207 
1208 /*
1209  * Allocate a variable extent at exactly agno/bno.
1210  * Extent's length (returned in *len) will be between minlen and maxlen,
1211  * and of the form k * prod + mod unless there's nothing that large.
1212  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1213  */
1214 STATIC int			/* error */
1215 xfs_alloc_ag_vextent_exact(
1216 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1217 {
1218 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1219 	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1220 	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1221 	int		error;
1222 	xfs_agblock_t	fbno;	/* start block of found extent */
1223 	xfs_extlen_t	flen;	/* length of found extent */
1224 	xfs_agblock_t	tbno;	/* start block of busy extent */
1225 	xfs_extlen_t	tlen;	/* length of busy extent */
1226 	xfs_agblock_t	tend;	/* end block of busy extent */
1227 	int		i;	/* success/failure of operation */
1228 	unsigned	busy_gen;
1229 
1230 	ASSERT(args->alignment == 1);
1231 
1232 	/*
1233 	 * Allocate/initialize a cursor for the by-number freespace btree.
1234 	 */
1235 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1236 					  args->pag, XFS_BTNUM_BNO);
1237 
1238 	/*
1239 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1240 	 * Look for the closest free block <= bno, it must contain bno
1241 	 * if any free block does.
1242 	 */
1243 	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1244 	if (error)
1245 		goto error0;
1246 	if (!i)
1247 		goto not_found;
1248 
1249 	/*
1250 	 * Grab the freespace record.
1251 	 */
1252 	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1253 	if (error)
1254 		goto error0;
1255 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1256 		xfs_btree_mark_sick(bno_cur);
1257 		error = -EFSCORRUPTED;
1258 		goto error0;
1259 	}
1260 	ASSERT(fbno <= args->agbno);
1261 
1262 	/*
1263 	 * Check for overlapping busy extents.
1264 	 */
1265 	tbno = fbno;
1266 	tlen = flen;
1267 	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1268 
1269 	/*
1270 	 * Give up if the start of the extent is busy, or the freespace isn't
1271 	 * long enough for the minimum request.
1272 	 */
1273 	if (tbno > args->agbno)
1274 		goto not_found;
1275 	if (tlen < args->minlen)
1276 		goto not_found;
1277 	tend = tbno + tlen;
1278 	if (tend < args->agbno + args->minlen)
1279 		goto not_found;
1280 
1281 	/*
1282 	 * End of extent will be smaller of the freespace end and the
1283 	 * maximal requested end.
1284 	 *
1285 	 * Fix the length according to mod and prod if given.
1286 	 */
1287 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1288 						- args->agbno;
1289 	xfs_alloc_fix_len(args);
1290 	ASSERT(args->agbno + args->len <= tend);
1291 
1292 	/*
1293 	 * We are allocating agbno for args->len
1294 	 * Allocate/initialize a cursor for the by-size btree.
1295 	 */
1296 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1297 					args->pag, XFS_BTNUM_CNT);
1298 	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1299 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1300 				      args->len, XFSA_FIXUP_BNO_OK);
1301 	if (error) {
1302 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1303 		goto error0;
1304 	}
1305 
1306 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1307 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1308 
1309 	args->wasfromfl = 0;
1310 	trace_xfs_alloc_exact_done(args);
1311 	return 0;
1312 
1313 not_found:
1314 	/* Didn't find it, return null. */
1315 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1316 	args->agbno = NULLAGBLOCK;
1317 	trace_xfs_alloc_exact_notfound(args);
1318 	return 0;
1319 
1320 error0:
1321 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1322 	trace_xfs_alloc_exact_error(args);
1323 	return error;
1324 }
1325 
1326 /*
1327  * Search a given number of btree records in a given direction. Check each
1328  * record against the good extent we've already found.
1329  */
1330 STATIC int
1331 xfs_alloc_walk_iter(
1332 	struct xfs_alloc_arg	*args,
1333 	struct xfs_alloc_cur	*acur,
1334 	struct xfs_btree_cur	*cur,
1335 	bool			increment,
1336 	bool			find_one, /* quit on first candidate */
1337 	int			count,    /* rec count (-1 for infinite) */
1338 	int			*stat)
1339 {
1340 	int			error;
1341 	int			i;
1342 
1343 	*stat = 0;
1344 
1345 	/*
1346 	 * Search so long as the cursor is active or we find a better extent.
1347 	 * The cursor is deactivated if it extends beyond the range of the
1348 	 * current allocation candidate.
1349 	 */
1350 	while (xfs_alloc_cur_active(cur) && count) {
1351 		error = xfs_alloc_cur_check(args, acur, cur, &i);
1352 		if (error)
1353 			return error;
1354 		if (i == 1) {
1355 			*stat = 1;
1356 			if (find_one)
1357 				break;
1358 		}
1359 		if (!xfs_alloc_cur_active(cur))
1360 			break;
1361 
1362 		if (increment)
1363 			error = xfs_btree_increment(cur, 0, &i);
1364 		else
1365 			error = xfs_btree_decrement(cur, 0, &i);
1366 		if (error)
1367 			return error;
1368 		if (i == 0)
1369 			cur->bc_ag.abt.active = false;
1370 
1371 		if (count > 0)
1372 			count--;
1373 	}
1374 
1375 	return 0;
1376 }
1377 
1378 /*
1379  * Search the by-bno and by-size btrees in parallel in search of an extent with
1380  * ideal locality based on the NEAR mode ->agbno locality hint.
1381  */
1382 STATIC int
1383 xfs_alloc_ag_vextent_locality(
1384 	struct xfs_alloc_arg	*args,
1385 	struct xfs_alloc_cur	*acur,
1386 	int			*stat)
1387 {
1388 	struct xfs_btree_cur	*fbcur = NULL;
1389 	int			error;
1390 	int			i;
1391 	bool			fbinc;
1392 
1393 	ASSERT(acur->len == 0);
1394 
1395 	*stat = 0;
1396 
1397 	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1398 	if (error)
1399 		return error;
1400 	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1401 	if (error)
1402 		return error;
1403 	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1404 	if (error)
1405 		return error;
1406 
1407 	/*
1408 	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1409 	 * right and lookup the closest extent to the locality hint for each
1410 	 * extent size key in the cntbt. The entire search terminates
1411 	 * immediately on a bnobt hit because that means we've found best case
1412 	 * locality. Otherwise the search continues until the cntbt cursor runs
1413 	 * off the end of the tree. If no allocation candidate is found at this
1414 	 * point, give up on locality, walk backwards from the end of the cntbt
1415 	 * and take the first available extent.
1416 	 *
1417 	 * The parallel tree searches balance each other out to provide fairly
1418 	 * consistent performance for various situations. The bnobt search can
1419 	 * have pathological behavior in the worst case scenario of larger
1420 	 * allocation requests and fragmented free space. On the other hand, the
1421 	 * bnobt is able to satisfy most smaller allocation requests much more
1422 	 * quickly than the cntbt. The cntbt search can sift through fragmented
1423 	 * free space and sets of free extents for larger allocation requests
1424 	 * more quickly than the bnobt. Since the locality hint is just a hint
1425 	 * and we don't want to scan the entire bnobt for perfect locality, the
1426 	 * cntbt search essentially bounds the bnobt search such that we can
1427 	 * find good enough locality at reasonable performance in most cases.
1428 	 */
1429 	while (xfs_alloc_cur_active(acur->bnolt) ||
1430 	       xfs_alloc_cur_active(acur->bnogt) ||
1431 	       xfs_alloc_cur_active(acur->cnt)) {
1432 
1433 		trace_xfs_alloc_cur_lookup(args);
1434 
1435 		/*
1436 		 * Search the bnobt left and right. In the case of a hit, finish
1437 		 * the search in the opposite direction and we're done.
1438 		 */
1439 		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1440 					    true, 1, &i);
1441 		if (error)
1442 			return error;
1443 		if (i == 1) {
1444 			trace_xfs_alloc_cur_left(args);
1445 			fbcur = acur->bnogt;
1446 			fbinc = true;
1447 			break;
1448 		}
1449 		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1450 					    1, &i);
1451 		if (error)
1452 			return error;
1453 		if (i == 1) {
1454 			trace_xfs_alloc_cur_right(args);
1455 			fbcur = acur->bnolt;
1456 			fbinc = false;
1457 			break;
1458 		}
1459 
1460 		/*
1461 		 * Check the extent with best locality based on the current
1462 		 * extent size search key and keep track of the best candidate.
1463 		 */
1464 		error = xfs_alloc_cntbt_iter(args, acur);
1465 		if (error)
1466 			return error;
1467 		if (!xfs_alloc_cur_active(acur->cnt)) {
1468 			trace_xfs_alloc_cur_lookup_done(args);
1469 			break;
1470 		}
1471 	}
1472 
1473 	/*
1474 	 * If we failed to find anything due to busy extents, return empty
1475 	 * handed so the caller can flush and retry. If no busy extents were
1476 	 * found, walk backwards from the end of the cntbt as a last resort.
1477 	 */
1478 	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1479 		error = xfs_btree_decrement(acur->cnt, 0, &i);
1480 		if (error)
1481 			return error;
1482 		if (i) {
1483 			acur->cnt->bc_ag.abt.active = true;
1484 			fbcur = acur->cnt;
1485 			fbinc = false;
1486 		}
1487 	}
1488 
1489 	/*
1490 	 * Search in the opposite direction for a better entry in the case of
1491 	 * a bnobt hit or walk backwards from the end of the cntbt.
1492 	 */
1493 	if (fbcur) {
1494 		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1495 					    &i);
1496 		if (error)
1497 			return error;
1498 	}
1499 
1500 	if (acur->len)
1501 		*stat = 1;
1502 
1503 	return 0;
1504 }
1505 
1506 /* Check the last block of the cnt btree for allocations. */
1507 static int
1508 xfs_alloc_ag_vextent_lastblock(
1509 	struct xfs_alloc_arg	*args,
1510 	struct xfs_alloc_cur	*acur,
1511 	xfs_agblock_t		*bno,
1512 	xfs_extlen_t		*len,
1513 	bool			*allocated)
1514 {
1515 	int			error;
1516 	int			i;
1517 
1518 #ifdef DEBUG
1519 	/* Randomly don't execute the first algorithm. */
1520 	if (get_random_u32_below(2))
1521 		return 0;
1522 #endif
1523 
1524 	/*
1525 	 * Start from the entry that lookup found, sequence through all larger
1526 	 * free blocks.  If we're actually pointing at a record smaller than
1527 	 * maxlen, go to the start of this block, and skip all those smaller
1528 	 * than minlen.
1529 	 */
1530 	if (*len || args->alignment > 1) {
1531 		acur->cnt->bc_levels[0].ptr = 1;
1532 		do {
1533 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1534 			if (error)
1535 				return error;
1536 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1537 				xfs_btree_mark_sick(acur->cnt);
1538 				return -EFSCORRUPTED;
1539 			}
1540 			if (*len >= args->minlen)
1541 				break;
1542 			error = xfs_btree_increment(acur->cnt, 0, &i);
1543 			if (error)
1544 				return error;
1545 		} while (i);
1546 		ASSERT(*len >= args->minlen);
1547 		if (!i)
1548 			return 0;
1549 	}
1550 
1551 	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1552 	if (error)
1553 		return error;
1554 
1555 	/*
1556 	 * It didn't work.  We COULD be in a case where there's a good record
1557 	 * somewhere, so try again.
1558 	 */
1559 	if (acur->len == 0)
1560 		return 0;
1561 
1562 	trace_xfs_alloc_near_first(args);
1563 	*allocated = true;
1564 	return 0;
1565 }
1566 
1567 /*
1568  * Allocate a variable extent near bno in the allocation group agno.
1569  * Extent's length (returned in len) will be between minlen and maxlen,
1570  * and of the form k * prod + mod unless there's nothing that large.
1571  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1572  */
1573 STATIC int
1574 xfs_alloc_ag_vextent_near(
1575 	struct xfs_alloc_arg	*args,
1576 	uint32_t		alloc_flags)
1577 {
1578 	struct xfs_alloc_cur	acur = {};
1579 	int			error;		/* error code */
1580 	int			i;		/* result code, temporary */
1581 	xfs_agblock_t		bno;
1582 	xfs_extlen_t		len;
1583 
1584 	/* handle uninitialized agbno range so caller doesn't have to */
1585 	if (!args->min_agbno && !args->max_agbno)
1586 		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1587 	ASSERT(args->min_agbno <= args->max_agbno);
1588 
1589 	/* clamp agbno to the range if it's outside */
1590 	if (args->agbno < args->min_agbno)
1591 		args->agbno = args->min_agbno;
1592 	if (args->agbno > args->max_agbno)
1593 		args->agbno = args->max_agbno;
1594 
1595 	/* Retry once quickly if we find busy extents before blocking. */
1596 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1597 restart:
1598 	len = 0;
1599 
1600 	/*
1601 	 * Set up cursors and see if there are any free extents as big as
1602 	 * maxlen. If not, pick the last entry in the tree unless the tree is
1603 	 * empty.
1604 	 */
1605 	error = xfs_alloc_cur_setup(args, &acur);
1606 	if (error == -ENOSPC) {
1607 		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1608 				&len, &i);
1609 		if (error)
1610 			goto out;
1611 		if (i == 0 || len == 0) {
1612 			trace_xfs_alloc_near_noentry(args);
1613 			goto out;
1614 		}
1615 		ASSERT(i == 1);
1616 	} else if (error) {
1617 		goto out;
1618 	}
1619 
1620 	/*
1621 	 * First algorithm.
1622 	 * If the requested extent is large wrt the freespaces available
1623 	 * in this a.g., then the cursor will be pointing to a btree entry
1624 	 * near the right edge of the tree.  If it's in the last btree leaf
1625 	 * block, then we just examine all the entries in that block
1626 	 * that are big enough, and pick the best one.
1627 	 */
1628 	if (xfs_btree_islastblock(acur.cnt, 0)) {
1629 		bool		allocated = false;
1630 
1631 		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1632 				&allocated);
1633 		if (error)
1634 			goto out;
1635 		if (allocated)
1636 			goto alloc_finish;
1637 	}
1638 
1639 	/*
1640 	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1641 	 * locality.
1642 	 */
1643 	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1644 	if (error)
1645 		goto out;
1646 
1647 	/*
1648 	 * If we couldn't get anything, give up.
1649 	 */
1650 	if (!acur.len) {
1651 		if (acur.busy) {
1652 			/*
1653 			 * Our only valid extents must have been busy. Flush and
1654 			 * retry the allocation again. If we get an -EAGAIN
1655 			 * error, we're being told that a deadlock was avoided
1656 			 * and the current transaction needs committing before
1657 			 * the allocation can be retried.
1658 			 */
1659 			trace_xfs_alloc_near_busy(args);
1660 			error = xfs_extent_busy_flush(args->tp, args->pag,
1661 					acur.busy_gen, alloc_flags);
1662 			if (error)
1663 				goto out;
1664 
1665 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1666 			goto restart;
1667 		}
1668 		trace_xfs_alloc_size_neither(args);
1669 		args->agbno = NULLAGBLOCK;
1670 		goto out;
1671 	}
1672 
1673 alloc_finish:
1674 	/* fix up btrees on a successful allocation */
1675 	error = xfs_alloc_cur_finish(args, &acur);
1676 
1677 out:
1678 	xfs_alloc_cur_close(&acur, error);
1679 	return error;
1680 }
1681 
1682 /*
1683  * Allocate a variable extent anywhere in the allocation group agno.
1684  * Extent's length (returned in len) will be between minlen and maxlen,
1685  * and of the form k * prod + mod unless there's nothing that large.
1686  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1687  */
1688 static int
1689 xfs_alloc_ag_vextent_size(
1690 	struct xfs_alloc_arg	*args,
1691 	uint32_t		alloc_flags)
1692 {
1693 	struct xfs_agf		*agf = args->agbp->b_addr;
1694 	struct xfs_btree_cur	*bno_cur;
1695 	struct xfs_btree_cur	*cnt_cur;
1696 	xfs_agblock_t		fbno;		/* start of found freespace */
1697 	xfs_extlen_t		flen;		/* length of found freespace */
1698 	xfs_agblock_t		rbno;		/* returned block number */
1699 	xfs_extlen_t		rlen;		/* length of returned extent */
1700 	bool			busy;
1701 	unsigned		busy_gen;
1702 	int			error;
1703 	int			i;
1704 
1705 	/* Retry once quickly if we find busy extents before blocking. */
1706 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1707 restart:
1708 	/*
1709 	 * Allocate and initialize a cursor for the by-size btree.
1710 	 */
1711 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1712 					args->pag, XFS_BTNUM_CNT);
1713 	bno_cur = NULL;
1714 
1715 	/*
1716 	 * Look for an entry >= maxlen+alignment-1 blocks.
1717 	 */
1718 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1719 			args->maxlen + args->alignment - 1, &i)))
1720 		goto error0;
1721 
1722 	/*
1723 	 * If none then we have to settle for a smaller extent. In the case that
1724 	 * there are no large extents, this will return the last entry in the
1725 	 * tree unless the tree is empty. In the case that there are only busy
1726 	 * large extents, this will return the largest small extent unless there
1727 	 * are no smaller extents available.
1728 	 */
1729 	if (!i) {
1730 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1731 						   &fbno, &flen, &i);
1732 		if (error)
1733 			goto error0;
1734 		if (i == 0 || flen == 0) {
1735 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1736 			trace_xfs_alloc_size_noentry(args);
1737 			return 0;
1738 		}
1739 		ASSERT(i == 1);
1740 		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1741 				&rlen, &busy_gen);
1742 	} else {
1743 		/*
1744 		 * Search for a non-busy extent that is large enough.
1745 		 */
1746 		for (;;) {
1747 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1748 			if (error)
1749 				goto error0;
1750 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1751 				xfs_btree_mark_sick(cnt_cur);
1752 				error = -EFSCORRUPTED;
1753 				goto error0;
1754 			}
1755 
1756 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1757 					&rbno, &rlen, &busy_gen);
1758 
1759 			if (rlen >= args->maxlen)
1760 				break;
1761 
1762 			error = xfs_btree_increment(cnt_cur, 0, &i);
1763 			if (error)
1764 				goto error0;
1765 			if (i)
1766 				continue;
1767 
1768 			/*
1769 			 * Our only valid extents must have been busy. Flush and
1770 			 * retry the allocation again. If we get an -EAGAIN
1771 			 * error, we're being told that a deadlock was avoided
1772 			 * and the current transaction needs committing before
1773 			 * the allocation can be retried.
1774 			 */
1775 			trace_xfs_alloc_size_busy(args);
1776 			error = xfs_extent_busy_flush(args->tp, args->pag,
1777 					busy_gen, alloc_flags);
1778 			if (error)
1779 				goto error0;
1780 
1781 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1782 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1783 			goto restart;
1784 		}
1785 	}
1786 
1787 	/*
1788 	 * In the first case above, we got the last entry in the
1789 	 * by-size btree.  Now we check to see if the space hits maxlen
1790 	 * once aligned; if not, we search left for something better.
1791 	 * This can't happen in the second case above.
1792 	 */
1793 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1794 	if (XFS_IS_CORRUPT(args->mp,
1795 			   rlen != 0 &&
1796 			   (rlen > flen ||
1797 			    rbno + rlen > fbno + flen))) {
1798 		xfs_btree_mark_sick(cnt_cur);
1799 		error = -EFSCORRUPTED;
1800 		goto error0;
1801 	}
1802 	if (rlen < args->maxlen) {
1803 		xfs_agblock_t	bestfbno;
1804 		xfs_extlen_t	bestflen;
1805 		xfs_agblock_t	bestrbno;
1806 		xfs_extlen_t	bestrlen;
1807 
1808 		bestrlen = rlen;
1809 		bestrbno = rbno;
1810 		bestflen = flen;
1811 		bestfbno = fbno;
1812 		for (;;) {
1813 			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1814 				goto error0;
1815 			if (i == 0)
1816 				break;
1817 			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1818 					&i)))
1819 				goto error0;
1820 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1821 				xfs_btree_mark_sick(cnt_cur);
1822 				error = -EFSCORRUPTED;
1823 				goto error0;
1824 			}
1825 			if (flen < bestrlen)
1826 				break;
1827 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1828 					&rbno, &rlen, &busy_gen);
1829 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1830 			if (XFS_IS_CORRUPT(args->mp,
1831 					   rlen != 0 &&
1832 					   (rlen > flen ||
1833 					    rbno + rlen > fbno + flen))) {
1834 				xfs_btree_mark_sick(cnt_cur);
1835 				error = -EFSCORRUPTED;
1836 				goto error0;
1837 			}
1838 			if (rlen > bestrlen) {
1839 				bestrlen = rlen;
1840 				bestrbno = rbno;
1841 				bestflen = flen;
1842 				bestfbno = fbno;
1843 				if (rlen == args->maxlen)
1844 					break;
1845 			}
1846 		}
1847 		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1848 				&i)))
1849 			goto error0;
1850 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1851 			xfs_btree_mark_sick(cnt_cur);
1852 			error = -EFSCORRUPTED;
1853 			goto error0;
1854 		}
1855 		rlen = bestrlen;
1856 		rbno = bestrbno;
1857 		flen = bestflen;
1858 		fbno = bestfbno;
1859 	}
1860 	args->wasfromfl = 0;
1861 	/*
1862 	 * Fix up the length.
1863 	 */
1864 	args->len = rlen;
1865 	if (rlen < args->minlen) {
1866 		if (busy) {
1867 			/*
1868 			 * Our only valid extents must have been busy. Flush and
1869 			 * retry the allocation again. If we get an -EAGAIN
1870 			 * error, we're being told that a deadlock was avoided
1871 			 * and the current transaction needs committing before
1872 			 * the allocation can be retried.
1873 			 */
1874 			trace_xfs_alloc_size_busy(args);
1875 			error = xfs_extent_busy_flush(args->tp, args->pag,
1876 					busy_gen, alloc_flags);
1877 			if (error)
1878 				goto error0;
1879 
1880 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1881 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1882 			goto restart;
1883 		}
1884 		goto out_nominleft;
1885 	}
1886 	xfs_alloc_fix_len(args);
1887 
1888 	rlen = args->len;
1889 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1890 		xfs_btree_mark_sick(cnt_cur);
1891 		error = -EFSCORRUPTED;
1892 		goto error0;
1893 	}
1894 	/*
1895 	 * Allocate and initialize a cursor for the by-block tree.
1896 	 */
1897 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1898 					args->pag, XFS_BTNUM_BNO);
1899 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1900 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1901 		goto error0;
1902 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1903 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1904 	cnt_cur = bno_cur = NULL;
1905 	args->len = rlen;
1906 	args->agbno = rbno;
1907 	if (XFS_IS_CORRUPT(args->mp,
1908 			   args->agbno + args->len >
1909 			   be32_to_cpu(agf->agf_length))) {
1910 		xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
1911 		error = -EFSCORRUPTED;
1912 		goto error0;
1913 	}
1914 	trace_xfs_alloc_size_done(args);
1915 	return 0;
1916 
1917 error0:
1918 	trace_xfs_alloc_size_error(args);
1919 	if (cnt_cur)
1920 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1921 	if (bno_cur)
1922 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1923 	return error;
1924 
1925 out_nominleft:
1926 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1927 	trace_xfs_alloc_size_nominleft(args);
1928 	args->agbno = NULLAGBLOCK;
1929 	return 0;
1930 }
1931 
1932 /*
1933  * Free the extent starting at agno/bno for length.
1934  */
1935 STATIC int
1936 xfs_free_ag_extent(
1937 	struct xfs_trans		*tp,
1938 	struct xfs_buf			*agbp,
1939 	xfs_agnumber_t			agno,
1940 	xfs_agblock_t			bno,
1941 	xfs_extlen_t			len,
1942 	const struct xfs_owner_info	*oinfo,
1943 	enum xfs_ag_resv_type		type)
1944 {
1945 	struct xfs_mount		*mp;
1946 	struct xfs_btree_cur		*bno_cur;
1947 	struct xfs_btree_cur		*cnt_cur;
1948 	xfs_agblock_t			gtbno; /* start of right neighbor */
1949 	xfs_extlen_t			gtlen; /* length of right neighbor */
1950 	xfs_agblock_t			ltbno; /* start of left neighbor */
1951 	xfs_extlen_t			ltlen; /* length of left neighbor */
1952 	xfs_agblock_t			nbno; /* new starting block of freesp */
1953 	xfs_extlen_t			nlen; /* new length of freespace */
1954 	int				haveleft; /* have a left neighbor */
1955 	int				haveright; /* have a right neighbor */
1956 	int				i;
1957 	int				error;
1958 	struct xfs_perag		*pag = agbp->b_pag;
1959 
1960 	bno_cur = cnt_cur = NULL;
1961 	mp = tp->t_mountp;
1962 
1963 	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1964 		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1965 		if (error)
1966 			goto error0;
1967 	}
1968 
1969 	/*
1970 	 * Allocate and initialize a cursor for the by-block btree.
1971 	 */
1972 	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1973 	/*
1974 	 * Look for a neighboring block on the left (lower block numbers)
1975 	 * that is contiguous with this space.
1976 	 */
1977 	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1978 		goto error0;
1979 	if (haveleft) {
1980 		/*
1981 		 * There is a block to our left.
1982 		 */
1983 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1984 			goto error0;
1985 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1986 			xfs_btree_mark_sick(bno_cur);
1987 			error = -EFSCORRUPTED;
1988 			goto error0;
1989 		}
1990 		/*
1991 		 * It's not contiguous, though.
1992 		 */
1993 		if (ltbno + ltlen < bno)
1994 			haveleft = 0;
1995 		else {
1996 			/*
1997 			 * If this failure happens the request to free this
1998 			 * space was invalid, it's (partly) already free.
1999 			 * Very bad.
2000 			 */
2001 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
2002 				xfs_btree_mark_sick(bno_cur);
2003 				error = -EFSCORRUPTED;
2004 				goto error0;
2005 			}
2006 		}
2007 	}
2008 	/*
2009 	 * Look for a neighboring block on the right (higher block numbers)
2010 	 * that is contiguous with this space.
2011 	 */
2012 	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2013 		goto error0;
2014 	if (haveright) {
2015 		/*
2016 		 * There is a block to our right.
2017 		 */
2018 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2019 			goto error0;
2020 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2021 			xfs_btree_mark_sick(bno_cur);
2022 			error = -EFSCORRUPTED;
2023 			goto error0;
2024 		}
2025 		/*
2026 		 * It's not contiguous, though.
2027 		 */
2028 		if (bno + len < gtbno)
2029 			haveright = 0;
2030 		else {
2031 			/*
2032 			 * If this failure happens the request to free this
2033 			 * space was invalid, it's (partly) already free.
2034 			 * Very bad.
2035 			 */
2036 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
2037 				xfs_btree_mark_sick(bno_cur);
2038 				error = -EFSCORRUPTED;
2039 				goto error0;
2040 			}
2041 		}
2042 	}
2043 	/*
2044 	 * Now allocate and initialize a cursor for the by-size tree.
2045 	 */
2046 	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2047 	/*
2048 	 * Have both left and right contiguous neighbors.
2049 	 * Merge all three into a single free block.
2050 	 */
2051 	if (haveleft && haveright) {
2052 		/*
2053 		 * Delete the old by-size entry on the left.
2054 		 */
2055 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2056 			goto error0;
2057 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2058 			xfs_btree_mark_sick(cnt_cur);
2059 			error = -EFSCORRUPTED;
2060 			goto error0;
2061 		}
2062 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2063 			goto error0;
2064 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2065 			xfs_btree_mark_sick(cnt_cur);
2066 			error = -EFSCORRUPTED;
2067 			goto error0;
2068 		}
2069 		/*
2070 		 * Delete the old by-size entry on the right.
2071 		 */
2072 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2073 			goto error0;
2074 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2075 			xfs_btree_mark_sick(cnt_cur);
2076 			error = -EFSCORRUPTED;
2077 			goto error0;
2078 		}
2079 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2080 			goto error0;
2081 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2082 			xfs_btree_mark_sick(cnt_cur);
2083 			error = -EFSCORRUPTED;
2084 			goto error0;
2085 		}
2086 		/*
2087 		 * Delete the old by-block entry for the right block.
2088 		 */
2089 		if ((error = xfs_btree_delete(bno_cur, &i)))
2090 			goto error0;
2091 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2092 			xfs_btree_mark_sick(bno_cur);
2093 			error = -EFSCORRUPTED;
2094 			goto error0;
2095 		}
2096 		/*
2097 		 * Move the by-block cursor back to the left neighbor.
2098 		 */
2099 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2100 			goto error0;
2101 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2102 			xfs_btree_mark_sick(bno_cur);
2103 			error = -EFSCORRUPTED;
2104 			goto error0;
2105 		}
2106 #ifdef DEBUG
2107 		/*
2108 		 * Check that this is the right record: delete didn't
2109 		 * mangle the cursor.
2110 		 */
2111 		{
2112 			xfs_agblock_t	xxbno;
2113 			xfs_extlen_t	xxlen;
2114 
2115 			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2116 					&i)))
2117 				goto error0;
2118 			if (XFS_IS_CORRUPT(mp,
2119 					   i != 1 ||
2120 					   xxbno != ltbno ||
2121 					   xxlen != ltlen)) {
2122 				xfs_btree_mark_sick(bno_cur);
2123 				error = -EFSCORRUPTED;
2124 				goto error0;
2125 			}
2126 		}
2127 #endif
2128 		/*
2129 		 * Update remaining by-block entry to the new, joined block.
2130 		 */
2131 		nbno = ltbno;
2132 		nlen = len + ltlen + gtlen;
2133 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2134 			goto error0;
2135 	}
2136 	/*
2137 	 * Have only a left contiguous neighbor.
2138 	 * Merge it together with the new freespace.
2139 	 */
2140 	else if (haveleft) {
2141 		/*
2142 		 * Delete the old by-size entry on the left.
2143 		 */
2144 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2145 			goto error0;
2146 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2147 			xfs_btree_mark_sick(cnt_cur);
2148 			error = -EFSCORRUPTED;
2149 			goto error0;
2150 		}
2151 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2152 			goto error0;
2153 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2154 			xfs_btree_mark_sick(cnt_cur);
2155 			error = -EFSCORRUPTED;
2156 			goto error0;
2157 		}
2158 		/*
2159 		 * Back up the by-block cursor to the left neighbor, and
2160 		 * update its length.
2161 		 */
2162 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2163 			goto error0;
2164 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2165 			xfs_btree_mark_sick(bno_cur);
2166 			error = -EFSCORRUPTED;
2167 			goto error0;
2168 		}
2169 		nbno = ltbno;
2170 		nlen = len + ltlen;
2171 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2172 			goto error0;
2173 	}
2174 	/*
2175 	 * Have only a right contiguous neighbor.
2176 	 * Merge it together with the new freespace.
2177 	 */
2178 	else if (haveright) {
2179 		/*
2180 		 * Delete the old by-size entry on the right.
2181 		 */
2182 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2183 			goto error0;
2184 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2185 			xfs_btree_mark_sick(cnt_cur);
2186 			error = -EFSCORRUPTED;
2187 			goto error0;
2188 		}
2189 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2190 			goto error0;
2191 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2192 			xfs_btree_mark_sick(cnt_cur);
2193 			error = -EFSCORRUPTED;
2194 			goto error0;
2195 		}
2196 		/*
2197 		 * Update the starting block and length of the right
2198 		 * neighbor in the by-block tree.
2199 		 */
2200 		nbno = bno;
2201 		nlen = len + gtlen;
2202 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2203 			goto error0;
2204 	}
2205 	/*
2206 	 * No contiguous neighbors.
2207 	 * Insert the new freespace into the by-block tree.
2208 	 */
2209 	else {
2210 		nbno = bno;
2211 		nlen = len;
2212 		if ((error = xfs_btree_insert(bno_cur, &i)))
2213 			goto error0;
2214 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2215 			xfs_btree_mark_sick(bno_cur);
2216 			error = -EFSCORRUPTED;
2217 			goto error0;
2218 		}
2219 	}
2220 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2221 	bno_cur = NULL;
2222 	/*
2223 	 * In all cases we need to insert the new freespace in the by-size tree.
2224 	 */
2225 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2226 		goto error0;
2227 	if (XFS_IS_CORRUPT(mp, i != 0)) {
2228 		xfs_btree_mark_sick(cnt_cur);
2229 		error = -EFSCORRUPTED;
2230 		goto error0;
2231 	}
2232 	if ((error = xfs_btree_insert(cnt_cur, &i)))
2233 		goto error0;
2234 	if (XFS_IS_CORRUPT(mp, i != 1)) {
2235 		xfs_btree_mark_sick(cnt_cur);
2236 		error = -EFSCORRUPTED;
2237 		goto error0;
2238 	}
2239 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2240 	cnt_cur = NULL;
2241 
2242 	/*
2243 	 * Update the freespace totals in the ag and superblock.
2244 	 */
2245 	error = xfs_alloc_update_counters(tp, agbp, len);
2246 	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2247 	if (error)
2248 		goto error0;
2249 
2250 	XFS_STATS_INC(mp, xs_freex);
2251 	XFS_STATS_ADD(mp, xs_freeb, len);
2252 
2253 	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2254 
2255 	return 0;
2256 
2257  error0:
2258 	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2259 	if (bno_cur)
2260 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2261 	if (cnt_cur)
2262 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2263 	return error;
2264 }
2265 
2266 /*
2267  * Visible (exported) allocation/free functions.
2268  * Some of these are used just by xfs_alloc_btree.c and this file.
2269  */
2270 
2271 /*
2272  * Compute and fill in value of m_alloc_maxlevels.
2273  */
2274 void
2275 xfs_alloc_compute_maxlevels(
2276 	xfs_mount_t	*mp)	/* file system mount structure */
2277 {
2278 	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2279 			(mp->m_sb.sb_agblocks + 1) / 2);
2280 	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2281 }
2282 
2283 /*
2284  * Find the length of the longest extent in an AG.  The 'need' parameter
2285  * specifies how much space we're going to need for the AGFL and the
2286  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2287  * other callers.
2288  */
2289 xfs_extlen_t
2290 xfs_alloc_longest_free_extent(
2291 	struct xfs_perag	*pag,
2292 	xfs_extlen_t		need,
2293 	xfs_extlen_t		reserved)
2294 {
2295 	xfs_extlen_t		delta = 0;
2296 
2297 	/*
2298 	 * If the AGFL needs a recharge, we'll have to subtract that from the
2299 	 * longest extent.
2300 	 */
2301 	if (need > pag->pagf_flcount)
2302 		delta = need - pag->pagf_flcount;
2303 
2304 	/*
2305 	 * If we cannot maintain others' reservations with space from the
2306 	 * not-longest freesp extents, we'll have to subtract /that/ from
2307 	 * the longest extent too.
2308 	 */
2309 	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2310 		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2311 
2312 	/*
2313 	 * If the longest extent is long enough to satisfy all the
2314 	 * reservations and AGFL rules in place, we can return this extent.
2315 	 */
2316 	if (pag->pagf_longest > delta)
2317 		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2318 				pag->pagf_longest - delta);
2319 
2320 	/* Otherwise, let the caller try for 1 block if there's space. */
2321 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2322 }
2323 
2324 /*
2325  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2326  * return the largest possible minimum length.
2327  */
2328 unsigned int
2329 xfs_alloc_min_freelist(
2330 	struct xfs_mount	*mp,
2331 	struct xfs_perag	*pag)
2332 {
2333 	/* AG btrees have at least 1 level. */
2334 	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2335 	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
2336 	unsigned int		min_free;
2337 
2338 	ASSERT(mp->m_alloc_maxlevels > 0);
2339 
2340 	/*
2341 	 * For a btree shorter than the maximum height, the worst case is that
2342 	 * every level gets split and a new level is added, then while inserting
2343 	 * another entry to refill the AGFL, every level under the old root gets
2344 	 * split again. This is:
2345 	 *
2346 	 *   (full height split reservation) + (AGFL refill split height)
2347 	 * = (current height + 1) + (current height - 1)
2348 	 * = (new height) + (new height - 2)
2349 	 * = 2 * new height - 2
2350 	 *
2351 	 * For a btree of maximum height, the worst case is that every level
2352 	 * under the root gets split, then while inserting another entry to
2353 	 * refill the AGFL, every level under the root gets split again. This is
2354 	 * also:
2355 	 *
2356 	 *   2 * (current height - 1)
2357 	 * = 2 * (new height - 1)
2358 	 * = 2 * new height - 2
2359 	 */
2360 
2361 	/* space needed by-bno freespace btree */
2362 	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2363 				       mp->m_alloc_maxlevels) * 2 - 2;
2364 	/* space needed by-size freespace btree */
2365 	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2366 				       mp->m_alloc_maxlevels) * 2 - 2;
2367 	/* space needed reverse mapping used space btree */
2368 	if (xfs_has_rmapbt(mp))
2369 		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2370 						mp->m_rmap_maxlevels) * 2 - 2;
2371 
2372 	return min_free;
2373 }
2374 
2375 /*
2376  * Check if the operation we are fixing up the freelist for should go ahead or
2377  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2378  * is dependent on whether the size and shape of free space available will
2379  * permit the requested allocation to take place.
2380  */
2381 static bool
2382 xfs_alloc_space_available(
2383 	struct xfs_alloc_arg	*args,
2384 	xfs_extlen_t		min_free,
2385 	int			flags)
2386 {
2387 	struct xfs_perag	*pag = args->pag;
2388 	xfs_extlen_t		alloc_len, longest;
2389 	xfs_extlen_t		reservation; /* blocks that are still reserved */
2390 	int			available;
2391 	xfs_extlen_t		agflcount;
2392 
2393 	if (flags & XFS_ALLOC_FLAG_FREEING)
2394 		return true;
2395 
2396 	reservation = xfs_ag_resv_needed(pag, args->resv);
2397 
2398 	/* do we have enough contiguous free space for the allocation? */
2399 	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2400 	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2401 	if (longest < alloc_len)
2402 		return false;
2403 
2404 	/*
2405 	 * Do we have enough free space remaining for the allocation? Don't
2406 	 * account extra agfl blocks because we are about to defer free them,
2407 	 * making them unavailable until the current transaction commits.
2408 	 */
2409 	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2410 	available = (int)(pag->pagf_freeblks + agflcount -
2411 			  reservation - min_free - args->minleft);
2412 	if (available < (int)max(args->total, alloc_len))
2413 		return false;
2414 
2415 	/*
2416 	 * Clamp maxlen to the amount of free space available for the actual
2417 	 * extent allocation.
2418 	 */
2419 	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2420 		args->maxlen = available;
2421 		ASSERT(args->maxlen > 0);
2422 		ASSERT(args->maxlen >= args->minlen);
2423 	}
2424 
2425 	return true;
2426 }
2427 
2428 int
2429 xfs_free_agfl_block(
2430 	struct xfs_trans	*tp,
2431 	xfs_agnumber_t		agno,
2432 	xfs_agblock_t		agbno,
2433 	struct xfs_buf		*agbp,
2434 	struct xfs_owner_info	*oinfo)
2435 {
2436 	int			error;
2437 	struct xfs_buf		*bp;
2438 
2439 	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2440 				   XFS_AG_RESV_AGFL);
2441 	if (error)
2442 		return error;
2443 
2444 	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2445 			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2446 			tp->t_mountp->m_bsize, 0, &bp);
2447 	if (error)
2448 		return error;
2449 	xfs_trans_binval(tp, bp);
2450 
2451 	return 0;
2452 }
2453 
2454 /*
2455  * Check the agfl fields of the agf for inconsistency or corruption.
2456  *
2457  * The original purpose was to detect an agfl header padding mismatch between
2458  * current and early v5 kernels. This problem manifests as a 1-slot size
2459  * difference between the on-disk flcount and the active [first, last] range of
2460  * a wrapped agfl.
2461  *
2462  * However, we need to use these same checks to catch agfl count corruptions
2463  * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2464  * way, we need to reset the agfl and warn the user.
2465  *
2466  * Return true if a reset is required before the agfl can be used, false
2467  * otherwise.
2468  */
2469 static bool
2470 xfs_agfl_needs_reset(
2471 	struct xfs_mount	*mp,
2472 	struct xfs_agf		*agf)
2473 {
2474 	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2475 	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2476 	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2477 	int			agfl_size = xfs_agfl_size(mp);
2478 	int			active;
2479 
2480 	/*
2481 	 * The agf read verifier catches severe corruption of these fields.
2482 	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2483 	 * the verifier allows it.
2484 	 */
2485 	if (f >= agfl_size || l >= agfl_size)
2486 		return true;
2487 	if (c > agfl_size)
2488 		return true;
2489 
2490 	/*
2491 	 * Check consistency between the on-disk count and the active range. An
2492 	 * agfl padding mismatch manifests as an inconsistent flcount.
2493 	 */
2494 	if (c && l >= f)
2495 		active = l - f + 1;
2496 	else if (c)
2497 		active = agfl_size - f + l + 1;
2498 	else
2499 		active = 0;
2500 
2501 	return active != c;
2502 }
2503 
2504 /*
2505  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2506  * agfl content cannot be trusted. Warn the user that a repair is required to
2507  * recover leaked blocks.
2508  *
2509  * The purpose of this mechanism is to handle filesystems affected by the agfl
2510  * header padding mismatch problem. A reset keeps the filesystem online with a
2511  * relatively minor free space accounting inconsistency rather than suffer the
2512  * inevitable crash from use of an invalid agfl block.
2513  */
2514 static void
2515 xfs_agfl_reset(
2516 	struct xfs_trans	*tp,
2517 	struct xfs_buf		*agbp,
2518 	struct xfs_perag	*pag)
2519 {
2520 	struct xfs_mount	*mp = tp->t_mountp;
2521 	struct xfs_agf		*agf = agbp->b_addr;
2522 
2523 	ASSERT(xfs_perag_agfl_needs_reset(pag));
2524 	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2525 
2526 	xfs_warn(mp,
2527 	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2528 	       "Please unmount and run xfs_repair.",
2529 	         pag->pag_agno, pag->pagf_flcount);
2530 
2531 	agf->agf_flfirst = 0;
2532 	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2533 	agf->agf_flcount = 0;
2534 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2535 				    XFS_AGF_FLCOUNT);
2536 
2537 	pag->pagf_flcount = 0;
2538 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
2539 }
2540 
2541 /*
2542  * Defer an AGFL block free. This is effectively equivalent to
2543  * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2544  *
2545  * Deferring AGFL frees helps prevent log reservation overruns due to too many
2546  * allocation operations in a transaction. AGFL frees are prone to this problem
2547  * because for one they are always freed one at a time. Further, an immediate
2548  * AGFL block free can cause a btree join and require another block free before
2549  * the real allocation can proceed. Deferring the free disconnects freeing up
2550  * the AGFL slot from freeing the block.
2551  */
2552 static int
2553 xfs_defer_agfl_block(
2554 	struct xfs_trans		*tp,
2555 	xfs_agnumber_t			agno,
2556 	xfs_agblock_t			agbno,
2557 	struct xfs_owner_info		*oinfo)
2558 {
2559 	struct xfs_mount		*mp = tp->t_mountp;
2560 	struct xfs_extent_free_item	*xefi;
2561 	xfs_fsblock_t			fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
2562 
2563 	ASSERT(xfs_extfree_item_cache != NULL);
2564 	ASSERT(oinfo != NULL);
2565 
2566 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
2567 		return -EFSCORRUPTED;
2568 
2569 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2570 			       GFP_KERNEL | __GFP_NOFAIL);
2571 	xefi->xefi_startblock = fsbno;
2572 	xefi->xefi_blockcount = 1;
2573 	xefi->xefi_owner = oinfo->oi_owner;
2574 	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
2575 
2576 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2577 
2578 	xfs_extent_free_get_group(mp, xefi);
2579 	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
2580 	return 0;
2581 }
2582 
2583 /*
2584  * Add the extent to the list of extents to be free at transaction end.
2585  * The list is maintained sorted (by block number).
2586  */
2587 static int
2588 xfs_defer_extent_free(
2589 	struct xfs_trans		*tp,
2590 	xfs_fsblock_t			bno,
2591 	xfs_filblks_t			len,
2592 	const struct xfs_owner_info	*oinfo,
2593 	enum xfs_ag_resv_type		type,
2594 	bool				skip_discard,
2595 	struct xfs_defer_pending	**dfpp)
2596 {
2597 	struct xfs_extent_free_item	*xefi;
2598 	struct xfs_mount		*mp = tp->t_mountp;
2599 #ifdef DEBUG
2600 	xfs_agnumber_t			agno;
2601 	xfs_agblock_t			agbno;
2602 
2603 	ASSERT(bno != NULLFSBLOCK);
2604 	ASSERT(len > 0);
2605 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2606 	ASSERT(!isnullstartblock(bno));
2607 	agno = XFS_FSB_TO_AGNO(mp, bno);
2608 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
2609 	ASSERT(agno < mp->m_sb.sb_agcount);
2610 	ASSERT(agbno < mp->m_sb.sb_agblocks);
2611 	ASSERT(len < mp->m_sb.sb_agblocks);
2612 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2613 #endif
2614 	ASSERT(xfs_extfree_item_cache != NULL);
2615 	ASSERT(type != XFS_AG_RESV_AGFL);
2616 
2617 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
2618 		return -EFSCORRUPTED;
2619 
2620 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2621 			       GFP_KERNEL | __GFP_NOFAIL);
2622 	xefi->xefi_startblock = bno;
2623 	xefi->xefi_blockcount = (xfs_extlen_t)len;
2624 	xefi->xefi_agresv = type;
2625 	if (skip_discard)
2626 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2627 	if (oinfo) {
2628 		ASSERT(oinfo->oi_offset == 0);
2629 
2630 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2631 			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
2632 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2633 			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2634 		xefi->xefi_owner = oinfo->oi_owner;
2635 	} else {
2636 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
2637 	}
2638 	trace_xfs_bmap_free_defer(mp,
2639 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2640 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2641 
2642 	xfs_extent_free_get_group(mp, xefi);
2643 	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
2644 	return 0;
2645 }
2646 
2647 int
2648 xfs_free_extent_later(
2649 	struct xfs_trans		*tp,
2650 	xfs_fsblock_t			bno,
2651 	xfs_filblks_t			len,
2652 	const struct xfs_owner_info	*oinfo,
2653 	enum xfs_ag_resv_type		type,
2654 	bool				skip_discard)
2655 {
2656 	struct xfs_defer_pending	*dontcare = NULL;
2657 
2658 	return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
2659 			&dontcare);
2660 }
2661 
2662 /*
2663  * Set up automatic freeing of unwritten space in the filesystem.
2664  *
2665  * This function attached a paused deferred extent free item to the
2666  * transaction.  Pausing means that the EFI will be logged in the next
2667  * transaction commit, but the pending EFI will not be finished until the
2668  * pending item is unpaused.
2669  *
2670  * If the system goes down after the EFI has been persisted to the log but
2671  * before the pending item is unpaused, log recovery will find the EFI, fail to
2672  * find the EFD, and free the space.
2673  *
2674  * If the pending item is unpaused, the next transaction commit will log an EFD
2675  * without freeing the space.
2676  *
2677  * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
2678  * @args structure are set to the relevant values.
2679  */
2680 int
2681 xfs_alloc_schedule_autoreap(
2682 	const struct xfs_alloc_arg	*args,
2683 	bool				skip_discard,
2684 	struct xfs_alloc_autoreap	*aarp)
2685 {
2686 	int				error;
2687 
2688 	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
2689 			&args->oinfo, args->resv, skip_discard, &aarp->dfp);
2690 	if (error)
2691 		return error;
2692 
2693 	xfs_defer_item_pause(args->tp, aarp->dfp);
2694 	return 0;
2695 }
2696 
2697 /*
2698  * Cancel automatic freeing of unwritten space in the filesystem.
2699  *
2700  * Earlier, we created a paused deferred extent free item and attached it to
2701  * this transaction so that we could automatically roll back a new space
2702  * allocation if the system went down.  Now we want to cancel the paused work
2703  * item by marking the EFI stale so we don't actually free the space, unpausing
2704  * the pending item and logging an EFD.
2705  *
2706  * The caller generally should have already mapped the space into the ondisk
2707  * filesystem.  If the reserved space was partially used, the caller must call
2708  * xfs_free_extent_later to create a new EFI to free the unused space.
2709  */
2710 void
2711 xfs_alloc_cancel_autoreap(
2712 	struct xfs_trans		*tp,
2713 	struct xfs_alloc_autoreap	*aarp)
2714 {
2715 	struct xfs_defer_pending	*dfp = aarp->dfp;
2716 	struct xfs_extent_free_item	*xefi;
2717 
2718 	if (!dfp)
2719 		return;
2720 
2721 	list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
2722 		xefi->xefi_flags |= XFS_EFI_CANCELLED;
2723 
2724 	xfs_defer_item_unpause(tp, dfp);
2725 }
2726 
2727 /*
2728  * Commit automatic freeing of unwritten space in the filesystem.
2729  *
2730  * This unpauses an earlier _schedule_autoreap and commits to freeing the
2731  * allocated space.  Call this if none of the reserved space was used.
2732  */
2733 void
2734 xfs_alloc_commit_autoreap(
2735 	struct xfs_trans		*tp,
2736 	struct xfs_alloc_autoreap	*aarp)
2737 {
2738 	if (aarp->dfp)
2739 		xfs_defer_item_unpause(tp, aarp->dfp);
2740 }
2741 
2742 #ifdef DEBUG
2743 /*
2744  * Check if an AGF has a free extent record whose length is equal to
2745  * args->minlen.
2746  */
2747 STATIC int
2748 xfs_exact_minlen_extent_available(
2749 	struct xfs_alloc_arg	*args,
2750 	struct xfs_buf		*agbp,
2751 	int			*stat)
2752 {
2753 	struct xfs_btree_cur	*cnt_cur;
2754 	xfs_agblock_t		fbno;
2755 	xfs_extlen_t		flen;
2756 	int			error = 0;
2757 
2758 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2759 					args->pag, XFS_BTNUM_CNT);
2760 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2761 	if (error)
2762 		goto out;
2763 
2764 	if (*stat == 0) {
2765 		xfs_btree_mark_sick(cnt_cur);
2766 		error = -EFSCORRUPTED;
2767 		goto out;
2768 	}
2769 
2770 	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2771 	if (error)
2772 		goto out;
2773 
2774 	if (*stat == 1 && flen != args->minlen)
2775 		*stat = 0;
2776 
2777 out:
2778 	xfs_btree_del_cursor(cnt_cur, error);
2779 
2780 	return error;
2781 }
2782 #endif
2783 
2784 /*
2785  * Decide whether to use this allocation group for this allocation.
2786  * If so, fix up the btree freelist's size.
2787  */
2788 int			/* error */
2789 xfs_alloc_fix_freelist(
2790 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2791 	uint32_t		alloc_flags)
2792 {
2793 	struct xfs_mount	*mp = args->mp;
2794 	struct xfs_perag	*pag = args->pag;
2795 	struct xfs_trans	*tp = args->tp;
2796 	struct xfs_buf		*agbp = NULL;
2797 	struct xfs_buf		*agflbp = NULL;
2798 	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2799 	xfs_agblock_t		bno;	/* freelist block */
2800 	xfs_extlen_t		need;	/* total blocks needed in freelist */
2801 	int			error = 0;
2802 
2803 	/* deferred ops (AGFL block frees) require permanent transactions */
2804 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2805 
2806 	if (!xfs_perag_initialised_agf(pag)) {
2807 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2808 		if (error) {
2809 			/* Couldn't lock the AGF so skip this AG. */
2810 			if (error == -EAGAIN)
2811 				error = 0;
2812 			goto out_no_agbp;
2813 		}
2814 	}
2815 
2816 	/*
2817 	 * If this is a metadata preferred pag and we are user data then try
2818 	 * somewhere else if we are not being asked to try harder at this
2819 	 * point
2820 	 */
2821 	if (xfs_perag_prefers_metadata(pag) &&
2822 	    (args->datatype & XFS_ALLOC_USERDATA) &&
2823 	    (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2824 		ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
2825 		goto out_agbp_relse;
2826 	}
2827 
2828 	need = xfs_alloc_min_freelist(mp, pag);
2829 	if (!xfs_alloc_space_available(args, need, alloc_flags |
2830 			XFS_ALLOC_FLAG_CHECK))
2831 		goto out_agbp_relse;
2832 
2833 	/*
2834 	 * Get the a.g. freespace buffer.
2835 	 * Can fail if we're not blocking on locks, and it's held.
2836 	 */
2837 	if (!agbp) {
2838 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2839 		if (error) {
2840 			/* Couldn't lock the AGF so skip this AG. */
2841 			if (error == -EAGAIN)
2842 				error = 0;
2843 			goto out_no_agbp;
2844 		}
2845 	}
2846 
2847 	/* reset a padding mismatched agfl before final free space check */
2848 	if (xfs_perag_agfl_needs_reset(pag))
2849 		xfs_agfl_reset(tp, agbp, pag);
2850 
2851 	/* If there isn't enough total space or single-extent, reject it. */
2852 	need = xfs_alloc_min_freelist(mp, pag);
2853 	if (!xfs_alloc_space_available(args, need, alloc_flags))
2854 		goto out_agbp_relse;
2855 
2856 #ifdef DEBUG
2857 	if (args->alloc_minlen_only) {
2858 		int stat;
2859 
2860 		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2861 		if (error || !stat)
2862 			goto out_agbp_relse;
2863 	}
2864 #endif
2865 	/*
2866 	 * Make the freelist shorter if it's too long.
2867 	 *
2868 	 * Note that from this point onwards, we will always release the agf and
2869 	 * agfl buffers on error. This handles the case where we error out and
2870 	 * the buffers are clean or may not have been joined to the transaction
2871 	 * and hence need to be released manually. If they have been joined to
2872 	 * the transaction, then xfs_trans_brelse() will handle them
2873 	 * appropriately based on the recursion count and dirty state of the
2874 	 * buffer.
2875 	 *
2876 	 * XXX (dgc): When we have lots of free space, does this buy us
2877 	 * anything other than extra overhead when we need to put more blocks
2878 	 * back on the free list? Maybe we should only do this when space is
2879 	 * getting low or the AGFL is more than half full?
2880 	 *
2881 	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2882 	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2883 	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2884 	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2885 	 * both required to ensure that rmaps are correctly recorded for the
2886 	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2887 	 * repair/rmap.c in xfsprogs for details.
2888 	 */
2889 	memset(&targs, 0, sizeof(targs));
2890 	/* struct copy below */
2891 	if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
2892 		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2893 	else
2894 		targs.oinfo = XFS_RMAP_OINFO_AG;
2895 	while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
2896 			pag->pagf_flcount > need) {
2897 		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2898 		if (error)
2899 			goto out_agbp_relse;
2900 
2901 		/* defer agfl frees */
2902 		error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2903 		if (error)
2904 			goto out_agbp_relse;
2905 	}
2906 
2907 	targs.tp = tp;
2908 	targs.mp = mp;
2909 	targs.agbp = agbp;
2910 	targs.agno = args->agno;
2911 	targs.alignment = targs.minlen = targs.prod = 1;
2912 	targs.pag = pag;
2913 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2914 	if (error)
2915 		goto out_agbp_relse;
2916 
2917 	/* Make the freelist longer if it's too short. */
2918 	while (pag->pagf_flcount < need) {
2919 		targs.agbno = 0;
2920 		targs.maxlen = need - pag->pagf_flcount;
2921 		targs.resv = XFS_AG_RESV_AGFL;
2922 
2923 		/* Allocate as many blocks as possible at once. */
2924 		error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
2925 		if (error)
2926 			goto out_agflbp_relse;
2927 
2928 		/*
2929 		 * Stop if we run out.  Won't happen if callers are obeying
2930 		 * the restrictions correctly.  Can happen for free calls
2931 		 * on a completely full ag.
2932 		 */
2933 		if (targs.agbno == NULLAGBLOCK) {
2934 			if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
2935 				break;
2936 			goto out_agflbp_relse;
2937 		}
2938 
2939 		if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2940 			error = xfs_rmap_alloc(tp, agbp, pag,
2941 				       targs.agbno, targs.len, &targs.oinfo);
2942 			if (error)
2943 				goto out_agflbp_relse;
2944 		}
2945 		error = xfs_alloc_update_counters(tp, agbp,
2946 						  -((long)(targs.len)));
2947 		if (error)
2948 			goto out_agflbp_relse;
2949 
2950 		/*
2951 		 * Put each allocated block on the list.
2952 		 */
2953 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2954 			error = xfs_alloc_put_freelist(pag, tp, agbp,
2955 							agflbp, bno, 0);
2956 			if (error)
2957 				goto out_agflbp_relse;
2958 		}
2959 	}
2960 	xfs_trans_brelse(tp, agflbp);
2961 	args->agbp = agbp;
2962 	return 0;
2963 
2964 out_agflbp_relse:
2965 	xfs_trans_brelse(tp, agflbp);
2966 out_agbp_relse:
2967 	if (agbp)
2968 		xfs_trans_brelse(tp, agbp);
2969 out_no_agbp:
2970 	args->agbp = NULL;
2971 	return error;
2972 }
2973 
2974 /*
2975  * Get a block from the freelist.
2976  * Returns with the buffer for the block gotten.
2977  */
2978 int
2979 xfs_alloc_get_freelist(
2980 	struct xfs_perag	*pag,
2981 	struct xfs_trans	*tp,
2982 	struct xfs_buf		*agbp,
2983 	xfs_agblock_t		*bnop,
2984 	int			btreeblk)
2985 {
2986 	struct xfs_agf		*agf = agbp->b_addr;
2987 	struct xfs_buf		*agflbp;
2988 	xfs_agblock_t		bno;
2989 	__be32			*agfl_bno;
2990 	int			error;
2991 	uint32_t		logflags;
2992 	struct xfs_mount	*mp = tp->t_mountp;
2993 
2994 	/*
2995 	 * Freelist is empty, give up.
2996 	 */
2997 	if (!agf->agf_flcount) {
2998 		*bnop = NULLAGBLOCK;
2999 		return 0;
3000 	}
3001 	/*
3002 	 * Read the array of free blocks.
3003 	 */
3004 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3005 	if (error)
3006 		return error;
3007 
3008 
3009 	/*
3010 	 * Get the block number and update the data structures.
3011 	 */
3012 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3013 	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
3014 	if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
3015 		return -EFSCORRUPTED;
3016 
3017 	be32_add_cpu(&agf->agf_flfirst, 1);
3018 	xfs_trans_brelse(tp, agflbp);
3019 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
3020 		agf->agf_flfirst = 0;
3021 
3022 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3023 	be32_add_cpu(&agf->agf_flcount, -1);
3024 	pag->pagf_flcount--;
3025 
3026 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
3027 	if (btreeblk) {
3028 		be32_add_cpu(&agf->agf_btreeblks, 1);
3029 		pag->pagf_btreeblks++;
3030 		logflags |= XFS_AGF_BTREEBLKS;
3031 	}
3032 
3033 	xfs_alloc_log_agf(tp, agbp, logflags);
3034 	*bnop = bno;
3035 
3036 	return 0;
3037 }
3038 
3039 /*
3040  * Log the given fields from the agf structure.
3041  */
3042 void
3043 xfs_alloc_log_agf(
3044 	struct xfs_trans	*tp,
3045 	struct xfs_buf		*bp,
3046 	uint32_t		fields)
3047 {
3048 	int	first;		/* first byte offset */
3049 	int	last;		/* last byte offset */
3050 	static const short	offsets[] = {
3051 		offsetof(xfs_agf_t, agf_magicnum),
3052 		offsetof(xfs_agf_t, agf_versionnum),
3053 		offsetof(xfs_agf_t, agf_seqno),
3054 		offsetof(xfs_agf_t, agf_length),
3055 		offsetof(xfs_agf_t, agf_roots[0]),
3056 		offsetof(xfs_agf_t, agf_levels[0]),
3057 		offsetof(xfs_agf_t, agf_flfirst),
3058 		offsetof(xfs_agf_t, agf_fllast),
3059 		offsetof(xfs_agf_t, agf_flcount),
3060 		offsetof(xfs_agf_t, agf_freeblks),
3061 		offsetof(xfs_agf_t, agf_longest),
3062 		offsetof(xfs_agf_t, agf_btreeblks),
3063 		offsetof(xfs_agf_t, agf_uuid),
3064 		offsetof(xfs_agf_t, agf_rmap_blocks),
3065 		offsetof(xfs_agf_t, agf_refcount_blocks),
3066 		offsetof(xfs_agf_t, agf_refcount_root),
3067 		offsetof(xfs_agf_t, agf_refcount_level),
3068 		/* needed so that we don't log the whole rest of the structure: */
3069 		offsetof(xfs_agf_t, agf_spare64),
3070 		sizeof(xfs_agf_t)
3071 	};
3072 
3073 	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
3074 
3075 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
3076 
3077 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
3078 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
3079 }
3080 
3081 /*
3082  * Put the block on the freelist for the allocation group.
3083  */
3084 int
3085 xfs_alloc_put_freelist(
3086 	struct xfs_perag	*pag,
3087 	struct xfs_trans	*tp,
3088 	struct xfs_buf		*agbp,
3089 	struct xfs_buf		*agflbp,
3090 	xfs_agblock_t		bno,
3091 	int			btreeblk)
3092 {
3093 	struct xfs_mount	*mp = tp->t_mountp;
3094 	struct xfs_agf		*agf = agbp->b_addr;
3095 	__be32			*blockp;
3096 	int			error;
3097 	uint32_t		logflags;
3098 	__be32			*agfl_bno;
3099 	int			startoff;
3100 
3101 	if (!agflbp) {
3102 		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3103 		if (error)
3104 			return error;
3105 	}
3106 
3107 	be32_add_cpu(&agf->agf_fllast, 1);
3108 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
3109 		agf->agf_fllast = 0;
3110 
3111 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3112 	be32_add_cpu(&agf->agf_flcount, 1);
3113 	pag->pagf_flcount++;
3114 
3115 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
3116 	if (btreeblk) {
3117 		be32_add_cpu(&agf->agf_btreeblks, -1);
3118 		pag->pagf_btreeblks--;
3119 		logflags |= XFS_AGF_BTREEBLKS;
3120 	}
3121 
3122 	xfs_alloc_log_agf(tp, agbp, logflags);
3123 
3124 	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
3125 
3126 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3127 	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
3128 	*blockp = cpu_to_be32(bno);
3129 	startoff = (char *)blockp - (char *)agflbp->b_addr;
3130 
3131 	xfs_alloc_log_agf(tp, agbp, logflags);
3132 
3133 	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
3134 	xfs_trans_log_buf(tp, agflbp, startoff,
3135 			  startoff + sizeof(xfs_agblock_t) - 1);
3136 	return 0;
3137 }
3138 
3139 /*
3140  * Check that this AGF/AGI header's sequence number and length matches the AG
3141  * number and size in fsblocks.
3142  */
3143 xfs_failaddr_t
3144 xfs_validate_ag_length(
3145 	struct xfs_buf		*bp,
3146 	uint32_t		seqno,
3147 	uint32_t		length)
3148 {
3149 	struct xfs_mount	*mp = bp->b_mount;
3150 	/*
3151 	 * During growfs operations, the perag is not fully initialised,
3152 	 * so we can't use it for any useful checking. growfs ensures we can't
3153 	 * use it by using uncached buffers that don't have the perag attached
3154 	 * so we can detect and avoid this problem.
3155 	 */
3156 	if (bp->b_pag && seqno != bp->b_pag->pag_agno)
3157 		return __this_address;
3158 
3159 	/*
3160 	 * Only the last AG in the filesystem is allowed to be shorter
3161 	 * than the AG size recorded in the superblock.
3162 	 */
3163 	if (length != mp->m_sb.sb_agblocks) {
3164 		/*
3165 		 * During growfs, the new last AG can get here before we
3166 		 * have updated the superblock. Give it a pass on the seqno
3167 		 * check.
3168 		 */
3169 		if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
3170 			return __this_address;
3171 		if (length < XFS_MIN_AG_BLOCKS)
3172 			return __this_address;
3173 		if (length > mp->m_sb.sb_agblocks)
3174 			return __this_address;
3175 	}
3176 
3177 	return NULL;
3178 }
3179 
3180 /*
3181  * Verify the AGF is consistent.
3182  *
3183  * We do not verify the AGFL indexes in the AGF are fully consistent here
3184  * because of issues with variable on-disk structure sizes. Instead, we check
3185  * the agfl indexes for consistency when we initialise the perag from the AGF
3186  * information after a read completes.
3187  *
3188  * If the index is inconsistent, then we mark the perag as needing an AGFL
3189  * reset. The first AGFL update performed then resets the AGFL indexes and
3190  * refills the AGFL with known good free blocks, allowing the filesystem to
3191  * continue operating normally at the cost of a few leaked free space blocks.
3192  */
3193 static xfs_failaddr_t
3194 xfs_agf_verify(
3195 	struct xfs_buf		*bp)
3196 {
3197 	struct xfs_mount	*mp = bp->b_mount;
3198 	struct xfs_agf		*agf = bp->b_addr;
3199 	xfs_failaddr_t		fa;
3200 	uint32_t		agf_seqno = be32_to_cpu(agf->agf_seqno);
3201 	uint32_t		agf_length = be32_to_cpu(agf->agf_length);
3202 
3203 	if (xfs_has_crc(mp)) {
3204 		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
3205 			return __this_address;
3206 		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
3207 			return __this_address;
3208 	}
3209 
3210 	if (!xfs_verify_magic(bp, agf->agf_magicnum))
3211 		return __this_address;
3212 
3213 	if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
3214 		return __this_address;
3215 
3216 	/*
3217 	 * Both agf_seqno and agf_length need to validated before anything else
3218 	 * block number related in the AGF or AGFL can be checked.
3219 	 */
3220 	fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
3221 	if (fa)
3222 		return fa;
3223 
3224 	if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
3225 		return __this_address;
3226 	if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
3227 		return __this_address;
3228 	if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
3229 		return __this_address;
3230 
3231 	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
3232 	    be32_to_cpu(agf->agf_freeblks) > agf_length)
3233 		return __this_address;
3234 
3235 	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
3236 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
3237 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
3238 						mp->m_alloc_maxlevels ||
3239 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
3240 						mp->m_alloc_maxlevels)
3241 		return __this_address;
3242 
3243 	if (xfs_has_lazysbcount(mp) &&
3244 	    be32_to_cpu(agf->agf_btreeblks) > agf_length)
3245 		return __this_address;
3246 
3247 	if (xfs_has_rmapbt(mp)) {
3248 		if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
3249 			return __this_address;
3250 
3251 		if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
3252 		    be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
3253 							mp->m_rmap_maxlevels)
3254 			return __this_address;
3255 	}
3256 
3257 	if (xfs_has_reflink(mp)) {
3258 		if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
3259 			return __this_address;
3260 
3261 		if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
3262 		    be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
3263 			return __this_address;
3264 	}
3265 
3266 	return NULL;
3267 }
3268 
3269 static void
3270 xfs_agf_read_verify(
3271 	struct xfs_buf	*bp)
3272 {
3273 	struct xfs_mount *mp = bp->b_mount;
3274 	xfs_failaddr_t	fa;
3275 
3276 	if (xfs_has_crc(mp) &&
3277 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3278 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3279 	else {
3280 		fa = xfs_agf_verify(bp);
3281 		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3282 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3283 	}
3284 }
3285 
3286 static void
3287 xfs_agf_write_verify(
3288 	struct xfs_buf	*bp)
3289 {
3290 	struct xfs_mount	*mp = bp->b_mount;
3291 	struct xfs_buf_log_item	*bip = bp->b_log_item;
3292 	struct xfs_agf		*agf = bp->b_addr;
3293 	xfs_failaddr_t		fa;
3294 
3295 	fa = xfs_agf_verify(bp);
3296 	if (fa) {
3297 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3298 		return;
3299 	}
3300 
3301 	if (!xfs_has_crc(mp))
3302 		return;
3303 
3304 	if (bip)
3305 		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3306 
3307 	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3308 }
3309 
3310 const struct xfs_buf_ops xfs_agf_buf_ops = {
3311 	.name = "xfs_agf",
3312 	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3313 	.verify_read = xfs_agf_read_verify,
3314 	.verify_write = xfs_agf_write_verify,
3315 	.verify_struct = xfs_agf_verify,
3316 };
3317 
3318 /*
3319  * Read in the allocation group header (free/alloc section).
3320  */
3321 int
3322 xfs_read_agf(
3323 	struct xfs_perag	*pag,
3324 	struct xfs_trans	*tp,
3325 	int			flags,
3326 	struct xfs_buf		**agfbpp)
3327 {
3328 	struct xfs_mount	*mp = pag->pag_mount;
3329 	int			error;
3330 
3331 	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3332 
3333 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3334 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3335 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3336 	if (xfs_metadata_is_sick(error))
3337 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
3338 	if (error)
3339 		return error;
3340 
3341 	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3342 	return 0;
3343 }
3344 
3345 /*
3346  * Read in the allocation group header (free/alloc section) and initialise the
3347  * perag structure if necessary. If the caller provides @agfbpp, then return the
3348  * locked buffer to the caller, otherwise free it.
3349  */
3350 int
3351 xfs_alloc_read_agf(
3352 	struct xfs_perag	*pag,
3353 	struct xfs_trans	*tp,
3354 	int			flags,
3355 	struct xfs_buf		**agfbpp)
3356 {
3357 	struct xfs_buf		*agfbp;
3358 	struct xfs_agf		*agf;
3359 	int			error;
3360 	int			allocbt_blks;
3361 
3362 	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3363 
3364 	/* We don't support trylock when freeing. */
3365 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3366 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3367 	error = xfs_read_agf(pag, tp,
3368 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3369 			&agfbp);
3370 	if (error)
3371 		return error;
3372 
3373 	agf = agfbp->b_addr;
3374 	if (!xfs_perag_initialised_agf(pag)) {
3375 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3376 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3377 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3378 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3379 		pag->pagf_levels[XFS_BTNUM_BNOi] =
3380 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3381 		pag->pagf_levels[XFS_BTNUM_CNTi] =
3382 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3383 		pag->pagf_levels[XFS_BTNUM_RMAPi] =
3384 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3385 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3386 		if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3387 			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3388 		else
3389 			clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3390 
3391 		/*
3392 		 * Update the in-core allocbt counter. Filter out the rmapbt
3393 		 * subset of the btreeblks counter because the rmapbt is managed
3394 		 * by perag reservation. Subtract one for the rmapbt root block
3395 		 * because the rmap counter includes it while the btreeblks
3396 		 * counter only tracks non-root blocks.
3397 		 */
3398 		allocbt_blks = pag->pagf_btreeblks;
3399 		if (xfs_has_rmapbt(pag->pag_mount))
3400 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3401 		if (allocbt_blks > 0)
3402 			atomic64_add(allocbt_blks,
3403 					&pag->pag_mount->m_allocbt_blks);
3404 
3405 		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
3406 	}
3407 #ifdef DEBUG
3408 	else if (!xfs_is_shutdown(pag->pag_mount)) {
3409 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3410 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3411 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3412 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3413 		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3414 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3415 		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3416 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3417 	}
3418 #endif
3419 	if (agfbpp)
3420 		*agfbpp = agfbp;
3421 	else
3422 		xfs_trans_brelse(tp, agfbp);
3423 	return 0;
3424 }
3425 
3426 /*
3427  * Pre-proces allocation arguments to set initial state that we don't require
3428  * callers to set up correctly, as well as bounds check the allocation args
3429  * that are set up.
3430  */
3431 static int
3432 xfs_alloc_vextent_check_args(
3433 	struct xfs_alloc_arg	*args,
3434 	xfs_fsblock_t		target,
3435 	xfs_agnumber_t		*minimum_agno)
3436 {
3437 	struct xfs_mount	*mp = args->mp;
3438 	xfs_agblock_t		agsize;
3439 
3440 	args->fsbno = NULLFSBLOCK;
3441 
3442 	*minimum_agno = 0;
3443 	if (args->tp->t_highest_agno != NULLAGNUMBER)
3444 		*minimum_agno = args->tp->t_highest_agno;
3445 
3446 	/*
3447 	 * Just fix this up, for the case where the last a.g. is shorter
3448 	 * (or there's only one a.g.) and the caller couldn't easily figure
3449 	 * that out (xfs_bmap_alloc).
3450 	 */
3451 	agsize = mp->m_sb.sb_agblocks;
3452 	if (args->maxlen > agsize)
3453 		args->maxlen = agsize;
3454 	if (args->alignment == 0)
3455 		args->alignment = 1;
3456 
3457 	ASSERT(args->minlen > 0);
3458 	ASSERT(args->maxlen > 0);
3459 	ASSERT(args->alignment > 0);
3460 	ASSERT(args->resv != XFS_AG_RESV_AGFL);
3461 
3462 	ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3463 	ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
3464 	ASSERT(args->minlen <= args->maxlen);
3465 	ASSERT(args->minlen <= agsize);
3466 	ASSERT(args->mod < args->prod);
3467 
3468 	if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3469 	    XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
3470 	    args->minlen > args->maxlen || args->minlen > agsize ||
3471 	    args->mod >= args->prod) {
3472 		trace_xfs_alloc_vextent_badargs(args);
3473 		return -ENOSPC;
3474 	}
3475 
3476 	if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3477 		trace_xfs_alloc_vextent_skip_deadlock(args);
3478 		return -ENOSPC;
3479 	}
3480 	return 0;
3481 
3482 }
3483 
3484 /*
3485  * Prepare an AG for allocation. If the AG is not prepared to accept the
3486  * allocation, return failure.
3487  *
3488  * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3489  * modified to hold their own perag references.
3490  */
3491 static int
3492 xfs_alloc_vextent_prepare_ag(
3493 	struct xfs_alloc_arg	*args,
3494 	uint32_t		alloc_flags)
3495 {
3496 	bool			need_pag = !args->pag;
3497 	int			error;
3498 
3499 	if (need_pag)
3500 		args->pag = xfs_perag_get(args->mp, args->agno);
3501 
3502 	args->agbp = NULL;
3503 	error = xfs_alloc_fix_freelist(args, alloc_flags);
3504 	if (error) {
3505 		trace_xfs_alloc_vextent_nofix(args);
3506 		if (need_pag)
3507 			xfs_perag_put(args->pag);
3508 		args->agbno = NULLAGBLOCK;
3509 		return error;
3510 	}
3511 	if (!args->agbp) {
3512 		/* cannot allocate in this AG at all */
3513 		trace_xfs_alloc_vextent_noagbp(args);
3514 		args->agbno = NULLAGBLOCK;
3515 		return 0;
3516 	}
3517 	args->wasfromfl = 0;
3518 	return 0;
3519 }
3520 
3521 /*
3522  * Post-process allocation results to account for the allocation if it succeed
3523  * and set the allocated block number correctly for the caller.
3524  *
3525  * XXX: we should really be returning ENOSPC for ENOSPC, not
3526  * hiding it behind a "successful" NULLFSBLOCK allocation.
3527  */
3528 static int
3529 xfs_alloc_vextent_finish(
3530 	struct xfs_alloc_arg	*args,
3531 	xfs_agnumber_t		minimum_agno,
3532 	int			alloc_error,
3533 	bool			drop_perag)
3534 {
3535 	struct xfs_mount	*mp = args->mp;
3536 	int			error = 0;
3537 
3538 	/*
3539 	 * We can end up here with a locked AGF. If we failed, the caller is
3540 	 * likely going to try to allocate again with different parameters, and
3541 	 * that can widen the AGs that are searched for free space. If we have
3542 	 * to do BMBT block allocation, we have to do a new allocation.
3543 	 *
3544 	 * Hence leaving this function with the AGF locked opens up potential
3545 	 * ABBA AGF deadlocks because a future allocation attempt in this
3546 	 * transaction may attempt to lock a lower number AGF.
3547 	 *
3548 	 * We can't release the AGF until the transaction is commited, so at
3549 	 * this point we must update the "first allocation" tracker to point at
3550 	 * this AG if the tracker is empty or points to a lower AG. This allows
3551 	 * the next allocation attempt to be modified appropriately to avoid
3552 	 * deadlocks.
3553 	 */
3554 	if (args->agbp &&
3555 	    (args->tp->t_highest_agno == NULLAGNUMBER ||
3556 	     args->agno > minimum_agno))
3557 		args->tp->t_highest_agno = args->agno;
3558 
3559 	/*
3560 	 * If the allocation failed with an error or we had an ENOSPC result,
3561 	 * preserve the returned error whilst also marking the allocation result
3562 	 * as "no extent allocated". This ensures that callers that fail to
3563 	 * capture the error will still treat it as a failed allocation.
3564 	 */
3565 	if (alloc_error || args->agbno == NULLAGBLOCK) {
3566 		args->fsbno = NULLFSBLOCK;
3567 		error = alloc_error;
3568 		goto out_drop_perag;
3569 	}
3570 
3571 	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3572 
3573 	ASSERT(args->len >= args->minlen);
3574 	ASSERT(args->len <= args->maxlen);
3575 	ASSERT(args->agbno % args->alignment == 0);
3576 	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
3577 
3578 	/* if not file data, insert new block into the reverse map btree */
3579 	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3580 		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3581 				       args->agbno, args->len, &args->oinfo);
3582 		if (error)
3583 			goto out_drop_perag;
3584 	}
3585 
3586 	if (!args->wasfromfl) {
3587 		error = xfs_alloc_update_counters(args->tp, args->agbp,
3588 						  -((long)(args->len)));
3589 		if (error)
3590 			goto out_drop_perag;
3591 
3592 		ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
3593 				args->len));
3594 	}
3595 
3596 	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3597 
3598 	XFS_STATS_INC(mp, xs_allocx);
3599 	XFS_STATS_ADD(mp, xs_allocb, args->len);
3600 
3601 	trace_xfs_alloc_vextent_finish(args);
3602 
3603 out_drop_perag:
3604 	if (drop_perag && args->pag) {
3605 		xfs_perag_rele(args->pag);
3606 		args->pag = NULL;
3607 	}
3608 	return error;
3609 }
3610 
3611 /*
3612  * Allocate within a single AG only. This uses a best-fit length algorithm so if
3613  * you need an exact sized allocation without locality constraints, this is the
3614  * fastest way to do it.
3615  *
3616  * Caller is expected to hold a perag reference in args->pag.
3617  */
3618 int
3619 xfs_alloc_vextent_this_ag(
3620 	struct xfs_alloc_arg	*args,
3621 	xfs_agnumber_t		agno)
3622 {
3623 	struct xfs_mount	*mp = args->mp;
3624 	xfs_agnumber_t		minimum_agno;
3625 	uint32_t		alloc_flags = 0;
3626 	int			error;
3627 
3628 	ASSERT(args->pag != NULL);
3629 	ASSERT(args->pag->pag_agno == agno);
3630 
3631 	args->agno = agno;
3632 	args->agbno = 0;
3633 
3634 	trace_xfs_alloc_vextent_this_ag(args);
3635 
3636 	error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
3637 			&minimum_agno);
3638 	if (error) {
3639 		if (error == -ENOSPC)
3640 			return 0;
3641 		return error;
3642 	}
3643 
3644 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3645 	if (!error && args->agbp)
3646 		error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3647 
3648 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3649 }
3650 
3651 /*
3652  * Iterate all AGs trying to allocate an extent starting from @start_ag.
3653  *
3654  * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3655  * allocation attempts in @start_agno have locality information. If we fail to
3656  * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3657  * we attempt to allocation in as there is no locality optimisation possible for
3658  * those allocations.
3659  *
3660  * On return, args->pag may be left referenced if we finish before the "all
3661  * failed" return point. The allocation finish still needs the perag, and
3662  * so the caller will release it once they've finished the allocation.
3663  *
3664  * When we wrap the AG iteration at the end of the filesystem, we have to be
3665  * careful not to wrap into AGs below ones we already have locked in the
3666  * transaction if we are doing a blocking iteration. This will result in an
3667  * out-of-order locking of AGFs and hence can cause deadlocks.
3668  */
3669 static int
3670 xfs_alloc_vextent_iterate_ags(
3671 	struct xfs_alloc_arg	*args,
3672 	xfs_agnumber_t		minimum_agno,
3673 	xfs_agnumber_t		start_agno,
3674 	xfs_agblock_t		target_agbno,
3675 	uint32_t		alloc_flags)
3676 {
3677 	struct xfs_mount	*mp = args->mp;
3678 	xfs_agnumber_t		restart_agno = minimum_agno;
3679 	xfs_agnumber_t		agno;
3680 	int			error = 0;
3681 
3682 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
3683 		restart_agno = 0;
3684 restart:
3685 	for_each_perag_wrap_range(mp, start_agno, restart_agno,
3686 			mp->m_sb.sb_agcount, agno, args->pag) {
3687 		args->agno = agno;
3688 		error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3689 		if (error)
3690 			break;
3691 		if (!args->agbp) {
3692 			trace_xfs_alloc_vextent_loopfailed(args);
3693 			continue;
3694 		}
3695 
3696 		/*
3697 		 * Allocation is supposed to succeed now, so break out of the
3698 		 * loop regardless of whether we succeed or not.
3699 		 */
3700 		if (args->agno == start_agno && target_agbno) {
3701 			args->agbno = target_agbno;
3702 			error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3703 		} else {
3704 			args->agbno = 0;
3705 			error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3706 		}
3707 		break;
3708 	}
3709 	if (error) {
3710 		xfs_perag_rele(args->pag);
3711 		args->pag = NULL;
3712 		return error;
3713 	}
3714 	if (args->agbp)
3715 		return 0;
3716 
3717 	/*
3718 	 * We didn't find an AG we can alloation from. If we were given
3719 	 * constraining flags by the caller, drop them and retry the allocation
3720 	 * without any constraints being set.
3721 	 */
3722 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
3723 		alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
3724 		restart_agno = minimum_agno;
3725 		goto restart;
3726 	}
3727 
3728 	ASSERT(args->pag == NULL);
3729 	trace_xfs_alloc_vextent_allfailed(args);
3730 	return 0;
3731 }
3732 
3733 /*
3734  * Iterate from the AGs from the start AG to the end of the filesystem, trying
3735  * to allocate blocks. It starts with a near allocation attempt in the initial
3736  * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3737  * back to zero if allowed by previous allocations in this transaction,
3738  * otherwise will wrap back to the start AG and run a second blocking pass to
3739  * the end of the filesystem.
3740  */
3741 int
3742 xfs_alloc_vextent_start_ag(
3743 	struct xfs_alloc_arg	*args,
3744 	xfs_fsblock_t		target)
3745 {
3746 	struct xfs_mount	*mp = args->mp;
3747 	xfs_agnumber_t		minimum_agno;
3748 	xfs_agnumber_t		start_agno;
3749 	xfs_agnumber_t		rotorstep = xfs_rotorstep;
3750 	bool			bump_rotor = false;
3751 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3752 	int			error;
3753 
3754 	ASSERT(args->pag == NULL);
3755 
3756 	args->agno = NULLAGNUMBER;
3757 	args->agbno = NULLAGBLOCK;
3758 
3759 	trace_xfs_alloc_vextent_start_ag(args);
3760 
3761 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3762 	if (error) {
3763 		if (error == -ENOSPC)
3764 			return 0;
3765 		return error;
3766 	}
3767 
3768 	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3769 	    xfs_is_inode32(mp)) {
3770 		target = XFS_AGB_TO_FSB(mp,
3771 				((mp->m_agfrotor / rotorstep) %
3772 				mp->m_sb.sb_agcount), 0);
3773 		bump_rotor = 1;
3774 	}
3775 
3776 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3777 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3778 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3779 
3780 	if (bump_rotor) {
3781 		if (args->agno == start_agno)
3782 			mp->m_agfrotor = (mp->m_agfrotor + 1) %
3783 				(mp->m_sb.sb_agcount * rotorstep);
3784 		else
3785 			mp->m_agfrotor = (args->agno * rotorstep + 1) %
3786 				(mp->m_sb.sb_agcount * rotorstep);
3787 	}
3788 
3789 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3790 }
3791 
3792 /*
3793  * Iterate from the agno indicated via @target through to the end of the
3794  * filesystem attempting blocking allocation. This does not wrap or try a second
3795  * pass, so will not recurse into AGs lower than indicated by the target.
3796  */
3797 int
3798 xfs_alloc_vextent_first_ag(
3799 	struct xfs_alloc_arg	*args,
3800 	xfs_fsblock_t		target)
3801  {
3802 	struct xfs_mount	*mp = args->mp;
3803 	xfs_agnumber_t		minimum_agno;
3804 	xfs_agnumber_t		start_agno;
3805 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3806 	int			error;
3807 
3808 	ASSERT(args->pag == NULL);
3809 
3810 	args->agno = NULLAGNUMBER;
3811 	args->agbno = NULLAGBLOCK;
3812 
3813 	trace_xfs_alloc_vextent_first_ag(args);
3814 
3815 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3816 	if (error) {
3817 		if (error == -ENOSPC)
3818 			return 0;
3819 		return error;
3820 	}
3821 
3822 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3823 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3824 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3825 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3826 }
3827 
3828 /*
3829  * Allocate at the exact block target or fail. Caller is expected to hold a
3830  * perag reference in args->pag.
3831  */
3832 int
3833 xfs_alloc_vextent_exact_bno(
3834 	struct xfs_alloc_arg	*args,
3835 	xfs_fsblock_t		target)
3836 {
3837 	struct xfs_mount	*mp = args->mp;
3838 	xfs_agnumber_t		minimum_agno;
3839 	int			error;
3840 
3841 	ASSERT(args->pag != NULL);
3842 	ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3843 
3844 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3845 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3846 
3847 	trace_xfs_alloc_vextent_exact_bno(args);
3848 
3849 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3850 	if (error) {
3851 		if (error == -ENOSPC)
3852 			return 0;
3853 		return error;
3854 	}
3855 
3856 	error = xfs_alloc_vextent_prepare_ag(args, 0);
3857 	if (!error && args->agbp)
3858 		error = xfs_alloc_ag_vextent_exact(args);
3859 
3860 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3861 }
3862 
3863 /*
3864  * Allocate an extent as close to the target as possible. If there are not
3865  * viable candidates in the AG, then fail the allocation.
3866  *
3867  * Caller may or may not have a per-ag reference in args->pag.
3868  */
3869 int
3870 xfs_alloc_vextent_near_bno(
3871 	struct xfs_alloc_arg	*args,
3872 	xfs_fsblock_t		target)
3873 {
3874 	struct xfs_mount	*mp = args->mp;
3875 	xfs_agnumber_t		minimum_agno;
3876 	bool			needs_perag = args->pag == NULL;
3877 	uint32_t		alloc_flags = 0;
3878 	int			error;
3879 
3880 	if (!needs_perag)
3881 		ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3882 
3883 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3884 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3885 
3886 	trace_xfs_alloc_vextent_near_bno(args);
3887 
3888 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3889 	if (error) {
3890 		if (error == -ENOSPC)
3891 			return 0;
3892 		return error;
3893 	}
3894 
3895 	if (needs_perag)
3896 		args->pag = xfs_perag_grab(mp, args->agno);
3897 
3898 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3899 	if (!error && args->agbp)
3900 		error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3901 
3902 	return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
3903 }
3904 
3905 /* Ensure that the freelist is at full capacity. */
3906 int
3907 xfs_free_extent_fix_freelist(
3908 	struct xfs_trans	*tp,
3909 	struct xfs_perag	*pag,
3910 	struct xfs_buf		**agbp)
3911 {
3912 	struct xfs_alloc_arg	args;
3913 	int			error;
3914 
3915 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3916 	args.tp = tp;
3917 	args.mp = tp->t_mountp;
3918 	args.agno = pag->pag_agno;
3919 	args.pag = pag;
3920 
3921 	/*
3922 	 * validate that the block number is legal - the enables us to detect
3923 	 * and handle a silent filesystem corruption rather than crashing.
3924 	 */
3925 	if (args.agno >= args.mp->m_sb.sb_agcount)
3926 		return -EFSCORRUPTED;
3927 
3928 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3929 	if (error)
3930 		return error;
3931 
3932 	*agbp = args.agbp;
3933 	return 0;
3934 }
3935 
3936 /*
3937  * Free an extent.
3938  * Just break up the extent address and hand off to xfs_free_ag_extent
3939  * after fixing up the freelist.
3940  */
3941 int
3942 __xfs_free_extent(
3943 	struct xfs_trans		*tp,
3944 	struct xfs_perag		*pag,
3945 	xfs_agblock_t			agbno,
3946 	xfs_extlen_t			len,
3947 	const struct xfs_owner_info	*oinfo,
3948 	enum xfs_ag_resv_type		type,
3949 	bool				skip_discard)
3950 {
3951 	struct xfs_mount		*mp = tp->t_mountp;
3952 	struct xfs_buf			*agbp;
3953 	struct xfs_agf			*agf;
3954 	int				error;
3955 	unsigned int			busy_flags = 0;
3956 
3957 	ASSERT(len != 0);
3958 	ASSERT(type != XFS_AG_RESV_AGFL);
3959 
3960 	if (XFS_TEST_ERROR(false, mp,
3961 			XFS_ERRTAG_FREE_EXTENT))
3962 		return -EIO;
3963 
3964 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3965 	if (error) {
3966 		if (xfs_metadata_is_sick(error))
3967 			xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3968 		return error;
3969 	}
3970 
3971 	agf = agbp->b_addr;
3972 
3973 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3974 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3975 		error = -EFSCORRUPTED;
3976 		goto err_release;
3977 	}
3978 
3979 	/* validate the extent size is legal now we have the agf locked */
3980 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3981 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3982 		error = -EFSCORRUPTED;
3983 		goto err_release;
3984 	}
3985 
3986 	error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
3987 			type);
3988 	if (error)
3989 		goto err_release;
3990 
3991 	if (skip_discard)
3992 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3993 	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3994 	return 0;
3995 
3996 err_release:
3997 	xfs_trans_brelse(tp, agbp);
3998 	return error;
3999 }
4000 
4001 struct xfs_alloc_query_range_info {
4002 	xfs_alloc_query_range_fn	fn;
4003 	void				*priv;
4004 };
4005 
4006 /* Format btree record and pass to our callback. */
4007 STATIC int
4008 xfs_alloc_query_range_helper(
4009 	struct xfs_btree_cur		*cur,
4010 	const union xfs_btree_rec	*rec,
4011 	void				*priv)
4012 {
4013 	struct xfs_alloc_query_range_info	*query = priv;
4014 	struct xfs_alloc_rec_incore		irec;
4015 	xfs_failaddr_t				fa;
4016 
4017 	xfs_alloc_btrec_to_irec(rec, &irec);
4018 	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
4019 	if (fa)
4020 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
4021 
4022 	return query->fn(cur, &irec, query->priv);
4023 }
4024 
4025 /* Find all free space within a given range of blocks. */
4026 int
4027 xfs_alloc_query_range(
4028 	struct xfs_btree_cur			*cur,
4029 	const struct xfs_alloc_rec_incore	*low_rec,
4030 	const struct xfs_alloc_rec_incore	*high_rec,
4031 	xfs_alloc_query_range_fn		fn,
4032 	void					*priv)
4033 {
4034 	union xfs_btree_irec			low_brec = { .a = *low_rec };
4035 	union xfs_btree_irec			high_brec = { .a = *high_rec };
4036 	struct xfs_alloc_query_range_info	query = { .priv = priv, .fn = fn };
4037 
4038 	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
4039 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
4040 			xfs_alloc_query_range_helper, &query);
4041 }
4042 
4043 /* Find all free space records. */
4044 int
4045 xfs_alloc_query_all(
4046 	struct xfs_btree_cur			*cur,
4047 	xfs_alloc_query_range_fn		fn,
4048 	void					*priv)
4049 {
4050 	struct xfs_alloc_query_range_info	query;
4051 
4052 	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
4053 	query.priv = priv;
4054 	query.fn = fn;
4055 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
4056 }
4057 
4058 /*
4059  * Scan part of the keyspace of the free space and tell us if the area has no
4060  * records, is fully mapped by records, or is partially filled.
4061  */
4062 int
4063 xfs_alloc_has_records(
4064 	struct xfs_btree_cur	*cur,
4065 	xfs_agblock_t		bno,
4066 	xfs_extlen_t		len,
4067 	enum xbtree_recpacking	*outcome)
4068 {
4069 	union xfs_btree_irec	low;
4070 	union xfs_btree_irec	high;
4071 
4072 	memset(&low, 0, sizeof(low));
4073 	low.a.ar_startblock = bno;
4074 	memset(&high, 0xFF, sizeof(high));
4075 	high.a.ar_startblock = bno + len - 1;
4076 
4077 	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
4078 }
4079 
4080 /*
4081  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
4082  * error code or XFS_ITER_*.
4083  */
4084 int
4085 xfs_agfl_walk(
4086 	struct xfs_mount	*mp,
4087 	struct xfs_agf		*agf,
4088 	struct xfs_buf		*agflbp,
4089 	xfs_agfl_walk_fn	walk_fn,
4090 	void			*priv)
4091 {
4092 	__be32			*agfl_bno;
4093 	unsigned int		i;
4094 	int			error;
4095 
4096 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
4097 	i = be32_to_cpu(agf->agf_flfirst);
4098 
4099 	/* Nothing to walk in an empty AGFL. */
4100 	if (agf->agf_flcount == cpu_to_be32(0))
4101 		return 0;
4102 
4103 	/* Otherwise, walk from first to last, wrapping as needed. */
4104 	for (;;) {
4105 		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
4106 		if (error)
4107 			return error;
4108 		if (i == be32_to_cpu(agf->agf_fllast))
4109 			break;
4110 		if (++i == xfs_agfl_size(mp))
4111 			i = 0;
4112 	}
4113 
4114 	return 0;
4115 }
4116 
4117 int __init
4118 xfs_extfree_intent_init_cache(void)
4119 {
4120 	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
4121 			sizeof(struct xfs_extent_free_item),
4122 			0, 0, NULL);
4123 
4124 	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
4125 }
4126 
4127 void
4128 xfs_extfree_intent_destroy_cache(void)
4129 {
4130 	kmem_cache_destroy(xfs_extfree_item_cache);
4131 	xfs_extfree_item_cache = NULL;
4132 }
4133