xref: /linux/fs/xfs/libxfs/xfs_alloc.c (revision 881f1bb5e25c8982ed963b2d319fc0fc732e55db)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
16 #include "xfs_rmap.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
25 #include "xfs_log.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_bmap.h"
29 #include "xfs_health.h"
30 
31 struct kmem_cache	*xfs_extfree_item_cache;
32 
33 struct workqueue_struct *xfs_alloc_wq;
34 
35 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
36 
37 #define	XFSA_FIXUP_BNO_OK	1
38 #define	XFSA_FIXUP_CNT_OK	2
39 
40 /*
41  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
42  * the beginning of the block for a proper header with the location information
43  * and CRC.
44  */
45 unsigned int
46 xfs_agfl_size(
47 	struct xfs_mount	*mp)
48 {
49 	unsigned int		size = mp->m_sb.sb_sectsize;
50 
51 	if (xfs_has_crc(mp))
52 		size -= sizeof(struct xfs_agfl);
53 
54 	return size / sizeof(xfs_agblock_t);
55 }
56 
57 unsigned int
58 xfs_refc_block(
59 	struct xfs_mount	*mp)
60 {
61 	if (xfs_has_rmapbt(mp))
62 		return XFS_RMAP_BLOCK(mp) + 1;
63 	if (xfs_has_finobt(mp))
64 		return XFS_FIBT_BLOCK(mp) + 1;
65 	return XFS_IBT_BLOCK(mp) + 1;
66 }
67 
68 xfs_extlen_t
69 xfs_prealloc_blocks(
70 	struct xfs_mount	*mp)
71 {
72 	if (xfs_has_reflink(mp))
73 		return xfs_refc_block(mp) + 1;
74 	if (xfs_has_rmapbt(mp))
75 		return XFS_RMAP_BLOCK(mp) + 1;
76 	if (xfs_has_finobt(mp))
77 		return XFS_FIBT_BLOCK(mp) + 1;
78 	return XFS_IBT_BLOCK(mp) + 1;
79 }
80 
81 /*
82  * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
83  * guarantee that we can refill the AGFL prior to allocating space in a nearly
84  * full AG.  Although the space described by the free space btrees, the
85  * blocks used by the freesp btrees themselves, and the blocks owned by the
86  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
87  * free space in the AG drop so low that the free space btrees cannot refill an
88  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
89  * until the fs goes down, we subtract this many AG blocks from the incore
90  * fdblocks to ensure user allocation does not overcommit the space the
91  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
92  * withhold space from xfs_mod_fdblocks, so we do not account for that here.
93  */
94 #define XFS_ALLOCBT_AGFL_RESERVE	4
95 
96 /*
97  * Compute the number of blocks that we set aside to guarantee the ability to
98  * refill the AGFL and handle a full bmap btree split.
99  *
100  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
101  * AGF buffer (PV 947395), we place constraints on the relationship among
102  * actual allocations for data blocks, freelist blocks, and potential file data
103  * bmap btree blocks. However, these restrictions may result in no actual space
104  * allocated for a delayed extent, for example, a data block in a certain AG is
105  * allocated but there is no additional block for the additional bmap btree
106  * block due to a split of the bmap btree of the file. The result of this may
107  * lead to an infinite loop when the file gets flushed to disk and all delayed
108  * extents need to be actually allocated. To get around this, we explicitly set
109  * aside a few blocks which will not be reserved in delayed allocation.
110  *
111  * For each AG, we need to reserve enough blocks to replenish a totally empty
112  * AGFL and 4 more to handle a potential split of the file's bmap btree.
113  */
114 unsigned int
115 xfs_alloc_set_aside(
116 	struct xfs_mount	*mp)
117 {
118 	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
119 }
120 
121 /*
122  * When deciding how much space to allocate out of an AG, we limit the
123  * allocation maximum size to the size the AG. However, we cannot use all the
124  * blocks in the AG - some are permanently used by metadata. These
125  * blocks are generally:
126  *	- the AG superblock, AGF, AGI and AGFL
127  *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
128  *	  the AGI free inode and rmap btree root blocks.
129  *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
130  *	- the rmapbt root block
131  *
132  * The AG headers are sector sized, so the amount of space they take up is
133  * dependent on filesystem geometry. The others are all single blocks.
134  */
135 unsigned int
136 xfs_alloc_ag_max_usable(
137 	struct xfs_mount	*mp)
138 {
139 	unsigned int		blocks;
140 
141 	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
142 	blocks += XFS_ALLOCBT_AGFL_RESERVE;
143 	blocks += 3;			/* AGF, AGI btree root blocks */
144 	if (xfs_has_finobt(mp))
145 		blocks++;		/* finobt root block */
146 	if (xfs_has_rmapbt(mp))
147 		blocks++;		/* rmap root block */
148 	if (xfs_has_reflink(mp))
149 		blocks++;		/* refcount root block */
150 
151 	return mp->m_sb.sb_agblocks - blocks;
152 }
153 
154 
155 static int
156 xfs_alloc_lookup(
157 	struct xfs_btree_cur	*cur,
158 	xfs_lookup_t		dir,
159 	xfs_agblock_t		bno,
160 	xfs_extlen_t		len,
161 	int			*stat)
162 {
163 	int			error;
164 
165 	cur->bc_rec.a.ar_startblock = bno;
166 	cur->bc_rec.a.ar_blockcount = len;
167 	error = xfs_btree_lookup(cur, dir, stat);
168 	if (*stat == 1)
169 		cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
170 	else
171 		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
172 	return error;
173 }
174 
175 /*
176  * Lookup the record equal to [bno, len] in the btree given by cur.
177  */
178 static inline int				/* error */
179 xfs_alloc_lookup_eq(
180 	struct xfs_btree_cur	*cur,	/* btree cursor */
181 	xfs_agblock_t		bno,	/* starting block of extent */
182 	xfs_extlen_t		len,	/* length of extent */
183 	int			*stat)	/* success/failure */
184 {
185 	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
186 }
187 
188 /*
189  * Lookup the first record greater than or equal to [bno, len]
190  * in the btree given by cur.
191  */
192 int				/* error */
193 xfs_alloc_lookup_ge(
194 	struct xfs_btree_cur	*cur,	/* btree cursor */
195 	xfs_agblock_t		bno,	/* starting block of extent */
196 	xfs_extlen_t		len,	/* length of extent */
197 	int			*stat)	/* success/failure */
198 {
199 	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
200 }
201 
202 /*
203  * Lookup the first record less than or equal to [bno, len]
204  * in the btree given by cur.
205  */
206 int					/* error */
207 xfs_alloc_lookup_le(
208 	struct xfs_btree_cur	*cur,	/* btree cursor */
209 	xfs_agblock_t		bno,	/* starting block of extent */
210 	xfs_extlen_t		len,	/* length of extent */
211 	int			*stat)	/* success/failure */
212 {
213 	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
214 }
215 
216 static inline bool
217 xfs_alloc_cur_active(
218 	struct xfs_btree_cur	*cur)
219 {
220 	return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
221 }
222 
223 /*
224  * Update the record referred to by cur to the value given
225  * by [bno, len].
226  * This either works (return 0) or gets an EFSCORRUPTED error.
227  */
228 STATIC int				/* error */
229 xfs_alloc_update(
230 	struct xfs_btree_cur	*cur,	/* btree cursor */
231 	xfs_agblock_t		bno,	/* starting block of extent */
232 	xfs_extlen_t		len)	/* length of extent */
233 {
234 	union xfs_btree_rec	rec;
235 
236 	rec.alloc.ar_startblock = cpu_to_be32(bno);
237 	rec.alloc.ar_blockcount = cpu_to_be32(len);
238 	return xfs_btree_update(cur, &rec);
239 }
240 
241 /* Convert the ondisk btree record to its incore representation. */
242 void
243 xfs_alloc_btrec_to_irec(
244 	const union xfs_btree_rec	*rec,
245 	struct xfs_alloc_rec_incore	*irec)
246 {
247 	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
248 	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
249 }
250 
251 /* Simple checks for free space records. */
252 xfs_failaddr_t
253 xfs_alloc_check_irec(
254 	struct xfs_perag			*pag,
255 	const struct xfs_alloc_rec_incore	*irec)
256 {
257 	if (irec->ar_blockcount == 0)
258 		return __this_address;
259 
260 	/* check for valid extent range, including overflow */
261 	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
262 		return __this_address;
263 
264 	return NULL;
265 }
266 
267 static inline int
268 xfs_alloc_complain_bad_rec(
269 	struct xfs_btree_cur		*cur,
270 	xfs_failaddr_t			fa,
271 	const struct xfs_alloc_rec_incore *irec)
272 {
273 	struct xfs_mount		*mp = cur->bc_mp;
274 
275 	xfs_warn(mp,
276 		"%sbt record corruption in AG %d detected at %pS!",
277 		cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
278 	xfs_warn(mp,
279 		"start block 0x%x block count 0x%x", irec->ar_startblock,
280 		irec->ar_blockcount);
281 	xfs_btree_mark_sick(cur);
282 	return -EFSCORRUPTED;
283 }
284 
285 /*
286  * Get the data from the pointed-to record.
287  */
288 int					/* error */
289 xfs_alloc_get_rec(
290 	struct xfs_btree_cur	*cur,	/* btree cursor */
291 	xfs_agblock_t		*bno,	/* output: starting block of extent */
292 	xfs_extlen_t		*len,	/* output: length of extent */
293 	int			*stat)	/* output: success/failure */
294 {
295 	struct xfs_alloc_rec_incore irec;
296 	union xfs_btree_rec	*rec;
297 	xfs_failaddr_t		fa;
298 	int			error;
299 
300 	error = xfs_btree_get_rec(cur, &rec, stat);
301 	if (error || !(*stat))
302 		return error;
303 
304 	xfs_alloc_btrec_to_irec(rec, &irec);
305 	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
306 	if (fa)
307 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
308 
309 	*bno = irec.ar_startblock;
310 	*len = irec.ar_blockcount;
311 	return 0;
312 }
313 
314 /*
315  * Compute aligned version of the found extent.
316  * Takes alignment and min length into account.
317  */
318 STATIC bool
319 xfs_alloc_compute_aligned(
320 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
321 	xfs_agblock_t	foundbno,	/* starting block in found extent */
322 	xfs_extlen_t	foundlen,	/* length in found extent */
323 	xfs_agblock_t	*resbno,	/* result block number */
324 	xfs_extlen_t	*reslen,	/* result length */
325 	unsigned	*busy_gen)
326 {
327 	xfs_agblock_t	bno = foundbno;
328 	xfs_extlen_t	len = foundlen;
329 	xfs_extlen_t	diff;
330 	bool		busy;
331 
332 	/* Trim busy sections out of found extent */
333 	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
334 
335 	/*
336 	 * If we have a largish extent that happens to start before min_agbno,
337 	 * see if we can shift it into range...
338 	 */
339 	if (bno < args->min_agbno && bno + len > args->min_agbno) {
340 		diff = args->min_agbno - bno;
341 		if (len > diff) {
342 			bno += diff;
343 			len -= diff;
344 		}
345 	}
346 
347 	if (args->alignment > 1 && len >= args->minlen) {
348 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
349 
350 		diff = aligned_bno - bno;
351 
352 		*resbno = aligned_bno;
353 		*reslen = diff >= len ? 0 : len - diff;
354 	} else {
355 		*resbno = bno;
356 		*reslen = len;
357 	}
358 
359 	return busy;
360 }
361 
362 /*
363  * Compute best start block and diff for "near" allocations.
364  * freelen >= wantlen already checked by caller.
365  */
366 STATIC xfs_extlen_t			/* difference value (absolute) */
367 xfs_alloc_compute_diff(
368 	xfs_agblock_t	wantbno,	/* target starting block */
369 	xfs_extlen_t	wantlen,	/* target length */
370 	xfs_extlen_t	alignment,	/* target alignment */
371 	int		datatype,	/* are we allocating data? */
372 	xfs_agblock_t	freebno,	/* freespace's starting block */
373 	xfs_extlen_t	freelen,	/* freespace's length */
374 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
375 {
376 	xfs_agblock_t	freeend;	/* end of freespace extent */
377 	xfs_agblock_t	newbno1;	/* return block number */
378 	xfs_agblock_t	newbno2;	/* other new block number */
379 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
380 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
381 	xfs_agblock_t	wantend;	/* end of target extent */
382 	bool		userdata = datatype & XFS_ALLOC_USERDATA;
383 
384 	ASSERT(freelen >= wantlen);
385 	freeend = freebno + freelen;
386 	wantend = wantbno + wantlen;
387 	/*
388 	 * We want to allocate from the start of a free extent if it is past
389 	 * the desired block or if we are allocating user data and the free
390 	 * extent is before desired block. The second case is there to allow
391 	 * for contiguous allocation from the remaining free space if the file
392 	 * grows in the short term.
393 	 */
394 	if (freebno >= wantbno || (userdata && freeend < wantend)) {
395 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
396 			newbno1 = NULLAGBLOCK;
397 	} else if (freeend >= wantend && alignment > 1) {
398 		newbno1 = roundup(wantbno, alignment);
399 		newbno2 = newbno1 - alignment;
400 		if (newbno1 >= freeend)
401 			newbno1 = NULLAGBLOCK;
402 		else
403 			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
404 		if (newbno2 < freebno)
405 			newbno2 = NULLAGBLOCK;
406 		else
407 			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
408 		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
409 			if (newlen1 < newlen2 ||
410 			    (newlen1 == newlen2 &&
411 			     XFS_ABSDIFF(newbno1, wantbno) >
412 			     XFS_ABSDIFF(newbno2, wantbno)))
413 				newbno1 = newbno2;
414 		} else if (newbno2 != NULLAGBLOCK)
415 			newbno1 = newbno2;
416 	} else if (freeend >= wantend) {
417 		newbno1 = wantbno;
418 	} else if (alignment > 1) {
419 		newbno1 = roundup(freeend - wantlen, alignment);
420 		if (newbno1 > freeend - wantlen &&
421 		    newbno1 - alignment >= freebno)
422 			newbno1 -= alignment;
423 		else if (newbno1 >= freeend)
424 			newbno1 = NULLAGBLOCK;
425 	} else
426 		newbno1 = freeend - wantlen;
427 	*newbnop = newbno1;
428 	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
429 }
430 
431 /*
432  * Fix up the length, based on mod and prod.
433  * len should be k * prod + mod for some k.
434  * If len is too small it is returned unchanged.
435  * If len hits maxlen it is left alone.
436  */
437 STATIC void
438 xfs_alloc_fix_len(
439 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
440 {
441 	xfs_extlen_t	k;
442 	xfs_extlen_t	rlen;
443 
444 	ASSERT(args->mod < args->prod);
445 	rlen = args->len;
446 	ASSERT(rlen >= args->minlen);
447 	ASSERT(rlen <= args->maxlen);
448 	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
449 	    (args->mod == 0 && rlen < args->prod))
450 		return;
451 	k = rlen % args->prod;
452 	if (k == args->mod)
453 		return;
454 	if (k > args->mod)
455 		rlen = rlen - (k - args->mod);
456 	else
457 		rlen = rlen - args->prod + (args->mod - k);
458 	/* casts to (int) catch length underflows */
459 	if ((int)rlen < (int)args->minlen)
460 		return;
461 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
462 	ASSERT(rlen % args->prod == args->mod);
463 	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
464 		rlen + args->minleft);
465 	args->len = rlen;
466 }
467 
468 /*
469  * Update the two btrees, logically removing from freespace the extent
470  * starting at rbno, rlen blocks.  The extent is contained within the
471  * actual (current) free extent fbno for flen blocks.
472  * Flags are passed in indicating whether the cursors are set to the
473  * relevant records.
474  */
475 STATIC int				/* error code */
476 xfs_alloc_fixup_trees(
477 	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
478 	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
479 	xfs_agblock_t	fbno,		/* starting block of free extent */
480 	xfs_extlen_t	flen,		/* length of free extent */
481 	xfs_agblock_t	rbno,		/* starting block of returned extent */
482 	xfs_extlen_t	rlen,		/* length of returned extent */
483 	int		flags)		/* flags, XFSA_FIXUP_... */
484 {
485 	int		error;		/* error code */
486 	int		i;		/* operation results */
487 	xfs_agblock_t	nfbno1;		/* first new free startblock */
488 	xfs_agblock_t	nfbno2;		/* second new free startblock */
489 	xfs_extlen_t	nflen1=0;	/* first new free length */
490 	xfs_extlen_t	nflen2=0;	/* second new free length */
491 	struct xfs_mount *mp;
492 
493 	mp = cnt_cur->bc_mp;
494 
495 	/*
496 	 * Look up the record in the by-size tree if necessary.
497 	 */
498 	if (flags & XFSA_FIXUP_CNT_OK) {
499 #ifdef DEBUG
500 		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
501 			return error;
502 		if (XFS_IS_CORRUPT(mp,
503 				   i != 1 ||
504 				   nfbno1 != fbno ||
505 				   nflen1 != flen)) {
506 			xfs_btree_mark_sick(cnt_cur);
507 			return -EFSCORRUPTED;
508 		}
509 #endif
510 	} else {
511 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
512 			return error;
513 		if (XFS_IS_CORRUPT(mp, i != 1)) {
514 			xfs_btree_mark_sick(cnt_cur);
515 			return -EFSCORRUPTED;
516 		}
517 	}
518 	/*
519 	 * Look up the record in the by-block tree if necessary.
520 	 */
521 	if (flags & XFSA_FIXUP_BNO_OK) {
522 #ifdef DEBUG
523 		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
524 			return error;
525 		if (XFS_IS_CORRUPT(mp,
526 				   i != 1 ||
527 				   nfbno1 != fbno ||
528 				   nflen1 != flen)) {
529 			xfs_btree_mark_sick(bno_cur);
530 			return -EFSCORRUPTED;
531 		}
532 #endif
533 	} else {
534 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
535 			return error;
536 		if (XFS_IS_CORRUPT(mp, i != 1)) {
537 			xfs_btree_mark_sick(bno_cur);
538 			return -EFSCORRUPTED;
539 		}
540 	}
541 
542 #ifdef DEBUG
543 	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
544 		struct xfs_btree_block	*bnoblock;
545 		struct xfs_btree_block	*cntblock;
546 
547 		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
548 		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
549 
550 		if (XFS_IS_CORRUPT(mp,
551 				   bnoblock->bb_numrecs !=
552 				   cntblock->bb_numrecs)) {
553 			xfs_btree_mark_sick(bno_cur);
554 			return -EFSCORRUPTED;
555 		}
556 	}
557 #endif
558 
559 	/*
560 	 * Deal with all four cases: the allocated record is contained
561 	 * within the freespace record, so we can have new freespace
562 	 * at either (or both) end, or no freespace remaining.
563 	 */
564 	if (rbno == fbno && rlen == flen)
565 		nfbno1 = nfbno2 = NULLAGBLOCK;
566 	else if (rbno == fbno) {
567 		nfbno1 = rbno + rlen;
568 		nflen1 = flen - rlen;
569 		nfbno2 = NULLAGBLOCK;
570 	} else if (rbno + rlen == fbno + flen) {
571 		nfbno1 = fbno;
572 		nflen1 = flen - rlen;
573 		nfbno2 = NULLAGBLOCK;
574 	} else {
575 		nfbno1 = fbno;
576 		nflen1 = rbno - fbno;
577 		nfbno2 = rbno + rlen;
578 		nflen2 = (fbno + flen) - nfbno2;
579 	}
580 	/*
581 	 * Delete the entry from the by-size btree.
582 	 */
583 	if ((error = xfs_btree_delete(cnt_cur, &i)))
584 		return error;
585 	if (XFS_IS_CORRUPT(mp, i != 1)) {
586 		xfs_btree_mark_sick(cnt_cur);
587 		return -EFSCORRUPTED;
588 	}
589 	/*
590 	 * Add new by-size btree entry(s).
591 	 */
592 	if (nfbno1 != NULLAGBLOCK) {
593 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
594 			return error;
595 		if (XFS_IS_CORRUPT(mp, i != 0)) {
596 			xfs_btree_mark_sick(cnt_cur);
597 			return -EFSCORRUPTED;
598 		}
599 		if ((error = xfs_btree_insert(cnt_cur, &i)))
600 			return error;
601 		if (XFS_IS_CORRUPT(mp, i != 1)) {
602 			xfs_btree_mark_sick(cnt_cur);
603 			return -EFSCORRUPTED;
604 		}
605 	}
606 	if (nfbno2 != NULLAGBLOCK) {
607 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
608 			return error;
609 		if (XFS_IS_CORRUPT(mp, i != 0)) {
610 			xfs_btree_mark_sick(cnt_cur);
611 			return -EFSCORRUPTED;
612 		}
613 		if ((error = xfs_btree_insert(cnt_cur, &i)))
614 			return error;
615 		if (XFS_IS_CORRUPT(mp, i != 1)) {
616 			xfs_btree_mark_sick(cnt_cur);
617 			return -EFSCORRUPTED;
618 		}
619 	}
620 	/*
621 	 * Fix up the by-block btree entry(s).
622 	 */
623 	if (nfbno1 == NULLAGBLOCK) {
624 		/*
625 		 * No remaining freespace, just delete the by-block tree entry.
626 		 */
627 		if ((error = xfs_btree_delete(bno_cur, &i)))
628 			return error;
629 		if (XFS_IS_CORRUPT(mp, i != 1)) {
630 			xfs_btree_mark_sick(bno_cur);
631 			return -EFSCORRUPTED;
632 		}
633 	} else {
634 		/*
635 		 * Update the by-block entry to start later|be shorter.
636 		 */
637 		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
638 			return error;
639 	}
640 	if (nfbno2 != NULLAGBLOCK) {
641 		/*
642 		 * 2 resulting free entries, need to add one.
643 		 */
644 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
645 			return error;
646 		if (XFS_IS_CORRUPT(mp, i != 0)) {
647 			xfs_btree_mark_sick(bno_cur);
648 			return -EFSCORRUPTED;
649 		}
650 		if ((error = xfs_btree_insert(bno_cur, &i)))
651 			return error;
652 		if (XFS_IS_CORRUPT(mp, i != 1)) {
653 			xfs_btree_mark_sick(bno_cur);
654 			return -EFSCORRUPTED;
655 		}
656 	}
657 	return 0;
658 }
659 
660 /*
661  * We do not verify the AGFL contents against AGF-based index counters here,
662  * even though we may have access to the perag that contains shadow copies. We
663  * don't know if the AGF based counters have been checked, and if they have they
664  * still may be inconsistent because they haven't yet been reset on the first
665  * allocation after the AGF has been read in.
666  *
667  * This means we can only check that all agfl entries contain valid or null
668  * values because we can't reliably determine the active range to exclude
669  * NULLAGBNO as a valid value.
670  *
671  * However, we can't even do that for v4 format filesystems because there are
672  * old versions of mkfs out there that does not initialise the AGFL to known,
673  * verifiable values. HEnce we can't tell the difference between a AGFL block
674  * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
675  *
676  * As a result, we can only fully validate AGFL block numbers when we pull them
677  * from the freelist in xfs_alloc_get_freelist().
678  */
679 static xfs_failaddr_t
680 xfs_agfl_verify(
681 	struct xfs_buf	*bp)
682 {
683 	struct xfs_mount *mp = bp->b_mount;
684 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
685 	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
686 	int		i;
687 
688 	if (!xfs_has_crc(mp))
689 		return NULL;
690 
691 	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
692 		return __this_address;
693 	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
694 		return __this_address;
695 	/*
696 	 * during growfs operations, the perag is not fully initialised,
697 	 * so we can't use it for any useful checking. growfs ensures we can't
698 	 * use it by using uncached buffers that don't have the perag attached
699 	 * so we can detect and avoid this problem.
700 	 */
701 	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
702 		return __this_address;
703 
704 	for (i = 0; i < xfs_agfl_size(mp); i++) {
705 		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
706 		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
707 			return __this_address;
708 	}
709 
710 	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
711 		return __this_address;
712 	return NULL;
713 }
714 
715 static void
716 xfs_agfl_read_verify(
717 	struct xfs_buf	*bp)
718 {
719 	struct xfs_mount *mp = bp->b_mount;
720 	xfs_failaddr_t	fa;
721 
722 	/*
723 	 * There is no verification of non-crc AGFLs because mkfs does not
724 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
725 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
726 	 * can't verify just those entries are valid.
727 	 */
728 	if (!xfs_has_crc(mp))
729 		return;
730 
731 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
732 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
733 	else {
734 		fa = xfs_agfl_verify(bp);
735 		if (fa)
736 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
737 	}
738 }
739 
740 static void
741 xfs_agfl_write_verify(
742 	struct xfs_buf	*bp)
743 {
744 	struct xfs_mount	*mp = bp->b_mount;
745 	struct xfs_buf_log_item	*bip = bp->b_log_item;
746 	xfs_failaddr_t		fa;
747 
748 	/* no verification of non-crc AGFLs */
749 	if (!xfs_has_crc(mp))
750 		return;
751 
752 	fa = xfs_agfl_verify(bp);
753 	if (fa) {
754 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
755 		return;
756 	}
757 
758 	if (bip)
759 		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
760 
761 	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
762 }
763 
764 const struct xfs_buf_ops xfs_agfl_buf_ops = {
765 	.name = "xfs_agfl",
766 	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
767 	.verify_read = xfs_agfl_read_verify,
768 	.verify_write = xfs_agfl_write_verify,
769 	.verify_struct = xfs_agfl_verify,
770 };
771 
772 /*
773  * Read in the allocation group free block array.
774  */
775 int
776 xfs_alloc_read_agfl(
777 	struct xfs_perag	*pag,
778 	struct xfs_trans	*tp,
779 	struct xfs_buf		**bpp)
780 {
781 	struct xfs_mount	*mp = pag->pag_mount;
782 	struct xfs_buf		*bp;
783 	int			error;
784 
785 	error = xfs_trans_read_buf(
786 			mp, tp, mp->m_ddev_targp,
787 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
788 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
789 	if (xfs_metadata_is_sick(error))
790 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
791 	if (error)
792 		return error;
793 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
794 	*bpp = bp;
795 	return 0;
796 }
797 
798 STATIC int
799 xfs_alloc_update_counters(
800 	struct xfs_trans	*tp,
801 	struct xfs_buf		*agbp,
802 	long			len)
803 {
804 	struct xfs_agf		*agf = agbp->b_addr;
805 
806 	agbp->b_pag->pagf_freeblks += len;
807 	be32_add_cpu(&agf->agf_freeblks, len);
808 
809 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
810 		     be32_to_cpu(agf->agf_length))) {
811 		xfs_buf_mark_corrupt(agbp);
812 		xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
813 		return -EFSCORRUPTED;
814 	}
815 
816 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
817 	return 0;
818 }
819 
820 /*
821  * Block allocation algorithm and data structures.
822  */
823 struct xfs_alloc_cur {
824 	struct xfs_btree_cur		*cnt;	/* btree cursors */
825 	struct xfs_btree_cur		*bnolt;
826 	struct xfs_btree_cur		*bnogt;
827 	xfs_extlen_t			cur_len;/* current search length */
828 	xfs_agblock_t			rec_bno;/* extent startblock */
829 	xfs_extlen_t			rec_len;/* extent length */
830 	xfs_agblock_t			bno;	/* alloc bno */
831 	xfs_extlen_t			len;	/* alloc len */
832 	xfs_extlen_t			diff;	/* diff from search bno */
833 	unsigned int			busy_gen;/* busy state */
834 	bool				busy;
835 };
836 
837 /*
838  * Set up cursors, etc. in the extent allocation cursor. This function can be
839  * called multiple times to reset an initialized structure without having to
840  * reallocate cursors.
841  */
842 static int
843 xfs_alloc_cur_setup(
844 	struct xfs_alloc_arg	*args,
845 	struct xfs_alloc_cur	*acur)
846 {
847 	int			error;
848 	int			i;
849 
850 	acur->cur_len = args->maxlen;
851 	acur->rec_bno = 0;
852 	acur->rec_len = 0;
853 	acur->bno = 0;
854 	acur->len = 0;
855 	acur->diff = -1;
856 	acur->busy = false;
857 	acur->busy_gen = 0;
858 
859 	/*
860 	 * Perform an initial cntbt lookup to check for availability of maxlen
861 	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
862 	 * attempt a small allocation.
863 	 */
864 	if (!acur->cnt)
865 		acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
866 					args->agbp, args->pag);
867 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
868 	if (error)
869 		return error;
870 
871 	/*
872 	 * Allocate the bnobt left and right search cursors.
873 	 */
874 	if (!acur->bnolt)
875 		acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
876 					args->agbp, args->pag);
877 	if (!acur->bnogt)
878 		acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
879 					args->agbp, args->pag);
880 	return i == 1 ? 0 : -ENOSPC;
881 }
882 
883 static void
884 xfs_alloc_cur_close(
885 	struct xfs_alloc_cur	*acur,
886 	bool			error)
887 {
888 	int			cur_error = XFS_BTREE_NOERROR;
889 
890 	if (error)
891 		cur_error = XFS_BTREE_ERROR;
892 
893 	if (acur->cnt)
894 		xfs_btree_del_cursor(acur->cnt, cur_error);
895 	if (acur->bnolt)
896 		xfs_btree_del_cursor(acur->bnolt, cur_error);
897 	if (acur->bnogt)
898 		xfs_btree_del_cursor(acur->bnogt, cur_error);
899 	acur->cnt = acur->bnolt = acur->bnogt = NULL;
900 }
901 
902 /*
903  * Check an extent for allocation and track the best available candidate in the
904  * allocation structure. The cursor is deactivated if it has entered an out of
905  * range state based on allocation arguments. Optionally return the extent
906  * extent geometry and allocation status if requested by the caller.
907  */
908 static int
909 xfs_alloc_cur_check(
910 	struct xfs_alloc_arg	*args,
911 	struct xfs_alloc_cur	*acur,
912 	struct xfs_btree_cur	*cur,
913 	int			*new)
914 {
915 	int			error, i;
916 	xfs_agblock_t		bno, bnoa, bnew;
917 	xfs_extlen_t		len, lena, diff = -1;
918 	bool			busy;
919 	unsigned		busy_gen = 0;
920 	bool			deactivate = false;
921 	bool			isbnobt = xfs_btree_is_bno(cur->bc_ops);
922 
923 	*new = 0;
924 
925 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
926 	if (error)
927 		return error;
928 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
929 		xfs_btree_mark_sick(cur);
930 		return -EFSCORRUPTED;
931 	}
932 
933 	/*
934 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
935 	 * range (i.e., walking backwards looking for a minlen extent).
936 	 */
937 	if (len < args->minlen) {
938 		deactivate = !isbnobt;
939 		goto out;
940 	}
941 
942 	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
943 					 &busy_gen);
944 	acur->busy |= busy;
945 	if (busy)
946 		acur->busy_gen = busy_gen;
947 	/* deactivate a bnobt cursor outside of locality range */
948 	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
949 		deactivate = isbnobt;
950 		goto out;
951 	}
952 	if (lena < args->minlen)
953 		goto out;
954 
955 	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
956 	xfs_alloc_fix_len(args);
957 	ASSERT(args->len >= args->minlen);
958 	if (args->len < acur->len)
959 		goto out;
960 
961 	/*
962 	 * We have an aligned record that satisfies minlen and beats or matches
963 	 * the candidate extent size. Compare locality for near allocation mode.
964 	 */
965 	diff = xfs_alloc_compute_diff(args->agbno, args->len,
966 				      args->alignment, args->datatype,
967 				      bnoa, lena, &bnew);
968 	if (bnew == NULLAGBLOCK)
969 		goto out;
970 
971 	/*
972 	 * Deactivate a bnobt cursor with worse locality than the current best.
973 	 */
974 	if (diff > acur->diff) {
975 		deactivate = isbnobt;
976 		goto out;
977 	}
978 
979 	ASSERT(args->len > acur->len ||
980 	       (args->len == acur->len && diff <= acur->diff));
981 	acur->rec_bno = bno;
982 	acur->rec_len = len;
983 	acur->bno = bnew;
984 	acur->len = args->len;
985 	acur->diff = diff;
986 	*new = 1;
987 
988 	/*
989 	 * We're done if we found a perfect allocation. This only deactivates
990 	 * the current cursor, but this is just an optimization to terminate a
991 	 * cntbt search that otherwise runs to the edge of the tree.
992 	 */
993 	if (acur->diff == 0 && acur->len == args->maxlen)
994 		deactivate = true;
995 out:
996 	if (deactivate)
997 		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
998 	trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
999 	return 0;
1000 }
1001 
1002 /*
1003  * Complete an allocation of a candidate extent. Remove the extent from both
1004  * trees and update the args structure.
1005  */
1006 STATIC int
1007 xfs_alloc_cur_finish(
1008 	struct xfs_alloc_arg	*args,
1009 	struct xfs_alloc_cur	*acur)
1010 {
1011 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1012 	int			error;
1013 
1014 	ASSERT(acur->cnt && acur->bnolt);
1015 	ASSERT(acur->bno >= acur->rec_bno);
1016 	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
1017 	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
1018 
1019 	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
1020 				      acur->rec_len, acur->bno, acur->len, 0);
1021 	if (error)
1022 		return error;
1023 
1024 	args->agbno = acur->bno;
1025 	args->len = acur->len;
1026 	args->wasfromfl = 0;
1027 
1028 	trace_xfs_alloc_cur(args);
1029 	return 0;
1030 }
1031 
1032 /*
1033  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
1034  * bno optimized lookup to search for extents with ideal size and locality.
1035  */
1036 STATIC int
1037 xfs_alloc_cntbt_iter(
1038 	struct xfs_alloc_arg		*args,
1039 	struct xfs_alloc_cur		*acur)
1040 {
1041 	struct xfs_btree_cur	*cur = acur->cnt;
1042 	xfs_agblock_t		bno;
1043 	xfs_extlen_t		len, cur_len;
1044 	int			error;
1045 	int			i;
1046 
1047 	if (!xfs_alloc_cur_active(cur))
1048 		return 0;
1049 
1050 	/* locality optimized lookup */
1051 	cur_len = acur->cur_len;
1052 	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1053 	if (error)
1054 		return error;
1055 	if (i == 0)
1056 		return 0;
1057 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1058 	if (error)
1059 		return error;
1060 
1061 	/* check the current record and update search length from it */
1062 	error = xfs_alloc_cur_check(args, acur, cur, &i);
1063 	if (error)
1064 		return error;
1065 	ASSERT(len >= acur->cur_len);
1066 	acur->cur_len = len;
1067 
1068 	/*
1069 	 * We looked up the first record >= [agbno, len] above. The agbno is a
1070 	 * secondary key and so the current record may lie just before or after
1071 	 * agbno. If it is past agbno, check the previous record too so long as
1072 	 * the length matches as it may be closer. Don't check a smaller record
1073 	 * because that could deactivate our cursor.
1074 	 */
1075 	if (bno > args->agbno) {
1076 		error = xfs_btree_decrement(cur, 0, &i);
1077 		if (!error && i) {
1078 			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1079 			if (!error && i && len == acur->cur_len)
1080 				error = xfs_alloc_cur_check(args, acur, cur,
1081 							    &i);
1082 		}
1083 		if (error)
1084 			return error;
1085 	}
1086 
1087 	/*
1088 	 * Increment the search key until we find at least one allocation
1089 	 * candidate or if the extent we found was larger. Otherwise, double the
1090 	 * search key to optimize the search. Efficiency is more important here
1091 	 * than absolute best locality.
1092 	 */
1093 	cur_len <<= 1;
1094 	if (!acur->len || acur->cur_len >= cur_len)
1095 		acur->cur_len++;
1096 	else
1097 		acur->cur_len = cur_len;
1098 
1099 	return error;
1100 }
1101 
1102 /*
1103  * Deal with the case where only small freespaces remain. Either return the
1104  * contents of the last freespace record, or allocate space from the freelist if
1105  * there is nothing in the tree.
1106  */
1107 STATIC int			/* error */
1108 xfs_alloc_ag_vextent_small(
1109 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1110 	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1111 	xfs_agblock_t		*fbnop,	/* result block number */
1112 	xfs_extlen_t		*flenp,	/* result length */
1113 	int			*stat)	/* status: 0-freelist, 1-normal/none */
1114 {
1115 	struct xfs_agf		*agf = args->agbp->b_addr;
1116 	int			error = 0;
1117 	xfs_agblock_t		fbno = NULLAGBLOCK;
1118 	xfs_extlen_t		flen = 0;
1119 	int			i = 0;
1120 
1121 	/*
1122 	 * If a cntbt cursor is provided, try to allocate the largest record in
1123 	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1124 	 * allocation. Make sure to respect minleft even when pulling from the
1125 	 * freelist.
1126 	 */
1127 	if (ccur)
1128 		error = xfs_btree_decrement(ccur, 0, &i);
1129 	if (error)
1130 		goto error;
1131 	if (i) {
1132 		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1133 		if (error)
1134 			goto error;
1135 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1136 			xfs_btree_mark_sick(ccur);
1137 			error = -EFSCORRUPTED;
1138 			goto error;
1139 		}
1140 		goto out;
1141 	}
1142 
1143 	if (args->minlen != 1 || args->alignment != 1 ||
1144 	    args->resv == XFS_AG_RESV_AGFL ||
1145 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1146 		goto out;
1147 
1148 	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1149 			&fbno, 0);
1150 	if (error)
1151 		goto error;
1152 	if (fbno == NULLAGBLOCK)
1153 		goto out;
1154 
1155 	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1156 			      (args->datatype & XFS_ALLOC_NOBUSY));
1157 
1158 	if (args->datatype & XFS_ALLOC_USERDATA) {
1159 		struct xfs_buf	*bp;
1160 
1161 		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1162 				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1163 				args->mp->m_bsize, 0, &bp);
1164 		if (error)
1165 			goto error;
1166 		xfs_trans_binval(args->tp, bp);
1167 	}
1168 	*fbnop = args->agbno = fbno;
1169 	*flenp = args->len = 1;
1170 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1171 		xfs_btree_mark_sick(ccur);
1172 		error = -EFSCORRUPTED;
1173 		goto error;
1174 	}
1175 	args->wasfromfl = 1;
1176 	trace_xfs_alloc_small_freelist(args);
1177 
1178 	/*
1179 	 * If we're feeding an AGFL block to something that doesn't live in the
1180 	 * free space, we need to clear out the OWN_AG rmap.
1181 	 */
1182 	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1183 			      &XFS_RMAP_OINFO_AG);
1184 	if (error)
1185 		goto error;
1186 
1187 	*stat = 0;
1188 	return 0;
1189 
1190 out:
1191 	/*
1192 	 * Can't do the allocation, give up.
1193 	 */
1194 	if (flen < args->minlen) {
1195 		args->agbno = NULLAGBLOCK;
1196 		trace_xfs_alloc_small_notenough(args);
1197 		flen = 0;
1198 	}
1199 	*fbnop = fbno;
1200 	*flenp = flen;
1201 	*stat = 1;
1202 	trace_xfs_alloc_small_done(args);
1203 	return 0;
1204 
1205 error:
1206 	trace_xfs_alloc_small_error(args);
1207 	return error;
1208 }
1209 
1210 /*
1211  * Allocate a variable extent at exactly agno/bno.
1212  * Extent's length (returned in *len) will be between minlen and maxlen,
1213  * and of the form k * prod + mod unless there's nothing that large.
1214  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1215  */
1216 STATIC int			/* error */
1217 xfs_alloc_ag_vextent_exact(
1218 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1219 {
1220 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1221 	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1222 	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1223 	int		error;
1224 	xfs_agblock_t	fbno;	/* start block of found extent */
1225 	xfs_extlen_t	flen;	/* length of found extent */
1226 	xfs_agblock_t	tbno;	/* start block of busy extent */
1227 	xfs_extlen_t	tlen;	/* length of busy extent */
1228 	xfs_agblock_t	tend;	/* end block of busy extent */
1229 	int		i;	/* success/failure of operation */
1230 	unsigned	busy_gen;
1231 
1232 	ASSERT(args->alignment == 1);
1233 
1234 	/*
1235 	 * Allocate/initialize a cursor for the by-number freespace btree.
1236 	 */
1237 	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
1238 					  args->pag);
1239 
1240 	/*
1241 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1242 	 * Look for the closest free block <= bno, it must contain bno
1243 	 * if any free block does.
1244 	 */
1245 	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1246 	if (error)
1247 		goto error0;
1248 	if (!i)
1249 		goto not_found;
1250 
1251 	/*
1252 	 * Grab the freespace record.
1253 	 */
1254 	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1255 	if (error)
1256 		goto error0;
1257 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1258 		xfs_btree_mark_sick(bno_cur);
1259 		error = -EFSCORRUPTED;
1260 		goto error0;
1261 	}
1262 	ASSERT(fbno <= args->agbno);
1263 
1264 	/*
1265 	 * Check for overlapping busy extents.
1266 	 */
1267 	tbno = fbno;
1268 	tlen = flen;
1269 	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1270 
1271 	/*
1272 	 * Give up if the start of the extent is busy, or the freespace isn't
1273 	 * long enough for the minimum request.
1274 	 */
1275 	if (tbno > args->agbno)
1276 		goto not_found;
1277 	if (tlen < args->minlen)
1278 		goto not_found;
1279 	tend = tbno + tlen;
1280 	if (tend < args->agbno + args->minlen)
1281 		goto not_found;
1282 
1283 	/*
1284 	 * End of extent will be smaller of the freespace end and the
1285 	 * maximal requested end.
1286 	 *
1287 	 * Fix the length according to mod and prod if given.
1288 	 */
1289 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1290 						- args->agbno;
1291 	xfs_alloc_fix_len(args);
1292 	ASSERT(args->agbno + args->len <= tend);
1293 
1294 	/*
1295 	 * We are allocating agbno for args->len
1296 	 * Allocate/initialize a cursor for the by-size btree.
1297 	 */
1298 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
1299 					args->pag);
1300 	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1301 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1302 				      args->len, XFSA_FIXUP_BNO_OK);
1303 	if (error) {
1304 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1305 		goto error0;
1306 	}
1307 
1308 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1309 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1310 
1311 	args->wasfromfl = 0;
1312 	trace_xfs_alloc_exact_done(args);
1313 	return 0;
1314 
1315 not_found:
1316 	/* Didn't find it, return null. */
1317 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1318 	args->agbno = NULLAGBLOCK;
1319 	trace_xfs_alloc_exact_notfound(args);
1320 	return 0;
1321 
1322 error0:
1323 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1324 	trace_xfs_alloc_exact_error(args);
1325 	return error;
1326 }
1327 
1328 /*
1329  * Search a given number of btree records in a given direction. Check each
1330  * record against the good extent we've already found.
1331  */
1332 STATIC int
1333 xfs_alloc_walk_iter(
1334 	struct xfs_alloc_arg	*args,
1335 	struct xfs_alloc_cur	*acur,
1336 	struct xfs_btree_cur	*cur,
1337 	bool			increment,
1338 	bool			find_one, /* quit on first candidate */
1339 	int			count,    /* rec count (-1 for infinite) */
1340 	int			*stat)
1341 {
1342 	int			error;
1343 	int			i;
1344 
1345 	*stat = 0;
1346 
1347 	/*
1348 	 * Search so long as the cursor is active or we find a better extent.
1349 	 * The cursor is deactivated if it extends beyond the range of the
1350 	 * current allocation candidate.
1351 	 */
1352 	while (xfs_alloc_cur_active(cur) && count) {
1353 		error = xfs_alloc_cur_check(args, acur, cur, &i);
1354 		if (error)
1355 			return error;
1356 		if (i == 1) {
1357 			*stat = 1;
1358 			if (find_one)
1359 				break;
1360 		}
1361 		if (!xfs_alloc_cur_active(cur))
1362 			break;
1363 
1364 		if (increment)
1365 			error = xfs_btree_increment(cur, 0, &i);
1366 		else
1367 			error = xfs_btree_decrement(cur, 0, &i);
1368 		if (error)
1369 			return error;
1370 		if (i == 0)
1371 			cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
1372 
1373 		if (count > 0)
1374 			count--;
1375 	}
1376 
1377 	return 0;
1378 }
1379 
1380 /*
1381  * Search the by-bno and by-size btrees in parallel in search of an extent with
1382  * ideal locality based on the NEAR mode ->agbno locality hint.
1383  */
1384 STATIC int
1385 xfs_alloc_ag_vextent_locality(
1386 	struct xfs_alloc_arg	*args,
1387 	struct xfs_alloc_cur	*acur,
1388 	int			*stat)
1389 {
1390 	struct xfs_btree_cur	*fbcur = NULL;
1391 	int			error;
1392 	int			i;
1393 	bool			fbinc;
1394 
1395 	ASSERT(acur->len == 0);
1396 
1397 	*stat = 0;
1398 
1399 	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1400 	if (error)
1401 		return error;
1402 	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1403 	if (error)
1404 		return error;
1405 	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1406 	if (error)
1407 		return error;
1408 
1409 	/*
1410 	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1411 	 * right and lookup the closest extent to the locality hint for each
1412 	 * extent size key in the cntbt. The entire search terminates
1413 	 * immediately on a bnobt hit because that means we've found best case
1414 	 * locality. Otherwise the search continues until the cntbt cursor runs
1415 	 * off the end of the tree. If no allocation candidate is found at this
1416 	 * point, give up on locality, walk backwards from the end of the cntbt
1417 	 * and take the first available extent.
1418 	 *
1419 	 * The parallel tree searches balance each other out to provide fairly
1420 	 * consistent performance for various situations. The bnobt search can
1421 	 * have pathological behavior in the worst case scenario of larger
1422 	 * allocation requests and fragmented free space. On the other hand, the
1423 	 * bnobt is able to satisfy most smaller allocation requests much more
1424 	 * quickly than the cntbt. The cntbt search can sift through fragmented
1425 	 * free space and sets of free extents for larger allocation requests
1426 	 * more quickly than the bnobt. Since the locality hint is just a hint
1427 	 * and we don't want to scan the entire bnobt for perfect locality, the
1428 	 * cntbt search essentially bounds the bnobt search such that we can
1429 	 * find good enough locality at reasonable performance in most cases.
1430 	 */
1431 	while (xfs_alloc_cur_active(acur->bnolt) ||
1432 	       xfs_alloc_cur_active(acur->bnogt) ||
1433 	       xfs_alloc_cur_active(acur->cnt)) {
1434 
1435 		trace_xfs_alloc_cur_lookup(args);
1436 
1437 		/*
1438 		 * Search the bnobt left and right. In the case of a hit, finish
1439 		 * the search in the opposite direction and we're done.
1440 		 */
1441 		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1442 					    true, 1, &i);
1443 		if (error)
1444 			return error;
1445 		if (i == 1) {
1446 			trace_xfs_alloc_cur_left(args);
1447 			fbcur = acur->bnogt;
1448 			fbinc = true;
1449 			break;
1450 		}
1451 		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1452 					    1, &i);
1453 		if (error)
1454 			return error;
1455 		if (i == 1) {
1456 			trace_xfs_alloc_cur_right(args);
1457 			fbcur = acur->bnolt;
1458 			fbinc = false;
1459 			break;
1460 		}
1461 
1462 		/*
1463 		 * Check the extent with best locality based on the current
1464 		 * extent size search key and keep track of the best candidate.
1465 		 */
1466 		error = xfs_alloc_cntbt_iter(args, acur);
1467 		if (error)
1468 			return error;
1469 		if (!xfs_alloc_cur_active(acur->cnt)) {
1470 			trace_xfs_alloc_cur_lookup_done(args);
1471 			break;
1472 		}
1473 	}
1474 
1475 	/*
1476 	 * If we failed to find anything due to busy extents, return empty
1477 	 * handed so the caller can flush and retry. If no busy extents were
1478 	 * found, walk backwards from the end of the cntbt as a last resort.
1479 	 */
1480 	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1481 		error = xfs_btree_decrement(acur->cnt, 0, &i);
1482 		if (error)
1483 			return error;
1484 		if (i) {
1485 			acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
1486 			fbcur = acur->cnt;
1487 			fbinc = false;
1488 		}
1489 	}
1490 
1491 	/*
1492 	 * Search in the opposite direction for a better entry in the case of
1493 	 * a bnobt hit or walk backwards from the end of the cntbt.
1494 	 */
1495 	if (fbcur) {
1496 		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1497 					    &i);
1498 		if (error)
1499 			return error;
1500 	}
1501 
1502 	if (acur->len)
1503 		*stat = 1;
1504 
1505 	return 0;
1506 }
1507 
1508 /* Check the last block of the cnt btree for allocations. */
1509 static int
1510 xfs_alloc_ag_vextent_lastblock(
1511 	struct xfs_alloc_arg	*args,
1512 	struct xfs_alloc_cur	*acur,
1513 	xfs_agblock_t		*bno,
1514 	xfs_extlen_t		*len,
1515 	bool			*allocated)
1516 {
1517 	int			error;
1518 	int			i;
1519 
1520 #ifdef DEBUG
1521 	/* Randomly don't execute the first algorithm. */
1522 	if (get_random_u32_below(2))
1523 		return 0;
1524 #endif
1525 
1526 	/*
1527 	 * Start from the entry that lookup found, sequence through all larger
1528 	 * free blocks.  If we're actually pointing at a record smaller than
1529 	 * maxlen, go to the start of this block, and skip all those smaller
1530 	 * than minlen.
1531 	 */
1532 	if (*len || args->alignment > 1) {
1533 		acur->cnt->bc_levels[0].ptr = 1;
1534 		do {
1535 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1536 			if (error)
1537 				return error;
1538 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1539 				xfs_btree_mark_sick(acur->cnt);
1540 				return -EFSCORRUPTED;
1541 			}
1542 			if (*len >= args->minlen)
1543 				break;
1544 			error = xfs_btree_increment(acur->cnt, 0, &i);
1545 			if (error)
1546 				return error;
1547 		} while (i);
1548 		ASSERT(*len >= args->minlen);
1549 		if (!i)
1550 			return 0;
1551 	}
1552 
1553 	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1554 	if (error)
1555 		return error;
1556 
1557 	/*
1558 	 * It didn't work.  We COULD be in a case where there's a good record
1559 	 * somewhere, so try again.
1560 	 */
1561 	if (acur->len == 0)
1562 		return 0;
1563 
1564 	trace_xfs_alloc_near_first(args);
1565 	*allocated = true;
1566 	return 0;
1567 }
1568 
1569 /*
1570  * Allocate a variable extent near bno in the allocation group agno.
1571  * Extent's length (returned in len) will be between minlen and maxlen,
1572  * and of the form k * prod + mod unless there's nothing that large.
1573  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1574  */
1575 STATIC int
1576 xfs_alloc_ag_vextent_near(
1577 	struct xfs_alloc_arg	*args,
1578 	uint32_t		alloc_flags)
1579 {
1580 	struct xfs_alloc_cur	acur = {};
1581 	int			error;		/* error code */
1582 	int			i;		/* result code, temporary */
1583 	xfs_agblock_t		bno;
1584 	xfs_extlen_t		len;
1585 
1586 	/* handle uninitialized agbno range so caller doesn't have to */
1587 	if (!args->min_agbno && !args->max_agbno)
1588 		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1589 	ASSERT(args->min_agbno <= args->max_agbno);
1590 
1591 	/* clamp agbno to the range if it's outside */
1592 	if (args->agbno < args->min_agbno)
1593 		args->agbno = args->min_agbno;
1594 	if (args->agbno > args->max_agbno)
1595 		args->agbno = args->max_agbno;
1596 
1597 	/* Retry once quickly if we find busy extents before blocking. */
1598 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1599 restart:
1600 	len = 0;
1601 
1602 	/*
1603 	 * Set up cursors and see if there are any free extents as big as
1604 	 * maxlen. If not, pick the last entry in the tree unless the tree is
1605 	 * empty.
1606 	 */
1607 	error = xfs_alloc_cur_setup(args, &acur);
1608 	if (error == -ENOSPC) {
1609 		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1610 				&len, &i);
1611 		if (error)
1612 			goto out;
1613 		if (i == 0 || len == 0) {
1614 			trace_xfs_alloc_near_noentry(args);
1615 			goto out;
1616 		}
1617 		ASSERT(i == 1);
1618 	} else if (error) {
1619 		goto out;
1620 	}
1621 
1622 	/*
1623 	 * First algorithm.
1624 	 * If the requested extent is large wrt the freespaces available
1625 	 * in this a.g., then the cursor will be pointing to a btree entry
1626 	 * near the right edge of the tree.  If it's in the last btree leaf
1627 	 * block, then we just examine all the entries in that block
1628 	 * that are big enough, and pick the best one.
1629 	 */
1630 	if (xfs_btree_islastblock(acur.cnt, 0)) {
1631 		bool		allocated = false;
1632 
1633 		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1634 				&allocated);
1635 		if (error)
1636 			goto out;
1637 		if (allocated)
1638 			goto alloc_finish;
1639 	}
1640 
1641 	/*
1642 	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1643 	 * locality.
1644 	 */
1645 	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1646 	if (error)
1647 		goto out;
1648 
1649 	/*
1650 	 * If we couldn't get anything, give up.
1651 	 */
1652 	if (!acur.len) {
1653 		if (acur.busy) {
1654 			/*
1655 			 * Our only valid extents must have been busy. Flush and
1656 			 * retry the allocation again. If we get an -EAGAIN
1657 			 * error, we're being told that a deadlock was avoided
1658 			 * and the current transaction needs committing before
1659 			 * the allocation can be retried.
1660 			 */
1661 			trace_xfs_alloc_near_busy(args);
1662 			error = xfs_extent_busy_flush(args->tp, args->pag,
1663 					acur.busy_gen, alloc_flags);
1664 			if (error)
1665 				goto out;
1666 
1667 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1668 			goto restart;
1669 		}
1670 		trace_xfs_alloc_size_neither(args);
1671 		args->agbno = NULLAGBLOCK;
1672 		goto out;
1673 	}
1674 
1675 alloc_finish:
1676 	/* fix up btrees on a successful allocation */
1677 	error = xfs_alloc_cur_finish(args, &acur);
1678 
1679 out:
1680 	xfs_alloc_cur_close(&acur, error);
1681 	return error;
1682 }
1683 
1684 /*
1685  * Allocate a variable extent anywhere in the allocation group agno.
1686  * Extent's length (returned in len) will be between minlen and maxlen,
1687  * and of the form k * prod + mod unless there's nothing that large.
1688  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1689  */
1690 static int
1691 xfs_alloc_ag_vextent_size(
1692 	struct xfs_alloc_arg	*args,
1693 	uint32_t		alloc_flags)
1694 {
1695 	struct xfs_agf		*agf = args->agbp->b_addr;
1696 	struct xfs_btree_cur	*bno_cur;
1697 	struct xfs_btree_cur	*cnt_cur;
1698 	xfs_agblock_t		fbno;		/* start of found freespace */
1699 	xfs_extlen_t		flen;		/* length of found freespace */
1700 	xfs_agblock_t		rbno;		/* returned block number */
1701 	xfs_extlen_t		rlen;		/* length of returned extent */
1702 	bool			busy;
1703 	unsigned		busy_gen;
1704 	int			error;
1705 	int			i;
1706 
1707 	/* Retry once quickly if we find busy extents before blocking. */
1708 	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1709 restart:
1710 	/*
1711 	 * Allocate and initialize a cursor for the by-size btree.
1712 	 */
1713 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
1714 					args->pag);
1715 	bno_cur = NULL;
1716 
1717 	/*
1718 	 * Look for an entry >= maxlen+alignment-1 blocks.
1719 	 */
1720 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1721 			args->maxlen + args->alignment - 1, &i)))
1722 		goto error0;
1723 
1724 	/*
1725 	 * If none then we have to settle for a smaller extent. In the case that
1726 	 * there are no large extents, this will return the last entry in the
1727 	 * tree unless the tree is empty. In the case that there are only busy
1728 	 * large extents, this will return the largest small extent unless there
1729 	 * are no smaller extents available.
1730 	 */
1731 	if (!i) {
1732 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1733 						   &fbno, &flen, &i);
1734 		if (error)
1735 			goto error0;
1736 		if (i == 0 || flen == 0) {
1737 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1738 			trace_xfs_alloc_size_noentry(args);
1739 			return 0;
1740 		}
1741 		ASSERT(i == 1);
1742 		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1743 				&rlen, &busy_gen);
1744 	} else {
1745 		/*
1746 		 * Search for a non-busy extent that is large enough.
1747 		 */
1748 		for (;;) {
1749 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1750 			if (error)
1751 				goto error0;
1752 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1753 				xfs_btree_mark_sick(cnt_cur);
1754 				error = -EFSCORRUPTED;
1755 				goto error0;
1756 			}
1757 
1758 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1759 					&rbno, &rlen, &busy_gen);
1760 
1761 			if (rlen >= args->maxlen)
1762 				break;
1763 
1764 			error = xfs_btree_increment(cnt_cur, 0, &i);
1765 			if (error)
1766 				goto error0;
1767 			if (i)
1768 				continue;
1769 
1770 			/*
1771 			 * Our only valid extents must have been busy. Flush and
1772 			 * retry the allocation again. If we get an -EAGAIN
1773 			 * error, we're being told that a deadlock was avoided
1774 			 * and the current transaction needs committing before
1775 			 * the allocation can be retried.
1776 			 */
1777 			trace_xfs_alloc_size_busy(args);
1778 			error = xfs_extent_busy_flush(args->tp, args->pag,
1779 					busy_gen, alloc_flags);
1780 			if (error)
1781 				goto error0;
1782 
1783 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1784 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1785 			goto restart;
1786 		}
1787 	}
1788 
1789 	/*
1790 	 * In the first case above, we got the last entry in the
1791 	 * by-size btree.  Now we check to see if the space hits maxlen
1792 	 * once aligned; if not, we search left for something better.
1793 	 * This can't happen in the second case above.
1794 	 */
1795 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1796 	if (XFS_IS_CORRUPT(args->mp,
1797 			   rlen != 0 &&
1798 			   (rlen > flen ||
1799 			    rbno + rlen > fbno + flen))) {
1800 		xfs_btree_mark_sick(cnt_cur);
1801 		error = -EFSCORRUPTED;
1802 		goto error0;
1803 	}
1804 	if (rlen < args->maxlen) {
1805 		xfs_agblock_t	bestfbno;
1806 		xfs_extlen_t	bestflen;
1807 		xfs_agblock_t	bestrbno;
1808 		xfs_extlen_t	bestrlen;
1809 
1810 		bestrlen = rlen;
1811 		bestrbno = rbno;
1812 		bestflen = flen;
1813 		bestfbno = fbno;
1814 		for (;;) {
1815 			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1816 				goto error0;
1817 			if (i == 0)
1818 				break;
1819 			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1820 					&i)))
1821 				goto error0;
1822 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1823 				xfs_btree_mark_sick(cnt_cur);
1824 				error = -EFSCORRUPTED;
1825 				goto error0;
1826 			}
1827 			if (flen < bestrlen)
1828 				break;
1829 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1830 					&rbno, &rlen, &busy_gen);
1831 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1832 			if (XFS_IS_CORRUPT(args->mp,
1833 					   rlen != 0 &&
1834 					   (rlen > flen ||
1835 					    rbno + rlen > fbno + flen))) {
1836 				xfs_btree_mark_sick(cnt_cur);
1837 				error = -EFSCORRUPTED;
1838 				goto error0;
1839 			}
1840 			if (rlen > bestrlen) {
1841 				bestrlen = rlen;
1842 				bestrbno = rbno;
1843 				bestflen = flen;
1844 				bestfbno = fbno;
1845 				if (rlen == args->maxlen)
1846 					break;
1847 			}
1848 		}
1849 		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1850 				&i)))
1851 			goto error0;
1852 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1853 			xfs_btree_mark_sick(cnt_cur);
1854 			error = -EFSCORRUPTED;
1855 			goto error0;
1856 		}
1857 		rlen = bestrlen;
1858 		rbno = bestrbno;
1859 		flen = bestflen;
1860 		fbno = bestfbno;
1861 	}
1862 	args->wasfromfl = 0;
1863 	/*
1864 	 * Fix up the length.
1865 	 */
1866 	args->len = rlen;
1867 	if (rlen < args->minlen) {
1868 		if (busy) {
1869 			/*
1870 			 * Our only valid extents must have been busy. Flush and
1871 			 * retry the allocation again. If we get an -EAGAIN
1872 			 * error, we're being told that a deadlock was avoided
1873 			 * and the current transaction needs committing before
1874 			 * the allocation can be retried.
1875 			 */
1876 			trace_xfs_alloc_size_busy(args);
1877 			error = xfs_extent_busy_flush(args->tp, args->pag,
1878 					busy_gen, alloc_flags);
1879 			if (error)
1880 				goto error0;
1881 
1882 			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1883 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1884 			goto restart;
1885 		}
1886 		goto out_nominleft;
1887 	}
1888 	xfs_alloc_fix_len(args);
1889 
1890 	rlen = args->len;
1891 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1892 		xfs_btree_mark_sick(cnt_cur);
1893 		error = -EFSCORRUPTED;
1894 		goto error0;
1895 	}
1896 	/*
1897 	 * Allocate and initialize a cursor for the by-block tree.
1898 	 */
1899 	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
1900 					args->pag);
1901 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1902 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1903 		goto error0;
1904 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1905 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1906 	cnt_cur = bno_cur = NULL;
1907 	args->len = rlen;
1908 	args->agbno = rbno;
1909 	if (XFS_IS_CORRUPT(args->mp,
1910 			   args->agbno + args->len >
1911 			   be32_to_cpu(agf->agf_length))) {
1912 		xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
1913 		error = -EFSCORRUPTED;
1914 		goto error0;
1915 	}
1916 	trace_xfs_alloc_size_done(args);
1917 	return 0;
1918 
1919 error0:
1920 	trace_xfs_alloc_size_error(args);
1921 	if (cnt_cur)
1922 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1923 	if (bno_cur)
1924 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1925 	return error;
1926 
1927 out_nominleft:
1928 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1929 	trace_xfs_alloc_size_nominleft(args);
1930 	args->agbno = NULLAGBLOCK;
1931 	return 0;
1932 }
1933 
1934 /*
1935  * Free the extent starting at agno/bno for length.
1936  */
1937 STATIC int
1938 xfs_free_ag_extent(
1939 	struct xfs_trans		*tp,
1940 	struct xfs_buf			*agbp,
1941 	xfs_agnumber_t			agno,
1942 	xfs_agblock_t			bno,
1943 	xfs_extlen_t			len,
1944 	const struct xfs_owner_info	*oinfo,
1945 	enum xfs_ag_resv_type		type)
1946 {
1947 	struct xfs_mount		*mp;
1948 	struct xfs_btree_cur		*bno_cur;
1949 	struct xfs_btree_cur		*cnt_cur;
1950 	xfs_agblock_t			gtbno; /* start of right neighbor */
1951 	xfs_extlen_t			gtlen; /* length of right neighbor */
1952 	xfs_agblock_t			ltbno; /* start of left neighbor */
1953 	xfs_extlen_t			ltlen; /* length of left neighbor */
1954 	xfs_agblock_t			nbno; /* new starting block of freesp */
1955 	xfs_extlen_t			nlen; /* new length of freespace */
1956 	int				haveleft; /* have a left neighbor */
1957 	int				haveright; /* have a right neighbor */
1958 	int				i;
1959 	int				error;
1960 	struct xfs_perag		*pag = agbp->b_pag;
1961 
1962 	bno_cur = cnt_cur = NULL;
1963 	mp = tp->t_mountp;
1964 
1965 	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1966 		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1967 		if (error)
1968 			goto error0;
1969 	}
1970 
1971 	/*
1972 	 * Allocate and initialize a cursor for the by-block btree.
1973 	 */
1974 	bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
1975 	/*
1976 	 * Look for a neighboring block on the left (lower block numbers)
1977 	 * that is contiguous with this space.
1978 	 */
1979 	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1980 		goto error0;
1981 	if (haveleft) {
1982 		/*
1983 		 * There is a block to our left.
1984 		 */
1985 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1986 			goto error0;
1987 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1988 			xfs_btree_mark_sick(bno_cur);
1989 			error = -EFSCORRUPTED;
1990 			goto error0;
1991 		}
1992 		/*
1993 		 * It's not contiguous, though.
1994 		 */
1995 		if (ltbno + ltlen < bno)
1996 			haveleft = 0;
1997 		else {
1998 			/*
1999 			 * If this failure happens the request to free this
2000 			 * space was invalid, it's (partly) already free.
2001 			 * Very bad.
2002 			 */
2003 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
2004 				xfs_btree_mark_sick(bno_cur);
2005 				error = -EFSCORRUPTED;
2006 				goto error0;
2007 			}
2008 		}
2009 	}
2010 	/*
2011 	 * Look for a neighboring block on the right (higher block numbers)
2012 	 * that is contiguous with this space.
2013 	 */
2014 	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2015 		goto error0;
2016 	if (haveright) {
2017 		/*
2018 		 * There is a block to our right.
2019 		 */
2020 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2021 			goto error0;
2022 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2023 			xfs_btree_mark_sick(bno_cur);
2024 			error = -EFSCORRUPTED;
2025 			goto error0;
2026 		}
2027 		/*
2028 		 * It's not contiguous, though.
2029 		 */
2030 		if (bno + len < gtbno)
2031 			haveright = 0;
2032 		else {
2033 			/*
2034 			 * If this failure happens the request to free this
2035 			 * space was invalid, it's (partly) already free.
2036 			 * Very bad.
2037 			 */
2038 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
2039 				xfs_btree_mark_sick(bno_cur);
2040 				error = -EFSCORRUPTED;
2041 				goto error0;
2042 			}
2043 		}
2044 	}
2045 	/*
2046 	 * Now allocate and initialize a cursor for the by-size tree.
2047 	 */
2048 	cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
2049 	/*
2050 	 * Have both left and right contiguous neighbors.
2051 	 * Merge all three into a single free block.
2052 	 */
2053 	if (haveleft && haveright) {
2054 		/*
2055 		 * Delete the old by-size entry on the left.
2056 		 */
2057 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2058 			goto error0;
2059 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2060 			xfs_btree_mark_sick(cnt_cur);
2061 			error = -EFSCORRUPTED;
2062 			goto error0;
2063 		}
2064 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2065 			goto error0;
2066 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2067 			xfs_btree_mark_sick(cnt_cur);
2068 			error = -EFSCORRUPTED;
2069 			goto error0;
2070 		}
2071 		/*
2072 		 * Delete the old by-size entry on the right.
2073 		 */
2074 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2075 			goto error0;
2076 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2077 			xfs_btree_mark_sick(cnt_cur);
2078 			error = -EFSCORRUPTED;
2079 			goto error0;
2080 		}
2081 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2082 			goto error0;
2083 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2084 			xfs_btree_mark_sick(cnt_cur);
2085 			error = -EFSCORRUPTED;
2086 			goto error0;
2087 		}
2088 		/*
2089 		 * Delete the old by-block entry for the right block.
2090 		 */
2091 		if ((error = xfs_btree_delete(bno_cur, &i)))
2092 			goto error0;
2093 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2094 			xfs_btree_mark_sick(bno_cur);
2095 			error = -EFSCORRUPTED;
2096 			goto error0;
2097 		}
2098 		/*
2099 		 * Move the by-block cursor back to the left neighbor.
2100 		 */
2101 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2102 			goto error0;
2103 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2104 			xfs_btree_mark_sick(bno_cur);
2105 			error = -EFSCORRUPTED;
2106 			goto error0;
2107 		}
2108 #ifdef DEBUG
2109 		/*
2110 		 * Check that this is the right record: delete didn't
2111 		 * mangle the cursor.
2112 		 */
2113 		{
2114 			xfs_agblock_t	xxbno;
2115 			xfs_extlen_t	xxlen;
2116 
2117 			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2118 					&i)))
2119 				goto error0;
2120 			if (XFS_IS_CORRUPT(mp,
2121 					   i != 1 ||
2122 					   xxbno != ltbno ||
2123 					   xxlen != ltlen)) {
2124 				xfs_btree_mark_sick(bno_cur);
2125 				error = -EFSCORRUPTED;
2126 				goto error0;
2127 			}
2128 		}
2129 #endif
2130 		/*
2131 		 * Update remaining by-block entry to the new, joined block.
2132 		 */
2133 		nbno = ltbno;
2134 		nlen = len + ltlen + gtlen;
2135 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2136 			goto error0;
2137 	}
2138 	/*
2139 	 * Have only a left contiguous neighbor.
2140 	 * Merge it together with the new freespace.
2141 	 */
2142 	else if (haveleft) {
2143 		/*
2144 		 * Delete the old by-size entry on the left.
2145 		 */
2146 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2147 			goto error0;
2148 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2149 			xfs_btree_mark_sick(cnt_cur);
2150 			error = -EFSCORRUPTED;
2151 			goto error0;
2152 		}
2153 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2154 			goto error0;
2155 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2156 			xfs_btree_mark_sick(cnt_cur);
2157 			error = -EFSCORRUPTED;
2158 			goto error0;
2159 		}
2160 		/*
2161 		 * Back up the by-block cursor to the left neighbor, and
2162 		 * update its length.
2163 		 */
2164 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2165 			goto error0;
2166 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2167 			xfs_btree_mark_sick(bno_cur);
2168 			error = -EFSCORRUPTED;
2169 			goto error0;
2170 		}
2171 		nbno = ltbno;
2172 		nlen = len + ltlen;
2173 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2174 			goto error0;
2175 	}
2176 	/*
2177 	 * Have only a right contiguous neighbor.
2178 	 * Merge it together with the new freespace.
2179 	 */
2180 	else if (haveright) {
2181 		/*
2182 		 * Delete the old by-size entry on the right.
2183 		 */
2184 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2185 			goto error0;
2186 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2187 			xfs_btree_mark_sick(cnt_cur);
2188 			error = -EFSCORRUPTED;
2189 			goto error0;
2190 		}
2191 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2192 			goto error0;
2193 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2194 			xfs_btree_mark_sick(cnt_cur);
2195 			error = -EFSCORRUPTED;
2196 			goto error0;
2197 		}
2198 		/*
2199 		 * Update the starting block and length of the right
2200 		 * neighbor in the by-block tree.
2201 		 */
2202 		nbno = bno;
2203 		nlen = len + gtlen;
2204 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2205 			goto error0;
2206 	}
2207 	/*
2208 	 * No contiguous neighbors.
2209 	 * Insert the new freespace into the by-block tree.
2210 	 */
2211 	else {
2212 		nbno = bno;
2213 		nlen = len;
2214 		if ((error = xfs_btree_insert(bno_cur, &i)))
2215 			goto error0;
2216 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2217 			xfs_btree_mark_sick(bno_cur);
2218 			error = -EFSCORRUPTED;
2219 			goto error0;
2220 		}
2221 	}
2222 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2223 	bno_cur = NULL;
2224 	/*
2225 	 * In all cases we need to insert the new freespace in the by-size tree.
2226 	 */
2227 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2228 		goto error0;
2229 	if (XFS_IS_CORRUPT(mp, i != 0)) {
2230 		xfs_btree_mark_sick(cnt_cur);
2231 		error = -EFSCORRUPTED;
2232 		goto error0;
2233 	}
2234 	if ((error = xfs_btree_insert(cnt_cur, &i)))
2235 		goto error0;
2236 	if (XFS_IS_CORRUPT(mp, i != 1)) {
2237 		xfs_btree_mark_sick(cnt_cur);
2238 		error = -EFSCORRUPTED;
2239 		goto error0;
2240 	}
2241 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2242 	cnt_cur = NULL;
2243 
2244 	/*
2245 	 * Update the freespace totals in the ag and superblock.
2246 	 */
2247 	error = xfs_alloc_update_counters(tp, agbp, len);
2248 	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2249 	if (error)
2250 		goto error0;
2251 
2252 	XFS_STATS_INC(mp, xs_freex);
2253 	XFS_STATS_ADD(mp, xs_freeb, len);
2254 
2255 	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2256 
2257 	return 0;
2258 
2259  error0:
2260 	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2261 	if (bno_cur)
2262 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2263 	if (cnt_cur)
2264 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2265 	return error;
2266 }
2267 
2268 /*
2269  * Visible (exported) allocation/free functions.
2270  * Some of these are used just by xfs_alloc_btree.c and this file.
2271  */
2272 
2273 /*
2274  * Compute and fill in value of m_alloc_maxlevels.
2275  */
2276 void
2277 xfs_alloc_compute_maxlevels(
2278 	xfs_mount_t	*mp)	/* file system mount structure */
2279 {
2280 	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2281 			(mp->m_sb.sb_agblocks + 1) / 2);
2282 	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2283 }
2284 
2285 /*
2286  * Find the length of the longest extent in an AG.  The 'need' parameter
2287  * specifies how much space we're going to need for the AGFL and the
2288  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2289  * other callers.
2290  */
2291 xfs_extlen_t
2292 xfs_alloc_longest_free_extent(
2293 	struct xfs_perag	*pag,
2294 	xfs_extlen_t		need,
2295 	xfs_extlen_t		reserved)
2296 {
2297 	xfs_extlen_t		delta = 0;
2298 
2299 	/*
2300 	 * If the AGFL needs a recharge, we'll have to subtract that from the
2301 	 * longest extent.
2302 	 */
2303 	if (need > pag->pagf_flcount)
2304 		delta = need - pag->pagf_flcount;
2305 
2306 	/*
2307 	 * If we cannot maintain others' reservations with space from the
2308 	 * not-longest freesp extents, we'll have to subtract /that/ from
2309 	 * the longest extent too.
2310 	 */
2311 	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2312 		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2313 
2314 	/*
2315 	 * If the longest extent is long enough to satisfy all the
2316 	 * reservations and AGFL rules in place, we can return this extent.
2317 	 */
2318 	if (pag->pagf_longest > delta)
2319 		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2320 				pag->pagf_longest - delta);
2321 
2322 	/* Otherwise, let the caller try for 1 block if there's space. */
2323 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2324 }
2325 
2326 /*
2327  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2328  * return the largest possible minimum length.
2329  */
2330 unsigned int
2331 xfs_alloc_min_freelist(
2332 	struct xfs_mount	*mp,
2333 	struct xfs_perag	*pag)
2334 {
2335 	/* AG btrees have at least 1 level. */
2336 	const unsigned int	bno_level = pag ? pag->pagf_bno_level : 1;
2337 	const unsigned int	cnt_level = pag ? pag->pagf_cnt_level : 1;
2338 	const unsigned int	rmap_level = pag ? pag->pagf_rmap_level : 1;
2339 	unsigned int		min_free;
2340 
2341 	ASSERT(mp->m_alloc_maxlevels > 0);
2342 
2343 	/*
2344 	 * For a btree shorter than the maximum height, the worst case is that
2345 	 * every level gets split and a new level is added, then while inserting
2346 	 * another entry to refill the AGFL, every level under the old root gets
2347 	 * split again. This is:
2348 	 *
2349 	 *   (full height split reservation) + (AGFL refill split height)
2350 	 * = (current height + 1) + (current height - 1)
2351 	 * = (new height) + (new height - 2)
2352 	 * = 2 * new height - 2
2353 	 *
2354 	 * For a btree of maximum height, the worst case is that every level
2355 	 * under the root gets split, then while inserting another entry to
2356 	 * refill the AGFL, every level under the root gets split again. This is
2357 	 * also:
2358 	 *
2359 	 *   2 * (current height - 1)
2360 	 * = 2 * (new height - 1)
2361 	 * = 2 * new height - 2
2362 	 */
2363 
2364 	/* space needed by-bno freespace btree */
2365 	min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
2366 	/* space needed by-size freespace btree */
2367 	min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
2368 	/* space needed reverse mapping used space btree */
2369 	if (xfs_has_rmapbt(mp))
2370 		min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
2371 	return min_free;
2372 }
2373 
2374 /*
2375  * Check if the operation we are fixing up the freelist for should go ahead or
2376  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2377  * is dependent on whether the size and shape of free space available will
2378  * permit the requested allocation to take place.
2379  */
2380 static bool
2381 xfs_alloc_space_available(
2382 	struct xfs_alloc_arg	*args,
2383 	xfs_extlen_t		min_free,
2384 	int			flags)
2385 {
2386 	struct xfs_perag	*pag = args->pag;
2387 	xfs_extlen_t		alloc_len, longest;
2388 	xfs_extlen_t		reservation; /* blocks that are still reserved */
2389 	int			available;
2390 	xfs_extlen_t		agflcount;
2391 
2392 	if (flags & XFS_ALLOC_FLAG_FREEING)
2393 		return true;
2394 
2395 	reservation = xfs_ag_resv_needed(pag, args->resv);
2396 
2397 	/* do we have enough contiguous free space for the allocation? */
2398 	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2399 	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2400 	if (longest < alloc_len)
2401 		return false;
2402 
2403 	/*
2404 	 * Do we have enough free space remaining for the allocation? Don't
2405 	 * account extra agfl blocks because we are about to defer free them,
2406 	 * making them unavailable until the current transaction commits.
2407 	 */
2408 	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2409 	available = (int)(pag->pagf_freeblks + agflcount -
2410 			  reservation - min_free - args->minleft);
2411 	if (available < (int)max(args->total, alloc_len))
2412 		return false;
2413 
2414 	/*
2415 	 * Clamp maxlen to the amount of free space available for the actual
2416 	 * extent allocation.
2417 	 */
2418 	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2419 		args->maxlen = available;
2420 		ASSERT(args->maxlen > 0);
2421 		ASSERT(args->maxlen >= args->minlen);
2422 	}
2423 
2424 	return true;
2425 }
2426 
2427 int
2428 xfs_free_agfl_block(
2429 	struct xfs_trans	*tp,
2430 	xfs_agnumber_t		agno,
2431 	xfs_agblock_t		agbno,
2432 	struct xfs_buf		*agbp,
2433 	struct xfs_owner_info	*oinfo)
2434 {
2435 	int			error;
2436 	struct xfs_buf		*bp;
2437 
2438 	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2439 				   XFS_AG_RESV_AGFL);
2440 	if (error)
2441 		return error;
2442 
2443 	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2444 			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2445 			tp->t_mountp->m_bsize, 0, &bp);
2446 	if (error)
2447 		return error;
2448 	xfs_trans_binval(tp, bp);
2449 
2450 	return 0;
2451 }
2452 
2453 /*
2454  * Check the agfl fields of the agf for inconsistency or corruption.
2455  *
2456  * The original purpose was to detect an agfl header padding mismatch between
2457  * current and early v5 kernels. This problem manifests as a 1-slot size
2458  * difference between the on-disk flcount and the active [first, last] range of
2459  * a wrapped agfl.
2460  *
2461  * However, we need to use these same checks to catch agfl count corruptions
2462  * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2463  * way, we need to reset the agfl and warn the user.
2464  *
2465  * Return true if a reset is required before the agfl can be used, false
2466  * otherwise.
2467  */
2468 static bool
2469 xfs_agfl_needs_reset(
2470 	struct xfs_mount	*mp,
2471 	struct xfs_agf		*agf)
2472 {
2473 	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2474 	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2475 	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2476 	int			agfl_size = xfs_agfl_size(mp);
2477 	int			active;
2478 
2479 	/*
2480 	 * The agf read verifier catches severe corruption of these fields.
2481 	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2482 	 * the verifier allows it.
2483 	 */
2484 	if (f >= agfl_size || l >= agfl_size)
2485 		return true;
2486 	if (c > agfl_size)
2487 		return true;
2488 
2489 	/*
2490 	 * Check consistency between the on-disk count and the active range. An
2491 	 * agfl padding mismatch manifests as an inconsistent flcount.
2492 	 */
2493 	if (c && l >= f)
2494 		active = l - f + 1;
2495 	else if (c)
2496 		active = agfl_size - f + l + 1;
2497 	else
2498 		active = 0;
2499 
2500 	return active != c;
2501 }
2502 
2503 /*
2504  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2505  * agfl content cannot be trusted. Warn the user that a repair is required to
2506  * recover leaked blocks.
2507  *
2508  * The purpose of this mechanism is to handle filesystems affected by the agfl
2509  * header padding mismatch problem. A reset keeps the filesystem online with a
2510  * relatively minor free space accounting inconsistency rather than suffer the
2511  * inevitable crash from use of an invalid agfl block.
2512  */
2513 static void
2514 xfs_agfl_reset(
2515 	struct xfs_trans	*tp,
2516 	struct xfs_buf		*agbp,
2517 	struct xfs_perag	*pag)
2518 {
2519 	struct xfs_mount	*mp = tp->t_mountp;
2520 	struct xfs_agf		*agf = agbp->b_addr;
2521 
2522 	ASSERT(xfs_perag_agfl_needs_reset(pag));
2523 	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2524 
2525 	xfs_warn(mp,
2526 	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2527 	       "Please unmount and run xfs_repair.",
2528 	         pag->pag_agno, pag->pagf_flcount);
2529 
2530 	agf->agf_flfirst = 0;
2531 	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2532 	agf->agf_flcount = 0;
2533 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2534 				    XFS_AGF_FLCOUNT);
2535 
2536 	pag->pagf_flcount = 0;
2537 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
2538 }
2539 
2540 /*
2541  * Defer an AGFL block free. This is effectively equivalent to
2542  * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2543  *
2544  * Deferring AGFL frees helps prevent log reservation overruns due to too many
2545  * allocation operations in a transaction. AGFL frees are prone to this problem
2546  * because for one they are always freed one at a time. Further, an immediate
2547  * AGFL block free can cause a btree join and require another block free before
2548  * the real allocation can proceed. Deferring the free disconnects freeing up
2549  * the AGFL slot from freeing the block.
2550  */
2551 static int
2552 xfs_defer_agfl_block(
2553 	struct xfs_trans		*tp,
2554 	xfs_agnumber_t			agno,
2555 	xfs_agblock_t			agbno,
2556 	struct xfs_owner_info		*oinfo)
2557 {
2558 	struct xfs_mount		*mp = tp->t_mountp;
2559 	struct xfs_extent_free_item	*xefi;
2560 	xfs_fsblock_t			fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
2561 
2562 	ASSERT(xfs_extfree_item_cache != NULL);
2563 	ASSERT(oinfo != NULL);
2564 
2565 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
2566 		return -EFSCORRUPTED;
2567 
2568 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2569 			       GFP_KERNEL | __GFP_NOFAIL);
2570 	xefi->xefi_startblock = fsbno;
2571 	xefi->xefi_blockcount = 1;
2572 	xefi->xefi_owner = oinfo->oi_owner;
2573 	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
2574 
2575 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2576 
2577 	xfs_extent_free_get_group(mp, xefi);
2578 	xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
2579 	return 0;
2580 }
2581 
2582 /*
2583  * Add the extent to the list of extents to be free at transaction end.
2584  * The list is maintained sorted (by block number).
2585  */
2586 static int
2587 xfs_defer_extent_free(
2588 	struct xfs_trans		*tp,
2589 	xfs_fsblock_t			bno,
2590 	xfs_filblks_t			len,
2591 	const struct xfs_owner_info	*oinfo,
2592 	enum xfs_ag_resv_type		type,
2593 	bool				skip_discard,
2594 	struct xfs_defer_pending	**dfpp)
2595 {
2596 	struct xfs_extent_free_item	*xefi;
2597 	struct xfs_mount		*mp = tp->t_mountp;
2598 #ifdef DEBUG
2599 	xfs_agnumber_t			agno;
2600 	xfs_agblock_t			agbno;
2601 
2602 	ASSERT(bno != NULLFSBLOCK);
2603 	ASSERT(len > 0);
2604 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2605 	ASSERT(!isnullstartblock(bno));
2606 	agno = XFS_FSB_TO_AGNO(mp, bno);
2607 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
2608 	ASSERT(agno < mp->m_sb.sb_agcount);
2609 	ASSERT(agbno < mp->m_sb.sb_agblocks);
2610 	ASSERT(len < mp->m_sb.sb_agblocks);
2611 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2612 #endif
2613 	ASSERT(xfs_extfree_item_cache != NULL);
2614 	ASSERT(type != XFS_AG_RESV_AGFL);
2615 
2616 	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
2617 		return -EFSCORRUPTED;
2618 
2619 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2620 			       GFP_KERNEL | __GFP_NOFAIL);
2621 	xefi->xefi_startblock = bno;
2622 	xefi->xefi_blockcount = (xfs_extlen_t)len;
2623 	xefi->xefi_agresv = type;
2624 	if (skip_discard)
2625 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2626 	if (oinfo) {
2627 		ASSERT(oinfo->oi_offset == 0);
2628 
2629 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2630 			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
2631 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2632 			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2633 		xefi->xefi_owner = oinfo->oi_owner;
2634 	} else {
2635 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
2636 	}
2637 	trace_xfs_bmap_free_defer(mp,
2638 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2639 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2640 
2641 	xfs_extent_free_get_group(mp, xefi);
2642 	*dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
2643 	return 0;
2644 }
2645 
2646 int
2647 xfs_free_extent_later(
2648 	struct xfs_trans		*tp,
2649 	xfs_fsblock_t			bno,
2650 	xfs_filblks_t			len,
2651 	const struct xfs_owner_info	*oinfo,
2652 	enum xfs_ag_resv_type		type,
2653 	bool				skip_discard)
2654 {
2655 	struct xfs_defer_pending	*dontcare = NULL;
2656 
2657 	return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
2658 			&dontcare);
2659 }
2660 
2661 /*
2662  * Set up automatic freeing of unwritten space in the filesystem.
2663  *
2664  * This function attached a paused deferred extent free item to the
2665  * transaction.  Pausing means that the EFI will be logged in the next
2666  * transaction commit, but the pending EFI will not be finished until the
2667  * pending item is unpaused.
2668  *
2669  * If the system goes down after the EFI has been persisted to the log but
2670  * before the pending item is unpaused, log recovery will find the EFI, fail to
2671  * find the EFD, and free the space.
2672  *
2673  * If the pending item is unpaused, the next transaction commit will log an EFD
2674  * without freeing the space.
2675  *
2676  * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
2677  * @args structure are set to the relevant values.
2678  */
2679 int
2680 xfs_alloc_schedule_autoreap(
2681 	const struct xfs_alloc_arg	*args,
2682 	bool				skip_discard,
2683 	struct xfs_alloc_autoreap	*aarp)
2684 {
2685 	int				error;
2686 
2687 	error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
2688 			&args->oinfo, args->resv, skip_discard, &aarp->dfp);
2689 	if (error)
2690 		return error;
2691 
2692 	xfs_defer_item_pause(args->tp, aarp->dfp);
2693 	return 0;
2694 }
2695 
2696 /*
2697  * Cancel automatic freeing of unwritten space in the filesystem.
2698  *
2699  * Earlier, we created a paused deferred extent free item and attached it to
2700  * this transaction so that we could automatically roll back a new space
2701  * allocation if the system went down.  Now we want to cancel the paused work
2702  * item by marking the EFI stale so we don't actually free the space, unpausing
2703  * the pending item and logging an EFD.
2704  *
2705  * The caller generally should have already mapped the space into the ondisk
2706  * filesystem.  If the reserved space was partially used, the caller must call
2707  * xfs_free_extent_later to create a new EFI to free the unused space.
2708  */
2709 void
2710 xfs_alloc_cancel_autoreap(
2711 	struct xfs_trans		*tp,
2712 	struct xfs_alloc_autoreap	*aarp)
2713 {
2714 	struct xfs_defer_pending	*dfp = aarp->dfp;
2715 	struct xfs_extent_free_item	*xefi;
2716 
2717 	if (!dfp)
2718 		return;
2719 
2720 	list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
2721 		xefi->xefi_flags |= XFS_EFI_CANCELLED;
2722 
2723 	xfs_defer_item_unpause(tp, dfp);
2724 }
2725 
2726 /*
2727  * Commit automatic freeing of unwritten space in the filesystem.
2728  *
2729  * This unpauses an earlier _schedule_autoreap and commits to freeing the
2730  * allocated space.  Call this if none of the reserved space was used.
2731  */
2732 void
2733 xfs_alloc_commit_autoreap(
2734 	struct xfs_trans		*tp,
2735 	struct xfs_alloc_autoreap	*aarp)
2736 {
2737 	if (aarp->dfp)
2738 		xfs_defer_item_unpause(tp, aarp->dfp);
2739 }
2740 
2741 #ifdef DEBUG
2742 /*
2743  * Check if an AGF has a free extent record whose length is equal to
2744  * args->minlen.
2745  */
2746 STATIC int
2747 xfs_exact_minlen_extent_available(
2748 	struct xfs_alloc_arg	*args,
2749 	struct xfs_buf		*agbp,
2750 	int			*stat)
2751 {
2752 	struct xfs_btree_cur	*cnt_cur;
2753 	xfs_agblock_t		fbno;
2754 	xfs_extlen_t		flen;
2755 	int			error = 0;
2756 
2757 	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
2758 					args->pag);
2759 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2760 	if (error)
2761 		goto out;
2762 
2763 	if (*stat == 0) {
2764 		xfs_btree_mark_sick(cnt_cur);
2765 		error = -EFSCORRUPTED;
2766 		goto out;
2767 	}
2768 
2769 	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2770 	if (error)
2771 		goto out;
2772 
2773 	if (*stat == 1 && flen != args->minlen)
2774 		*stat = 0;
2775 
2776 out:
2777 	xfs_btree_del_cursor(cnt_cur, error);
2778 
2779 	return error;
2780 }
2781 #endif
2782 
2783 /*
2784  * Decide whether to use this allocation group for this allocation.
2785  * If so, fix up the btree freelist's size.
2786  */
2787 int			/* error */
2788 xfs_alloc_fix_freelist(
2789 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2790 	uint32_t		alloc_flags)
2791 {
2792 	struct xfs_mount	*mp = args->mp;
2793 	struct xfs_perag	*pag = args->pag;
2794 	struct xfs_trans	*tp = args->tp;
2795 	struct xfs_buf		*agbp = NULL;
2796 	struct xfs_buf		*agflbp = NULL;
2797 	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2798 	xfs_agblock_t		bno;	/* freelist block */
2799 	xfs_extlen_t		need;	/* total blocks needed in freelist */
2800 	int			error = 0;
2801 
2802 	/* deferred ops (AGFL block frees) require permanent transactions */
2803 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2804 
2805 	if (!xfs_perag_initialised_agf(pag)) {
2806 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2807 		if (error) {
2808 			/* Couldn't lock the AGF so skip this AG. */
2809 			if (error == -EAGAIN)
2810 				error = 0;
2811 			goto out_no_agbp;
2812 		}
2813 	}
2814 
2815 	/*
2816 	 * If this is a metadata preferred pag and we are user data then try
2817 	 * somewhere else if we are not being asked to try harder at this
2818 	 * point
2819 	 */
2820 	if (xfs_perag_prefers_metadata(pag) &&
2821 	    (args->datatype & XFS_ALLOC_USERDATA) &&
2822 	    (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2823 		ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
2824 		goto out_agbp_relse;
2825 	}
2826 
2827 	need = xfs_alloc_min_freelist(mp, pag);
2828 	if (!xfs_alloc_space_available(args, need, alloc_flags |
2829 			XFS_ALLOC_FLAG_CHECK))
2830 		goto out_agbp_relse;
2831 
2832 	/*
2833 	 * Get the a.g. freespace buffer.
2834 	 * Can fail if we're not blocking on locks, and it's held.
2835 	 */
2836 	if (!agbp) {
2837 		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2838 		if (error) {
2839 			/* Couldn't lock the AGF so skip this AG. */
2840 			if (error == -EAGAIN)
2841 				error = 0;
2842 			goto out_no_agbp;
2843 		}
2844 	}
2845 
2846 	/* reset a padding mismatched agfl before final free space check */
2847 	if (xfs_perag_agfl_needs_reset(pag))
2848 		xfs_agfl_reset(tp, agbp, pag);
2849 
2850 	/* If there isn't enough total space or single-extent, reject it. */
2851 	need = xfs_alloc_min_freelist(mp, pag);
2852 	if (!xfs_alloc_space_available(args, need, alloc_flags))
2853 		goto out_agbp_relse;
2854 
2855 #ifdef DEBUG
2856 	if (args->alloc_minlen_only) {
2857 		int stat;
2858 
2859 		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2860 		if (error || !stat)
2861 			goto out_agbp_relse;
2862 	}
2863 #endif
2864 	/*
2865 	 * Make the freelist shorter if it's too long.
2866 	 *
2867 	 * Note that from this point onwards, we will always release the agf and
2868 	 * agfl buffers on error. This handles the case where we error out and
2869 	 * the buffers are clean or may not have been joined to the transaction
2870 	 * and hence need to be released manually. If they have been joined to
2871 	 * the transaction, then xfs_trans_brelse() will handle them
2872 	 * appropriately based on the recursion count and dirty state of the
2873 	 * buffer.
2874 	 *
2875 	 * XXX (dgc): When we have lots of free space, does this buy us
2876 	 * anything other than extra overhead when we need to put more blocks
2877 	 * back on the free list? Maybe we should only do this when space is
2878 	 * getting low or the AGFL is more than half full?
2879 	 *
2880 	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2881 	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2882 	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2883 	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2884 	 * both required to ensure that rmaps are correctly recorded for the
2885 	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2886 	 * repair/rmap.c in xfsprogs for details.
2887 	 */
2888 	memset(&targs, 0, sizeof(targs));
2889 	/* struct copy below */
2890 	if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
2891 		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2892 	else
2893 		targs.oinfo = XFS_RMAP_OINFO_AG;
2894 	while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
2895 			pag->pagf_flcount > need) {
2896 		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2897 		if (error)
2898 			goto out_agbp_relse;
2899 
2900 		/* defer agfl frees */
2901 		error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2902 		if (error)
2903 			goto out_agbp_relse;
2904 	}
2905 
2906 	targs.tp = tp;
2907 	targs.mp = mp;
2908 	targs.agbp = agbp;
2909 	targs.agno = args->agno;
2910 	targs.alignment = targs.minlen = targs.prod = 1;
2911 	targs.pag = pag;
2912 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2913 	if (error)
2914 		goto out_agbp_relse;
2915 
2916 	/* Make the freelist longer if it's too short. */
2917 	while (pag->pagf_flcount < need) {
2918 		targs.agbno = 0;
2919 		targs.maxlen = need - pag->pagf_flcount;
2920 		targs.resv = XFS_AG_RESV_AGFL;
2921 
2922 		/* Allocate as many blocks as possible at once. */
2923 		error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
2924 		if (error)
2925 			goto out_agflbp_relse;
2926 
2927 		/*
2928 		 * Stop if we run out.  Won't happen if callers are obeying
2929 		 * the restrictions correctly.  Can happen for free calls
2930 		 * on a completely full ag.
2931 		 */
2932 		if (targs.agbno == NULLAGBLOCK) {
2933 			if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
2934 				break;
2935 			goto out_agflbp_relse;
2936 		}
2937 
2938 		if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2939 			error = xfs_rmap_alloc(tp, agbp, pag,
2940 				       targs.agbno, targs.len, &targs.oinfo);
2941 			if (error)
2942 				goto out_agflbp_relse;
2943 		}
2944 		error = xfs_alloc_update_counters(tp, agbp,
2945 						  -((long)(targs.len)));
2946 		if (error)
2947 			goto out_agflbp_relse;
2948 
2949 		/*
2950 		 * Put each allocated block on the list.
2951 		 */
2952 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2953 			error = xfs_alloc_put_freelist(pag, tp, agbp,
2954 							agflbp, bno, 0);
2955 			if (error)
2956 				goto out_agflbp_relse;
2957 		}
2958 	}
2959 	xfs_trans_brelse(tp, agflbp);
2960 	args->agbp = agbp;
2961 	return 0;
2962 
2963 out_agflbp_relse:
2964 	xfs_trans_brelse(tp, agflbp);
2965 out_agbp_relse:
2966 	if (agbp)
2967 		xfs_trans_brelse(tp, agbp);
2968 out_no_agbp:
2969 	args->agbp = NULL;
2970 	return error;
2971 }
2972 
2973 /*
2974  * Get a block from the freelist.
2975  * Returns with the buffer for the block gotten.
2976  */
2977 int
2978 xfs_alloc_get_freelist(
2979 	struct xfs_perag	*pag,
2980 	struct xfs_trans	*tp,
2981 	struct xfs_buf		*agbp,
2982 	xfs_agblock_t		*bnop,
2983 	int			btreeblk)
2984 {
2985 	struct xfs_agf		*agf = agbp->b_addr;
2986 	struct xfs_buf		*agflbp;
2987 	xfs_agblock_t		bno;
2988 	__be32			*agfl_bno;
2989 	int			error;
2990 	uint32_t		logflags;
2991 	struct xfs_mount	*mp = tp->t_mountp;
2992 
2993 	/*
2994 	 * Freelist is empty, give up.
2995 	 */
2996 	if (!agf->agf_flcount) {
2997 		*bnop = NULLAGBLOCK;
2998 		return 0;
2999 	}
3000 	/*
3001 	 * Read the array of free blocks.
3002 	 */
3003 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3004 	if (error)
3005 		return error;
3006 
3007 
3008 	/*
3009 	 * Get the block number and update the data structures.
3010 	 */
3011 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3012 	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
3013 	if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
3014 		return -EFSCORRUPTED;
3015 
3016 	be32_add_cpu(&agf->agf_flfirst, 1);
3017 	xfs_trans_brelse(tp, agflbp);
3018 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
3019 		agf->agf_flfirst = 0;
3020 
3021 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3022 	be32_add_cpu(&agf->agf_flcount, -1);
3023 	pag->pagf_flcount--;
3024 
3025 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
3026 	if (btreeblk) {
3027 		be32_add_cpu(&agf->agf_btreeblks, 1);
3028 		pag->pagf_btreeblks++;
3029 		logflags |= XFS_AGF_BTREEBLKS;
3030 	}
3031 
3032 	xfs_alloc_log_agf(tp, agbp, logflags);
3033 	*bnop = bno;
3034 
3035 	return 0;
3036 }
3037 
3038 /*
3039  * Log the given fields from the agf structure.
3040  */
3041 void
3042 xfs_alloc_log_agf(
3043 	struct xfs_trans	*tp,
3044 	struct xfs_buf		*bp,
3045 	uint32_t		fields)
3046 {
3047 	int	first;		/* first byte offset */
3048 	int	last;		/* last byte offset */
3049 	static const short	offsets[] = {
3050 		offsetof(xfs_agf_t, agf_magicnum),
3051 		offsetof(xfs_agf_t, agf_versionnum),
3052 		offsetof(xfs_agf_t, agf_seqno),
3053 		offsetof(xfs_agf_t, agf_length),
3054 		offsetof(xfs_agf_t, agf_bno_root),   /* also cnt/rmap root */
3055 		offsetof(xfs_agf_t, agf_bno_level),  /* also cnt/rmap levels */
3056 		offsetof(xfs_agf_t, agf_flfirst),
3057 		offsetof(xfs_agf_t, agf_fllast),
3058 		offsetof(xfs_agf_t, agf_flcount),
3059 		offsetof(xfs_agf_t, agf_freeblks),
3060 		offsetof(xfs_agf_t, agf_longest),
3061 		offsetof(xfs_agf_t, agf_btreeblks),
3062 		offsetof(xfs_agf_t, agf_uuid),
3063 		offsetof(xfs_agf_t, agf_rmap_blocks),
3064 		offsetof(xfs_agf_t, agf_refcount_blocks),
3065 		offsetof(xfs_agf_t, agf_refcount_root),
3066 		offsetof(xfs_agf_t, agf_refcount_level),
3067 		/* needed so that we don't log the whole rest of the structure: */
3068 		offsetof(xfs_agf_t, agf_spare64),
3069 		sizeof(xfs_agf_t)
3070 	};
3071 
3072 	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
3073 
3074 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
3075 
3076 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
3077 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
3078 }
3079 
3080 /*
3081  * Put the block on the freelist for the allocation group.
3082  */
3083 int
3084 xfs_alloc_put_freelist(
3085 	struct xfs_perag	*pag,
3086 	struct xfs_trans	*tp,
3087 	struct xfs_buf		*agbp,
3088 	struct xfs_buf		*agflbp,
3089 	xfs_agblock_t		bno,
3090 	int			btreeblk)
3091 {
3092 	struct xfs_mount	*mp = tp->t_mountp;
3093 	struct xfs_agf		*agf = agbp->b_addr;
3094 	__be32			*blockp;
3095 	int			error;
3096 	uint32_t		logflags;
3097 	__be32			*agfl_bno;
3098 	int			startoff;
3099 
3100 	if (!agflbp) {
3101 		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
3102 		if (error)
3103 			return error;
3104 	}
3105 
3106 	be32_add_cpu(&agf->agf_fllast, 1);
3107 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
3108 		agf->agf_fllast = 0;
3109 
3110 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
3111 	be32_add_cpu(&agf->agf_flcount, 1);
3112 	pag->pagf_flcount++;
3113 
3114 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
3115 	if (btreeblk) {
3116 		be32_add_cpu(&agf->agf_btreeblks, -1);
3117 		pag->pagf_btreeblks--;
3118 		logflags |= XFS_AGF_BTREEBLKS;
3119 	}
3120 
3121 	xfs_alloc_log_agf(tp, agbp, logflags);
3122 
3123 	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
3124 
3125 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3126 	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
3127 	*blockp = cpu_to_be32(bno);
3128 	startoff = (char *)blockp - (char *)agflbp->b_addr;
3129 
3130 	xfs_alloc_log_agf(tp, agbp, logflags);
3131 
3132 	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
3133 	xfs_trans_log_buf(tp, agflbp, startoff,
3134 			  startoff + sizeof(xfs_agblock_t) - 1);
3135 	return 0;
3136 }
3137 
3138 /*
3139  * Check that this AGF/AGI header's sequence number and length matches the AG
3140  * number and size in fsblocks.
3141  */
3142 xfs_failaddr_t
3143 xfs_validate_ag_length(
3144 	struct xfs_buf		*bp,
3145 	uint32_t		seqno,
3146 	uint32_t		length)
3147 {
3148 	struct xfs_mount	*mp = bp->b_mount;
3149 	/*
3150 	 * During growfs operations, the perag is not fully initialised,
3151 	 * so we can't use it for any useful checking. growfs ensures we can't
3152 	 * use it by using uncached buffers that don't have the perag attached
3153 	 * so we can detect and avoid this problem.
3154 	 */
3155 	if (bp->b_pag && seqno != bp->b_pag->pag_agno)
3156 		return __this_address;
3157 
3158 	/*
3159 	 * Only the last AG in the filesystem is allowed to be shorter
3160 	 * than the AG size recorded in the superblock.
3161 	 */
3162 	if (length != mp->m_sb.sb_agblocks) {
3163 		/*
3164 		 * During growfs, the new last AG can get here before we
3165 		 * have updated the superblock. Give it a pass on the seqno
3166 		 * check.
3167 		 */
3168 		if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
3169 			return __this_address;
3170 		if (length < XFS_MIN_AG_BLOCKS)
3171 			return __this_address;
3172 		if (length > mp->m_sb.sb_agblocks)
3173 			return __this_address;
3174 	}
3175 
3176 	return NULL;
3177 }
3178 
3179 /*
3180  * Verify the AGF is consistent.
3181  *
3182  * We do not verify the AGFL indexes in the AGF are fully consistent here
3183  * because of issues with variable on-disk structure sizes. Instead, we check
3184  * the agfl indexes for consistency when we initialise the perag from the AGF
3185  * information after a read completes.
3186  *
3187  * If the index is inconsistent, then we mark the perag as needing an AGFL
3188  * reset. The first AGFL update performed then resets the AGFL indexes and
3189  * refills the AGFL with known good free blocks, allowing the filesystem to
3190  * continue operating normally at the cost of a few leaked free space blocks.
3191  */
3192 static xfs_failaddr_t
3193 xfs_agf_verify(
3194 	struct xfs_buf		*bp)
3195 {
3196 	struct xfs_mount	*mp = bp->b_mount;
3197 	struct xfs_agf		*agf = bp->b_addr;
3198 	xfs_failaddr_t		fa;
3199 	uint32_t		agf_seqno = be32_to_cpu(agf->agf_seqno);
3200 	uint32_t		agf_length = be32_to_cpu(agf->agf_length);
3201 
3202 	if (xfs_has_crc(mp)) {
3203 		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
3204 			return __this_address;
3205 		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
3206 			return __this_address;
3207 	}
3208 
3209 	if (!xfs_verify_magic(bp, agf->agf_magicnum))
3210 		return __this_address;
3211 
3212 	if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
3213 		return __this_address;
3214 
3215 	/*
3216 	 * Both agf_seqno and agf_length need to validated before anything else
3217 	 * block number related in the AGF or AGFL can be checked.
3218 	 */
3219 	fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
3220 	if (fa)
3221 		return fa;
3222 
3223 	if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
3224 		return __this_address;
3225 	if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
3226 		return __this_address;
3227 	if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
3228 		return __this_address;
3229 
3230 	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
3231 	    be32_to_cpu(agf->agf_freeblks) > agf_length)
3232 		return __this_address;
3233 
3234 	if (be32_to_cpu(agf->agf_bno_level) < 1 ||
3235 	    be32_to_cpu(agf->agf_cnt_level) < 1 ||
3236 	    be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
3237 	    be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
3238 		return __this_address;
3239 
3240 	if (xfs_has_lazysbcount(mp) &&
3241 	    be32_to_cpu(agf->agf_btreeblks) > agf_length)
3242 		return __this_address;
3243 
3244 	if (xfs_has_rmapbt(mp)) {
3245 		if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
3246 			return __this_address;
3247 
3248 		if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
3249 		    be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
3250 			return __this_address;
3251 	}
3252 
3253 	if (xfs_has_reflink(mp)) {
3254 		if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
3255 			return __this_address;
3256 
3257 		if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
3258 		    be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
3259 			return __this_address;
3260 	}
3261 
3262 	return NULL;
3263 }
3264 
3265 static void
3266 xfs_agf_read_verify(
3267 	struct xfs_buf	*bp)
3268 {
3269 	struct xfs_mount *mp = bp->b_mount;
3270 	xfs_failaddr_t	fa;
3271 
3272 	if (xfs_has_crc(mp) &&
3273 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3274 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3275 	else {
3276 		fa = xfs_agf_verify(bp);
3277 		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3278 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3279 	}
3280 }
3281 
3282 static void
3283 xfs_agf_write_verify(
3284 	struct xfs_buf	*bp)
3285 {
3286 	struct xfs_mount	*mp = bp->b_mount;
3287 	struct xfs_buf_log_item	*bip = bp->b_log_item;
3288 	struct xfs_agf		*agf = bp->b_addr;
3289 	xfs_failaddr_t		fa;
3290 
3291 	fa = xfs_agf_verify(bp);
3292 	if (fa) {
3293 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3294 		return;
3295 	}
3296 
3297 	if (!xfs_has_crc(mp))
3298 		return;
3299 
3300 	if (bip)
3301 		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3302 
3303 	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3304 }
3305 
3306 const struct xfs_buf_ops xfs_agf_buf_ops = {
3307 	.name = "xfs_agf",
3308 	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3309 	.verify_read = xfs_agf_read_verify,
3310 	.verify_write = xfs_agf_write_verify,
3311 	.verify_struct = xfs_agf_verify,
3312 };
3313 
3314 /*
3315  * Read in the allocation group header (free/alloc section).
3316  */
3317 int
3318 xfs_read_agf(
3319 	struct xfs_perag	*pag,
3320 	struct xfs_trans	*tp,
3321 	int			flags,
3322 	struct xfs_buf		**agfbpp)
3323 {
3324 	struct xfs_mount	*mp = pag->pag_mount;
3325 	int			error;
3326 
3327 	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3328 
3329 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3330 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3331 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3332 	if (xfs_metadata_is_sick(error))
3333 		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
3334 	if (error)
3335 		return error;
3336 
3337 	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3338 	return 0;
3339 }
3340 
3341 /*
3342  * Read in the allocation group header (free/alloc section) and initialise the
3343  * perag structure if necessary. If the caller provides @agfbpp, then return the
3344  * locked buffer to the caller, otherwise free it.
3345  */
3346 int
3347 xfs_alloc_read_agf(
3348 	struct xfs_perag	*pag,
3349 	struct xfs_trans	*tp,
3350 	int			flags,
3351 	struct xfs_buf		**agfbpp)
3352 {
3353 	struct xfs_buf		*agfbp;
3354 	struct xfs_agf		*agf;
3355 	int			error;
3356 	int			allocbt_blks;
3357 
3358 	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3359 
3360 	/* We don't support trylock when freeing. */
3361 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3362 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3363 	error = xfs_read_agf(pag, tp,
3364 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3365 			&agfbp);
3366 	if (error)
3367 		return error;
3368 
3369 	agf = agfbp->b_addr;
3370 	if (!xfs_perag_initialised_agf(pag)) {
3371 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3372 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3373 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3374 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3375 		pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
3376 		pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
3377 		pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
3378 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3379 		if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3380 			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3381 		else
3382 			clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3383 
3384 		/*
3385 		 * Update the in-core allocbt counter. Filter out the rmapbt
3386 		 * subset of the btreeblks counter because the rmapbt is managed
3387 		 * by perag reservation. Subtract one for the rmapbt root block
3388 		 * because the rmap counter includes it while the btreeblks
3389 		 * counter only tracks non-root blocks.
3390 		 */
3391 		allocbt_blks = pag->pagf_btreeblks;
3392 		if (xfs_has_rmapbt(pag->pag_mount))
3393 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3394 		if (allocbt_blks > 0)
3395 			atomic64_add(allocbt_blks,
3396 					&pag->pag_mount->m_allocbt_blks);
3397 
3398 		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
3399 	}
3400 #ifdef DEBUG
3401 	else if (!xfs_is_shutdown(pag->pag_mount)) {
3402 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3403 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3404 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3405 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3406 		ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
3407 		ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
3408 	}
3409 #endif
3410 	if (agfbpp)
3411 		*agfbpp = agfbp;
3412 	else
3413 		xfs_trans_brelse(tp, agfbp);
3414 	return 0;
3415 }
3416 
3417 /*
3418  * Pre-proces allocation arguments to set initial state that we don't require
3419  * callers to set up correctly, as well as bounds check the allocation args
3420  * that are set up.
3421  */
3422 static int
3423 xfs_alloc_vextent_check_args(
3424 	struct xfs_alloc_arg	*args,
3425 	xfs_fsblock_t		target,
3426 	xfs_agnumber_t		*minimum_agno)
3427 {
3428 	struct xfs_mount	*mp = args->mp;
3429 	xfs_agblock_t		agsize;
3430 
3431 	args->fsbno = NULLFSBLOCK;
3432 
3433 	*minimum_agno = 0;
3434 	if (args->tp->t_highest_agno != NULLAGNUMBER)
3435 		*minimum_agno = args->tp->t_highest_agno;
3436 
3437 	/*
3438 	 * Just fix this up, for the case where the last a.g. is shorter
3439 	 * (or there's only one a.g.) and the caller couldn't easily figure
3440 	 * that out (xfs_bmap_alloc).
3441 	 */
3442 	agsize = mp->m_sb.sb_agblocks;
3443 	if (args->maxlen > agsize)
3444 		args->maxlen = agsize;
3445 	if (args->alignment == 0)
3446 		args->alignment = 1;
3447 
3448 	ASSERT(args->minlen > 0);
3449 	ASSERT(args->maxlen > 0);
3450 	ASSERT(args->alignment > 0);
3451 	ASSERT(args->resv != XFS_AG_RESV_AGFL);
3452 
3453 	ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3454 	ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
3455 	ASSERT(args->minlen <= args->maxlen);
3456 	ASSERT(args->minlen <= agsize);
3457 	ASSERT(args->mod < args->prod);
3458 
3459 	if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3460 	    XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
3461 	    args->minlen > args->maxlen || args->minlen > agsize ||
3462 	    args->mod >= args->prod) {
3463 		trace_xfs_alloc_vextent_badargs(args);
3464 		return -ENOSPC;
3465 	}
3466 
3467 	if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3468 		trace_xfs_alloc_vextent_skip_deadlock(args);
3469 		return -ENOSPC;
3470 	}
3471 	return 0;
3472 
3473 }
3474 
3475 /*
3476  * Prepare an AG for allocation. If the AG is not prepared to accept the
3477  * allocation, return failure.
3478  *
3479  * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3480  * modified to hold their own perag references.
3481  */
3482 static int
3483 xfs_alloc_vextent_prepare_ag(
3484 	struct xfs_alloc_arg	*args,
3485 	uint32_t		alloc_flags)
3486 {
3487 	bool			need_pag = !args->pag;
3488 	int			error;
3489 
3490 	if (need_pag)
3491 		args->pag = xfs_perag_get(args->mp, args->agno);
3492 
3493 	args->agbp = NULL;
3494 	error = xfs_alloc_fix_freelist(args, alloc_flags);
3495 	if (error) {
3496 		trace_xfs_alloc_vextent_nofix(args);
3497 		if (need_pag)
3498 			xfs_perag_put(args->pag);
3499 		args->agbno = NULLAGBLOCK;
3500 		return error;
3501 	}
3502 	if (!args->agbp) {
3503 		/* cannot allocate in this AG at all */
3504 		trace_xfs_alloc_vextent_noagbp(args);
3505 		args->agbno = NULLAGBLOCK;
3506 		return 0;
3507 	}
3508 	args->wasfromfl = 0;
3509 	return 0;
3510 }
3511 
3512 /*
3513  * Post-process allocation results to account for the allocation if it succeed
3514  * and set the allocated block number correctly for the caller.
3515  *
3516  * XXX: we should really be returning ENOSPC for ENOSPC, not
3517  * hiding it behind a "successful" NULLFSBLOCK allocation.
3518  */
3519 static int
3520 xfs_alloc_vextent_finish(
3521 	struct xfs_alloc_arg	*args,
3522 	xfs_agnumber_t		minimum_agno,
3523 	int			alloc_error,
3524 	bool			drop_perag)
3525 {
3526 	struct xfs_mount	*mp = args->mp;
3527 	int			error = 0;
3528 
3529 	/*
3530 	 * We can end up here with a locked AGF. If we failed, the caller is
3531 	 * likely going to try to allocate again with different parameters, and
3532 	 * that can widen the AGs that are searched for free space. If we have
3533 	 * to do BMBT block allocation, we have to do a new allocation.
3534 	 *
3535 	 * Hence leaving this function with the AGF locked opens up potential
3536 	 * ABBA AGF deadlocks because a future allocation attempt in this
3537 	 * transaction may attempt to lock a lower number AGF.
3538 	 *
3539 	 * We can't release the AGF until the transaction is commited, so at
3540 	 * this point we must update the "first allocation" tracker to point at
3541 	 * this AG if the tracker is empty or points to a lower AG. This allows
3542 	 * the next allocation attempt to be modified appropriately to avoid
3543 	 * deadlocks.
3544 	 */
3545 	if (args->agbp &&
3546 	    (args->tp->t_highest_agno == NULLAGNUMBER ||
3547 	     args->agno > minimum_agno))
3548 		args->tp->t_highest_agno = args->agno;
3549 
3550 	/*
3551 	 * If the allocation failed with an error or we had an ENOSPC result,
3552 	 * preserve the returned error whilst also marking the allocation result
3553 	 * as "no extent allocated". This ensures that callers that fail to
3554 	 * capture the error will still treat it as a failed allocation.
3555 	 */
3556 	if (alloc_error || args->agbno == NULLAGBLOCK) {
3557 		args->fsbno = NULLFSBLOCK;
3558 		error = alloc_error;
3559 		goto out_drop_perag;
3560 	}
3561 
3562 	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3563 
3564 	ASSERT(args->len >= args->minlen);
3565 	ASSERT(args->len <= args->maxlen);
3566 	ASSERT(args->agbno % args->alignment == 0);
3567 	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
3568 
3569 	/* if not file data, insert new block into the reverse map btree */
3570 	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3571 		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3572 				       args->agbno, args->len, &args->oinfo);
3573 		if (error)
3574 			goto out_drop_perag;
3575 	}
3576 
3577 	if (!args->wasfromfl) {
3578 		error = xfs_alloc_update_counters(args->tp, args->agbp,
3579 						  -((long)(args->len)));
3580 		if (error)
3581 			goto out_drop_perag;
3582 
3583 		ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
3584 				args->len));
3585 	}
3586 
3587 	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3588 
3589 	XFS_STATS_INC(mp, xs_allocx);
3590 	XFS_STATS_ADD(mp, xs_allocb, args->len);
3591 
3592 	trace_xfs_alloc_vextent_finish(args);
3593 
3594 out_drop_perag:
3595 	if (drop_perag && args->pag) {
3596 		xfs_perag_rele(args->pag);
3597 		args->pag = NULL;
3598 	}
3599 	return error;
3600 }
3601 
3602 /*
3603  * Allocate within a single AG only. This uses a best-fit length algorithm so if
3604  * you need an exact sized allocation without locality constraints, this is the
3605  * fastest way to do it.
3606  *
3607  * Caller is expected to hold a perag reference in args->pag.
3608  */
3609 int
3610 xfs_alloc_vextent_this_ag(
3611 	struct xfs_alloc_arg	*args,
3612 	xfs_agnumber_t		agno)
3613 {
3614 	struct xfs_mount	*mp = args->mp;
3615 	xfs_agnumber_t		minimum_agno;
3616 	uint32_t		alloc_flags = 0;
3617 	int			error;
3618 
3619 	ASSERT(args->pag != NULL);
3620 	ASSERT(args->pag->pag_agno == agno);
3621 
3622 	args->agno = agno;
3623 	args->agbno = 0;
3624 
3625 	trace_xfs_alloc_vextent_this_ag(args);
3626 
3627 	error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
3628 			&minimum_agno);
3629 	if (error) {
3630 		if (error == -ENOSPC)
3631 			return 0;
3632 		return error;
3633 	}
3634 
3635 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3636 	if (!error && args->agbp)
3637 		error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3638 
3639 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3640 }
3641 
3642 /*
3643  * Iterate all AGs trying to allocate an extent starting from @start_ag.
3644  *
3645  * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3646  * allocation attempts in @start_agno have locality information. If we fail to
3647  * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3648  * we attempt to allocation in as there is no locality optimisation possible for
3649  * those allocations.
3650  *
3651  * On return, args->pag may be left referenced if we finish before the "all
3652  * failed" return point. The allocation finish still needs the perag, and
3653  * so the caller will release it once they've finished the allocation.
3654  *
3655  * When we wrap the AG iteration at the end of the filesystem, we have to be
3656  * careful not to wrap into AGs below ones we already have locked in the
3657  * transaction if we are doing a blocking iteration. This will result in an
3658  * out-of-order locking of AGFs and hence can cause deadlocks.
3659  */
3660 static int
3661 xfs_alloc_vextent_iterate_ags(
3662 	struct xfs_alloc_arg	*args,
3663 	xfs_agnumber_t		minimum_agno,
3664 	xfs_agnumber_t		start_agno,
3665 	xfs_agblock_t		target_agbno,
3666 	uint32_t		alloc_flags)
3667 {
3668 	struct xfs_mount	*mp = args->mp;
3669 	xfs_agnumber_t		restart_agno = minimum_agno;
3670 	xfs_agnumber_t		agno;
3671 	int			error = 0;
3672 
3673 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
3674 		restart_agno = 0;
3675 restart:
3676 	for_each_perag_wrap_range(mp, start_agno, restart_agno,
3677 			mp->m_sb.sb_agcount, agno, args->pag) {
3678 		args->agno = agno;
3679 		error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3680 		if (error)
3681 			break;
3682 		if (!args->agbp) {
3683 			trace_xfs_alloc_vextent_loopfailed(args);
3684 			continue;
3685 		}
3686 
3687 		/*
3688 		 * Allocation is supposed to succeed now, so break out of the
3689 		 * loop regardless of whether we succeed or not.
3690 		 */
3691 		if (args->agno == start_agno && target_agbno) {
3692 			args->agbno = target_agbno;
3693 			error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3694 		} else {
3695 			args->agbno = 0;
3696 			error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3697 		}
3698 		break;
3699 	}
3700 	if (error) {
3701 		xfs_perag_rele(args->pag);
3702 		args->pag = NULL;
3703 		return error;
3704 	}
3705 	if (args->agbp)
3706 		return 0;
3707 
3708 	/*
3709 	 * We didn't find an AG we can alloation from. If we were given
3710 	 * constraining flags by the caller, drop them and retry the allocation
3711 	 * without any constraints being set.
3712 	 */
3713 	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
3714 		alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
3715 		restart_agno = minimum_agno;
3716 		goto restart;
3717 	}
3718 
3719 	ASSERT(args->pag == NULL);
3720 	trace_xfs_alloc_vextent_allfailed(args);
3721 	return 0;
3722 }
3723 
3724 /*
3725  * Iterate from the AGs from the start AG to the end of the filesystem, trying
3726  * to allocate blocks. It starts with a near allocation attempt in the initial
3727  * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3728  * back to zero if allowed by previous allocations in this transaction,
3729  * otherwise will wrap back to the start AG and run a second blocking pass to
3730  * the end of the filesystem.
3731  */
3732 int
3733 xfs_alloc_vextent_start_ag(
3734 	struct xfs_alloc_arg	*args,
3735 	xfs_fsblock_t		target)
3736 {
3737 	struct xfs_mount	*mp = args->mp;
3738 	xfs_agnumber_t		minimum_agno;
3739 	xfs_agnumber_t		start_agno;
3740 	xfs_agnumber_t		rotorstep = xfs_rotorstep;
3741 	bool			bump_rotor = false;
3742 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3743 	int			error;
3744 
3745 	ASSERT(args->pag == NULL);
3746 
3747 	args->agno = NULLAGNUMBER;
3748 	args->agbno = NULLAGBLOCK;
3749 
3750 	trace_xfs_alloc_vextent_start_ag(args);
3751 
3752 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3753 	if (error) {
3754 		if (error == -ENOSPC)
3755 			return 0;
3756 		return error;
3757 	}
3758 
3759 	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3760 	    xfs_is_inode32(mp)) {
3761 		target = XFS_AGB_TO_FSB(mp,
3762 				((mp->m_agfrotor / rotorstep) %
3763 				mp->m_sb.sb_agcount), 0);
3764 		bump_rotor = 1;
3765 	}
3766 
3767 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3768 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3769 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3770 
3771 	if (bump_rotor) {
3772 		if (args->agno == start_agno)
3773 			mp->m_agfrotor = (mp->m_agfrotor + 1) %
3774 				(mp->m_sb.sb_agcount * rotorstep);
3775 		else
3776 			mp->m_agfrotor = (args->agno * rotorstep + 1) %
3777 				(mp->m_sb.sb_agcount * rotorstep);
3778 	}
3779 
3780 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3781 }
3782 
3783 /*
3784  * Iterate from the agno indicated via @target through to the end of the
3785  * filesystem attempting blocking allocation. This does not wrap or try a second
3786  * pass, so will not recurse into AGs lower than indicated by the target.
3787  */
3788 int
3789 xfs_alloc_vextent_first_ag(
3790 	struct xfs_alloc_arg	*args,
3791 	xfs_fsblock_t		target)
3792  {
3793 	struct xfs_mount	*mp = args->mp;
3794 	xfs_agnumber_t		minimum_agno;
3795 	xfs_agnumber_t		start_agno;
3796 	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3797 	int			error;
3798 
3799 	ASSERT(args->pag == NULL);
3800 
3801 	args->agno = NULLAGNUMBER;
3802 	args->agbno = NULLAGBLOCK;
3803 
3804 	trace_xfs_alloc_vextent_first_ag(args);
3805 
3806 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3807 	if (error) {
3808 		if (error == -ENOSPC)
3809 			return 0;
3810 		return error;
3811 	}
3812 
3813 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3814 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3815 			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3816 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3817 }
3818 
3819 /*
3820  * Allocate at the exact block target or fail. Caller is expected to hold a
3821  * perag reference in args->pag.
3822  */
3823 int
3824 xfs_alloc_vextent_exact_bno(
3825 	struct xfs_alloc_arg	*args,
3826 	xfs_fsblock_t		target)
3827 {
3828 	struct xfs_mount	*mp = args->mp;
3829 	xfs_agnumber_t		minimum_agno;
3830 	int			error;
3831 
3832 	ASSERT(args->pag != NULL);
3833 	ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3834 
3835 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3836 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3837 
3838 	trace_xfs_alloc_vextent_exact_bno(args);
3839 
3840 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3841 	if (error) {
3842 		if (error == -ENOSPC)
3843 			return 0;
3844 		return error;
3845 	}
3846 
3847 	error = xfs_alloc_vextent_prepare_ag(args, 0);
3848 	if (!error && args->agbp)
3849 		error = xfs_alloc_ag_vextent_exact(args);
3850 
3851 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3852 }
3853 
3854 /*
3855  * Allocate an extent as close to the target as possible. If there are not
3856  * viable candidates in the AG, then fail the allocation.
3857  *
3858  * Caller may or may not have a per-ag reference in args->pag.
3859  */
3860 int
3861 xfs_alloc_vextent_near_bno(
3862 	struct xfs_alloc_arg	*args,
3863 	xfs_fsblock_t		target)
3864 {
3865 	struct xfs_mount	*mp = args->mp;
3866 	xfs_agnumber_t		minimum_agno;
3867 	bool			needs_perag = args->pag == NULL;
3868 	uint32_t		alloc_flags = 0;
3869 	int			error;
3870 
3871 	if (!needs_perag)
3872 		ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3873 
3874 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3875 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3876 
3877 	trace_xfs_alloc_vextent_near_bno(args);
3878 
3879 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3880 	if (error) {
3881 		if (error == -ENOSPC)
3882 			return 0;
3883 		return error;
3884 	}
3885 
3886 	if (needs_perag)
3887 		args->pag = xfs_perag_grab(mp, args->agno);
3888 
3889 	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3890 	if (!error && args->agbp)
3891 		error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3892 
3893 	return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
3894 }
3895 
3896 /* Ensure that the freelist is at full capacity. */
3897 int
3898 xfs_free_extent_fix_freelist(
3899 	struct xfs_trans	*tp,
3900 	struct xfs_perag	*pag,
3901 	struct xfs_buf		**agbp)
3902 {
3903 	struct xfs_alloc_arg	args;
3904 	int			error;
3905 
3906 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3907 	args.tp = tp;
3908 	args.mp = tp->t_mountp;
3909 	args.agno = pag->pag_agno;
3910 	args.pag = pag;
3911 
3912 	/*
3913 	 * validate that the block number is legal - the enables us to detect
3914 	 * and handle a silent filesystem corruption rather than crashing.
3915 	 */
3916 	if (args.agno >= args.mp->m_sb.sb_agcount)
3917 		return -EFSCORRUPTED;
3918 
3919 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3920 	if (error)
3921 		return error;
3922 
3923 	*agbp = args.agbp;
3924 	return 0;
3925 }
3926 
3927 /*
3928  * Free an extent.
3929  * Just break up the extent address and hand off to xfs_free_ag_extent
3930  * after fixing up the freelist.
3931  */
3932 int
3933 __xfs_free_extent(
3934 	struct xfs_trans		*tp,
3935 	struct xfs_perag		*pag,
3936 	xfs_agblock_t			agbno,
3937 	xfs_extlen_t			len,
3938 	const struct xfs_owner_info	*oinfo,
3939 	enum xfs_ag_resv_type		type,
3940 	bool				skip_discard)
3941 {
3942 	struct xfs_mount		*mp = tp->t_mountp;
3943 	struct xfs_buf			*agbp;
3944 	struct xfs_agf			*agf;
3945 	int				error;
3946 	unsigned int			busy_flags = 0;
3947 
3948 	ASSERT(len != 0);
3949 	ASSERT(type != XFS_AG_RESV_AGFL);
3950 
3951 	if (XFS_TEST_ERROR(false, mp,
3952 			XFS_ERRTAG_FREE_EXTENT))
3953 		return -EIO;
3954 
3955 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3956 	if (error) {
3957 		if (xfs_metadata_is_sick(error))
3958 			xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3959 		return error;
3960 	}
3961 
3962 	agf = agbp->b_addr;
3963 
3964 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3965 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3966 		error = -EFSCORRUPTED;
3967 		goto err_release;
3968 	}
3969 
3970 	/* validate the extent size is legal now we have the agf locked */
3971 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3972 		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
3973 		error = -EFSCORRUPTED;
3974 		goto err_release;
3975 	}
3976 
3977 	error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
3978 			type);
3979 	if (error)
3980 		goto err_release;
3981 
3982 	if (skip_discard)
3983 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3984 	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3985 	return 0;
3986 
3987 err_release:
3988 	xfs_trans_brelse(tp, agbp);
3989 	return error;
3990 }
3991 
3992 struct xfs_alloc_query_range_info {
3993 	xfs_alloc_query_range_fn	fn;
3994 	void				*priv;
3995 };
3996 
3997 /* Format btree record and pass to our callback. */
3998 STATIC int
3999 xfs_alloc_query_range_helper(
4000 	struct xfs_btree_cur		*cur,
4001 	const union xfs_btree_rec	*rec,
4002 	void				*priv)
4003 {
4004 	struct xfs_alloc_query_range_info	*query = priv;
4005 	struct xfs_alloc_rec_incore		irec;
4006 	xfs_failaddr_t				fa;
4007 
4008 	xfs_alloc_btrec_to_irec(rec, &irec);
4009 	fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
4010 	if (fa)
4011 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
4012 
4013 	return query->fn(cur, &irec, query->priv);
4014 }
4015 
4016 /* Find all free space within a given range of blocks. */
4017 int
4018 xfs_alloc_query_range(
4019 	struct xfs_btree_cur			*cur,
4020 	const struct xfs_alloc_rec_incore	*low_rec,
4021 	const struct xfs_alloc_rec_incore	*high_rec,
4022 	xfs_alloc_query_range_fn		fn,
4023 	void					*priv)
4024 {
4025 	union xfs_btree_irec			low_brec = { .a = *low_rec };
4026 	union xfs_btree_irec			high_brec = { .a = *high_rec };
4027 	struct xfs_alloc_query_range_info	query = { .priv = priv, .fn = fn };
4028 
4029 	ASSERT(xfs_btree_is_bno(cur->bc_ops));
4030 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
4031 			xfs_alloc_query_range_helper, &query);
4032 }
4033 
4034 /* Find all free space records. */
4035 int
4036 xfs_alloc_query_all(
4037 	struct xfs_btree_cur			*cur,
4038 	xfs_alloc_query_range_fn		fn,
4039 	void					*priv)
4040 {
4041 	struct xfs_alloc_query_range_info	query;
4042 
4043 	ASSERT(xfs_btree_is_bno(cur->bc_ops));
4044 	query.priv = priv;
4045 	query.fn = fn;
4046 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
4047 }
4048 
4049 /*
4050  * Scan part of the keyspace of the free space and tell us if the area has no
4051  * records, is fully mapped by records, or is partially filled.
4052  */
4053 int
4054 xfs_alloc_has_records(
4055 	struct xfs_btree_cur	*cur,
4056 	xfs_agblock_t		bno,
4057 	xfs_extlen_t		len,
4058 	enum xbtree_recpacking	*outcome)
4059 {
4060 	union xfs_btree_irec	low;
4061 	union xfs_btree_irec	high;
4062 
4063 	memset(&low, 0, sizeof(low));
4064 	low.a.ar_startblock = bno;
4065 	memset(&high, 0xFF, sizeof(high));
4066 	high.a.ar_startblock = bno + len - 1;
4067 
4068 	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
4069 }
4070 
4071 /*
4072  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
4073  * error code or XFS_ITER_*.
4074  */
4075 int
4076 xfs_agfl_walk(
4077 	struct xfs_mount	*mp,
4078 	struct xfs_agf		*agf,
4079 	struct xfs_buf		*agflbp,
4080 	xfs_agfl_walk_fn	walk_fn,
4081 	void			*priv)
4082 {
4083 	__be32			*agfl_bno;
4084 	unsigned int		i;
4085 	int			error;
4086 
4087 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
4088 	i = be32_to_cpu(agf->agf_flfirst);
4089 
4090 	/* Nothing to walk in an empty AGFL. */
4091 	if (agf->agf_flcount == cpu_to_be32(0))
4092 		return 0;
4093 
4094 	/* Otherwise, walk from first to last, wrapping as needed. */
4095 	for (;;) {
4096 		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
4097 		if (error)
4098 			return error;
4099 		if (i == be32_to_cpu(agf->agf_fllast))
4100 			break;
4101 		if (++i == xfs_agfl_size(mp))
4102 			i = 0;
4103 	}
4104 
4105 	return 0;
4106 }
4107 
4108 int __init
4109 xfs_extfree_intent_init_cache(void)
4110 {
4111 	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
4112 			sizeof(struct xfs_extent_free_item),
4113 			0, 0, NULL);
4114 
4115 	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
4116 }
4117 
4118 void
4119 xfs_extfree_intent_destroy_cache(void)
4120 {
4121 	kmem_cache_destroy(xfs_extfree_item_cache);
4122 	xfs_extfree_item_cache = NULL;
4123 }
4124