xref: /linux/fs/xfs/libxfs/xfs_btree_staging.c (revision b477ff98d903618a1ab8247861f2ea6e70c0f0f8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2020 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_btree.h"
17 #include "xfs_trace.h"
18 #include "xfs_btree_staging.h"
19 
20 /*
21  * Staging Cursors and Fake Roots for Btrees
22  * =========================================
23  *
24  * A staging btree cursor is a special type of btree cursor that callers must
25  * use to construct a new btree index using the btree bulk loader code.  The
26  * bulk loading code uses the staging btree cursor to abstract the details of
27  * initializing new btree blocks and filling them with records or key/ptr
28  * pairs.  Regular btree operations (e.g. queries and modifications) are not
29  * supported with staging cursors, and callers must not invoke them.
30  *
31  * Fake root structures contain all the information about a btree that is under
32  * construction by the bulk loading code.  Staging btree cursors point to fake
33  * root structures instead of the usual AG header or inode structure.
34  *
35  * Callers are expected to initialize a fake root structure and pass it into
36  * the _stage_cursor function for a specific btree type.  When bulk loading is
37  * complete, callers should call the _commit_staged_btree function for that
38  * specific btree type to commit the new btree into the filesystem.
39  */
40 
41 /*
42  * Bulk Loading for AG Btrees
43  * ==========================
44  *
45  * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
46  * staging cursor.  Callers should initialize this to zero.
47  *
48  * The _stage_cursor() function for a specific btree type should call
49  * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
50  * cursor.  The corresponding _commit_staged_btree() function should log the
51  * new root and call xfs_btree_commit_afakeroot() to transform the staging
52  * cursor into a regular btree cursor.
53  */
54 
55 /*
56  * Initialize a AG-rooted btree cursor with the given AG btree fake root.
57  */
58 void
xfs_btree_stage_afakeroot(struct xfs_btree_cur * cur,struct xbtree_afakeroot * afake)59 xfs_btree_stage_afakeroot(
60 	struct xfs_btree_cur		*cur,
61 	struct xbtree_afakeroot		*afake)
62 {
63 	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
64 	ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
65 	ASSERT(cur->bc_tp == NULL);
66 
67 	cur->bc_ag.afake = afake;
68 	cur->bc_nlevels = afake->af_levels;
69 	cur->bc_flags |= XFS_BTREE_STAGING;
70 }
71 
72 /*
73  * Transform an AG-rooted staging btree cursor back into a regular cursor by
74  * substituting a real btree root for the fake one and restoring normal btree
75  * cursor ops.  The caller must log the btree root change prior to calling
76  * this.
77  */
78 void
xfs_btree_commit_afakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp)79 xfs_btree_commit_afakeroot(
80 	struct xfs_btree_cur		*cur,
81 	struct xfs_trans		*tp,
82 	struct xfs_buf			*agbp)
83 {
84 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
85 	ASSERT(cur->bc_tp == NULL);
86 
87 	trace_xfs_btree_commit_afakeroot(cur);
88 
89 	cur->bc_ag.afake = NULL;
90 	cur->bc_ag.agbp = agbp;
91 	cur->bc_flags &= ~XFS_BTREE_STAGING;
92 	cur->bc_tp = tp;
93 }
94 
95 /*
96  * Bulk Loading for Inode-Rooted Btrees
97  * ====================================
98  *
99  * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
100  * the staging cursor.  This structure should be initialized as follows:
101  *
102  * - if_fork_size field should be set to the number of bytes available to the
103  *   fork in the inode.
104  *
105  * - if_fork should point to a freshly allocated struct xfs_ifork.
106  *
107  * - if_format should be set to the appropriate fork type (e.g.
108  *   XFS_DINODE_FMT_BTREE).
109  *
110  * All other fields must be zero.
111  *
112  * The _stage_cursor() function for a specific btree type should call
113  * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
114  * cursor.  The corresponding _commit_staged_btree() function should log the
115  * new root and call xfs_btree_commit_ifakeroot() to transform the staging
116  * cursor into a regular btree cursor.
117  */
118 
119 /*
120  * Initialize an inode-rooted btree cursor with the given inode btree fake
121  * root.  The btree cursor's bc_ops will be overridden as needed to make the
122  * staging functionality work.  If new_ops is not NULL, these new ops will be
123  * passed out to the caller for further overriding.
124  */
125 void
xfs_btree_stage_ifakeroot(struct xfs_btree_cur * cur,struct xbtree_ifakeroot * ifake)126 xfs_btree_stage_ifakeroot(
127 	struct xfs_btree_cur		*cur,
128 	struct xbtree_ifakeroot		*ifake)
129 {
130 	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
131 	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
132 	ASSERT(cur->bc_tp == NULL);
133 
134 	cur->bc_ino.ifake = ifake;
135 	cur->bc_nlevels = ifake->if_levels;
136 	cur->bc_ino.forksize = ifake->if_fork_size;
137 	cur->bc_ino.whichfork = XFS_STAGING_FORK;
138 	cur->bc_flags |= XFS_BTREE_STAGING;
139 }
140 
141 /*
142  * Transform an inode-rooted staging btree cursor back into a regular cursor by
143  * substituting a real btree root for the fake one and restoring normal btree
144  * cursor ops.  The caller must log the btree root change prior to calling
145  * this.
146  */
147 void
xfs_btree_commit_ifakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,int whichfork)148 xfs_btree_commit_ifakeroot(
149 	struct xfs_btree_cur		*cur,
150 	struct xfs_trans		*tp,
151 	int				whichfork)
152 {
153 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
154 	ASSERT(cur->bc_tp == NULL);
155 
156 	trace_xfs_btree_commit_ifakeroot(cur);
157 
158 	cur->bc_ino.ifake = NULL;
159 	cur->bc_ino.whichfork = whichfork;
160 	cur->bc_flags &= ~XFS_BTREE_STAGING;
161 	cur->bc_tp = tp;
162 }
163 
164 /*
165  * Bulk Loading of Staged Btrees
166  * =============================
167  *
168  * This interface is used with a staged btree cursor to create a totally new
169  * btree with a large number of records (i.e. more than what would fit in a
170  * single root block).  When the creation is complete, the new root can be
171  * linked atomically into the filesystem by committing the staged cursor.
172  *
173  * Creation of a new btree proceeds roughly as follows:
174  *
175  * The first step is to initialize an appropriate fake btree root structure and
176  * then construct a staged btree cursor.  Refer to the block comments about
177  * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
178  * more information about how to do this.
179  *
180  * The second step is to initialize a struct xfs_btree_bload context as
181  * documented in the structure definition.
182  *
183  * The third step is to call xfs_btree_bload_compute_geometry to compute the
184  * height of and the number of blocks needed to construct the btree.  See the
185  * section "Computing the Geometry of the New Btree" for details about this
186  * computation.
187  *
188  * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
189  * save them for later use by ->claim_block().  Bulk loading requires all
190  * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
191  * rebuild, and to minimize seek distances of the new btree.
192  *
193  * Step five is to call xfs_btree_bload() to start constructing the btree.
194  *
195  * The final step is to commit the staging btree cursor, which logs the new
196  * btree root and turns the staging cursor into a regular cursor.  The caller
197  * is responsible for cleaning up the previous btree blocks, if any.
198  *
199  * Computing the Geometry of the New Btree
200  * =======================================
201  *
202  * The number of items placed in each btree block is computed via the following
203  * algorithm: For leaf levels, the number of items for the level is nr_records
204  * in the bload structure.  For node levels, the number of items for the level
205  * is the number of blocks in the next lower level of the tree.  For each
206  * level, the desired number of items per block is defined as:
207  *
208  * desired = max(minrecs, maxrecs - slack factor)
209  *
210  * The number of blocks for the level is defined to be:
211  *
212  * blocks = floor(nr_items / desired)
213  *
214  * Note this is rounded down so that the npb calculation below will never fall
215  * below minrecs.  The number of items that will actually be loaded into each
216  * btree block is defined as:
217  *
218  * npb =  nr_items / blocks
219  *
220  * Some of the leftmost blocks in the level will contain one extra record as
221  * needed to handle uneven division.  If the number of records in any block
222  * would exceed maxrecs for that level, blocks is incremented and npb is
223  * recalculated.
224  *
225  * In other words, we compute the number of blocks needed to satisfy a given
226  * loading level, then spread the items as evenly as possible.
227  *
228  * The height and number of fs blocks required to create the btree are computed
229  * and returned via btree_height and nr_blocks.
230  */
231 
232 /*
233  * Put a btree block that we're loading onto the ordered list and release it.
234  * The btree blocks will be written to disk when bulk loading is finished.
235  * If we reach the dirty buffer threshold, flush them to disk before
236  * continuing.
237  */
238 static int
xfs_btree_bload_drop_buf(struct xfs_btree_bload * bbl,struct list_head * buffers_list,struct xfs_buf ** bpp)239 xfs_btree_bload_drop_buf(
240 	struct xfs_btree_bload		*bbl,
241 	struct list_head		*buffers_list,
242 	struct xfs_buf			**bpp)
243 {
244 	struct xfs_buf			*bp = *bpp;
245 	int				error;
246 
247 	if (!bp)
248 		return 0;
249 
250 	/*
251 	 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
252 	 * xfs_buf_read will not pointlessly reread the contents from the disk.
253 	 */
254 	bp->b_flags |= XBF_DONE;
255 
256 	xfs_buf_delwri_queue_here(bp, buffers_list);
257 	xfs_buf_relse(bp);
258 	*bpp = NULL;
259 	bbl->nr_dirty++;
260 
261 	if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
262 		return 0;
263 
264 	error = xfs_buf_delwri_submit(buffers_list);
265 	if (error)
266 		return error;
267 
268 	bbl->nr_dirty = 0;
269 	return 0;
270 }
271 
272 /*
273  * Allocate and initialize one btree block for bulk loading.
274  *
275  * The new btree block will have its level and numrecs fields set to the values
276  * of the level and nr_this_block parameters, respectively.
277  *
278  * The caller should ensure that ptrp, bpp, and blockp refer to the left
279  * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
280  * will all point to the new block.
281  */
282 STATIC int
xfs_btree_bload_prep_block(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,struct list_head * buffers_list,unsigned int level,unsigned int nr_this_block,union xfs_btree_ptr * ptrp,struct xfs_buf ** bpp,struct xfs_btree_block ** blockp,void * priv)283 xfs_btree_bload_prep_block(
284 	struct xfs_btree_cur		*cur,
285 	struct xfs_btree_bload		*bbl,
286 	struct list_head		*buffers_list,
287 	unsigned int			level,
288 	unsigned int			nr_this_block,
289 	union xfs_btree_ptr		*ptrp, /* in/out */
290 	struct xfs_buf			**bpp, /* in/out */
291 	struct xfs_btree_block		**blockp, /* in/out */
292 	void				*priv)
293 {
294 	union xfs_btree_ptr		new_ptr;
295 	struct xfs_buf			*new_bp;
296 	struct xfs_btree_block		*new_block;
297 	int				ret;
298 
299 	if (xfs_btree_at_iroot(cur, level)) {
300 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
301 		size_t			new_size;
302 
303 		ASSERT(*bpp == NULL);
304 
305 		/* Allocate a new incore btree root block. */
306 		new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
307 		ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
308 		ifp->if_broot_bytes = (int)new_size;
309 
310 		/* Initialize it and send it out. */
311 		xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
312 				level, nr_this_block, cur->bc_ino.ip->i_ino);
313 
314 		*bpp = NULL;
315 		*blockp = ifp->if_broot;
316 		xfs_btree_set_ptr_null(cur, ptrp);
317 		return 0;
318 	}
319 
320 	/* Claim one of the caller's preallocated blocks. */
321 	xfs_btree_set_ptr_null(cur, &new_ptr);
322 	ret = bbl->claim_block(cur, &new_ptr, priv);
323 	if (ret)
324 		return ret;
325 
326 	ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
327 
328 	ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
329 	if (ret)
330 		return ret;
331 
332 	/*
333 	 * The previous block (if any) is the left sibling of the new block,
334 	 * so set its right sibling pointer to the new block and drop it.
335 	 */
336 	if (*blockp)
337 		xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
338 
339 	ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
340 	if (ret)
341 		return ret;
342 
343 	/* Initialize the new btree block. */
344 	xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
345 	xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
346 
347 	/* Set the out parameters. */
348 	*bpp = new_bp;
349 	*blockp = new_block;
350 	xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
351 	return 0;
352 }
353 
354 /* Load one leaf block. */
355 STATIC int
xfs_btree_bload_leaf(struct xfs_btree_cur * cur,unsigned int recs_this_block,xfs_btree_bload_get_records_fn get_records,struct xfs_btree_block * block,void * priv)356 xfs_btree_bload_leaf(
357 	struct xfs_btree_cur		*cur,
358 	unsigned int			recs_this_block,
359 	xfs_btree_bload_get_records_fn	get_records,
360 	struct xfs_btree_block		*block,
361 	void				*priv)
362 {
363 	unsigned int			j = 1;
364 	int				ret;
365 
366 	/* Fill the leaf block with records. */
367 	while (j <= recs_this_block) {
368 		ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
369 		if (ret < 0)
370 			return ret;
371 		j += ret;
372 	}
373 
374 	return 0;
375 }
376 
377 /*
378  * Load one node block with key/ptr pairs.
379  *
380  * child_ptr must point to a block within the next level down in the tree.  A
381  * key/ptr entry will be created in the new node block to the block pointed to
382  * by child_ptr.  On exit, child_ptr points to the next block on the child
383  * level that needs processing.
384  */
385 STATIC int
xfs_btree_bload_node(struct xfs_btree_cur * cur,unsigned int recs_this_block,union xfs_btree_ptr * child_ptr,struct xfs_btree_block * block)386 xfs_btree_bload_node(
387 	struct xfs_btree_cur	*cur,
388 	unsigned int		recs_this_block,
389 	union xfs_btree_ptr	*child_ptr,
390 	struct xfs_btree_block	*block)
391 {
392 	unsigned int		j;
393 	int			ret;
394 
395 	/* Fill the node block with keys and pointers. */
396 	for (j = 1; j <= recs_this_block; j++) {
397 		union xfs_btree_key	child_key;
398 		union xfs_btree_ptr	*block_ptr;
399 		union xfs_btree_key	*block_key;
400 		struct xfs_btree_block	*child_block;
401 		struct xfs_buf		*child_bp;
402 
403 		ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
404 
405 		/*
406 		 * Read the lower-level block in case the buffer for it has
407 		 * been reclaimed.  LRU refs will be set on the block, which is
408 		 * desirable if the new btree commits.
409 		 */
410 		ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
411 				&child_bp);
412 		if (ret)
413 			return ret;
414 
415 		block_ptr = xfs_btree_ptr_addr(cur, j, block);
416 		xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
417 
418 		block_key = xfs_btree_key_addr(cur, j, block);
419 		xfs_btree_get_keys(cur, child_block, &child_key);
420 		xfs_btree_copy_keys(cur, block_key, &child_key, 1);
421 
422 		xfs_btree_get_sibling(cur, child_block, child_ptr,
423 				XFS_BB_RIGHTSIB);
424 		xfs_buf_relse(child_bp);
425 	}
426 
427 	return 0;
428 }
429 
430 /*
431  * Compute the maximum number of records (or keyptrs) per block that we want to
432  * install at this level in the btree.  Caller is responsible for having set
433  * @cur->bc_ino.forksize to the desired fork size, if appropriate.
434  */
435 STATIC unsigned int
xfs_btree_bload_max_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)436 xfs_btree_bload_max_npb(
437 	struct xfs_btree_cur	*cur,
438 	struct xfs_btree_bload	*bbl,
439 	unsigned int		level)
440 {
441 	unsigned int		ret;
442 
443 	if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
444 		return cur->bc_ops->get_dmaxrecs(cur, level);
445 
446 	ret = cur->bc_ops->get_maxrecs(cur, level);
447 	if (level == 0)
448 		ret -= bbl->leaf_slack;
449 	else
450 		ret -= bbl->node_slack;
451 	return ret;
452 }
453 
454 /*
455  * Compute the desired number of records (or keyptrs) per block that we want to
456  * install at this level in the btree, which must be somewhere between minrecs
457  * and max_npb.  The caller is free to install fewer records per block.
458  */
459 STATIC unsigned int
xfs_btree_bload_desired_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)460 xfs_btree_bload_desired_npb(
461 	struct xfs_btree_cur	*cur,
462 	struct xfs_btree_bload	*bbl,
463 	unsigned int		level)
464 {
465 	unsigned int		npb = xfs_btree_bload_max_npb(cur, bbl, level);
466 
467 	/* Root blocks are not subject to minrecs rules. */
468 	if (level == cur->bc_nlevels - 1)
469 		return max(1U, npb);
470 
471 	return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
472 }
473 
474 /*
475  * Compute the number of records to be stored in each block at this level and
476  * the number of blocks for this level.  For leaf levels, we must populate an
477  * empty root block even if there are no records, so we have to have at least
478  * one block.
479  */
480 STATIC void
xfs_btree_bload_level_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level,uint64_t nr_this_level,unsigned int * avg_per_block,uint64_t * blocks,uint64_t * blocks_with_extra)481 xfs_btree_bload_level_geometry(
482 	struct xfs_btree_cur	*cur,
483 	struct xfs_btree_bload	*bbl,
484 	unsigned int		level,
485 	uint64_t		nr_this_level,
486 	unsigned int		*avg_per_block,
487 	uint64_t		*blocks,
488 	uint64_t		*blocks_with_extra)
489 {
490 	uint64_t		npb;
491 	uint64_t		dontcare;
492 	unsigned int		desired_npb;
493 	unsigned int		maxnr;
494 
495 	/*
496 	 * Compute the absolute maximum number of records that we can store in
497 	 * the ondisk block or inode root.
498 	 */
499 	if (cur->bc_ops->get_dmaxrecs)
500 		maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
501 	else
502 		maxnr = cur->bc_ops->get_maxrecs(cur, level);
503 
504 	/*
505 	 * Compute the number of blocks we need to fill each block with the
506 	 * desired number of records/keyptrs per block.  Because desired_npb
507 	 * could be minrecs, we use regular integer division (which rounds
508 	 * the block count down) so that in the next step the effective # of
509 	 * items per block will never be less than desired_npb.
510 	 */
511 	desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
512 	*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
513 	*blocks = max(1ULL, *blocks);
514 
515 	/*
516 	 * Compute the number of records that we will actually put in each
517 	 * block, assuming that we want to spread the records evenly between
518 	 * the blocks.  Take care that the effective # of items per block (npb)
519 	 * won't exceed maxrecs even for the blocks that get an extra record,
520 	 * since desired_npb could be maxrecs, and in the previous step we
521 	 * rounded the block count down.
522 	 */
523 	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
524 	if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
525 		(*blocks)++;
526 		npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
527 	}
528 
529 	*avg_per_block = min_t(uint64_t, npb, nr_this_level);
530 
531 	trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
532 			*avg_per_block, desired_npb, *blocks,
533 			*blocks_with_extra);
534 }
535 
536 /*
537  * Ensure a slack value is appropriate for the btree.
538  *
539  * If the slack value is negative, set slack so that we fill the block to
540  * halfway between minrecs and maxrecs.  Make sure the slack is never so large
541  * that we can underflow minrecs.
542  */
543 static void
xfs_btree_bload_ensure_slack(struct xfs_btree_cur * cur,int * slack,int level)544 xfs_btree_bload_ensure_slack(
545 	struct xfs_btree_cur	*cur,
546 	int			*slack,
547 	int			level)
548 {
549 	int			maxr;
550 	int			minr;
551 
552 	maxr = cur->bc_ops->get_maxrecs(cur, level);
553 	minr = cur->bc_ops->get_minrecs(cur, level);
554 
555 	/*
556 	 * If slack is negative, automatically set slack so that we load the
557 	 * btree block approximately halfway between minrecs and maxrecs.
558 	 * Generally, this will net us 75% loading.
559 	 */
560 	if (*slack < 0)
561 		*slack = maxr - ((maxr + minr) >> 1);
562 
563 	*slack = min(*slack, maxr - minr);
564 }
565 
566 /*
567  * Prepare a btree cursor for a bulk load operation by computing the geometry
568  * fields in bbl.  Caller must ensure that the btree cursor is a staging
569  * cursor.  This function can be called multiple times.
570  */
571 int
xfs_btree_bload_compute_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,uint64_t nr_records)572 xfs_btree_bload_compute_geometry(
573 	struct xfs_btree_cur	*cur,
574 	struct xfs_btree_bload	*bbl,
575 	uint64_t		nr_records)
576 {
577 	const struct xfs_btree_ops *ops = cur->bc_ops;
578 	uint64_t		nr_blocks = 0;
579 	uint64_t		nr_this_level;
580 
581 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
582 
583 	/*
584 	 * Make sure that the slack values make sense for traditional leaf and
585 	 * node blocks.  Inode-rooted btrees will return different minrecs and
586 	 * maxrecs values for the root block (bc_nlevels == level - 1).  We're
587 	 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
588 	 * code doesn't interpret either as the root level.
589 	 */
590 	cur->bc_nlevels = cur->bc_maxlevels - 1;
591 	xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
592 	xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
593 
594 	bbl->nr_records = nr_this_level = nr_records;
595 	for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
596 		uint64_t	level_blocks;
597 		uint64_t	dontcare64;
598 		unsigned int	level = cur->bc_nlevels - 1;
599 		unsigned int	avg_per_block;
600 
601 		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
602 				&avg_per_block, &level_blocks, &dontcare64);
603 
604 		if (ops->type == XFS_BTREE_TYPE_INODE) {
605 			/*
606 			 * If all the items we want to store at this level
607 			 * would fit in the inode root block, then we have our
608 			 * btree root and are done.
609 			 *
610 			 * Note that bmap btrees forbid records in the root.
611 			 */
612 			if ((level != 0 ||
613 			     (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) &&
614 			    nr_this_level <= avg_per_block) {
615 				nr_blocks++;
616 				break;
617 			}
618 
619 			/*
620 			 * Otherwise, we have to store all the items for this
621 			 * level in traditional btree blocks and therefore need
622 			 * another level of btree to point to those blocks.
623 			 *
624 			 * We have to re-compute the geometry for each level of
625 			 * an inode-rooted btree because the geometry differs
626 			 * between a btree root in an inode fork and a
627 			 * traditional btree block.
628 			 *
629 			 * This distinction is made in the btree code based on
630 			 * whether level == bc_nlevels - 1.  Based on the
631 			 * previous root block size check against the root
632 			 * block geometry, we know that we aren't yet ready to
633 			 * populate the root.  Increment bc_nevels and
634 			 * recalculate the geometry for a traditional
635 			 * block-based btree level.
636 			 */
637 			cur->bc_nlevels++;
638 			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
639 			xfs_btree_bload_level_geometry(cur, bbl, level,
640 					nr_this_level, &avg_per_block,
641 					&level_blocks, &dontcare64);
642 		} else {
643 			/*
644 			 * If all the items we want to store at this level
645 			 * would fit in a single root block, we're done.
646 			 */
647 			if (nr_this_level <= avg_per_block) {
648 				nr_blocks++;
649 				break;
650 			}
651 
652 			/* Otherwise, we need another level of btree. */
653 			cur->bc_nlevels++;
654 			ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
655 		}
656 
657 		nr_blocks += level_blocks;
658 		nr_this_level = level_blocks;
659 	}
660 
661 	if (cur->bc_nlevels > cur->bc_maxlevels)
662 		return -EOVERFLOW;
663 
664 	bbl->btree_height = cur->bc_nlevels;
665 	if (ops->type == XFS_BTREE_TYPE_INODE)
666 		bbl->nr_blocks = nr_blocks - 1;
667 	else
668 		bbl->nr_blocks = nr_blocks;
669 	return 0;
670 }
671 
672 /* Bulk load a btree given the parameters and geometry established in bbl. */
673 int
xfs_btree_bload(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,void * priv)674 xfs_btree_bload(
675 	struct xfs_btree_cur		*cur,
676 	struct xfs_btree_bload		*bbl,
677 	void				*priv)
678 {
679 	struct list_head		buffers_list;
680 	union xfs_btree_ptr		child_ptr;
681 	union xfs_btree_ptr		ptr;
682 	struct xfs_buf			*bp = NULL;
683 	struct xfs_btree_block		*block = NULL;
684 	uint64_t			nr_this_level = bbl->nr_records;
685 	uint64_t			blocks;
686 	uint64_t			i;
687 	uint64_t			blocks_with_extra;
688 	uint64_t			total_blocks = 0;
689 	unsigned int			avg_per_block;
690 	unsigned int			level = 0;
691 	int				ret;
692 
693 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
694 
695 	INIT_LIST_HEAD(&buffers_list);
696 	cur->bc_nlevels = bbl->btree_height;
697 	xfs_btree_set_ptr_null(cur, &child_ptr);
698 	xfs_btree_set_ptr_null(cur, &ptr);
699 	bbl->nr_dirty = 0;
700 
701 	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
702 			&avg_per_block, &blocks, &blocks_with_extra);
703 
704 	/* Load each leaf block. */
705 	for (i = 0; i < blocks; i++) {
706 		unsigned int		nr_this_block = avg_per_block;
707 
708 		/*
709 		 * Due to rounding, btree blocks will not be evenly populated
710 		 * in most cases.  blocks_with_extra tells us how many blocks
711 		 * will receive an extra record to distribute the excess across
712 		 * the current level as evenly as possible.
713 		 */
714 		if (i < blocks_with_extra)
715 			nr_this_block++;
716 
717 		ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
718 				nr_this_block, &ptr, &bp, &block, priv);
719 		if (ret)
720 			goto out;
721 
722 		trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
723 				nr_this_block);
724 
725 		ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
726 				block, priv);
727 		if (ret)
728 			goto out;
729 
730 		/*
731 		 * Record the leftmost leaf pointer so we know where to start
732 		 * with the first node level.
733 		 */
734 		if (i == 0)
735 			xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
736 	}
737 	total_blocks += blocks;
738 
739 	ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
740 	if (ret)
741 		goto out;
742 
743 	/* Populate the internal btree nodes. */
744 	for (level = 1; level < cur->bc_nlevels; level++) {
745 		union xfs_btree_ptr	first_ptr;
746 
747 		nr_this_level = blocks;
748 		block = NULL;
749 		xfs_btree_set_ptr_null(cur, &ptr);
750 
751 		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
752 				&avg_per_block, &blocks, &blocks_with_extra);
753 
754 		/* Load each node block. */
755 		for (i = 0; i < blocks; i++) {
756 			unsigned int	nr_this_block = avg_per_block;
757 
758 			if (i < blocks_with_extra)
759 				nr_this_block++;
760 
761 			ret = xfs_btree_bload_prep_block(cur, bbl,
762 					&buffers_list, level, nr_this_block,
763 					&ptr, &bp, &block, priv);
764 			if (ret)
765 				goto out;
766 
767 			trace_xfs_btree_bload_block(cur, level, i, blocks,
768 					&ptr, nr_this_block);
769 
770 			ret = xfs_btree_bload_node(cur, nr_this_block,
771 					&child_ptr, block);
772 			if (ret)
773 				goto out;
774 
775 			/*
776 			 * Record the leftmost node pointer so that we know
777 			 * where to start the next node level above this one.
778 			 */
779 			if (i == 0)
780 				xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
781 		}
782 		total_blocks += blocks;
783 
784 		ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
785 		if (ret)
786 			goto out;
787 
788 		xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
789 	}
790 
791 	/* Initialize the new root. */
792 	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
793 		ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
794 		cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
795 		cur->bc_ino.ifake->if_blocks = total_blocks - 1;
796 	} else {
797 		cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
798 		cur->bc_ag.afake->af_levels = cur->bc_nlevels;
799 		cur->bc_ag.afake->af_blocks = total_blocks;
800 	}
801 
802 	/*
803 	 * Write the new blocks to disk.  If the ordered list isn't empty after
804 	 * that, then something went wrong and we have to fail.  This should
805 	 * never happen, but we'll check anyway.
806 	 */
807 	ret = xfs_buf_delwri_submit(&buffers_list);
808 	if (ret)
809 		goto out;
810 	if (!list_empty(&buffers_list)) {
811 		ASSERT(list_empty(&buffers_list));
812 		ret = -EIO;
813 	}
814 
815 out:
816 	xfs_buf_delwri_cancel(&buffers_list);
817 	if (bp)
818 		xfs_buf_relse(bp);
819 	return ret;
820 }
821