1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2020 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_btree.h"
17 #include "xfs_trace.h"
18 #include "xfs_btree_staging.h"
19
20 /*
21 * Staging Cursors and Fake Roots for Btrees
22 * =========================================
23 *
24 * A staging btree cursor is a special type of btree cursor that callers must
25 * use to construct a new btree index using the btree bulk loader code. The
26 * bulk loading code uses the staging btree cursor to abstract the details of
27 * initializing new btree blocks and filling them with records or key/ptr
28 * pairs. Regular btree operations (e.g. queries and modifications) are not
29 * supported with staging cursors, and callers must not invoke them.
30 *
31 * Fake root structures contain all the information about a btree that is under
32 * construction by the bulk loading code. Staging btree cursors point to fake
33 * root structures instead of the usual AG header or inode structure.
34 *
35 * Callers are expected to initialize a fake root structure and pass it into
36 * the _stage_cursor function for a specific btree type. When bulk loading is
37 * complete, callers should call the _commit_staged_btree function for that
38 * specific btree type to commit the new btree into the filesystem.
39 */
40
41 /*
42 * Bulk Loading for AG Btrees
43 * ==========================
44 *
45 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
46 * staging cursor. Callers should initialize this to zero.
47 *
48 * The _stage_cursor() function for a specific btree type should call
49 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
50 * cursor. The corresponding _commit_staged_btree() function should log the
51 * new root and call xfs_btree_commit_afakeroot() to transform the staging
52 * cursor into a regular btree cursor.
53 */
54
55 /*
56 * Initialize a AG-rooted btree cursor with the given AG btree fake root.
57 */
58 void
xfs_btree_stage_afakeroot(struct xfs_btree_cur * cur,struct xbtree_afakeroot * afake)59 xfs_btree_stage_afakeroot(
60 struct xfs_btree_cur *cur,
61 struct xbtree_afakeroot *afake)
62 {
63 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
64 ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
65 ASSERT(cur->bc_tp == NULL);
66
67 cur->bc_ag.afake = afake;
68 cur->bc_nlevels = afake->af_levels;
69 cur->bc_flags |= XFS_BTREE_STAGING;
70 }
71
72 /*
73 * Transform an AG-rooted staging btree cursor back into a regular cursor by
74 * substituting a real btree root for the fake one and restoring normal btree
75 * cursor ops. The caller must log the btree root change prior to calling
76 * this.
77 */
78 void
xfs_btree_commit_afakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp)79 xfs_btree_commit_afakeroot(
80 struct xfs_btree_cur *cur,
81 struct xfs_trans *tp,
82 struct xfs_buf *agbp)
83 {
84 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
85 ASSERT(cur->bc_tp == NULL);
86
87 trace_xfs_btree_commit_afakeroot(cur);
88
89 cur->bc_ag.afake = NULL;
90 cur->bc_ag.agbp = agbp;
91 cur->bc_flags &= ~XFS_BTREE_STAGING;
92 cur->bc_tp = tp;
93 }
94
95 /*
96 * Bulk Loading for Inode-Rooted Btrees
97 * ====================================
98 *
99 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
100 * the staging cursor. This structure should be initialized as follows:
101 *
102 * - if_fork_size field should be set to the number of bytes available to the
103 * fork in the inode.
104 *
105 * - if_fork should point to a freshly allocated struct xfs_ifork.
106 *
107 * - if_format should be set to the appropriate fork type (e.g.
108 * XFS_DINODE_FMT_BTREE).
109 *
110 * All other fields must be zero.
111 *
112 * The _stage_cursor() function for a specific btree type should call
113 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
114 * cursor. The corresponding _commit_staged_btree() function should log the
115 * new root and call xfs_btree_commit_ifakeroot() to transform the staging
116 * cursor into a regular btree cursor.
117 */
118
119 /*
120 * Initialize an inode-rooted btree cursor with the given inode btree fake
121 * root. The btree cursor's bc_ops will be overridden as needed to make the
122 * staging functionality work. If new_ops is not NULL, these new ops will be
123 * passed out to the caller for further overriding.
124 */
125 void
xfs_btree_stage_ifakeroot(struct xfs_btree_cur * cur,struct xbtree_ifakeroot * ifake)126 xfs_btree_stage_ifakeroot(
127 struct xfs_btree_cur *cur,
128 struct xbtree_ifakeroot *ifake)
129 {
130 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
131 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
132 ASSERT(cur->bc_tp == NULL);
133
134 cur->bc_ino.ifake = ifake;
135 cur->bc_nlevels = ifake->if_levels;
136 cur->bc_ino.forksize = ifake->if_fork_size;
137 cur->bc_ino.whichfork = XFS_STAGING_FORK;
138 cur->bc_flags |= XFS_BTREE_STAGING;
139 }
140
141 /*
142 * Transform an inode-rooted staging btree cursor back into a regular cursor by
143 * substituting a real btree root for the fake one and restoring normal btree
144 * cursor ops. The caller must log the btree root change prior to calling
145 * this.
146 */
147 void
xfs_btree_commit_ifakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,int whichfork)148 xfs_btree_commit_ifakeroot(
149 struct xfs_btree_cur *cur,
150 struct xfs_trans *tp,
151 int whichfork)
152 {
153 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
154 ASSERT(cur->bc_tp == NULL);
155
156 trace_xfs_btree_commit_ifakeroot(cur);
157
158 cur->bc_ino.ifake = NULL;
159 cur->bc_ino.whichfork = whichfork;
160 cur->bc_flags &= ~XFS_BTREE_STAGING;
161 cur->bc_tp = tp;
162 }
163
164 /*
165 * Bulk Loading of Staged Btrees
166 * =============================
167 *
168 * This interface is used with a staged btree cursor to create a totally new
169 * btree with a large number of records (i.e. more than what would fit in a
170 * single root block). When the creation is complete, the new root can be
171 * linked atomically into the filesystem by committing the staged cursor.
172 *
173 * Creation of a new btree proceeds roughly as follows:
174 *
175 * The first step is to initialize an appropriate fake btree root structure and
176 * then construct a staged btree cursor. Refer to the block comments about
177 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
178 * more information about how to do this.
179 *
180 * The second step is to initialize a struct xfs_btree_bload context as
181 * documented in the structure definition.
182 *
183 * The third step is to call xfs_btree_bload_compute_geometry to compute the
184 * height of and the number of blocks needed to construct the btree. See the
185 * section "Computing the Geometry of the New Btree" for details about this
186 * computation.
187 *
188 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
189 * save them for later use by ->claim_block(). Bulk loading requires all
190 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
191 * rebuild, and to minimize seek distances of the new btree.
192 *
193 * Step five is to call xfs_btree_bload() to start constructing the btree.
194 *
195 * The final step is to commit the staging btree cursor, which logs the new
196 * btree root and turns the staging cursor into a regular cursor. The caller
197 * is responsible for cleaning up the previous btree blocks, if any.
198 *
199 * Computing the Geometry of the New Btree
200 * =======================================
201 *
202 * The number of items placed in each btree block is computed via the following
203 * algorithm: For leaf levels, the number of items for the level is nr_records
204 * in the bload structure. For node levels, the number of items for the level
205 * is the number of blocks in the next lower level of the tree. For each
206 * level, the desired number of items per block is defined as:
207 *
208 * desired = max(minrecs, maxrecs - slack factor)
209 *
210 * The number of blocks for the level is defined to be:
211 *
212 * blocks = floor(nr_items / desired)
213 *
214 * Note this is rounded down so that the npb calculation below will never fall
215 * below minrecs. The number of items that will actually be loaded into each
216 * btree block is defined as:
217 *
218 * npb = nr_items / blocks
219 *
220 * Some of the leftmost blocks in the level will contain one extra record as
221 * needed to handle uneven division. If the number of records in any block
222 * would exceed maxrecs for that level, blocks is incremented and npb is
223 * recalculated.
224 *
225 * In other words, we compute the number of blocks needed to satisfy a given
226 * loading level, then spread the items as evenly as possible.
227 *
228 * The height and number of fs blocks required to create the btree are computed
229 * and returned via btree_height and nr_blocks.
230 */
231
232 /*
233 * Put a btree block that we're loading onto the ordered list and release it.
234 * The btree blocks will be written to disk when bulk loading is finished.
235 * If we reach the dirty buffer threshold, flush them to disk before
236 * continuing.
237 */
238 static int
xfs_btree_bload_drop_buf(struct xfs_btree_bload * bbl,struct list_head * buffers_list,struct xfs_buf ** bpp)239 xfs_btree_bload_drop_buf(
240 struct xfs_btree_bload *bbl,
241 struct list_head *buffers_list,
242 struct xfs_buf **bpp)
243 {
244 struct xfs_buf *bp = *bpp;
245 int error;
246
247 if (!bp)
248 return 0;
249
250 /*
251 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
252 * xfs_buf_read will not pointlessly reread the contents from the disk.
253 */
254 bp->b_flags |= XBF_DONE;
255
256 xfs_buf_delwri_queue_here(bp, buffers_list);
257 xfs_buf_relse(bp);
258 *bpp = NULL;
259 bbl->nr_dirty++;
260
261 if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
262 return 0;
263
264 error = xfs_buf_delwri_submit(buffers_list);
265 if (error)
266 return error;
267
268 bbl->nr_dirty = 0;
269 return 0;
270 }
271
272 /*
273 * Allocate and initialize one btree block for bulk loading.
274 *
275 * The new btree block will have its level and numrecs fields set to the values
276 * of the level and nr_this_block parameters, respectively.
277 *
278 * The caller should ensure that ptrp, bpp, and blockp refer to the left
279 * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
280 * will all point to the new block.
281 */
282 STATIC int
xfs_btree_bload_prep_block(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,struct list_head * buffers_list,unsigned int level,unsigned int nr_this_block,union xfs_btree_ptr * ptrp,struct xfs_buf ** bpp,struct xfs_btree_block ** blockp,void * priv)283 xfs_btree_bload_prep_block(
284 struct xfs_btree_cur *cur,
285 struct xfs_btree_bload *bbl,
286 struct list_head *buffers_list,
287 unsigned int level,
288 unsigned int nr_this_block,
289 union xfs_btree_ptr *ptrp, /* in/out */
290 struct xfs_buf **bpp, /* in/out */
291 struct xfs_btree_block **blockp, /* in/out */
292 void *priv)
293 {
294 union xfs_btree_ptr new_ptr;
295 struct xfs_buf *new_bp;
296 struct xfs_btree_block *new_block;
297 int ret;
298
299 if (xfs_btree_at_iroot(cur, level)) {
300 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
301 size_t new_size;
302
303 ASSERT(*bpp == NULL);
304
305 /* Allocate a new incore btree root block. */
306 new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
307 ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
308 ifp->if_broot_bytes = (int)new_size;
309
310 /* Initialize it and send it out. */
311 xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
312 level, nr_this_block, cur->bc_ino.ip->i_ino);
313
314 *bpp = NULL;
315 *blockp = ifp->if_broot;
316 xfs_btree_set_ptr_null(cur, ptrp);
317 return 0;
318 }
319
320 /* Claim one of the caller's preallocated blocks. */
321 xfs_btree_set_ptr_null(cur, &new_ptr);
322 ret = bbl->claim_block(cur, &new_ptr, priv);
323 if (ret)
324 return ret;
325
326 ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
327
328 ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
329 if (ret)
330 return ret;
331
332 /*
333 * The previous block (if any) is the left sibling of the new block,
334 * so set its right sibling pointer to the new block and drop it.
335 */
336 if (*blockp)
337 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
338
339 ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
340 if (ret)
341 return ret;
342
343 /* Initialize the new btree block. */
344 xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
345 xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
346
347 /* Set the out parameters. */
348 *bpp = new_bp;
349 *blockp = new_block;
350 xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
351 return 0;
352 }
353
354 /* Load one leaf block. */
355 STATIC int
xfs_btree_bload_leaf(struct xfs_btree_cur * cur,unsigned int recs_this_block,xfs_btree_bload_get_records_fn get_records,struct xfs_btree_block * block,void * priv)356 xfs_btree_bload_leaf(
357 struct xfs_btree_cur *cur,
358 unsigned int recs_this_block,
359 xfs_btree_bload_get_records_fn get_records,
360 struct xfs_btree_block *block,
361 void *priv)
362 {
363 unsigned int j = 1;
364 int ret;
365
366 /* Fill the leaf block with records. */
367 while (j <= recs_this_block) {
368 ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
369 if (ret < 0)
370 return ret;
371 j += ret;
372 }
373
374 return 0;
375 }
376
377 /*
378 * Load one node block with key/ptr pairs.
379 *
380 * child_ptr must point to a block within the next level down in the tree. A
381 * key/ptr entry will be created in the new node block to the block pointed to
382 * by child_ptr. On exit, child_ptr points to the next block on the child
383 * level that needs processing.
384 */
385 STATIC int
xfs_btree_bload_node(struct xfs_btree_cur * cur,unsigned int recs_this_block,union xfs_btree_ptr * child_ptr,struct xfs_btree_block * block)386 xfs_btree_bload_node(
387 struct xfs_btree_cur *cur,
388 unsigned int recs_this_block,
389 union xfs_btree_ptr *child_ptr,
390 struct xfs_btree_block *block)
391 {
392 unsigned int j;
393 int ret;
394
395 /* Fill the node block with keys and pointers. */
396 for (j = 1; j <= recs_this_block; j++) {
397 union xfs_btree_key child_key;
398 union xfs_btree_ptr *block_ptr;
399 union xfs_btree_key *block_key;
400 struct xfs_btree_block *child_block;
401 struct xfs_buf *child_bp;
402
403 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
404
405 /*
406 * Read the lower-level block in case the buffer for it has
407 * been reclaimed. LRU refs will be set on the block, which is
408 * desirable if the new btree commits.
409 */
410 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
411 &child_bp);
412 if (ret)
413 return ret;
414
415 block_ptr = xfs_btree_ptr_addr(cur, j, block);
416 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
417
418 block_key = xfs_btree_key_addr(cur, j, block);
419 xfs_btree_get_keys(cur, child_block, &child_key);
420 xfs_btree_copy_keys(cur, block_key, &child_key, 1);
421
422 xfs_btree_get_sibling(cur, child_block, child_ptr,
423 XFS_BB_RIGHTSIB);
424 xfs_buf_relse(child_bp);
425 }
426
427 return 0;
428 }
429
430 /*
431 * Compute the maximum number of records (or keyptrs) per block that we want to
432 * install at this level in the btree. Caller is responsible for having set
433 * @cur->bc_ino.forksize to the desired fork size, if appropriate.
434 */
435 STATIC unsigned int
xfs_btree_bload_max_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)436 xfs_btree_bload_max_npb(
437 struct xfs_btree_cur *cur,
438 struct xfs_btree_bload *bbl,
439 unsigned int level)
440 {
441 unsigned int ret;
442
443 if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
444 return cur->bc_ops->get_dmaxrecs(cur, level);
445
446 ret = cur->bc_ops->get_maxrecs(cur, level);
447 if (level == 0)
448 ret -= bbl->leaf_slack;
449 else
450 ret -= bbl->node_slack;
451 return ret;
452 }
453
454 /*
455 * Compute the desired number of records (or keyptrs) per block that we want to
456 * install at this level in the btree, which must be somewhere between minrecs
457 * and max_npb. The caller is free to install fewer records per block.
458 */
459 STATIC unsigned int
xfs_btree_bload_desired_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)460 xfs_btree_bload_desired_npb(
461 struct xfs_btree_cur *cur,
462 struct xfs_btree_bload *bbl,
463 unsigned int level)
464 {
465 unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
466
467 /* Root blocks are not subject to minrecs rules. */
468 if (level == cur->bc_nlevels - 1)
469 return max(1U, npb);
470
471 return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
472 }
473
474 /*
475 * Compute the number of records to be stored in each block at this level and
476 * the number of blocks for this level. For leaf levels, we must populate an
477 * empty root block even if there are no records, so we have to have at least
478 * one block.
479 */
480 STATIC void
xfs_btree_bload_level_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level,uint64_t nr_this_level,unsigned int * avg_per_block,uint64_t * blocks,uint64_t * blocks_with_extra)481 xfs_btree_bload_level_geometry(
482 struct xfs_btree_cur *cur,
483 struct xfs_btree_bload *bbl,
484 unsigned int level,
485 uint64_t nr_this_level,
486 unsigned int *avg_per_block,
487 uint64_t *blocks,
488 uint64_t *blocks_with_extra)
489 {
490 uint64_t npb;
491 uint64_t dontcare;
492 unsigned int desired_npb;
493 unsigned int maxnr;
494
495 /*
496 * Compute the absolute maximum number of records that we can store in
497 * the ondisk block or inode root.
498 */
499 if (cur->bc_ops->get_dmaxrecs)
500 maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
501 else
502 maxnr = cur->bc_ops->get_maxrecs(cur, level);
503
504 /*
505 * Compute the number of blocks we need to fill each block with the
506 * desired number of records/keyptrs per block. Because desired_npb
507 * could be minrecs, we use regular integer division (which rounds
508 * the block count down) so that in the next step the effective # of
509 * items per block will never be less than desired_npb.
510 */
511 desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
512 *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
513 *blocks = max(1ULL, *blocks);
514
515 /*
516 * Compute the number of records that we will actually put in each
517 * block, assuming that we want to spread the records evenly between
518 * the blocks. Take care that the effective # of items per block (npb)
519 * won't exceed maxrecs even for the blocks that get an extra record,
520 * since desired_npb could be maxrecs, and in the previous step we
521 * rounded the block count down.
522 */
523 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
524 if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
525 (*blocks)++;
526 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
527 }
528
529 *avg_per_block = min_t(uint64_t, npb, nr_this_level);
530
531 trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
532 *avg_per_block, desired_npb, *blocks,
533 *blocks_with_extra);
534 }
535
536 /*
537 * Ensure a slack value is appropriate for the btree.
538 *
539 * If the slack value is negative, set slack so that we fill the block to
540 * halfway between minrecs and maxrecs. Make sure the slack is never so large
541 * that we can underflow minrecs.
542 */
543 static void
xfs_btree_bload_ensure_slack(struct xfs_btree_cur * cur,int * slack,int level)544 xfs_btree_bload_ensure_slack(
545 struct xfs_btree_cur *cur,
546 int *slack,
547 int level)
548 {
549 int maxr;
550 int minr;
551
552 maxr = cur->bc_ops->get_maxrecs(cur, level);
553 minr = cur->bc_ops->get_minrecs(cur, level);
554
555 /*
556 * If slack is negative, automatically set slack so that we load the
557 * btree block approximately halfway between minrecs and maxrecs.
558 * Generally, this will net us 75% loading.
559 */
560 if (*slack < 0)
561 *slack = maxr - ((maxr + minr) >> 1);
562
563 *slack = min(*slack, maxr - minr);
564 }
565
566 /*
567 * Prepare a btree cursor for a bulk load operation by computing the geometry
568 * fields in bbl. Caller must ensure that the btree cursor is a staging
569 * cursor. This function can be called multiple times.
570 */
571 int
xfs_btree_bload_compute_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,uint64_t nr_records)572 xfs_btree_bload_compute_geometry(
573 struct xfs_btree_cur *cur,
574 struct xfs_btree_bload *bbl,
575 uint64_t nr_records)
576 {
577 const struct xfs_btree_ops *ops = cur->bc_ops;
578 uint64_t nr_blocks = 0;
579 uint64_t nr_this_level;
580
581 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
582
583 /*
584 * Make sure that the slack values make sense for traditional leaf and
585 * node blocks. Inode-rooted btrees will return different minrecs and
586 * maxrecs values for the root block (bc_nlevels == level - 1). We're
587 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
588 * code doesn't interpret either as the root level.
589 */
590 cur->bc_nlevels = cur->bc_maxlevels - 1;
591 xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
592 xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
593
594 bbl->nr_records = nr_this_level = nr_records;
595 for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
596 uint64_t level_blocks;
597 uint64_t dontcare64;
598 unsigned int level = cur->bc_nlevels - 1;
599 unsigned int avg_per_block;
600
601 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
602 &avg_per_block, &level_blocks, &dontcare64);
603
604 if (ops->type == XFS_BTREE_TYPE_INODE) {
605 /*
606 * If all the items we want to store at this level
607 * would fit in the inode root block, then we have our
608 * btree root and are done.
609 *
610 * Note that bmap btrees forbid records in the root.
611 */
612 if ((level != 0 ||
613 (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) &&
614 nr_this_level <= avg_per_block) {
615 nr_blocks++;
616 break;
617 }
618
619 /*
620 * Otherwise, we have to store all the items for this
621 * level in traditional btree blocks and therefore need
622 * another level of btree to point to those blocks.
623 *
624 * We have to re-compute the geometry for each level of
625 * an inode-rooted btree because the geometry differs
626 * between a btree root in an inode fork and a
627 * traditional btree block.
628 *
629 * This distinction is made in the btree code based on
630 * whether level == bc_nlevels - 1. Based on the
631 * previous root block size check against the root
632 * block geometry, we know that we aren't yet ready to
633 * populate the root. Increment bc_nevels and
634 * recalculate the geometry for a traditional
635 * block-based btree level.
636 */
637 cur->bc_nlevels++;
638 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
639 xfs_btree_bload_level_geometry(cur, bbl, level,
640 nr_this_level, &avg_per_block,
641 &level_blocks, &dontcare64);
642 } else {
643 /*
644 * If all the items we want to store at this level
645 * would fit in a single root block, we're done.
646 */
647 if (nr_this_level <= avg_per_block) {
648 nr_blocks++;
649 break;
650 }
651
652 /* Otherwise, we need another level of btree. */
653 cur->bc_nlevels++;
654 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
655 }
656
657 nr_blocks += level_blocks;
658 nr_this_level = level_blocks;
659 }
660
661 if (cur->bc_nlevels > cur->bc_maxlevels)
662 return -EOVERFLOW;
663
664 bbl->btree_height = cur->bc_nlevels;
665 if (ops->type == XFS_BTREE_TYPE_INODE)
666 bbl->nr_blocks = nr_blocks - 1;
667 else
668 bbl->nr_blocks = nr_blocks;
669 return 0;
670 }
671
672 /* Bulk load a btree given the parameters and geometry established in bbl. */
673 int
xfs_btree_bload(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,void * priv)674 xfs_btree_bload(
675 struct xfs_btree_cur *cur,
676 struct xfs_btree_bload *bbl,
677 void *priv)
678 {
679 struct list_head buffers_list;
680 union xfs_btree_ptr child_ptr;
681 union xfs_btree_ptr ptr;
682 struct xfs_buf *bp = NULL;
683 struct xfs_btree_block *block = NULL;
684 uint64_t nr_this_level = bbl->nr_records;
685 uint64_t blocks;
686 uint64_t i;
687 uint64_t blocks_with_extra;
688 uint64_t total_blocks = 0;
689 unsigned int avg_per_block;
690 unsigned int level = 0;
691 int ret;
692
693 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
694
695 INIT_LIST_HEAD(&buffers_list);
696 cur->bc_nlevels = bbl->btree_height;
697 xfs_btree_set_ptr_null(cur, &child_ptr);
698 xfs_btree_set_ptr_null(cur, &ptr);
699 bbl->nr_dirty = 0;
700
701 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
702 &avg_per_block, &blocks, &blocks_with_extra);
703
704 /* Load each leaf block. */
705 for (i = 0; i < blocks; i++) {
706 unsigned int nr_this_block = avg_per_block;
707
708 /*
709 * Due to rounding, btree blocks will not be evenly populated
710 * in most cases. blocks_with_extra tells us how many blocks
711 * will receive an extra record to distribute the excess across
712 * the current level as evenly as possible.
713 */
714 if (i < blocks_with_extra)
715 nr_this_block++;
716
717 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
718 nr_this_block, &ptr, &bp, &block, priv);
719 if (ret)
720 goto out;
721
722 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
723 nr_this_block);
724
725 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
726 block, priv);
727 if (ret)
728 goto out;
729
730 /*
731 * Record the leftmost leaf pointer so we know where to start
732 * with the first node level.
733 */
734 if (i == 0)
735 xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
736 }
737 total_blocks += blocks;
738
739 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
740 if (ret)
741 goto out;
742
743 /* Populate the internal btree nodes. */
744 for (level = 1; level < cur->bc_nlevels; level++) {
745 union xfs_btree_ptr first_ptr;
746
747 nr_this_level = blocks;
748 block = NULL;
749 xfs_btree_set_ptr_null(cur, &ptr);
750
751 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
752 &avg_per_block, &blocks, &blocks_with_extra);
753
754 /* Load each node block. */
755 for (i = 0; i < blocks; i++) {
756 unsigned int nr_this_block = avg_per_block;
757
758 if (i < blocks_with_extra)
759 nr_this_block++;
760
761 ret = xfs_btree_bload_prep_block(cur, bbl,
762 &buffers_list, level, nr_this_block,
763 &ptr, &bp, &block, priv);
764 if (ret)
765 goto out;
766
767 trace_xfs_btree_bload_block(cur, level, i, blocks,
768 &ptr, nr_this_block);
769
770 ret = xfs_btree_bload_node(cur, nr_this_block,
771 &child_ptr, block);
772 if (ret)
773 goto out;
774
775 /*
776 * Record the leftmost node pointer so that we know
777 * where to start the next node level above this one.
778 */
779 if (i == 0)
780 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
781 }
782 total_blocks += blocks;
783
784 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
785 if (ret)
786 goto out;
787
788 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
789 }
790
791 /* Initialize the new root. */
792 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
793 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
794 cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
795 cur->bc_ino.ifake->if_blocks = total_blocks - 1;
796 } else {
797 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
798 cur->bc_ag.afake->af_levels = cur->bc_nlevels;
799 cur->bc_ag.afake->af_blocks = total_blocks;
800 }
801
802 /*
803 * Write the new blocks to disk. If the ordered list isn't empty after
804 * that, then something went wrong and we have to fail. This should
805 * never happen, but we'll check anyway.
806 */
807 ret = xfs_buf_delwri_submit(&buffers_list);
808 if (ret)
809 goto out;
810 if (!list_empty(&buffers_list)) {
811 ASSERT(list_empty(&buffers_list));
812 ret = -EIO;
813 }
814
815 out:
816 xfs_buf_delwri_cancel(&buffers_list);
817 if (bp)
818 xfs_buf_relse(bp);
819 return ret;
820 }
821