1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2020 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_btree.h"
17 #include "xfs_trace.h"
18 #include "xfs_btree_staging.h"
19
20 /*
21 * Staging Cursors and Fake Roots for Btrees
22 * =========================================
23 *
24 * A staging btree cursor is a special type of btree cursor that callers must
25 * use to construct a new btree index using the btree bulk loader code. The
26 * bulk loading code uses the staging btree cursor to abstract the details of
27 * initializing new btree blocks and filling them with records or key/ptr
28 * pairs. Regular btree operations (e.g. queries and modifications) are not
29 * supported with staging cursors, and callers must not invoke them.
30 *
31 * Fake root structures contain all the information about a btree that is under
32 * construction by the bulk loading code. Staging btree cursors point to fake
33 * root structures instead of the usual AG header or inode structure.
34 *
35 * Callers are expected to initialize a fake root structure and pass it into
36 * the _stage_cursor function for a specific btree type. When bulk loading is
37 * complete, callers should call the _commit_staged_btree function for that
38 * specific btree type to commit the new btree into the filesystem.
39 */
40
41 /*
42 * Bulk Loading for AG Btrees
43 * ==========================
44 *
45 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
46 * staging cursor. Callers should initialize this to zero.
47 *
48 * The _stage_cursor() function for a specific btree type should call
49 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
50 * cursor. The corresponding _commit_staged_btree() function should log the
51 * new root and call xfs_btree_commit_afakeroot() to transform the staging
52 * cursor into a regular btree cursor.
53 */
54
55 /*
56 * Initialize a AG-rooted btree cursor with the given AG btree fake root.
57 */
58 void
xfs_btree_stage_afakeroot(struct xfs_btree_cur * cur,struct xbtree_afakeroot * afake)59 xfs_btree_stage_afakeroot(
60 struct xfs_btree_cur *cur,
61 struct xbtree_afakeroot *afake)
62 {
63 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
64 ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
65 ASSERT(cur->bc_tp == NULL);
66
67 cur->bc_ag.afake = afake;
68 cur->bc_nlevels = afake->af_levels;
69 cur->bc_flags |= XFS_BTREE_STAGING;
70 }
71
72 /*
73 * Transform an AG-rooted staging btree cursor back into a regular cursor by
74 * substituting a real btree root for the fake one and restoring normal btree
75 * cursor ops. The caller must log the btree root change prior to calling
76 * this.
77 */
78 void
xfs_btree_commit_afakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp)79 xfs_btree_commit_afakeroot(
80 struct xfs_btree_cur *cur,
81 struct xfs_trans *tp,
82 struct xfs_buf *agbp)
83 {
84 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
85 ASSERT(cur->bc_tp == NULL);
86
87 trace_xfs_btree_commit_afakeroot(cur);
88
89 cur->bc_ag.afake = NULL;
90 cur->bc_ag.agbp = agbp;
91 cur->bc_flags &= ~XFS_BTREE_STAGING;
92 cur->bc_tp = tp;
93 }
94
95 /*
96 * Bulk Loading for Inode-Rooted Btrees
97 * ====================================
98 *
99 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
100 * the staging cursor. This structure should be initialized as follows:
101 *
102 * - if_fork_size field should be set to the number of bytes available to the
103 * fork in the inode.
104 *
105 * - if_fork should point to a freshly allocated struct xfs_ifork.
106 *
107 * - if_format should be set to the appropriate fork type (e.g.
108 * XFS_DINODE_FMT_BTREE).
109 *
110 * All other fields must be zero.
111 *
112 * The _stage_cursor() function for a specific btree type should call
113 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
114 * cursor. The corresponding _commit_staged_btree() function should log the
115 * new root and call xfs_btree_commit_ifakeroot() to transform the staging
116 * cursor into a regular btree cursor.
117 */
118
119 /*
120 * Initialize an inode-rooted btree cursor with the given inode btree fake
121 * root. The btree cursor's bc_ops will be overridden as needed to make the
122 * staging functionality work. If new_ops is not NULL, these new ops will be
123 * passed out to the caller for further overriding.
124 */
125 void
xfs_btree_stage_ifakeroot(struct xfs_btree_cur * cur,struct xbtree_ifakeroot * ifake)126 xfs_btree_stage_ifakeroot(
127 struct xfs_btree_cur *cur,
128 struct xbtree_ifakeroot *ifake)
129 {
130 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
131 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
132 ASSERT(cur->bc_tp == NULL);
133
134 cur->bc_ino.ifake = ifake;
135 cur->bc_nlevels = ifake->if_levels;
136 cur->bc_ino.forksize = ifake->if_fork_size;
137 cur->bc_flags |= XFS_BTREE_STAGING;
138 }
139
140 /*
141 * Transform an inode-rooted staging btree cursor back into a regular cursor by
142 * substituting a real btree root for the fake one and restoring normal btree
143 * cursor ops. The caller must log the btree root change prior to calling
144 * this.
145 */
146 void
xfs_btree_commit_ifakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,int whichfork)147 xfs_btree_commit_ifakeroot(
148 struct xfs_btree_cur *cur,
149 struct xfs_trans *tp,
150 int whichfork)
151 {
152 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
153 ASSERT(cur->bc_tp == NULL);
154
155 trace_xfs_btree_commit_ifakeroot(cur);
156
157 cur->bc_ino.ifake = NULL;
158 cur->bc_ino.whichfork = whichfork;
159 cur->bc_flags &= ~XFS_BTREE_STAGING;
160 cur->bc_tp = tp;
161 }
162
163 /*
164 * Bulk Loading of Staged Btrees
165 * =============================
166 *
167 * This interface is used with a staged btree cursor to create a totally new
168 * btree with a large number of records (i.e. more than what would fit in a
169 * single root block). When the creation is complete, the new root can be
170 * linked atomically into the filesystem by committing the staged cursor.
171 *
172 * Creation of a new btree proceeds roughly as follows:
173 *
174 * The first step is to initialize an appropriate fake btree root structure and
175 * then construct a staged btree cursor. Refer to the block comments about
176 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
177 * more information about how to do this.
178 *
179 * The second step is to initialize a struct xfs_btree_bload context as
180 * documented in the structure definition.
181 *
182 * The third step is to call xfs_btree_bload_compute_geometry to compute the
183 * height of and the number of blocks needed to construct the btree. See the
184 * section "Computing the Geometry of the New Btree" for details about this
185 * computation.
186 *
187 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
188 * save them for later use by ->claim_block(). Bulk loading requires all
189 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
190 * rebuild, and to minimize seek distances of the new btree.
191 *
192 * Step five is to call xfs_btree_bload() to start constructing the btree.
193 *
194 * The final step is to commit the staging btree cursor, which logs the new
195 * btree root and turns the staging cursor into a regular cursor. The caller
196 * is responsible for cleaning up the previous btree blocks, if any.
197 *
198 * Computing the Geometry of the New Btree
199 * =======================================
200 *
201 * The number of items placed in each btree block is computed via the following
202 * algorithm: For leaf levels, the number of items for the level is nr_records
203 * in the bload structure. For node levels, the number of items for the level
204 * is the number of blocks in the next lower level of the tree. For each
205 * level, the desired number of items per block is defined as:
206 *
207 * desired = max(minrecs, maxrecs - slack factor)
208 *
209 * The number of blocks for the level is defined to be:
210 *
211 * blocks = floor(nr_items / desired)
212 *
213 * Note this is rounded down so that the npb calculation below will never fall
214 * below minrecs. The number of items that will actually be loaded into each
215 * btree block is defined as:
216 *
217 * npb = nr_items / blocks
218 *
219 * Some of the leftmost blocks in the level will contain one extra record as
220 * needed to handle uneven division. If the number of records in any block
221 * would exceed maxrecs for that level, blocks is incremented and npb is
222 * recalculated.
223 *
224 * In other words, we compute the number of blocks needed to satisfy a given
225 * loading level, then spread the items as evenly as possible.
226 *
227 * The height and number of fs blocks required to create the btree are computed
228 * and returned via btree_height and nr_blocks.
229 */
230
231 /*
232 * Put a btree block that we're loading onto the ordered list and release it.
233 * The btree blocks will be written to disk when bulk loading is finished.
234 * If we reach the dirty buffer threshold, flush them to disk before
235 * continuing.
236 */
237 static int
xfs_btree_bload_drop_buf(struct xfs_btree_bload * bbl,struct list_head * buffers_list,struct xfs_buf ** bpp)238 xfs_btree_bload_drop_buf(
239 struct xfs_btree_bload *bbl,
240 struct list_head *buffers_list,
241 struct xfs_buf **bpp)
242 {
243 struct xfs_buf *bp = *bpp;
244 int error;
245
246 if (!bp)
247 return 0;
248
249 /*
250 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
251 * xfs_buf_read will not pointlessly reread the contents from the disk.
252 */
253 bp->b_flags |= XBF_DONE;
254
255 xfs_buf_delwri_queue_here(bp, buffers_list);
256 xfs_buf_relse(bp);
257 *bpp = NULL;
258 bbl->nr_dirty++;
259
260 if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
261 return 0;
262
263 error = xfs_buf_delwri_submit(buffers_list);
264 if (error)
265 return error;
266
267 bbl->nr_dirty = 0;
268 return 0;
269 }
270
271 /*
272 * Allocate and initialize one btree block for bulk loading.
273 *
274 * The new btree block will have its level and numrecs fields set to the values
275 * of the level and nr_this_block parameters, respectively.
276 *
277 * The caller should ensure that ptrp, bpp, and blockp refer to the left
278 * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
279 * will all point to the new block.
280 */
281 STATIC int
xfs_btree_bload_prep_block(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,struct list_head * buffers_list,unsigned int level,unsigned int nr_this_block,union xfs_btree_ptr * ptrp,struct xfs_buf ** bpp,struct xfs_btree_block ** blockp,void * priv)282 xfs_btree_bload_prep_block(
283 struct xfs_btree_cur *cur,
284 struct xfs_btree_bload *bbl,
285 struct list_head *buffers_list,
286 unsigned int level,
287 unsigned int nr_this_block,
288 union xfs_btree_ptr *ptrp, /* in/out */
289 struct xfs_buf **bpp, /* in/out */
290 struct xfs_btree_block **blockp, /* in/out */
291 void *priv)
292 {
293 union xfs_btree_ptr new_ptr;
294 struct xfs_buf *new_bp;
295 struct xfs_btree_block *new_block;
296 int ret;
297
298 if (xfs_btree_at_iroot(cur, level)) {
299 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
300 size_t new_size;
301
302 ASSERT(*bpp == NULL);
303
304 /* Allocate a new incore btree root block. */
305 new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
306 ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
307 ifp->if_broot_bytes = (int)new_size;
308
309 /* Initialize it and send it out. */
310 xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
311 level, nr_this_block, cur->bc_ino.ip->i_ino);
312
313 *bpp = NULL;
314 *blockp = ifp->if_broot;
315 xfs_btree_set_ptr_null(cur, ptrp);
316 return 0;
317 }
318
319 /* Claim one of the caller's preallocated blocks. */
320 xfs_btree_set_ptr_null(cur, &new_ptr);
321 ret = bbl->claim_block(cur, &new_ptr, priv);
322 if (ret)
323 return ret;
324
325 ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
326
327 ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
328 if (ret)
329 return ret;
330
331 /*
332 * The previous block (if any) is the left sibling of the new block,
333 * so set its right sibling pointer to the new block and drop it.
334 */
335 if (*blockp)
336 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
337
338 ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
339 if (ret)
340 return ret;
341
342 /* Initialize the new btree block. */
343 xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
344 xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
345
346 /* Set the out parameters. */
347 *bpp = new_bp;
348 *blockp = new_block;
349 xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
350 return 0;
351 }
352
353 /* Load one leaf block. */
354 STATIC int
xfs_btree_bload_leaf(struct xfs_btree_cur * cur,unsigned int recs_this_block,xfs_btree_bload_get_records_fn get_records,struct xfs_btree_block * block,void * priv)355 xfs_btree_bload_leaf(
356 struct xfs_btree_cur *cur,
357 unsigned int recs_this_block,
358 xfs_btree_bload_get_records_fn get_records,
359 struct xfs_btree_block *block,
360 void *priv)
361 {
362 unsigned int j = 1;
363 int ret;
364
365 /* Fill the leaf block with records. */
366 while (j <= recs_this_block) {
367 ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
368 if (ret < 0)
369 return ret;
370 j += ret;
371 }
372
373 return 0;
374 }
375
376 /*
377 * Load one node block with key/ptr pairs.
378 *
379 * child_ptr must point to a block within the next level down in the tree. A
380 * key/ptr entry will be created in the new node block to the block pointed to
381 * by child_ptr. On exit, child_ptr points to the next block on the child
382 * level that needs processing.
383 */
384 STATIC int
xfs_btree_bload_node(struct xfs_btree_cur * cur,unsigned int recs_this_block,union xfs_btree_ptr * child_ptr,struct xfs_btree_block * block)385 xfs_btree_bload_node(
386 struct xfs_btree_cur *cur,
387 unsigned int recs_this_block,
388 union xfs_btree_ptr *child_ptr,
389 struct xfs_btree_block *block)
390 {
391 unsigned int j;
392 int ret;
393
394 /* Fill the node block with keys and pointers. */
395 for (j = 1; j <= recs_this_block; j++) {
396 union xfs_btree_key child_key;
397 union xfs_btree_ptr *block_ptr;
398 union xfs_btree_key *block_key;
399 struct xfs_btree_block *child_block;
400 struct xfs_buf *child_bp;
401
402 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
403
404 /*
405 * Read the lower-level block in case the buffer for it has
406 * been reclaimed. LRU refs will be set on the block, which is
407 * desirable if the new btree commits.
408 */
409 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
410 &child_bp);
411 if (ret)
412 return ret;
413
414 block_ptr = xfs_btree_ptr_addr(cur, j, block);
415 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
416
417 block_key = xfs_btree_key_addr(cur, j, block);
418 xfs_btree_get_keys(cur, child_block, &child_key);
419 xfs_btree_copy_keys(cur, block_key, &child_key, 1);
420
421 xfs_btree_get_sibling(cur, child_block, child_ptr,
422 XFS_BB_RIGHTSIB);
423 xfs_buf_relse(child_bp);
424 }
425
426 return 0;
427 }
428
429 /*
430 * Compute the maximum number of records (or keyptrs) per block that we want to
431 * install at this level in the btree. Caller is responsible for having set
432 * @cur->bc_ino.forksize to the desired fork size, if appropriate.
433 */
434 STATIC unsigned int
xfs_btree_bload_max_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)435 xfs_btree_bload_max_npb(
436 struct xfs_btree_cur *cur,
437 struct xfs_btree_bload *bbl,
438 unsigned int level)
439 {
440 unsigned int ret;
441
442 if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
443 return cur->bc_ops->get_dmaxrecs(cur, level);
444
445 ret = cur->bc_ops->get_maxrecs(cur, level);
446 if (level == 0)
447 ret -= bbl->leaf_slack;
448 else
449 ret -= bbl->node_slack;
450 return ret;
451 }
452
453 /*
454 * Compute the desired number of records (or keyptrs) per block that we want to
455 * install at this level in the btree, which must be somewhere between minrecs
456 * and max_npb. The caller is free to install fewer records per block.
457 */
458 STATIC unsigned int
xfs_btree_bload_desired_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)459 xfs_btree_bload_desired_npb(
460 struct xfs_btree_cur *cur,
461 struct xfs_btree_bload *bbl,
462 unsigned int level)
463 {
464 unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
465
466 /* Root blocks are not subject to minrecs rules. */
467 if (level == cur->bc_nlevels - 1)
468 return max(1U, npb);
469
470 return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
471 }
472
473 /*
474 * Compute the number of records to be stored in each block at this level and
475 * the number of blocks for this level. For leaf levels, we must populate an
476 * empty root block even if there are no records, so we have to have at least
477 * one block.
478 */
479 STATIC void
xfs_btree_bload_level_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level,uint64_t nr_this_level,unsigned int * avg_per_block,uint64_t * blocks,uint64_t * blocks_with_extra)480 xfs_btree_bload_level_geometry(
481 struct xfs_btree_cur *cur,
482 struct xfs_btree_bload *bbl,
483 unsigned int level,
484 uint64_t nr_this_level,
485 unsigned int *avg_per_block,
486 uint64_t *blocks,
487 uint64_t *blocks_with_extra)
488 {
489 uint64_t npb;
490 uint64_t dontcare;
491 unsigned int desired_npb;
492 unsigned int maxnr;
493
494 /*
495 * Compute the absolute maximum number of records that we can store in
496 * the ondisk block or inode root.
497 */
498 if (cur->bc_ops->get_dmaxrecs)
499 maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
500 else
501 maxnr = cur->bc_ops->get_maxrecs(cur, level);
502
503 /*
504 * Compute the number of blocks we need to fill each block with the
505 * desired number of records/keyptrs per block. Because desired_npb
506 * could be minrecs, we use regular integer division (which rounds
507 * the block count down) so that in the next step the effective # of
508 * items per block will never be less than desired_npb.
509 */
510 desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
511 *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
512 *blocks = max(1ULL, *blocks);
513
514 /*
515 * Compute the number of records that we will actually put in each
516 * block, assuming that we want to spread the records evenly between
517 * the blocks. Take care that the effective # of items per block (npb)
518 * won't exceed maxrecs even for the blocks that get an extra record,
519 * since desired_npb could be maxrecs, and in the previous step we
520 * rounded the block count down.
521 */
522 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
523 if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
524 (*blocks)++;
525 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
526 }
527
528 *avg_per_block = min_t(uint64_t, npb, nr_this_level);
529
530 trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
531 *avg_per_block, desired_npb, *blocks,
532 *blocks_with_extra);
533 }
534
535 /*
536 * Ensure a slack value is appropriate for the btree.
537 *
538 * If the slack value is negative, set slack so that we fill the block to
539 * halfway between minrecs and maxrecs. Make sure the slack is never so large
540 * that we can underflow minrecs.
541 */
542 static void
xfs_btree_bload_ensure_slack(struct xfs_btree_cur * cur,int * slack,int level)543 xfs_btree_bload_ensure_slack(
544 struct xfs_btree_cur *cur,
545 int *slack,
546 int level)
547 {
548 int maxr;
549 int minr;
550
551 maxr = cur->bc_ops->get_maxrecs(cur, level);
552 minr = cur->bc_ops->get_minrecs(cur, level);
553
554 /*
555 * If slack is negative, automatically set slack so that we load the
556 * btree block approximately halfway between minrecs and maxrecs.
557 * Generally, this will net us 75% loading.
558 */
559 if (*slack < 0)
560 *slack = maxr - ((maxr + minr) >> 1);
561
562 *slack = min(*slack, maxr - minr);
563 }
564
565 /*
566 * Prepare a btree cursor for a bulk load operation by computing the geometry
567 * fields in bbl. Caller must ensure that the btree cursor is a staging
568 * cursor. This function can be called multiple times.
569 */
570 int
xfs_btree_bload_compute_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,uint64_t nr_records)571 xfs_btree_bload_compute_geometry(
572 struct xfs_btree_cur *cur,
573 struct xfs_btree_bload *bbl,
574 uint64_t nr_records)
575 {
576 uint64_t nr_blocks = 0;
577 uint64_t nr_this_level;
578
579 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
580
581 /*
582 * Make sure that the slack values make sense for traditional leaf and
583 * node blocks. Inode-rooted btrees will return different minrecs and
584 * maxrecs values for the root block (bc_nlevels == level - 1). We're
585 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
586 * code doesn't interpret either as the root level.
587 */
588 cur->bc_nlevels = cur->bc_maxlevels - 1;
589 xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
590 xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
591
592 bbl->nr_records = nr_this_level = nr_records;
593 for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
594 uint64_t level_blocks;
595 uint64_t dontcare64;
596 unsigned int level = cur->bc_nlevels - 1;
597 unsigned int avg_per_block;
598
599 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
600 &avg_per_block, &level_blocks, &dontcare64);
601
602 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
603 /*
604 * If all the items we want to store at this level
605 * would fit in the inode root block, then we have our
606 * btree root and are done.
607 *
608 * Note that bmap btrees forbid records in the root.
609 */
610 if (level != 0 && nr_this_level <= avg_per_block) {
611 nr_blocks++;
612 break;
613 }
614
615 /*
616 * Otherwise, we have to store all the items for this
617 * level in traditional btree blocks and therefore need
618 * another level of btree to point to those blocks.
619 *
620 * We have to re-compute the geometry for each level of
621 * an inode-rooted btree because the geometry differs
622 * between a btree root in an inode fork and a
623 * traditional btree block.
624 *
625 * This distinction is made in the btree code based on
626 * whether level == bc_nlevels - 1. Based on the
627 * previous root block size check against the root
628 * block geometry, we know that we aren't yet ready to
629 * populate the root. Increment bc_nevels and
630 * recalculate the geometry for a traditional
631 * block-based btree level.
632 */
633 cur->bc_nlevels++;
634 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
635 xfs_btree_bload_level_geometry(cur, bbl, level,
636 nr_this_level, &avg_per_block,
637 &level_blocks, &dontcare64);
638 } else {
639 /*
640 * If all the items we want to store at this level
641 * would fit in a single root block, we're done.
642 */
643 if (nr_this_level <= avg_per_block) {
644 nr_blocks++;
645 break;
646 }
647
648 /* Otherwise, we need another level of btree. */
649 cur->bc_nlevels++;
650 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
651 }
652
653 nr_blocks += level_blocks;
654 nr_this_level = level_blocks;
655 }
656
657 if (cur->bc_nlevels > cur->bc_maxlevels)
658 return -EOVERFLOW;
659
660 bbl->btree_height = cur->bc_nlevels;
661 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
662 bbl->nr_blocks = nr_blocks - 1;
663 else
664 bbl->nr_blocks = nr_blocks;
665 return 0;
666 }
667
668 /* Bulk load a btree given the parameters and geometry established in bbl. */
669 int
xfs_btree_bload(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,void * priv)670 xfs_btree_bload(
671 struct xfs_btree_cur *cur,
672 struct xfs_btree_bload *bbl,
673 void *priv)
674 {
675 struct list_head buffers_list;
676 union xfs_btree_ptr child_ptr;
677 union xfs_btree_ptr ptr;
678 struct xfs_buf *bp = NULL;
679 struct xfs_btree_block *block = NULL;
680 uint64_t nr_this_level = bbl->nr_records;
681 uint64_t blocks;
682 uint64_t i;
683 uint64_t blocks_with_extra;
684 uint64_t total_blocks = 0;
685 unsigned int avg_per_block;
686 unsigned int level = 0;
687 int ret;
688
689 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
690
691 INIT_LIST_HEAD(&buffers_list);
692 cur->bc_nlevels = bbl->btree_height;
693 xfs_btree_set_ptr_null(cur, &child_ptr);
694 xfs_btree_set_ptr_null(cur, &ptr);
695 bbl->nr_dirty = 0;
696
697 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
698 &avg_per_block, &blocks, &blocks_with_extra);
699
700 /* Load each leaf block. */
701 for (i = 0; i < blocks; i++) {
702 unsigned int nr_this_block = avg_per_block;
703
704 /*
705 * Due to rounding, btree blocks will not be evenly populated
706 * in most cases. blocks_with_extra tells us how many blocks
707 * will receive an extra record to distribute the excess across
708 * the current level as evenly as possible.
709 */
710 if (i < blocks_with_extra)
711 nr_this_block++;
712
713 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
714 nr_this_block, &ptr, &bp, &block, priv);
715 if (ret)
716 goto out;
717
718 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
719 nr_this_block);
720
721 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
722 block, priv);
723 if (ret)
724 goto out;
725
726 /*
727 * Record the leftmost leaf pointer so we know where to start
728 * with the first node level.
729 */
730 if (i == 0)
731 xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
732 }
733 total_blocks += blocks;
734
735 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
736 if (ret)
737 goto out;
738
739 /* Populate the internal btree nodes. */
740 for (level = 1; level < cur->bc_nlevels; level++) {
741 union xfs_btree_ptr first_ptr;
742
743 nr_this_level = blocks;
744 block = NULL;
745 xfs_btree_set_ptr_null(cur, &ptr);
746
747 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
748 &avg_per_block, &blocks, &blocks_with_extra);
749
750 /* Load each node block. */
751 for (i = 0; i < blocks; i++) {
752 unsigned int nr_this_block = avg_per_block;
753
754 if (i < blocks_with_extra)
755 nr_this_block++;
756
757 ret = xfs_btree_bload_prep_block(cur, bbl,
758 &buffers_list, level, nr_this_block,
759 &ptr, &bp, &block, priv);
760 if (ret)
761 goto out;
762
763 trace_xfs_btree_bload_block(cur, level, i, blocks,
764 &ptr, nr_this_block);
765
766 ret = xfs_btree_bload_node(cur, nr_this_block,
767 &child_ptr, block);
768 if (ret)
769 goto out;
770
771 /*
772 * Record the leftmost node pointer so that we know
773 * where to start the next node level above this one.
774 */
775 if (i == 0)
776 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
777 }
778 total_blocks += blocks;
779
780 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
781 if (ret)
782 goto out;
783
784 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
785 }
786
787 /* Initialize the new root. */
788 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
789 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
790 cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
791 cur->bc_ino.ifake->if_blocks = total_blocks - 1;
792 } else {
793 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
794 cur->bc_ag.afake->af_levels = cur->bc_nlevels;
795 cur->bc_ag.afake->af_blocks = total_blocks;
796 }
797
798 /*
799 * Write the new blocks to disk. If the ordered list isn't empty after
800 * that, then something went wrong and we have to fail. This should
801 * never happen, but we'll check anyway.
802 */
803 ret = xfs_buf_delwri_submit(&buffers_list);
804 if (ret)
805 goto out;
806 if (!list_empty(&buffers_list)) {
807 ASSERT(list_empty(&buffers_list));
808 ret = -EIO;
809 }
810
811 out:
812 xfs_buf_delwri_cancel(&buffers_list);
813 if (bp)
814 xfs_buf_relse(bp);
815 return ret;
816 }
817