1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include <linux/error-injection.h>
11 #include "messages.h"
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "print-tree.h"
16 #include "locking.h"
17 #include "volumes.h"
18 #include "qgroup.h"
19 #include "tree-mod-log.h"
20 #include "tree-checker.h"
21 #include "fs.h"
22 #include "accessors.h"
23 #include "extent-tree.h"
24 #include "relocation.h"
25 #include "file-item.h"
26
27 static struct kmem_cache *btrfs_path_cachep;
28
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 const struct btrfs_key *ins_key, struct btrfs_path *path,
33 int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35 struct extent_buffer *dst,
36 struct extent_buffer *src, int empty);
37 static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf);
40 /*
41 * The leaf data grows from end-to-front in the node. this returns the address
42 * of the start of the last item, which is the stop of the leaf data stack.
43 */
leaf_data_end(const struct extent_buffer * leaf)44 static unsigned int leaf_data_end(const struct extent_buffer *leaf)
45 {
46 u32 nr = btrfs_header_nritems(leaf);
47
48 if (nr == 0)
49 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
50 return btrfs_item_offset(leaf, nr - 1);
51 }
52
53 /*
54 * Move data in a @leaf (using memmove, safe for overlapping ranges).
55 *
56 * @leaf: leaf that we're doing a memmove on
57 * @dst_offset: item data offset we're moving to
58 * @src_offset: item data offset were' moving from
59 * @len: length of the data we're moving
60 *
61 * Wrapper around memmove_extent_buffer() that takes into account the header on
62 * the leaf. The btrfs_item offset's start directly after the header, so we
63 * have to adjust any offsets to account for the header in the leaf. This
64 * handles that math to simplify the callers.
65 */
memmove_leaf_data(const struct extent_buffer * leaf,unsigned long dst_offset,unsigned long src_offset,unsigned long len)66 static inline void memmove_leaf_data(const struct extent_buffer *leaf,
67 unsigned long dst_offset,
68 unsigned long src_offset,
69 unsigned long len)
70 {
71 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
72 btrfs_item_nr_offset(leaf, 0) + src_offset, len);
73 }
74
75 /*
76 * Copy item data from @src into @dst at the given @offset.
77 *
78 * @dst: destination leaf that we're copying into
79 * @src: source leaf that we're copying from
80 * @dst_offset: item data offset we're copying to
81 * @src_offset: item data offset were' copying from
82 * @len: length of the data we're copying
83 *
84 * Wrapper around copy_extent_buffer() that takes into account the header on
85 * the leaf. The btrfs_item offset's start directly after the header, so we
86 * have to adjust any offsets to account for the header in the leaf. This
87 * handles that math to simplify the callers.
88 */
copy_leaf_data(const struct extent_buffer * dst,const struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,unsigned long len)89 static inline void copy_leaf_data(const struct extent_buffer *dst,
90 const struct extent_buffer *src,
91 unsigned long dst_offset,
92 unsigned long src_offset, unsigned long len)
93 {
94 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
95 btrfs_item_nr_offset(src, 0) + src_offset, len);
96 }
97
98 /*
99 * Move items in a @leaf (using memmove).
100 *
101 * @dst: destination leaf for the items
102 * @dst_item: the item nr we're copying into
103 * @src_item: the item nr we're copying from
104 * @nr_items: the number of items to copy
105 *
106 * Wrapper around memmove_extent_buffer() that does the math to get the
107 * appropriate offsets into the leaf from the item numbers.
108 */
memmove_leaf_items(const struct extent_buffer * leaf,int dst_item,int src_item,int nr_items)109 static inline void memmove_leaf_items(const struct extent_buffer *leaf,
110 int dst_item, int src_item, int nr_items)
111 {
112 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
113 btrfs_item_nr_offset(leaf, src_item),
114 nr_items * sizeof(struct btrfs_item));
115 }
116
117 /*
118 * Copy items from @src into @dst at the given @offset.
119 *
120 * @dst: destination leaf for the items
121 * @src: source leaf for the items
122 * @dst_item: the item nr we're copying into
123 * @src_item: the item nr we're copying from
124 * @nr_items: the number of items to copy
125 *
126 * Wrapper around copy_extent_buffer() that does the math to get the
127 * appropriate offsets into the leaf from the item numbers.
128 */
copy_leaf_items(const struct extent_buffer * dst,const struct extent_buffer * src,int dst_item,int src_item,int nr_items)129 static inline void copy_leaf_items(const struct extent_buffer *dst,
130 const struct extent_buffer *src,
131 int dst_item, int src_item, int nr_items)
132 {
133 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
134 btrfs_item_nr_offset(src, src_item),
135 nr_items * sizeof(struct btrfs_item));
136 }
137
btrfs_alloc_path(void)138 struct btrfs_path *btrfs_alloc_path(void)
139 {
140 might_sleep();
141
142 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
143 }
144
145 /* this also releases the path */
btrfs_free_path(struct btrfs_path * p)146 void btrfs_free_path(struct btrfs_path *p)
147 {
148 if (!p)
149 return;
150 btrfs_release_path(p);
151 kmem_cache_free(btrfs_path_cachep, p);
152 }
153
154 /*
155 * path release drops references on the extent buffers in the path
156 * and it drops any locks held by this path
157 *
158 * It is safe to call this on paths that no locks or extent buffers held.
159 */
btrfs_release_path(struct btrfs_path * p)160 noinline void btrfs_release_path(struct btrfs_path *p)
161 {
162 int i;
163
164 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
165 p->slots[i] = 0;
166 if (!p->nodes[i])
167 continue;
168 if (p->locks[i]) {
169 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
170 p->locks[i] = 0;
171 }
172 free_extent_buffer(p->nodes[i]);
173 p->nodes[i] = NULL;
174 }
175 }
176
177 /*
178 * safely gets a reference on the root node of a tree. A lock
179 * is not taken, so a concurrent writer may put a different node
180 * at the root of the tree. See btrfs_lock_root_node for the
181 * looping required.
182 *
183 * The extent buffer returned by this has a reference taken, so
184 * it won't disappear. It may stop being the root of the tree
185 * at any time because there are no locks held.
186 */
btrfs_root_node(struct btrfs_root * root)187 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
188 {
189 struct extent_buffer *eb;
190
191 while (1) {
192 rcu_read_lock();
193 eb = rcu_dereference(root->node);
194
195 /*
196 * RCU really hurts here, we could free up the root node because
197 * it was COWed but we may not get the new root node yet so do
198 * the inc_not_zero dance and if it doesn't work then
199 * synchronize_rcu and try again.
200 */
201 if (refcount_inc_not_zero(&eb->refs)) {
202 rcu_read_unlock();
203 break;
204 }
205 rcu_read_unlock();
206 synchronize_rcu();
207 }
208 return eb;
209 }
210
211 /*
212 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
213 * just get put onto a simple dirty list. Transaction walks this list to make
214 * sure they get properly updated on disk.
215 */
add_root_to_dirty_list(struct btrfs_root * root)216 static void add_root_to_dirty_list(struct btrfs_root *root)
217 {
218 struct btrfs_fs_info *fs_info = root->fs_info;
219
220 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
221 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
222 return;
223
224 spin_lock(&fs_info->trans_lock);
225 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
226 /* Want the extent tree to be the last on the list */
227 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
228 list_move_tail(&root->dirty_list,
229 &fs_info->dirty_cowonly_roots);
230 else
231 list_move(&root->dirty_list,
232 &fs_info->dirty_cowonly_roots);
233 }
234 spin_unlock(&fs_info->trans_lock);
235 }
236
237 /*
238 * used by snapshot creation to make a copy of a root for a tree with
239 * a given objectid. The buffer with the new root node is returned in
240 * cow_ret, and this func returns zero on success or a negative error code.
241 */
btrfs_copy_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer ** cow_ret,u64 new_root_objectid)242 int btrfs_copy_root(struct btrfs_trans_handle *trans,
243 struct btrfs_root *root,
244 struct extent_buffer *buf,
245 struct extent_buffer **cow_ret, u64 new_root_objectid)
246 {
247 struct btrfs_fs_info *fs_info = root->fs_info;
248 struct extent_buffer *cow;
249 int ret = 0;
250 int level;
251 struct btrfs_disk_key disk_key;
252 u64 reloc_src_root = 0;
253
254 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
255 trans->transid != fs_info->running_transaction->transid);
256 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
257 trans->transid != btrfs_get_root_last_trans(root));
258
259 level = btrfs_header_level(buf);
260 if (level == 0)
261 btrfs_item_key(buf, &disk_key, 0);
262 else
263 btrfs_node_key(buf, &disk_key, 0);
264
265 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
266 reloc_src_root = btrfs_header_owner(buf);
267 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
268 &disk_key, level, buf->start, 0,
269 reloc_src_root, BTRFS_NESTING_NEW_ROOT);
270 if (IS_ERR(cow))
271 return PTR_ERR(cow);
272
273 copy_extent_buffer_full(cow, buf);
274 btrfs_set_header_bytenr(cow, cow->start);
275 btrfs_set_header_generation(cow, trans->transid);
276 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
277 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
278 BTRFS_HEADER_FLAG_RELOC);
279 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
280 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
281 else
282 btrfs_set_header_owner(cow, new_root_objectid);
283
284 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
285
286 if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
287 btrfs_tree_unlock(cow);
288 free_extent_buffer(cow);
289 ret = -EUCLEAN;
290 btrfs_abort_transaction(trans, ret);
291 return ret;
292 }
293
294 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
295 ret = btrfs_inc_ref(trans, root, cow, 1);
296 if (ret)
297 btrfs_abort_transaction(trans, ret);
298 } else {
299 ret = btrfs_inc_ref(trans, root, cow, 0);
300 if (ret)
301 btrfs_abort_transaction(trans, ret);
302 }
303 if (ret) {
304 btrfs_tree_unlock(cow);
305 free_extent_buffer(cow);
306 return ret;
307 }
308
309 btrfs_mark_buffer_dirty(trans, cow);
310 *cow_ret = cow;
311 return 0;
312 }
313
314 /*
315 * check if the tree block can be shared by multiple trees
316 */
btrfs_block_can_be_shared(const struct btrfs_trans_handle * trans,const struct btrfs_root * root,const struct extent_buffer * buf)317 bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
318 const struct btrfs_root *root,
319 const struct extent_buffer *buf)
320 {
321 const u64 buf_gen = btrfs_header_generation(buf);
322
323 /*
324 * Tree blocks not in shareable trees and tree roots are never shared.
325 * If a block was allocated after the last snapshot and the block was
326 * not allocated by tree relocation, we know the block is not shared.
327 */
328
329 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
330 return false;
331
332 if (buf == root->node)
333 return false;
334
335 if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
336 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
337 return false;
338
339 if (buf != root->commit_root)
340 return true;
341
342 /*
343 * An extent buffer that used to be the commit root may still be shared
344 * because the tree height may have increased and it became a child of a
345 * higher level root. This can happen when snapshotting a subvolume
346 * created in the current transaction.
347 */
348 if (buf_gen == trans->transid)
349 return true;
350
351 return false;
352 }
353
update_ref_for_cow(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * cow,int * last_ref)354 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
355 struct btrfs_root *root,
356 struct extent_buffer *buf,
357 struct extent_buffer *cow,
358 int *last_ref)
359 {
360 struct btrfs_fs_info *fs_info = root->fs_info;
361 u64 refs;
362 u64 owner;
363 u64 flags;
364 int ret;
365
366 /*
367 * Backrefs update rules:
368 *
369 * Always use full backrefs for extent pointers in tree block
370 * allocated by tree relocation.
371 *
372 * If a shared tree block is no longer referenced by its owner
373 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
374 * use full backrefs for extent pointers in tree block.
375 *
376 * If a tree block is been relocating
377 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
378 * use full backrefs for extent pointers in tree block.
379 * The reason for this is some operations (such as drop tree)
380 * are only allowed for blocks use full backrefs.
381 */
382
383 if (btrfs_block_can_be_shared(trans, root, buf)) {
384 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
385 btrfs_header_level(buf), 1,
386 &refs, &flags, NULL);
387 if (ret)
388 return ret;
389 if (unlikely(refs == 0)) {
390 btrfs_crit(fs_info,
391 "found 0 references for tree block at bytenr %llu level %d root %llu",
392 buf->start, btrfs_header_level(buf),
393 btrfs_root_id(root));
394 ret = -EUCLEAN;
395 btrfs_abort_transaction(trans, ret);
396 return ret;
397 }
398 } else {
399 refs = 1;
400 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
401 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
402 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
403 else
404 flags = 0;
405 }
406
407 owner = btrfs_header_owner(buf);
408 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
409 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
410 btrfs_crit(fs_info,
411 "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
412 buf->start, btrfs_header_level(buf),
413 btrfs_root_id(root), refs, flags);
414 ret = -EUCLEAN;
415 btrfs_abort_transaction(trans, ret);
416 return ret;
417 }
418
419 if (refs > 1) {
420 if ((owner == btrfs_root_id(root) ||
421 btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
422 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
423 ret = btrfs_inc_ref(trans, root, buf, 1);
424 if (ret)
425 return ret;
426
427 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
428 ret = btrfs_dec_ref(trans, root, buf, 0);
429 if (ret)
430 return ret;
431 ret = btrfs_inc_ref(trans, root, cow, 1);
432 if (ret)
433 return ret;
434 }
435 ret = btrfs_set_disk_extent_flags(trans, buf,
436 BTRFS_BLOCK_FLAG_FULL_BACKREF);
437 if (ret)
438 return ret;
439 } else {
440
441 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
442 ret = btrfs_inc_ref(trans, root, cow, 1);
443 else
444 ret = btrfs_inc_ref(trans, root, cow, 0);
445 if (ret)
446 return ret;
447 }
448 } else {
449 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
450 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
451 ret = btrfs_inc_ref(trans, root, cow, 1);
452 else
453 ret = btrfs_inc_ref(trans, root, cow, 0);
454 if (ret)
455 return ret;
456 ret = btrfs_dec_ref(trans, root, buf, 1);
457 if (ret)
458 return ret;
459 }
460 btrfs_clear_buffer_dirty(trans, buf);
461 *last_ref = 1;
462 }
463 return 0;
464 }
465
466 /*
467 * does the dirty work in cow of a single block. The parent block (if
468 * supplied) is updated to point to the new cow copy. The new buffer is marked
469 * dirty and returned locked. If you modify the block it needs to be marked
470 * dirty again.
471 *
472 * search_start -- an allocation hint for the new block
473 *
474 * empty_size -- a hint that you plan on doing more cow. This is the size in
475 * bytes the allocator should try to find free next to the block it returns.
476 * This is just a hint and may be ignored by the allocator.
477 */
btrfs_force_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret,u64 search_start,u64 empty_size,enum btrfs_lock_nesting nest)478 int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
479 struct btrfs_root *root,
480 struct extent_buffer *buf,
481 struct extent_buffer *parent, int parent_slot,
482 struct extent_buffer **cow_ret,
483 u64 search_start, u64 empty_size,
484 enum btrfs_lock_nesting nest)
485 {
486 struct btrfs_fs_info *fs_info = root->fs_info;
487 struct btrfs_disk_key disk_key;
488 struct extent_buffer *cow;
489 int level, ret;
490 int last_ref = 0;
491 int unlock_orig = 0;
492 u64 parent_start = 0;
493 u64 reloc_src_root = 0;
494
495 if (*cow_ret == buf)
496 unlock_orig = 1;
497
498 btrfs_assert_tree_write_locked(buf);
499
500 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
501 trans->transid != fs_info->running_transaction->transid);
502 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
503 trans->transid != btrfs_get_root_last_trans(root));
504
505 level = btrfs_header_level(buf);
506
507 if (level == 0)
508 btrfs_item_key(buf, &disk_key, 0);
509 else
510 btrfs_node_key(buf, &disk_key, 0);
511
512 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
513 if (parent)
514 parent_start = parent->start;
515 reloc_src_root = btrfs_header_owner(buf);
516 }
517 cow = btrfs_alloc_tree_block(trans, root, parent_start,
518 btrfs_root_id(root), &disk_key, level,
519 search_start, empty_size, reloc_src_root, nest);
520 if (IS_ERR(cow))
521 return PTR_ERR(cow);
522
523 /* cow is set to blocking by btrfs_init_new_buffer */
524
525 copy_extent_buffer_full(cow, buf);
526 btrfs_set_header_bytenr(cow, cow->start);
527 btrfs_set_header_generation(cow, trans->transid);
528 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
529 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
530 BTRFS_HEADER_FLAG_RELOC);
531 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
532 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
533 else
534 btrfs_set_header_owner(cow, btrfs_root_id(root));
535
536 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
537
538 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
539 if (ret) {
540 btrfs_abort_transaction(trans, ret);
541 goto error_unlock_cow;
542 }
543
544 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
545 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
546 if (ret) {
547 btrfs_abort_transaction(trans, ret);
548 goto error_unlock_cow;
549 }
550 }
551
552 if (buf == root->node) {
553 WARN_ON(parent && parent != buf);
554 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
555 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
556 parent_start = buf->start;
557
558 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
559 if (ret < 0) {
560 btrfs_abort_transaction(trans, ret);
561 goto error_unlock_cow;
562 }
563 refcount_inc(&cow->refs);
564 rcu_assign_pointer(root->node, cow);
565
566 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
567 parent_start, last_ref);
568 free_extent_buffer(buf);
569 add_root_to_dirty_list(root);
570 if (ret < 0) {
571 btrfs_abort_transaction(trans, ret);
572 goto error_unlock_cow;
573 }
574 } else {
575 WARN_ON(trans->transid != btrfs_header_generation(parent));
576 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
577 BTRFS_MOD_LOG_KEY_REPLACE);
578 if (ret) {
579 btrfs_abort_transaction(trans, ret);
580 goto error_unlock_cow;
581 }
582 btrfs_set_node_blockptr(parent, parent_slot,
583 cow->start);
584 btrfs_set_node_ptr_generation(parent, parent_slot,
585 trans->transid);
586 btrfs_mark_buffer_dirty(trans, parent);
587 if (last_ref) {
588 ret = btrfs_tree_mod_log_free_eb(buf);
589 if (ret) {
590 btrfs_abort_transaction(trans, ret);
591 goto error_unlock_cow;
592 }
593 }
594 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
595 parent_start, last_ref);
596 if (ret < 0) {
597 btrfs_abort_transaction(trans, ret);
598 goto error_unlock_cow;
599 }
600 }
601
602 trace_btrfs_cow_block(root, buf, cow);
603 if (unlock_orig)
604 btrfs_tree_unlock(buf);
605 free_extent_buffer_stale(buf);
606 btrfs_mark_buffer_dirty(trans, cow);
607 *cow_ret = cow;
608 return 0;
609
610 error_unlock_cow:
611 btrfs_tree_unlock(cow);
612 free_extent_buffer(cow);
613 return ret;
614 }
615
should_cow_block(const struct btrfs_trans_handle * trans,const struct btrfs_root * root,const struct extent_buffer * buf)616 static inline int should_cow_block(const struct btrfs_trans_handle *trans,
617 const struct btrfs_root *root,
618 const struct extent_buffer *buf)
619 {
620 if (btrfs_is_testing(root->fs_info))
621 return 0;
622
623 /* Ensure we can see the FORCE_COW bit */
624 smp_mb__before_atomic();
625
626 /*
627 * We do not need to cow a block if
628 * 1) this block is not created or changed in this transaction;
629 * 2) this block does not belong to TREE_RELOC tree;
630 * 3) the root is not forced COW.
631 *
632 * What is forced COW:
633 * when we create snapshot during committing the transaction,
634 * after we've finished copying src root, we must COW the shared
635 * block to ensure the metadata consistency.
636 */
637 if (btrfs_header_generation(buf) == trans->transid &&
638 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
639 !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
640 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
641 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
642 return 0;
643 return 1;
644 }
645
646 /*
647 * COWs a single block, see btrfs_force_cow_block() for the real work.
648 * This version of it has extra checks so that a block isn't COWed more than
649 * once per transaction, as long as it hasn't been written yet
650 */
btrfs_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * parent,int parent_slot,struct extent_buffer ** cow_ret,enum btrfs_lock_nesting nest)651 int btrfs_cow_block(struct btrfs_trans_handle *trans,
652 struct btrfs_root *root, struct extent_buffer *buf,
653 struct extent_buffer *parent, int parent_slot,
654 struct extent_buffer **cow_ret,
655 enum btrfs_lock_nesting nest)
656 {
657 struct btrfs_fs_info *fs_info = root->fs_info;
658 u64 search_start;
659
660 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
661 btrfs_abort_transaction(trans, -EUCLEAN);
662 btrfs_crit(fs_info,
663 "attempt to COW block %llu on root %llu that is being deleted",
664 buf->start, btrfs_root_id(root));
665 return -EUCLEAN;
666 }
667
668 /*
669 * COWing must happen through a running transaction, which always
670 * matches the current fs generation (it's a transaction with a state
671 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
672 * into error state to prevent the commit of any transaction.
673 */
674 if (unlikely(trans->transaction != fs_info->running_transaction ||
675 trans->transid != fs_info->generation)) {
676 btrfs_abort_transaction(trans, -EUCLEAN);
677 btrfs_crit(fs_info,
678 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
679 buf->start, btrfs_root_id(root), trans->transid,
680 fs_info->running_transaction->transid,
681 fs_info->generation);
682 return -EUCLEAN;
683 }
684
685 if (!should_cow_block(trans, root, buf)) {
686 *cow_ret = buf;
687 return 0;
688 }
689
690 search_start = round_down(buf->start, SZ_1G);
691
692 /*
693 * Before CoWing this block for later modification, check if it's
694 * the subtree root and do the delayed subtree trace if needed.
695 *
696 * Also We don't care about the error, as it's handled internally.
697 */
698 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
699 return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
700 cow_ret, search_start, 0, nest);
701 }
702 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
703
704 /*
705 * same as comp_keys only with two btrfs_key's
706 */
btrfs_comp_cpu_keys(const struct btrfs_key * k1,const struct btrfs_key * k2)707 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
708 {
709 if (k1->objectid > k2->objectid)
710 return 1;
711 if (k1->objectid < k2->objectid)
712 return -1;
713 if (k1->type > k2->type)
714 return 1;
715 if (k1->type < k2->type)
716 return -1;
717 if (k1->offset > k2->offset)
718 return 1;
719 if (k1->offset < k2->offset)
720 return -1;
721 return 0;
722 }
723
724 /*
725 * Search for a key in the given extent_buffer.
726 *
727 * The lower boundary for the search is specified by the slot number @first_slot.
728 * Use a value of 0 to search over the whole extent buffer. Works for both
729 * leaves and nodes.
730 *
731 * The slot in the extent buffer is returned via @slot. If the key exists in the
732 * extent buffer, then @slot will point to the slot where the key is, otherwise
733 * it points to the slot where you would insert the key.
734 *
735 * Slot may point to the total number of items (i.e. one position beyond the last
736 * key) if the key is bigger than the last key in the extent buffer.
737 */
btrfs_bin_search(const struct extent_buffer * eb,int first_slot,const struct btrfs_key * key,int * slot)738 int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
739 const struct btrfs_key *key, int *slot)
740 {
741 unsigned long p;
742 int item_size;
743 /*
744 * Use unsigned types for the low and high slots, so that we get a more
745 * efficient division in the search loop below.
746 */
747 u32 low = first_slot;
748 u32 high = btrfs_header_nritems(eb);
749 int ret;
750 const int key_size = sizeof(struct btrfs_disk_key);
751
752 if (unlikely(low > high)) {
753 btrfs_err(eb->fs_info,
754 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
755 __func__, low, high, eb->start,
756 btrfs_header_owner(eb), btrfs_header_level(eb));
757 return -EINVAL;
758 }
759
760 if (btrfs_header_level(eb) == 0) {
761 p = offsetof(struct btrfs_leaf, items);
762 item_size = sizeof(struct btrfs_item);
763 } else {
764 p = offsetof(struct btrfs_node, ptrs);
765 item_size = sizeof(struct btrfs_key_ptr);
766 }
767
768 while (low < high) {
769 const int unit_size = eb->folio_size;
770 unsigned long oil;
771 unsigned long offset;
772 struct btrfs_disk_key *tmp;
773 struct btrfs_disk_key unaligned;
774 int mid;
775
776 mid = (low + high) / 2;
777 offset = p + mid * item_size;
778 oil = get_eb_offset_in_folio(eb, offset);
779
780 if (oil + key_size <= unit_size) {
781 const unsigned long idx = get_eb_folio_index(eb, offset);
782 char *kaddr = folio_address(eb->folios[idx]);
783
784 oil = get_eb_offset_in_folio(eb, offset);
785 tmp = (struct btrfs_disk_key *)(kaddr + oil);
786 } else {
787 read_extent_buffer(eb, &unaligned, offset, key_size);
788 tmp = &unaligned;
789 }
790
791 ret = btrfs_comp_keys(tmp, key);
792
793 if (ret < 0)
794 low = mid + 1;
795 else if (ret > 0)
796 high = mid;
797 else {
798 *slot = mid;
799 return 0;
800 }
801 }
802 *slot = low;
803 return 1;
804 }
805
root_add_used_bytes(struct btrfs_root * root)806 static void root_add_used_bytes(struct btrfs_root *root)
807 {
808 spin_lock(&root->accounting_lock);
809 btrfs_set_root_used(&root->root_item,
810 btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
811 spin_unlock(&root->accounting_lock);
812 }
813
root_sub_used_bytes(struct btrfs_root * root)814 static void root_sub_used_bytes(struct btrfs_root *root)
815 {
816 spin_lock(&root->accounting_lock);
817 btrfs_set_root_used(&root->root_item,
818 btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
819 spin_unlock(&root->accounting_lock);
820 }
821
822 /* given a node and slot number, this reads the blocks it points to. The
823 * extent buffer is returned with a reference taken (but unlocked).
824 */
btrfs_read_node_slot(struct extent_buffer * parent,int slot)825 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
826 int slot)
827 {
828 int level = btrfs_header_level(parent);
829 struct btrfs_tree_parent_check check = { 0 };
830 struct extent_buffer *eb;
831
832 if (slot < 0 || slot >= btrfs_header_nritems(parent))
833 return ERR_PTR(-ENOENT);
834
835 ASSERT(level);
836
837 check.level = level - 1;
838 check.transid = btrfs_node_ptr_generation(parent, slot);
839 check.owner_root = btrfs_header_owner(parent);
840 check.has_first_key = true;
841 btrfs_node_key_to_cpu(parent, &check.first_key, slot);
842
843 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
844 &check);
845 if (IS_ERR(eb))
846 return eb;
847 if (!extent_buffer_uptodate(eb)) {
848 free_extent_buffer(eb);
849 return ERR_PTR(-EIO);
850 }
851
852 return eb;
853 }
854
855 /*
856 * node level balancing, used to make sure nodes are in proper order for
857 * item deletion. We balance from the top down, so we have to make sure
858 * that a deletion won't leave an node completely empty later on.
859 */
balance_level(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)860 static noinline int balance_level(struct btrfs_trans_handle *trans,
861 struct btrfs_root *root,
862 struct btrfs_path *path, int level)
863 {
864 struct btrfs_fs_info *fs_info = root->fs_info;
865 struct extent_buffer *right = NULL;
866 struct extent_buffer *mid;
867 struct extent_buffer *left = NULL;
868 struct extent_buffer *parent = NULL;
869 int ret = 0;
870 int wret;
871 int pslot;
872 int orig_slot = path->slots[level];
873 u64 orig_ptr;
874
875 ASSERT(level > 0);
876
877 mid = path->nodes[level];
878
879 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
880 WARN_ON(btrfs_header_generation(mid) != trans->transid);
881
882 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
883
884 if (level < BTRFS_MAX_LEVEL - 1) {
885 parent = path->nodes[level + 1];
886 pslot = path->slots[level + 1];
887 }
888
889 /*
890 * deal with the case where there is only one pointer in the root
891 * by promoting the node below to a root
892 */
893 if (!parent) {
894 struct extent_buffer *child;
895
896 if (btrfs_header_nritems(mid) != 1)
897 return 0;
898
899 /* promote the child to a root */
900 child = btrfs_read_node_slot(mid, 0);
901 if (IS_ERR(child)) {
902 ret = PTR_ERR(child);
903 goto out;
904 }
905
906 btrfs_tree_lock(child);
907 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
908 BTRFS_NESTING_COW);
909 if (ret) {
910 btrfs_tree_unlock(child);
911 free_extent_buffer(child);
912 goto out;
913 }
914
915 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
916 if (ret < 0) {
917 btrfs_tree_unlock(child);
918 free_extent_buffer(child);
919 btrfs_abort_transaction(trans, ret);
920 goto out;
921 }
922 rcu_assign_pointer(root->node, child);
923
924 add_root_to_dirty_list(root);
925 btrfs_tree_unlock(child);
926
927 path->locks[level] = 0;
928 path->nodes[level] = NULL;
929 btrfs_clear_buffer_dirty(trans, mid);
930 btrfs_tree_unlock(mid);
931 /* once for the path */
932 free_extent_buffer(mid);
933
934 root_sub_used_bytes(root);
935 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
936 /* once for the root ptr */
937 free_extent_buffer_stale(mid);
938 if (ret < 0) {
939 btrfs_abort_transaction(trans, ret);
940 goto out;
941 }
942 return 0;
943 }
944 if (btrfs_header_nritems(mid) >
945 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
946 return 0;
947
948 if (pslot) {
949 left = btrfs_read_node_slot(parent, pslot - 1);
950 if (IS_ERR(left)) {
951 ret = PTR_ERR(left);
952 left = NULL;
953 goto out;
954 }
955
956 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
957 wret = btrfs_cow_block(trans, root, left,
958 parent, pslot - 1, &left,
959 BTRFS_NESTING_LEFT_COW);
960 if (wret) {
961 ret = wret;
962 goto out;
963 }
964 }
965
966 if (pslot + 1 < btrfs_header_nritems(parent)) {
967 right = btrfs_read_node_slot(parent, pslot + 1);
968 if (IS_ERR(right)) {
969 ret = PTR_ERR(right);
970 right = NULL;
971 goto out;
972 }
973
974 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
975 wret = btrfs_cow_block(trans, root, right,
976 parent, pslot + 1, &right,
977 BTRFS_NESTING_RIGHT_COW);
978 if (wret) {
979 ret = wret;
980 goto out;
981 }
982 }
983
984 /* first, try to make some room in the middle buffer */
985 if (left) {
986 orig_slot += btrfs_header_nritems(left);
987 wret = push_node_left(trans, left, mid, 1);
988 if (wret < 0)
989 ret = wret;
990 }
991
992 /*
993 * then try to empty the right most buffer into the middle
994 */
995 if (right) {
996 wret = push_node_left(trans, mid, right, 1);
997 if (wret < 0 && wret != -ENOSPC)
998 ret = wret;
999 if (btrfs_header_nritems(right) == 0) {
1000 btrfs_clear_buffer_dirty(trans, right);
1001 btrfs_tree_unlock(right);
1002 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1003 if (ret < 0) {
1004 free_extent_buffer_stale(right);
1005 right = NULL;
1006 goto out;
1007 }
1008 root_sub_used_bytes(root);
1009 ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
1010 right, 0, 1);
1011 free_extent_buffer_stale(right);
1012 right = NULL;
1013 if (ret < 0) {
1014 btrfs_abort_transaction(trans, ret);
1015 goto out;
1016 }
1017 } else {
1018 struct btrfs_disk_key right_key;
1019 btrfs_node_key(right, &right_key, 0);
1020 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1021 BTRFS_MOD_LOG_KEY_REPLACE);
1022 if (ret < 0) {
1023 btrfs_abort_transaction(trans, ret);
1024 goto out;
1025 }
1026 btrfs_set_node_key(parent, &right_key, pslot + 1);
1027 btrfs_mark_buffer_dirty(trans, parent);
1028 }
1029 }
1030 if (btrfs_header_nritems(mid) == 1) {
1031 /*
1032 * we're not allowed to leave a node with one item in the
1033 * tree during a delete. A deletion from lower in the tree
1034 * could try to delete the only pointer in this node.
1035 * So, pull some keys from the left.
1036 * There has to be a left pointer at this point because
1037 * otherwise we would have pulled some pointers from the
1038 * right
1039 */
1040 if (unlikely(!left)) {
1041 btrfs_crit(fs_info,
1042 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1043 parent->start, btrfs_header_level(parent),
1044 mid->start, btrfs_root_id(root));
1045 ret = -EUCLEAN;
1046 btrfs_abort_transaction(trans, ret);
1047 goto out;
1048 }
1049 wret = balance_node_right(trans, mid, left);
1050 if (wret < 0) {
1051 ret = wret;
1052 goto out;
1053 }
1054 if (wret == 1) {
1055 wret = push_node_left(trans, left, mid, 1);
1056 if (wret < 0)
1057 ret = wret;
1058 }
1059 BUG_ON(wret == 1);
1060 }
1061 if (btrfs_header_nritems(mid) == 0) {
1062 btrfs_clear_buffer_dirty(trans, mid);
1063 btrfs_tree_unlock(mid);
1064 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1065 if (ret < 0) {
1066 free_extent_buffer_stale(mid);
1067 mid = NULL;
1068 goto out;
1069 }
1070 root_sub_used_bytes(root);
1071 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1072 free_extent_buffer_stale(mid);
1073 mid = NULL;
1074 if (ret < 0) {
1075 btrfs_abort_transaction(trans, ret);
1076 goto out;
1077 }
1078 } else {
1079 /* update the parent key to reflect our changes */
1080 struct btrfs_disk_key mid_key;
1081 btrfs_node_key(mid, &mid_key, 0);
1082 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1083 BTRFS_MOD_LOG_KEY_REPLACE);
1084 if (ret < 0) {
1085 btrfs_abort_transaction(trans, ret);
1086 goto out;
1087 }
1088 btrfs_set_node_key(parent, &mid_key, pslot);
1089 btrfs_mark_buffer_dirty(trans, parent);
1090 }
1091
1092 /* update the path */
1093 if (left) {
1094 if (btrfs_header_nritems(left) > orig_slot) {
1095 refcount_inc(&left->refs);
1096 /* left was locked after cow */
1097 path->nodes[level] = left;
1098 path->slots[level + 1] -= 1;
1099 path->slots[level] = orig_slot;
1100 if (mid) {
1101 btrfs_tree_unlock(mid);
1102 free_extent_buffer(mid);
1103 }
1104 } else {
1105 orig_slot -= btrfs_header_nritems(left);
1106 path->slots[level] = orig_slot;
1107 }
1108 }
1109 /* double check we haven't messed things up */
1110 if (orig_ptr !=
1111 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1112 BUG();
1113 out:
1114 if (right) {
1115 btrfs_tree_unlock(right);
1116 free_extent_buffer(right);
1117 }
1118 if (left) {
1119 if (path->nodes[level] != left)
1120 btrfs_tree_unlock(left);
1121 free_extent_buffer(left);
1122 }
1123 return ret;
1124 }
1125
1126 /* Node balancing for insertion. Here we only split or push nodes around
1127 * when they are completely full. This is also done top down, so we
1128 * have to be pessimistic.
1129 */
push_nodes_for_insert(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)1130 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1131 struct btrfs_root *root,
1132 struct btrfs_path *path, int level)
1133 {
1134 struct btrfs_fs_info *fs_info = root->fs_info;
1135 struct extent_buffer *right = NULL;
1136 struct extent_buffer *mid;
1137 struct extent_buffer *left = NULL;
1138 struct extent_buffer *parent = NULL;
1139 int ret = 0;
1140 int wret;
1141 int pslot;
1142 int orig_slot = path->slots[level];
1143
1144 if (level == 0)
1145 return 1;
1146
1147 mid = path->nodes[level];
1148 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1149
1150 if (level < BTRFS_MAX_LEVEL - 1) {
1151 parent = path->nodes[level + 1];
1152 pslot = path->slots[level + 1];
1153 }
1154
1155 if (!parent)
1156 return 1;
1157
1158 /* first, try to make some room in the middle buffer */
1159 if (pslot) {
1160 u32 left_nr;
1161
1162 left = btrfs_read_node_slot(parent, pslot - 1);
1163 if (IS_ERR(left))
1164 return PTR_ERR(left);
1165
1166 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
1167
1168 left_nr = btrfs_header_nritems(left);
1169 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1170 wret = 1;
1171 } else {
1172 ret = btrfs_cow_block(trans, root, left, parent,
1173 pslot - 1, &left,
1174 BTRFS_NESTING_LEFT_COW);
1175 if (ret)
1176 wret = 1;
1177 else {
1178 wret = push_node_left(trans, left, mid, 0);
1179 }
1180 }
1181 if (wret < 0)
1182 ret = wret;
1183 if (wret == 0) {
1184 struct btrfs_disk_key disk_key;
1185 orig_slot += left_nr;
1186 btrfs_node_key(mid, &disk_key, 0);
1187 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1188 BTRFS_MOD_LOG_KEY_REPLACE);
1189 if (ret < 0) {
1190 btrfs_tree_unlock(left);
1191 free_extent_buffer(left);
1192 btrfs_abort_transaction(trans, ret);
1193 return ret;
1194 }
1195 btrfs_set_node_key(parent, &disk_key, pslot);
1196 btrfs_mark_buffer_dirty(trans, parent);
1197 if (btrfs_header_nritems(left) > orig_slot) {
1198 path->nodes[level] = left;
1199 path->slots[level + 1] -= 1;
1200 path->slots[level] = orig_slot;
1201 btrfs_tree_unlock(mid);
1202 free_extent_buffer(mid);
1203 } else {
1204 orig_slot -=
1205 btrfs_header_nritems(left);
1206 path->slots[level] = orig_slot;
1207 btrfs_tree_unlock(left);
1208 free_extent_buffer(left);
1209 }
1210 return 0;
1211 }
1212 btrfs_tree_unlock(left);
1213 free_extent_buffer(left);
1214 }
1215
1216 /*
1217 * then try to empty the right most buffer into the middle
1218 */
1219 if (pslot + 1 < btrfs_header_nritems(parent)) {
1220 u32 right_nr;
1221
1222 right = btrfs_read_node_slot(parent, pslot + 1);
1223 if (IS_ERR(right))
1224 return PTR_ERR(right);
1225
1226 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
1227
1228 right_nr = btrfs_header_nritems(right);
1229 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1230 wret = 1;
1231 } else {
1232 ret = btrfs_cow_block(trans, root, right,
1233 parent, pslot + 1,
1234 &right, BTRFS_NESTING_RIGHT_COW);
1235 if (ret)
1236 wret = 1;
1237 else {
1238 wret = balance_node_right(trans, right, mid);
1239 }
1240 }
1241 if (wret < 0)
1242 ret = wret;
1243 if (wret == 0) {
1244 struct btrfs_disk_key disk_key;
1245
1246 btrfs_node_key(right, &disk_key, 0);
1247 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1248 BTRFS_MOD_LOG_KEY_REPLACE);
1249 if (ret < 0) {
1250 btrfs_tree_unlock(right);
1251 free_extent_buffer(right);
1252 btrfs_abort_transaction(trans, ret);
1253 return ret;
1254 }
1255 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1256 btrfs_mark_buffer_dirty(trans, parent);
1257
1258 if (btrfs_header_nritems(mid) <= orig_slot) {
1259 path->nodes[level] = right;
1260 path->slots[level + 1] += 1;
1261 path->slots[level] = orig_slot -
1262 btrfs_header_nritems(mid);
1263 btrfs_tree_unlock(mid);
1264 free_extent_buffer(mid);
1265 } else {
1266 btrfs_tree_unlock(right);
1267 free_extent_buffer(right);
1268 }
1269 return 0;
1270 }
1271 btrfs_tree_unlock(right);
1272 free_extent_buffer(right);
1273 }
1274 return 1;
1275 }
1276
1277 /*
1278 * readahead one full node of leaves, finding things that are close
1279 * to the block in 'slot', and triggering ra on them.
1280 */
reada_for_search(struct btrfs_fs_info * fs_info,const struct btrfs_path * path,int level,int slot,u64 objectid)1281 static void reada_for_search(struct btrfs_fs_info *fs_info,
1282 const struct btrfs_path *path,
1283 int level, int slot, u64 objectid)
1284 {
1285 struct extent_buffer *node;
1286 struct btrfs_disk_key disk_key;
1287 u32 nritems;
1288 u64 search;
1289 u64 target;
1290 u64 nread = 0;
1291 u64 nread_max;
1292 u32 nr;
1293 u32 blocksize;
1294 u32 nscan = 0;
1295
1296 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1297 return;
1298
1299 if (!path->nodes[level])
1300 return;
1301
1302 node = path->nodes[level];
1303
1304 /*
1305 * Since the time between visiting leaves is much shorter than the time
1306 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1307 * much IO at once (possibly random).
1308 */
1309 if (path->reada == READA_FORWARD_ALWAYS) {
1310 if (level > 1)
1311 nread_max = node->fs_info->nodesize;
1312 else
1313 nread_max = SZ_128K;
1314 } else {
1315 nread_max = SZ_64K;
1316 }
1317
1318 search = btrfs_node_blockptr(node, slot);
1319 blocksize = fs_info->nodesize;
1320 if (path->reada != READA_FORWARD_ALWAYS) {
1321 struct extent_buffer *eb;
1322
1323 eb = find_extent_buffer(fs_info, search);
1324 if (eb) {
1325 free_extent_buffer(eb);
1326 return;
1327 }
1328 }
1329
1330 target = search;
1331
1332 nritems = btrfs_header_nritems(node);
1333 nr = slot;
1334
1335 while (1) {
1336 if (path->reada == READA_BACK) {
1337 if (nr == 0)
1338 break;
1339 nr--;
1340 } else if (path->reada == READA_FORWARD ||
1341 path->reada == READA_FORWARD_ALWAYS) {
1342 nr++;
1343 if (nr >= nritems)
1344 break;
1345 }
1346 if (path->reada == READA_BACK && objectid) {
1347 btrfs_node_key(node, &disk_key, nr);
1348 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1349 break;
1350 }
1351 search = btrfs_node_blockptr(node, nr);
1352 if (path->reada == READA_FORWARD_ALWAYS ||
1353 (search <= target && target - search <= 65536) ||
1354 (search > target && search - target <= 65536)) {
1355 btrfs_readahead_node_child(node, nr);
1356 nread += blocksize;
1357 }
1358 nscan++;
1359 if (nread > nread_max || nscan > 32)
1360 break;
1361 }
1362 }
1363
reada_for_balance(const struct btrfs_path * path,int level)1364 static noinline void reada_for_balance(const struct btrfs_path *path, int level)
1365 {
1366 struct extent_buffer *parent;
1367 int slot;
1368 int nritems;
1369
1370 parent = path->nodes[level + 1];
1371 if (!parent)
1372 return;
1373
1374 nritems = btrfs_header_nritems(parent);
1375 slot = path->slots[level + 1];
1376
1377 if (slot > 0)
1378 btrfs_readahead_node_child(parent, slot - 1);
1379 if (slot + 1 < nritems)
1380 btrfs_readahead_node_child(parent, slot + 1);
1381 }
1382
1383
1384 /*
1385 * when we walk down the tree, it is usually safe to unlock the higher layers
1386 * in the tree. The exceptions are when our path goes through slot 0, because
1387 * operations on the tree might require changing key pointers higher up in the
1388 * tree.
1389 *
1390 * callers might also have set path->keep_locks, which tells this code to keep
1391 * the lock if the path points to the last slot in the block. This is part of
1392 * walking through the tree, and selecting the next slot in the higher block.
1393 *
1394 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1395 * if lowest_unlock is 1, level 0 won't be unlocked
1396 */
unlock_up(struct btrfs_path * path,int level,int lowest_unlock,int min_write_lock_level,int * write_lock_level)1397 static noinline void unlock_up(struct btrfs_path *path, int level,
1398 int lowest_unlock, int min_write_lock_level,
1399 int *write_lock_level)
1400 {
1401 int i;
1402 int skip_level = level;
1403 bool check_skip = true;
1404
1405 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1406 if (!path->nodes[i])
1407 break;
1408 if (!path->locks[i])
1409 break;
1410
1411 if (check_skip) {
1412 if (path->slots[i] == 0) {
1413 skip_level = i + 1;
1414 continue;
1415 }
1416
1417 if (path->keep_locks) {
1418 u32 nritems;
1419
1420 nritems = btrfs_header_nritems(path->nodes[i]);
1421 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1422 skip_level = i + 1;
1423 continue;
1424 }
1425 }
1426 }
1427
1428 if (i >= lowest_unlock && i > skip_level) {
1429 check_skip = false;
1430 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1431 path->locks[i] = 0;
1432 if (write_lock_level &&
1433 i > min_write_lock_level &&
1434 i <= *write_lock_level) {
1435 *write_lock_level = i - 1;
1436 }
1437 }
1438 }
1439 }
1440
1441 /*
1442 * Helper function for btrfs_search_slot() and other functions that do a search
1443 * on a btree. The goal is to find a tree block in the cache (the radix tree at
1444 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1445 * its pages from disk.
1446 *
1447 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1448 * whole btree search, starting again from the current root node.
1449 */
1450 static int
read_block_for_search(struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer ** eb_ret,int slot,const struct btrfs_key * key)1451 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1452 struct extent_buffer **eb_ret, int slot,
1453 const struct btrfs_key *key)
1454 {
1455 struct btrfs_fs_info *fs_info = root->fs_info;
1456 struct btrfs_tree_parent_check check = { 0 };
1457 u64 blocknr;
1458 struct extent_buffer *tmp = NULL;
1459 int ret = 0;
1460 int ret2;
1461 int parent_level;
1462 bool read_tmp = false;
1463 bool tmp_locked = false;
1464 bool path_released = false;
1465
1466 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1467 parent_level = btrfs_header_level(*eb_ret);
1468 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1469 check.has_first_key = true;
1470 check.level = parent_level - 1;
1471 check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
1472 check.owner_root = btrfs_root_id(root);
1473
1474 /*
1475 * If we need to read an extent buffer from disk and we are holding locks
1476 * on upper level nodes, we unlock all the upper nodes before reading the
1477 * extent buffer, and then return -EAGAIN to the caller as it needs to
1478 * restart the search. We don't release the lock on the current level
1479 * because we need to walk this node to figure out which blocks to read.
1480 */
1481 tmp = find_extent_buffer(fs_info, blocknr);
1482 if (tmp) {
1483 if (p->reada == READA_FORWARD_ALWAYS)
1484 reada_for_search(fs_info, p, parent_level, slot, key->objectid);
1485
1486 /* first we do an atomic uptodate check */
1487 if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
1488 /*
1489 * Do extra check for first_key, eb can be stale due to
1490 * being cached, read from scrub, or have multiple
1491 * parents (shared tree blocks).
1492 */
1493 if (btrfs_verify_level_key(tmp, &check)) {
1494 ret = -EUCLEAN;
1495 goto out;
1496 }
1497 *eb_ret = tmp;
1498 tmp = NULL;
1499 ret = 0;
1500 goto out;
1501 }
1502
1503 if (p->nowait) {
1504 ret = -EAGAIN;
1505 goto out;
1506 }
1507
1508 if (!p->skip_locking) {
1509 btrfs_unlock_up_safe(p, parent_level + 1);
1510 btrfs_maybe_reset_lockdep_class(root, tmp);
1511 tmp_locked = true;
1512 btrfs_tree_read_lock(tmp);
1513 btrfs_release_path(p);
1514 ret = -EAGAIN;
1515 path_released = true;
1516 }
1517
1518 /* Now we're allowed to do a blocking uptodate check. */
1519 ret2 = btrfs_read_extent_buffer(tmp, &check);
1520 if (ret2) {
1521 ret = ret2;
1522 goto out;
1523 }
1524
1525 if (ret == 0) {
1526 ASSERT(!tmp_locked);
1527 *eb_ret = tmp;
1528 tmp = NULL;
1529 }
1530 goto out;
1531 } else if (p->nowait) {
1532 ret = -EAGAIN;
1533 goto out;
1534 }
1535
1536 if (!p->skip_locking) {
1537 btrfs_unlock_up_safe(p, parent_level + 1);
1538 ret = -EAGAIN;
1539 }
1540
1541 if (p->reada != READA_NONE)
1542 reada_for_search(fs_info, p, parent_level, slot, key->objectid);
1543
1544 tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
1545 if (IS_ERR(tmp)) {
1546 ret = PTR_ERR(tmp);
1547 tmp = NULL;
1548 goto out;
1549 }
1550 read_tmp = true;
1551
1552 if (!p->skip_locking) {
1553 ASSERT(ret == -EAGAIN);
1554 btrfs_maybe_reset_lockdep_class(root, tmp);
1555 tmp_locked = true;
1556 btrfs_tree_read_lock(tmp);
1557 btrfs_release_path(p);
1558 path_released = true;
1559 }
1560
1561 /* Now we're allowed to do a blocking uptodate check. */
1562 ret2 = btrfs_read_extent_buffer(tmp, &check);
1563 if (ret2) {
1564 ret = ret2;
1565 goto out;
1566 }
1567
1568 /*
1569 * If the read above didn't mark this buffer up to date,
1570 * it will never end up being up to date. Set ret to EIO now
1571 * and give up so that our caller doesn't loop forever
1572 * on our EAGAINs.
1573 */
1574 if (!extent_buffer_uptodate(tmp)) {
1575 ret = -EIO;
1576 goto out;
1577 }
1578
1579 if (ret == 0) {
1580 ASSERT(!tmp_locked);
1581 *eb_ret = tmp;
1582 tmp = NULL;
1583 }
1584 out:
1585 if (tmp) {
1586 if (tmp_locked)
1587 btrfs_tree_read_unlock(tmp);
1588 if (read_tmp && ret && ret != -EAGAIN)
1589 free_extent_buffer_stale(tmp);
1590 else
1591 free_extent_buffer(tmp);
1592 }
1593 if (ret && !path_released)
1594 btrfs_release_path(p);
1595
1596 return ret;
1597 }
1598
1599 /*
1600 * helper function for btrfs_search_slot. This does all of the checks
1601 * for node-level blocks and does any balancing required based on
1602 * the ins_len.
1603 *
1604 * If no extra work was required, zero is returned. If we had to
1605 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1606 * start over
1607 */
1608 static int
setup_nodes_for_search(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * p,struct extent_buffer * b,int level,int ins_len,int * write_lock_level)1609 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1610 struct btrfs_root *root, struct btrfs_path *p,
1611 struct extent_buffer *b, int level, int ins_len,
1612 int *write_lock_level)
1613 {
1614 struct btrfs_fs_info *fs_info = root->fs_info;
1615 int ret = 0;
1616
1617 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1618 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1619
1620 if (*write_lock_level < level + 1) {
1621 *write_lock_level = level + 1;
1622 btrfs_release_path(p);
1623 return -EAGAIN;
1624 }
1625
1626 reada_for_balance(p, level);
1627 ret = split_node(trans, root, p, level);
1628
1629 b = p->nodes[level];
1630 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1631 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1632
1633 if (*write_lock_level < level + 1) {
1634 *write_lock_level = level + 1;
1635 btrfs_release_path(p);
1636 return -EAGAIN;
1637 }
1638
1639 reada_for_balance(p, level);
1640 ret = balance_level(trans, root, p, level);
1641 if (ret)
1642 return ret;
1643
1644 b = p->nodes[level];
1645 if (!b) {
1646 btrfs_release_path(p);
1647 return -EAGAIN;
1648 }
1649 BUG_ON(btrfs_header_nritems(b) == 1);
1650 }
1651 return ret;
1652 }
1653
btrfs_find_item(struct btrfs_root * fs_root,struct btrfs_path * path,u64 iobjectid,u64 ioff,u8 key_type,struct btrfs_key * found_key)1654 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1655 u64 iobjectid, u64 ioff, u8 key_type,
1656 struct btrfs_key *found_key)
1657 {
1658 int ret;
1659 struct btrfs_key key;
1660 struct extent_buffer *eb;
1661
1662 ASSERT(path);
1663 ASSERT(found_key);
1664
1665 key.type = key_type;
1666 key.objectid = iobjectid;
1667 key.offset = ioff;
1668
1669 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1670 if (ret < 0)
1671 return ret;
1672
1673 eb = path->nodes[0];
1674 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1675 ret = btrfs_next_leaf(fs_root, path);
1676 if (ret)
1677 return ret;
1678 eb = path->nodes[0];
1679 }
1680
1681 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1682 if (found_key->type != key.type ||
1683 found_key->objectid != key.objectid)
1684 return 1;
1685
1686 return 0;
1687 }
1688
btrfs_search_slot_get_root(struct btrfs_root * root,struct btrfs_path * p,int write_lock_level)1689 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1690 struct btrfs_path *p,
1691 int write_lock_level)
1692 {
1693 struct extent_buffer *b;
1694 int root_lock = 0;
1695 int level = 0;
1696
1697 if (p->search_commit_root) {
1698 b = root->commit_root;
1699 refcount_inc(&b->refs);
1700 level = btrfs_header_level(b);
1701 /*
1702 * Ensure that all callers have set skip_locking when
1703 * p->search_commit_root = 1.
1704 */
1705 ASSERT(p->skip_locking == 1);
1706
1707 goto out;
1708 }
1709
1710 if (p->skip_locking) {
1711 b = btrfs_root_node(root);
1712 level = btrfs_header_level(b);
1713 goto out;
1714 }
1715
1716 /* We try very hard to do read locks on the root */
1717 root_lock = BTRFS_READ_LOCK;
1718
1719 /*
1720 * If the level is set to maximum, we can skip trying to get the read
1721 * lock.
1722 */
1723 if (write_lock_level < BTRFS_MAX_LEVEL) {
1724 /*
1725 * We don't know the level of the root node until we actually
1726 * have it read locked
1727 */
1728 if (p->nowait) {
1729 b = btrfs_try_read_lock_root_node(root);
1730 if (IS_ERR(b))
1731 return b;
1732 } else {
1733 b = btrfs_read_lock_root_node(root);
1734 }
1735 level = btrfs_header_level(b);
1736 if (level > write_lock_level)
1737 goto out;
1738
1739 /* Whoops, must trade for write lock */
1740 btrfs_tree_read_unlock(b);
1741 free_extent_buffer(b);
1742 }
1743
1744 b = btrfs_lock_root_node(root);
1745 root_lock = BTRFS_WRITE_LOCK;
1746
1747 /* The level might have changed, check again */
1748 level = btrfs_header_level(b);
1749
1750 out:
1751 /*
1752 * The root may have failed to write out at some point, and thus is no
1753 * longer valid, return an error in this case.
1754 */
1755 if (!extent_buffer_uptodate(b)) {
1756 if (root_lock)
1757 btrfs_tree_unlock_rw(b, root_lock);
1758 free_extent_buffer(b);
1759 return ERR_PTR(-EIO);
1760 }
1761
1762 p->nodes[level] = b;
1763 if (!p->skip_locking)
1764 p->locks[level] = root_lock;
1765 /*
1766 * Callers are responsible for dropping b's references.
1767 */
1768 return b;
1769 }
1770
1771 /*
1772 * Replace the extent buffer at the lowest level of the path with a cloned
1773 * version. The purpose is to be able to use it safely, after releasing the
1774 * commit root semaphore, even if relocation is happening in parallel, the
1775 * transaction used for relocation is committed and the extent buffer is
1776 * reallocated in the next transaction.
1777 *
1778 * This is used in a context where the caller does not prevent transaction
1779 * commits from happening, either by holding a transaction handle or holding
1780 * some lock, while it's doing searches through a commit root.
1781 * At the moment it's only used for send operations.
1782 */
finish_need_commit_sem_search(struct btrfs_path * path)1783 static int finish_need_commit_sem_search(struct btrfs_path *path)
1784 {
1785 const int i = path->lowest_level;
1786 const int slot = path->slots[i];
1787 struct extent_buffer *lowest = path->nodes[i];
1788 struct extent_buffer *clone;
1789
1790 ASSERT(path->need_commit_sem);
1791
1792 if (!lowest)
1793 return 0;
1794
1795 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1796
1797 clone = btrfs_clone_extent_buffer(lowest);
1798 if (!clone)
1799 return -ENOMEM;
1800
1801 btrfs_release_path(path);
1802 path->nodes[i] = clone;
1803 path->slots[i] = slot;
1804
1805 return 0;
1806 }
1807
search_for_key_slot(const struct extent_buffer * eb,int search_low_slot,const struct btrfs_key * key,int prev_cmp,int * slot)1808 static inline int search_for_key_slot(const struct extent_buffer *eb,
1809 int search_low_slot,
1810 const struct btrfs_key *key,
1811 int prev_cmp,
1812 int *slot)
1813 {
1814 /*
1815 * If a previous call to btrfs_bin_search() on a parent node returned an
1816 * exact match (prev_cmp == 0), we can safely assume the target key will
1817 * always be at slot 0 on lower levels, since each key pointer
1818 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1819 * subtree it points to. Thus we can skip searching lower levels.
1820 */
1821 if (prev_cmp == 0) {
1822 *slot = 0;
1823 return 0;
1824 }
1825
1826 return btrfs_bin_search(eb, search_low_slot, key, slot);
1827 }
1828
search_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * path,int ins_len,int prev_cmp)1829 static int search_leaf(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 const struct btrfs_key *key,
1832 struct btrfs_path *path,
1833 int ins_len,
1834 int prev_cmp)
1835 {
1836 struct extent_buffer *leaf = path->nodes[0];
1837 int leaf_free_space = -1;
1838 int search_low_slot = 0;
1839 int ret;
1840 bool do_bin_search = true;
1841
1842 /*
1843 * If we are doing an insertion, the leaf has enough free space and the
1844 * destination slot for the key is not slot 0, then we can unlock our
1845 * write lock on the parent, and any other upper nodes, before doing the
1846 * binary search on the leaf (with search_for_key_slot()), allowing other
1847 * tasks to lock the parent and any other upper nodes.
1848 */
1849 if (ins_len > 0) {
1850 /*
1851 * Cache the leaf free space, since we will need it later and it
1852 * will not change until then.
1853 */
1854 leaf_free_space = btrfs_leaf_free_space(leaf);
1855
1856 /*
1857 * !path->locks[1] means we have a single node tree, the leaf is
1858 * the root of the tree.
1859 */
1860 if (path->locks[1] && leaf_free_space >= ins_len) {
1861 struct btrfs_disk_key first_key;
1862
1863 ASSERT(btrfs_header_nritems(leaf) > 0);
1864 btrfs_item_key(leaf, &first_key, 0);
1865
1866 /*
1867 * Doing the extra comparison with the first key is cheap,
1868 * taking into account that the first key is very likely
1869 * already in a cache line because it immediately follows
1870 * the extent buffer's header and we have recently accessed
1871 * the header's level field.
1872 */
1873 ret = btrfs_comp_keys(&first_key, key);
1874 if (ret < 0) {
1875 /*
1876 * The first key is smaller than the key we want
1877 * to insert, so we are safe to unlock all upper
1878 * nodes and we have to do the binary search.
1879 *
1880 * We do use btrfs_unlock_up_safe() and not
1881 * unlock_up() because the later does not unlock
1882 * nodes with a slot of 0 - we can safely unlock
1883 * any node even if its slot is 0 since in this
1884 * case the key does not end up at slot 0 of the
1885 * leaf and there's no need to split the leaf.
1886 */
1887 btrfs_unlock_up_safe(path, 1);
1888 search_low_slot = 1;
1889 } else {
1890 /*
1891 * The first key is >= then the key we want to
1892 * insert, so we can skip the binary search as
1893 * the target key will be at slot 0.
1894 *
1895 * We can not unlock upper nodes when the key is
1896 * less than the first key, because we will need
1897 * to update the key at slot 0 of the parent node
1898 * and possibly of other upper nodes too.
1899 * If the key matches the first key, then we can
1900 * unlock all the upper nodes, using
1901 * btrfs_unlock_up_safe() instead of unlock_up()
1902 * as stated above.
1903 */
1904 if (ret == 0)
1905 btrfs_unlock_up_safe(path, 1);
1906 /*
1907 * ret is already 0 or 1, matching the result of
1908 * a btrfs_bin_search() call, so there is no need
1909 * to adjust it.
1910 */
1911 do_bin_search = false;
1912 path->slots[0] = 0;
1913 }
1914 }
1915 }
1916
1917 if (do_bin_search) {
1918 ret = search_for_key_slot(leaf, search_low_slot, key,
1919 prev_cmp, &path->slots[0]);
1920 if (ret < 0)
1921 return ret;
1922 }
1923
1924 if (ins_len > 0) {
1925 /*
1926 * Item key already exists. In this case, if we are allowed to
1927 * insert the item (for example, in dir_item case, item key
1928 * collision is allowed), it will be merged with the original
1929 * item. Only the item size grows, no new btrfs item will be
1930 * added. If search_for_extension is not set, ins_len already
1931 * accounts the size btrfs_item, deduct it here so leaf space
1932 * check will be correct.
1933 */
1934 if (ret == 0 && !path->search_for_extension) {
1935 ASSERT(ins_len >= sizeof(struct btrfs_item));
1936 ins_len -= sizeof(struct btrfs_item);
1937 }
1938
1939 ASSERT(leaf_free_space >= 0);
1940
1941 if (leaf_free_space < ins_len) {
1942 int ret2;
1943
1944 ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
1945 ASSERT(ret2 <= 0);
1946 if (WARN_ON(ret2 > 0))
1947 ret2 = -EUCLEAN;
1948 if (ret2)
1949 ret = ret2;
1950 }
1951 }
1952
1953 return ret;
1954 }
1955
1956 /*
1957 * Look for a key in a tree and perform necessary modifications to preserve
1958 * tree invariants.
1959 *
1960 * @trans: Handle of transaction, used when modifying the tree
1961 * @p: Holds all btree nodes along the search path
1962 * @root: The root node of the tree
1963 * @key: The key we are looking for
1964 * @ins_len: Indicates purpose of search:
1965 * >0 for inserts it's size of item inserted (*)
1966 * <0 for deletions
1967 * 0 for plain searches, not modifying the tree
1968 *
1969 * (*) If size of item inserted doesn't include
1970 * sizeof(struct btrfs_item), then p->search_for_extension must
1971 * be set.
1972 * @cow: boolean should CoW operations be performed. Must always be 1
1973 * when modifying the tree.
1974 *
1975 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1976 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1977 *
1978 * If @key is found, 0 is returned and you can find the item in the leaf level
1979 * of the path (level 0)
1980 *
1981 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1982 * points to the slot where it should be inserted
1983 *
1984 * If an error is encountered while searching the tree a negative error number
1985 * is returned
1986 */
btrfs_search_slot(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int ins_len,int cow)1987 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1988 const struct btrfs_key *key, struct btrfs_path *p,
1989 int ins_len, int cow)
1990 {
1991 struct btrfs_fs_info *fs_info;
1992 struct extent_buffer *b;
1993 int slot;
1994 int ret;
1995 int level;
1996 int lowest_unlock = 1;
1997 /* everything at write_lock_level or lower must be write locked */
1998 int write_lock_level = 0;
1999 u8 lowest_level = 0;
2000 int min_write_lock_level;
2001 int prev_cmp;
2002
2003 if (!root)
2004 return -EINVAL;
2005
2006 fs_info = root->fs_info;
2007 might_sleep();
2008
2009 lowest_level = p->lowest_level;
2010 WARN_ON(lowest_level && ins_len > 0);
2011 WARN_ON(p->nodes[0] != NULL);
2012 BUG_ON(!cow && ins_len);
2013
2014 /*
2015 * For now only allow nowait for read only operations. There's no
2016 * strict reason why we can't, we just only need it for reads so it's
2017 * only implemented for reads.
2018 */
2019 ASSERT(!p->nowait || !cow);
2020
2021 if (ins_len < 0) {
2022 lowest_unlock = 2;
2023
2024 /* when we are removing items, we might have to go up to level
2025 * two as we update tree pointers Make sure we keep write
2026 * for those levels as well
2027 */
2028 write_lock_level = 2;
2029 } else if (ins_len > 0) {
2030 /*
2031 * for inserting items, make sure we have a write lock on
2032 * level 1 so we can update keys
2033 */
2034 write_lock_level = 1;
2035 }
2036
2037 if (!cow)
2038 write_lock_level = -1;
2039
2040 if (cow && (p->keep_locks || p->lowest_level))
2041 write_lock_level = BTRFS_MAX_LEVEL;
2042
2043 min_write_lock_level = write_lock_level;
2044
2045 if (p->need_commit_sem) {
2046 ASSERT(p->search_commit_root);
2047 if (p->nowait) {
2048 if (!down_read_trylock(&fs_info->commit_root_sem))
2049 return -EAGAIN;
2050 } else {
2051 down_read(&fs_info->commit_root_sem);
2052 }
2053 }
2054
2055 again:
2056 prev_cmp = -1;
2057 b = btrfs_search_slot_get_root(root, p, write_lock_level);
2058 if (IS_ERR(b)) {
2059 ret = PTR_ERR(b);
2060 goto done;
2061 }
2062
2063 while (b) {
2064 int dec = 0;
2065 int ret2;
2066
2067 level = btrfs_header_level(b);
2068
2069 if (cow) {
2070 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2071
2072 /*
2073 * if we don't really need to cow this block
2074 * then we don't want to set the path blocking,
2075 * so we test it here
2076 */
2077 if (!should_cow_block(trans, root, b))
2078 goto cow_done;
2079
2080 /*
2081 * must have write locks on this node and the
2082 * parent
2083 */
2084 if (level > write_lock_level ||
2085 (level + 1 > write_lock_level &&
2086 level + 1 < BTRFS_MAX_LEVEL &&
2087 p->nodes[level + 1])) {
2088 write_lock_level = level + 1;
2089 btrfs_release_path(p);
2090 goto again;
2091 }
2092
2093 if (last_level)
2094 ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
2095 &b, BTRFS_NESTING_COW);
2096 else
2097 ret2 = btrfs_cow_block(trans, root, b,
2098 p->nodes[level + 1],
2099 p->slots[level + 1], &b,
2100 BTRFS_NESTING_COW);
2101 if (ret2) {
2102 ret = ret2;
2103 goto done;
2104 }
2105 }
2106 cow_done:
2107 p->nodes[level] = b;
2108
2109 /*
2110 * we have a lock on b and as long as we aren't changing
2111 * the tree, there is no way to for the items in b to change.
2112 * It is safe to drop the lock on our parent before we
2113 * go through the expensive btree search on b.
2114 *
2115 * If we're inserting or deleting (ins_len != 0), then we might
2116 * be changing slot zero, which may require changing the parent.
2117 * So, we can't drop the lock until after we know which slot
2118 * we're operating on.
2119 */
2120 if (!ins_len && !p->keep_locks) {
2121 int u = level + 1;
2122
2123 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2124 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2125 p->locks[u] = 0;
2126 }
2127 }
2128
2129 if (level == 0) {
2130 if (ins_len > 0)
2131 ASSERT(write_lock_level >= 1);
2132
2133 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2134 if (!p->search_for_split)
2135 unlock_up(p, level, lowest_unlock,
2136 min_write_lock_level, NULL);
2137 goto done;
2138 }
2139
2140 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2141 if (ret < 0)
2142 goto done;
2143 prev_cmp = ret;
2144
2145 if (ret && slot > 0) {
2146 dec = 1;
2147 slot--;
2148 }
2149 p->slots[level] = slot;
2150 ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2151 &write_lock_level);
2152 if (ret2 == -EAGAIN)
2153 goto again;
2154 if (ret2) {
2155 ret = ret2;
2156 goto done;
2157 }
2158 b = p->nodes[level];
2159 slot = p->slots[level];
2160
2161 /*
2162 * Slot 0 is special, if we change the key we have to update
2163 * the parent pointer which means we must have a write lock on
2164 * the parent
2165 */
2166 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2167 write_lock_level = level + 1;
2168 btrfs_release_path(p);
2169 goto again;
2170 }
2171
2172 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2173 &write_lock_level);
2174
2175 if (level == lowest_level) {
2176 if (dec)
2177 p->slots[level]++;
2178 goto done;
2179 }
2180
2181 ret2 = read_block_for_search(root, p, &b, slot, key);
2182 if (ret2 == -EAGAIN && !p->nowait)
2183 goto again;
2184 if (ret2) {
2185 ret = ret2;
2186 goto done;
2187 }
2188
2189 if (!p->skip_locking) {
2190 level = btrfs_header_level(b);
2191
2192 btrfs_maybe_reset_lockdep_class(root, b);
2193
2194 if (level <= write_lock_level) {
2195 btrfs_tree_lock(b);
2196 p->locks[level] = BTRFS_WRITE_LOCK;
2197 } else {
2198 if (p->nowait) {
2199 if (!btrfs_try_tree_read_lock(b)) {
2200 free_extent_buffer(b);
2201 ret = -EAGAIN;
2202 goto done;
2203 }
2204 } else {
2205 btrfs_tree_read_lock(b);
2206 }
2207 p->locks[level] = BTRFS_READ_LOCK;
2208 }
2209 p->nodes[level] = b;
2210 }
2211 }
2212 ret = 1;
2213 done:
2214 if (ret < 0 && !p->skip_release_on_error)
2215 btrfs_release_path(p);
2216
2217 if (p->need_commit_sem) {
2218 int ret2;
2219
2220 ret2 = finish_need_commit_sem_search(p);
2221 up_read(&fs_info->commit_root_sem);
2222 if (ret2)
2223 ret = ret2;
2224 }
2225
2226 return ret;
2227 }
2228 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2229
2230 /*
2231 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2232 * current state of the tree together with the operations recorded in the tree
2233 * modification log to search for the key in a previous version of this tree, as
2234 * denoted by the time_seq parameter.
2235 *
2236 * Naturally, there is no support for insert, delete or cow operations.
2237 *
2238 * The resulting path and return value will be set up as if we called
2239 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2240 */
btrfs_search_old_slot(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,u64 time_seq)2241 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2242 struct btrfs_path *p, u64 time_seq)
2243 {
2244 struct btrfs_fs_info *fs_info = root->fs_info;
2245 struct extent_buffer *b;
2246 int slot;
2247 int ret;
2248 int level;
2249 int lowest_unlock = 1;
2250 u8 lowest_level = 0;
2251
2252 lowest_level = p->lowest_level;
2253 WARN_ON(p->nodes[0] != NULL);
2254 ASSERT(!p->nowait);
2255
2256 if (p->search_commit_root) {
2257 BUG_ON(time_seq);
2258 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2259 }
2260
2261 again:
2262 b = btrfs_get_old_root(root, time_seq);
2263 if (!b) {
2264 ret = -EIO;
2265 goto done;
2266 }
2267 level = btrfs_header_level(b);
2268 p->locks[level] = BTRFS_READ_LOCK;
2269
2270 while (b) {
2271 int dec = 0;
2272 int ret2;
2273
2274 level = btrfs_header_level(b);
2275 p->nodes[level] = b;
2276
2277 /*
2278 * we have a lock on b and as long as we aren't changing
2279 * the tree, there is no way to for the items in b to change.
2280 * It is safe to drop the lock on our parent before we
2281 * go through the expensive btree search on b.
2282 */
2283 btrfs_unlock_up_safe(p, level + 1);
2284
2285 ret = btrfs_bin_search(b, 0, key, &slot);
2286 if (ret < 0)
2287 goto done;
2288
2289 if (level == 0) {
2290 p->slots[level] = slot;
2291 unlock_up(p, level, lowest_unlock, 0, NULL);
2292 goto done;
2293 }
2294
2295 if (ret && slot > 0) {
2296 dec = 1;
2297 slot--;
2298 }
2299 p->slots[level] = slot;
2300 unlock_up(p, level, lowest_unlock, 0, NULL);
2301
2302 if (level == lowest_level) {
2303 if (dec)
2304 p->slots[level]++;
2305 goto done;
2306 }
2307
2308 ret2 = read_block_for_search(root, p, &b, slot, key);
2309 if (ret2 == -EAGAIN && !p->nowait)
2310 goto again;
2311 if (ret2) {
2312 ret = ret2;
2313 goto done;
2314 }
2315
2316 level = btrfs_header_level(b);
2317 btrfs_tree_read_lock(b);
2318 b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
2319 if (!b) {
2320 ret = -ENOMEM;
2321 goto done;
2322 }
2323 p->locks[level] = BTRFS_READ_LOCK;
2324 p->nodes[level] = b;
2325 }
2326 ret = 1;
2327 done:
2328 if (ret < 0)
2329 btrfs_release_path(p);
2330
2331 return ret;
2332 }
2333
2334 /*
2335 * Search the tree again to find a leaf with smaller keys.
2336 * Returns 0 if it found something.
2337 * Returns 1 if there are no smaller keys.
2338 * Returns < 0 on error.
2339 *
2340 * This may release the path, and so you may lose any locks held at the
2341 * time you call it.
2342 */
btrfs_prev_leaf(struct btrfs_root * root,struct btrfs_path * path)2343 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2344 {
2345 struct btrfs_key key;
2346 struct btrfs_key orig_key;
2347 struct btrfs_disk_key found_key;
2348 int ret;
2349
2350 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2351 orig_key = key;
2352
2353 if (key.offset > 0) {
2354 key.offset--;
2355 } else if (key.type > 0) {
2356 key.type--;
2357 key.offset = (u64)-1;
2358 } else if (key.objectid > 0) {
2359 key.objectid--;
2360 key.type = (u8)-1;
2361 key.offset = (u64)-1;
2362 } else {
2363 return 1;
2364 }
2365
2366 btrfs_release_path(path);
2367 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2368 if (ret <= 0)
2369 return ret;
2370
2371 /*
2372 * Previous key not found. Even if we were at slot 0 of the leaf we had
2373 * before releasing the path and calling btrfs_search_slot(), we now may
2374 * be in a slot pointing to the same original key - this can happen if
2375 * after we released the path, one of more items were moved from a
2376 * sibling leaf into the front of the leaf we had due to an insertion
2377 * (see push_leaf_right()).
2378 * If we hit this case and our slot is > 0 and just decrement the slot
2379 * so that the caller does not process the same key again, which may or
2380 * may not break the caller, depending on its logic.
2381 */
2382 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2383 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2384 ret = btrfs_comp_keys(&found_key, &orig_key);
2385 if (ret == 0) {
2386 if (path->slots[0] > 0) {
2387 path->slots[0]--;
2388 return 0;
2389 }
2390 /*
2391 * At slot 0, same key as before, it means orig_key is
2392 * the lowest, leftmost, key in the tree. We're done.
2393 */
2394 return 1;
2395 }
2396 }
2397
2398 btrfs_item_key(path->nodes[0], &found_key, 0);
2399 ret = btrfs_comp_keys(&found_key, &key);
2400 /*
2401 * We might have had an item with the previous key in the tree right
2402 * before we released our path. And after we released our path, that
2403 * item might have been pushed to the first slot (0) of the leaf we
2404 * were holding due to a tree balance. Alternatively, an item with the
2405 * previous key can exist as the only element of a leaf (big fat item).
2406 * Therefore account for these 2 cases, so that our callers (like
2407 * btrfs_previous_item) don't miss an existing item with a key matching
2408 * the previous key we computed above.
2409 */
2410 if (ret <= 0)
2411 return 0;
2412 return 1;
2413 }
2414
2415 /*
2416 * helper to use instead of search slot if no exact match is needed but
2417 * instead the next or previous item should be returned.
2418 * When find_higher is true, the next higher item is returned, the next lower
2419 * otherwise.
2420 * When return_any and find_higher are both true, and no higher item is found,
2421 * return the next lower instead.
2422 * When return_any is true and find_higher is false, and no lower item is found,
2423 * return the next higher instead.
2424 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2425 * < 0 on error
2426 */
btrfs_search_slot_for_read(struct btrfs_root * root,const struct btrfs_key * key,struct btrfs_path * p,int find_higher,int return_any)2427 int btrfs_search_slot_for_read(struct btrfs_root *root,
2428 const struct btrfs_key *key,
2429 struct btrfs_path *p, int find_higher,
2430 int return_any)
2431 {
2432 int ret;
2433 struct extent_buffer *leaf;
2434
2435 again:
2436 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2437 if (ret <= 0)
2438 return ret;
2439 /*
2440 * a return value of 1 means the path is at the position where the
2441 * item should be inserted. Normally this is the next bigger item,
2442 * but in case the previous item is the last in a leaf, path points
2443 * to the first free slot in the previous leaf, i.e. at an invalid
2444 * item.
2445 */
2446 leaf = p->nodes[0];
2447
2448 if (find_higher) {
2449 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2450 ret = btrfs_next_leaf(root, p);
2451 if (ret <= 0)
2452 return ret;
2453 if (!return_any)
2454 return 1;
2455 /*
2456 * no higher item found, return the next
2457 * lower instead
2458 */
2459 return_any = 0;
2460 find_higher = 0;
2461 btrfs_release_path(p);
2462 goto again;
2463 }
2464 } else {
2465 if (p->slots[0] == 0) {
2466 ret = btrfs_prev_leaf(root, p);
2467 if (ret < 0)
2468 return ret;
2469 if (!ret) {
2470 leaf = p->nodes[0];
2471 if (p->slots[0] == btrfs_header_nritems(leaf))
2472 p->slots[0]--;
2473 return 0;
2474 }
2475 if (!return_any)
2476 return 1;
2477 /*
2478 * no lower item found, return the next
2479 * higher instead
2480 */
2481 return_any = 0;
2482 find_higher = 1;
2483 btrfs_release_path(p);
2484 goto again;
2485 } else {
2486 --p->slots[0];
2487 }
2488 }
2489 return 0;
2490 }
2491
2492 /*
2493 * Execute search and call btrfs_previous_item to traverse backwards if the item
2494 * was not found.
2495 *
2496 * Return 0 if found, 1 if not found and < 0 if error.
2497 */
btrfs_search_backwards(struct btrfs_root * root,struct btrfs_key * key,struct btrfs_path * path)2498 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2499 struct btrfs_path *path)
2500 {
2501 int ret;
2502
2503 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2504 if (ret > 0)
2505 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2506
2507 if (ret == 0)
2508 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2509
2510 return ret;
2511 }
2512
2513 /*
2514 * Search for a valid slot for the given path.
2515 *
2516 * @root: The root node of the tree.
2517 * @key: Will contain a valid item if found.
2518 * @path: The starting point to validate the slot.
2519 *
2520 * Return: 0 if the item is valid
2521 * 1 if not found
2522 * <0 if error.
2523 */
btrfs_get_next_valid_item(struct btrfs_root * root,struct btrfs_key * key,struct btrfs_path * path)2524 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2525 struct btrfs_path *path)
2526 {
2527 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2528 int ret;
2529
2530 ret = btrfs_next_leaf(root, path);
2531 if (ret)
2532 return ret;
2533 }
2534
2535 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2536 return 0;
2537 }
2538
2539 /*
2540 * adjust the pointers going up the tree, starting at level
2541 * making sure the right key of each node is points to 'key'.
2542 * This is used after shifting pointers to the left, so it stops
2543 * fixing up pointers when a given leaf/node is not in slot 0 of the
2544 * higher levels
2545 *
2546 */
fixup_low_keys(struct btrfs_trans_handle * trans,const struct btrfs_path * path,const struct btrfs_disk_key * key,int level)2547 static void fixup_low_keys(struct btrfs_trans_handle *trans,
2548 const struct btrfs_path *path,
2549 const struct btrfs_disk_key *key, int level)
2550 {
2551 int i;
2552 struct extent_buffer *t;
2553 int ret;
2554
2555 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2556 int tslot = path->slots[i];
2557
2558 if (!path->nodes[i])
2559 break;
2560 t = path->nodes[i];
2561 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2562 BTRFS_MOD_LOG_KEY_REPLACE);
2563 BUG_ON(ret < 0);
2564 btrfs_set_node_key(t, key, tslot);
2565 btrfs_mark_buffer_dirty(trans, path->nodes[i]);
2566 if (tslot != 0)
2567 break;
2568 }
2569 }
2570
2571 /*
2572 * update item key.
2573 *
2574 * This function isn't completely safe. It's the caller's responsibility
2575 * that the new key won't break the order
2576 */
btrfs_set_item_key_safe(struct btrfs_trans_handle * trans,const struct btrfs_path * path,const struct btrfs_key * new_key)2577 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2578 const struct btrfs_path *path,
2579 const struct btrfs_key *new_key)
2580 {
2581 struct btrfs_fs_info *fs_info = trans->fs_info;
2582 struct btrfs_disk_key disk_key;
2583 struct extent_buffer *eb;
2584 int slot;
2585
2586 eb = path->nodes[0];
2587 slot = path->slots[0];
2588 if (slot > 0) {
2589 btrfs_item_key(eb, &disk_key, slot - 1);
2590 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
2591 btrfs_print_leaf(eb);
2592 btrfs_crit(fs_info,
2593 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2594 slot, btrfs_disk_key_objectid(&disk_key),
2595 btrfs_disk_key_type(&disk_key),
2596 btrfs_disk_key_offset(&disk_key),
2597 new_key->objectid, new_key->type,
2598 new_key->offset);
2599 BUG();
2600 }
2601 }
2602 if (slot < btrfs_header_nritems(eb) - 1) {
2603 btrfs_item_key(eb, &disk_key, slot + 1);
2604 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
2605 btrfs_print_leaf(eb);
2606 btrfs_crit(fs_info,
2607 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2608 slot, btrfs_disk_key_objectid(&disk_key),
2609 btrfs_disk_key_type(&disk_key),
2610 btrfs_disk_key_offset(&disk_key),
2611 new_key->objectid, new_key->type,
2612 new_key->offset);
2613 BUG();
2614 }
2615 }
2616
2617 btrfs_cpu_key_to_disk(&disk_key, new_key);
2618 btrfs_set_item_key(eb, &disk_key, slot);
2619 btrfs_mark_buffer_dirty(trans, eb);
2620 if (slot == 0)
2621 fixup_low_keys(trans, path, &disk_key, 1);
2622 }
2623
2624 /*
2625 * Check key order of two sibling extent buffers.
2626 *
2627 * Return true if something is wrong.
2628 * Return false if everything is fine.
2629 *
2630 * Tree-checker only works inside one tree block, thus the following
2631 * corruption can not be detected by tree-checker:
2632 *
2633 * Leaf @left | Leaf @right
2634 * --------------------------------------------------------------
2635 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2636 *
2637 * Key f6 in leaf @left itself is valid, but not valid when the next
2638 * key in leaf @right is 7.
2639 * This can only be checked at tree block merge time.
2640 * And since tree checker has ensured all key order in each tree block
2641 * is correct, we only need to bother the last key of @left and the first
2642 * key of @right.
2643 */
check_sibling_keys(const struct extent_buffer * left,const struct extent_buffer * right)2644 static bool check_sibling_keys(const struct extent_buffer *left,
2645 const struct extent_buffer *right)
2646 {
2647 struct btrfs_key left_last;
2648 struct btrfs_key right_first;
2649 int level = btrfs_header_level(left);
2650 int nr_left = btrfs_header_nritems(left);
2651 int nr_right = btrfs_header_nritems(right);
2652
2653 /* No key to check in one of the tree blocks */
2654 if (!nr_left || !nr_right)
2655 return false;
2656
2657 if (level) {
2658 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2659 btrfs_node_key_to_cpu(right, &right_first, 0);
2660 } else {
2661 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2662 btrfs_item_key_to_cpu(right, &right_first, 0);
2663 }
2664
2665 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2666 btrfs_crit(left->fs_info, "left extent buffer:");
2667 btrfs_print_tree(left, false);
2668 btrfs_crit(left->fs_info, "right extent buffer:");
2669 btrfs_print_tree(right, false);
2670 btrfs_crit(left->fs_info,
2671 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2672 left_last.objectid, left_last.type,
2673 left_last.offset, right_first.objectid,
2674 right_first.type, right_first.offset);
2675 return true;
2676 }
2677 return false;
2678 }
2679
2680 /*
2681 * try to push data from one node into the next node left in the
2682 * tree.
2683 *
2684 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2685 * error, and > 0 if there was no room in the left hand block.
2686 */
push_node_left(struct btrfs_trans_handle * trans,struct extent_buffer * dst,struct extent_buffer * src,int empty)2687 static int push_node_left(struct btrfs_trans_handle *trans,
2688 struct extent_buffer *dst,
2689 struct extent_buffer *src, int empty)
2690 {
2691 struct btrfs_fs_info *fs_info = trans->fs_info;
2692 int push_items = 0;
2693 int src_nritems;
2694 int dst_nritems;
2695 int ret = 0;
2696
2697 src_nritems = btrfs_header_nritems(src);
2698 dst_nritems = btrfs_header_nritems(dst);
2699 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2700 WARN_ON(btrfs_header_generation(src) != trans->transid);
2701 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2702
2703 if (!empty && src_nritems <= 8)
2704 return 1;
2705
2706 if (push_items <= 0)
2707 return 1;
2708
2709 if (empty) {
2710 push_items = min(src_nritems, push_items);
2711 if (push_items < src_nritems) {
2712 /* leave at least 8 pointers in the node if
2713 * we aren't going to empty it
2714 */
2715 if (src_nritems - push_items < 8) {
2716 if (push_items <= 8)
2717 return 1;
2718 push_items -= 8;
2719 }
2720 }
2721 } else
2722 push_items = min(src_nritems - 8, push_items);
2723
2724 /* dst is the left eb, src is the middle eb */
2725 if (check_sibling_keys(dst, src)) {
2726 ret = -EUCLEAN;
2727 btrfs_abort_transaction(trans, ret);
2728 return ret;
2729 }
2730 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2731 if (ret) {
2732 btrfs_abort_transaction(trans, ret);
2733 return ret;
2734 }
2735 copy_extent_buffer(dst, src,
2736 btrfs_node_key_ptr_offset(dst, dst_nritems),
2737 btrfs_node_key_ptr_offset(src, 0),
2738 push_items * sizeof(struct btrfs_key_ptr));
2739
2740 if (push_items < src_nritems) {
2741 /*
2742 * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2743 * don't need to do an explicit tree mod log operation for it.
2744 */
2745 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2746 btrfs_node_key_ptr_offset(src, push_items),
2747 (src_nritems - push_items) *
2748 sizeof(struct btrfs_key_ptr));
2749 }
2750 btrfs_set_header_nritems(src, src_nritems - push_items);
2751 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2752 btrfs_mark_buffer_dirty(trans, src);
2753 btrfs_mark_buffer_dirty(trans, dst);
2754
2755 return ret;
2756 }
2757
2758 /*
2759 * try to push data from one node into the next node right in the
2760 * tree.
2761 *
2762 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2763 * error, and > 0 if there was no room in the right hand block.
2764 *
2765 * this will only push up to 1/2 the contents of the left node over
2766 */
balance_node_right(struct btrfs_trans_handle * trans,struct extent_buffer * dst,struct extent_buffer * src)2767 static int balance_node_right(struct btrfs_trans_handle *trans,
2768 struct extent_buffer *dst,
2769 struct extent_buffer *src)
2770 {
2771 struct btrfs_fs_info *fs_info = trans->fs_info;
2772 int push_items = 0;
2773 int max_push;
2774 int src_nritems;
2775 int dst_nritems;
2776 int ret = 0;
2777
2778 WARN_ON(btrfs_header_generation(src) != trans->transid);
2779 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2780
2781 src_nritems = btrfs_header_nritems(src);
2782 dst_nritems = btrfs_header_nritems(dst);
2783 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2784 if (push_items <= 0)
2785 return 1;
2786
2787 if (src_nritems < 4)
2788 return 1;
2789
2790 max_push = src_nritems / 2 + 1;
2791 /* don't try to empty the node */
2792 if (max_push >= src_nritems)
2793 return 1;
2794
2795 if (max_push < push_items)
2796 push_items = max_push;
2797
2798 /* dst is the right eb, src is the middle eb */
2799 if (check_sibling_keys(src, dst)) {
2800 ret = -EUCLEAN;
2801 btrfs_abort_transaction(trans, ret);
2802 return ret;
2803 }
2804
2805 /*
2806 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2807 * need to do an explicit tree mod log operation for it.
2808 */
2809 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2810 btrfs_node_key_ptr_offset(dst, 0),
2811 (dst_nritems) *
2812 sizeof(struct btrfs_key_ptr));
2813
2814 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2815 push_items);
2816 if (ret) {
2817 btrfs_abort_transaction(trans, ret);
2818 return ret;
2819 }
2820 copy_extent_buffer(dst, src,
2821 btrfs_node_key_ptr_offset(dst, 0),
2822 btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2823 push_items * sizeof(struct btrfs_key_ptr));
2824
2825 btrfs_set_header_nritems(src, src_nritems - push_items);
2826 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2827
2828 btrfs_mark_buffer_dirty(trans, src);
2829 btrfs_mark_buffer_dirty(trans, dst);
2830
2831 return ret;
2832 }
2833
2834 /*
2835 * helper function to insert a new root level in the tree.
2836 * A new node is allocated, and a single item is inserted to
2837 * point to the existing root
2838 *
2839 * returns zero on success or < 0 on failure.
2840 */
insert_new_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)2841 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2842 struct btrfs_root *root,
2843 struct btrfs_path *path, int level)
2844 {
2845 u64 lower_gen;
2846 struct extent_buffer *lower;
2847 struct extent_buffer *c;
2848 struct extent_buffer *old;
2849 struct btrfs_disk_key lower_key;
2850 int ret;
2851
2852 BUG_ON(path->nodes[level]);
2853 BUG_ON(path->nodes[level-1] != root->node);
2854
2855 lower = path->nodes[level-1];
2856 if (level == 1)
2857 btrfs_item_key(lower, &lower_key, 0);
2858 else
2859 btrfs_node_key(lower, &lower_key, 0);
2860
2861 c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
2862 &lower_key, level, root->node->start, 0,
2863 0, BTRFS_NESTING_NEW_ROOT);
2864 if (IS_ERR(c))
2865 return PTR_ERR(c);
2866
2867 root_add_used_bytes(root);
2868
2869 btrfs_set_header_nritems(c, 1);
2870 btrfs_set_node_key(c, &lower_key, 0);
2871 btrfs_set_node_blockptr(c, 0, lower->start);
2872 lower_gen = btrfs_header_generation(lower);
2873 WARN_ON(lower_gen != trans->transid);
2874
2875 btrfs_set_node_ptr_generation(c, 0, lower_gen);
2876
2877 btrfs_mark_buffer_dirty(trans, c);
2878
2879 old = root->node;
2880 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2881 if (ret < 0) {
2882 int ret2;
2883
2884 btrfs_clear_buffer_dirty(trans, c);
2885 ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2886 if (ret2 < 0)
2887 btrfs_abort_transaction(trans, ret2);
2888 btrfs_tree_unlock(c);
2889 free_extent_buffer(c);
2890 return ret;
2891 }
2892 rcu_assign_pointer(root->node, c);
2893
2894 /* the super has an extra ref to root->node */
2895 free_extent_buffer(old);
2896
2897 add_root_to_dirty_list(root);
2898 refcount_inc(&c->refs);
2899 path->nodes[level] = c;
2900 path->locks[level] = BTRFS_WRITE_LOCK;
2901 path->slots[level] = 0;
2902 return 0;
2903 }
2904
2905 /*
2906 * worker function to insert a single pointer in a node.
2907 * the node should have enough room for the pointer already
2908 *
2909 * slot and level indicate where you want the key to go, and
2910 * blocknr is the block the key points to.
2911 */
insert_ptr(struct btrfs_trans_handle * trans,const struct btrfs_path * path,const struct btrfs_disk_key * key,u64 bytenr,int slot,int level)2912 static int insert_ptr(struct btrfs_trans_handle *trans,
2913 const struct btrfs_path *path,
2914 const struct btrfs_disk_key *key, u64 bytenr,
2915 int slot, int level)
2916 {
2917 struct extent_buffer *lower;
2918 int nritems;
2919 int ret;
2920
2921 BUG_ON(!path->nodes[level]);
2922 btrfs_assert_tree_write_locked(path->nodes[level]);
2923 lower = path->nodes[level];
2924 nritems = btrfs_header_nritems(lower);
2925 BUG_ON(slot > nritems);
2926 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2927 if (slot != nritems) {
2928 if (level) {
2929 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2930 slot, nritems - slot);
2931 if (ret < 0) {
2932 btrfs_abort_transaction(trans, ret);
2933 return ret;
2934 }
2935 }
2936 memmove_extent_buffer(lower,
2937 btrfs_node_key_ptr_offset(lower, slot + 1),
2938 btrfs_node_key_ptr_offset(lower, slot),
2939 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2940 }
2941 if (level) {
2942 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2943 BTRFS_MOD_LOG_KEY_ADD);
2944 if (ret < 0) {
2945 btrfs_abort_transaction(trans, ret);
2946 return ret;
2947 }
2948 }
2949 btrfs_set_node_key(lower, key, slot);
2950 btrfs_set_node_blockptr(lower, slot, bytenr);
2951 WARN_ON(trans->transid == 0);
2952 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2953 btrfs_set_header_nritems(lower, nritems + 1);
2954 btrfs_mark_buffer_dirty(trans, lower);
2955
2956 return 0;
2957 }
2958
2959 /*
2960 * split the node at the specified level in path in two.
2961 * The path is corrected to point to the appropriate node after the split
2962 *
2963 * Before splitting this tries to make some room in the node by pushing
2964 * left and right, if either one works, it returns right away.
2965 *
2966 * returns 0 on success and < 0 on failure
2967 */
split_node(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level)2968 static noinline int split_node(struct btrfs_trans_handle *trans,
2969 struct btrfs_root *root,
2970 struct btrfs_path *path, int level)
2971 {
2972 struct btrfs_fs_info *fs_info = root->fs_info;
2973 struct extent_buffer *c;
2974 struct extent_buffer *split;
2975 struct btrfs_disk_key disk_key;
2976 int mid;
2977 int ret;
2978 u32 c_nritems;
2979
2980 c = path->nodes[level];
2981 WARN_ON(btrfs_header_generation(c) != trans->transid);
2982 if (c == root->node) {
2983 /*
2984 * trying to split the root, lets make a new one
2985 *
2986 * tree mod log: We don't log_removal old root in
2987 * insert_new_root, because that root buffer will be kept as a
2988 * normal node. We are going to log removal of half of the
2989 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2990 * holding a tree lock on the buffer, which is why we cannot
2991 * race with other tree_mod_log users.
2992 */
2993 ret = insert_new_root(trans, root, path, level + 1);
2994 if (ret)
2995 return ret;
2996 } else {
2997 ret = push_nodes_for_insert(trans, root, path, level);
2998 c = path->nodes[level];
2999 if (!ret && btrfs_header_nritems(c) <
3000 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3001 return 0;
3002 if (ret < 0)
3003 return ret;
3004 }
3005
3006 c_nritems = btrfs_header_nritems(c);
3007 mid = (c_nritems + 1) / 2;
3008 btrfs_node_key(c, &disk_key, mid);
3009
3010 split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
3011 &disk_key, level, c->start, 0,
3012 0, BTRFS_NESTING_SPLIT);
3013 if (IS_ERR(split))
3014 return PTR_ERR(split);
3015
3016 root_add_used_bytes(root);
3017 ASSERT(btrfs_header_level(c) == level);
3018
3019 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3020 if (ret) {
3021 btrfs_tree_unlock(split);
3022 free_extent_buffer(split);
3023 btrfs_abort_transaction(trans, ret);
3024 return ret;
3025 }
3026 copy_extent_buffer(split, c,
3027 btrfs_node_key_ptr_offset(split, 0),
3028 btrfs_node_key_ptr_offset(c, mid),
3029 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3030 btrfs_set_header_nritems(split, c_nritems - mid);
3031 btrfs_set_header_nritems(c, mid);
3032
3033 btrfs_mark_buffer_dirty(trans, c);
3034 btrfs_mark_buffer_dirty(trans, split);
3035
3036 ret = insert_ptr(trans, path, &disk_key, split->start,
3037 path->slots[level + 1] + 1, level + 1);
3038 if (ret < 0) {
3039 btrfs_tree_unlock(split);
3040 free_extent_buffer(split);
3041 return ret;
3042 }
3043
3044 if (path->slots[level] >= mid) {
3045 path->slots[level] -= mid;
3046 btrfs_tree_unlock(c);
3047 free_extent_buffer(c);
3048 path->nodes[level] = split;
3049 path->slots[level + 1] += 1;
3050 } else {
3051 btrfs_tree_unlock(split);
3052 free_extent_buffer(split);
3053 }
3054 return 0;
3055 }
3056
3057 /*
3058 * how many bytes are required to store the items in a leaf. start
3059 * and nr indicate which items in the leaf to check. This totals up the
3060 * space used both by the item structs and the item data
3061 */
leaf_space_used(const struct extent_buffer * l,int start,int nr)3062 static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3063 {
3064 int data_len;
3065 int nritems = btrfs_header_nritems(l);
3066 int end = min(nritems, start + nr) - 1;
3067
3068 if (!nr)
3069 return 0;
3070 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3071 data_len = data_len - btrfs_item_offset(l, end);
3072 data_len += sizeof(struct btrfs_item) * nr;
3073 WARN_ON(data_len < 0);
3074 return data_len;
3075 }
3076
3077 /*
3078 * The space between the end of the leaf items and
3079 * the start of the leaf data. IOW, how much room
3080 * the leaf has left for both items and data
3081 */
btrfs_leaf_free_space(const struct extent_buffer * leaf)3082 int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3083 {
3084 struct btrfs_fs_info *fs_info = leaf->fs_info;
3085 int nritems = btrfs_header_nritems(leaf);
3086 int ret;
3087
3088 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3089 if (ret < 0) {
3090 btrfs_crit(fs_info,
3091 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3092 ret,
3093 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3094 leaf_space_used(leaf, 0, nritems), nritems);
3095 }
3096 return ret;
3097 }
3098
3099 /*
3100 * min slot controls the lowest index we're willing to push to the
3101 * right. We'll push up to and including min_slot, but no lower
3102 */
__push_leaf_right(struct btrfs_trans_handle * trans,struct btrfs_path * path,int data_size,int empty,struct extent_buffer * right,int free_space,u32 left_nritems,u32 min_slot)3103 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3104 struct btrfs_path *path,
3105 int data_size, int empty,
3106 struct extent_buffer *right,
3107 int free_space, u32 left_nritems,
3108 u32 min_slot)
3109 {
3110 struct btrfs_fs_info *fs_info = right->fs_info;
3111 struct extent_buffer *left = path->nodes[0];
3112 struct extent_buffer *upper = path->nodes[1];
3113 struct btrfs_disk_key disk_key;
3114 int slot;
3115 u32 i;
3116 int push_space = 0;
3117 int push_items = 0;
3118 u32 nr;
3119 u32 right_nritems;
3120 u32 data_end;
3121 u32 this_item_size;
3122
3123 if (empty)
3124 nr = 0;
3125 else
3126 nr = max_t(u32, 1, min_slot);
3127
3128 if (path->slots[0] >= left_nritems)
3129 push_space += data_size;
3130
3131 slot = path->slots[1];
3132 i = left_nritems - 1;
3133 while (i >= nr) {
3134 if (!empty && push_items > 0) {
3135 if (path->slots[0] > i)
3136 break;
3137 if (path->slots[0] == i) {
3138 int space = btrfs_leaf_free_space(left);
3139
3140 if (space + push_space * 2 > free_space)
3141 break;
3142 }
3143 }
3144
3145 if (path->slots[0] == i)
3146 push_space += data_size;
3147
3148 this_item_size = btrfs_item_size(left, i);
3149 if (this_item_size + sizeof(struct btrfs_item) +
3150 push_space > free_space)
3151 break;
3152
3153 push_items++;
3154 push_space += this_item_size + sizeof(struct btrfs_item);
3155 if (i == 0)
3156 break;
3157 i--;
3158 }
3159
3160 if (push_items == 0)
3161 goto out_unlock;
3162
3163 WARN_ON(!empty && push_items == left_nritems);
3164
3165 /* push left to right */
3166 right_nritems = btrfs_header_nritems(right);
3167
3168 push_space = btrfs_item_data_end(left, left_nritems - push_items);
3169 push_space -= leaf_data_end(left);
3170
3171 /* make room in the right data area */
3172 data_end = leaf_data_end(right);
3173 memmove_leaf_data(right, data_end - push_space, data_end,
3174 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3175
3176 /* copy from the left data area */
3177 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3178 leaf_data_end(left), push_space);
3179
3180 memmove_leaf_items(right, push_items, 0, right_nritems);
3181
3182 /* copy the items from left to right */
3183 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3184
3185 /* update the item pointers */
3186 right_nritems += push_items;
3187 btrfs_set_header_nritems(right, right_nritems);
3188 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3189 for (i = 0; i < right_nritems; i++) {
3190 push_space -= btrfs_item_size(right, i);
3191 btrfs_set_item_offset(right, i, push_space);
3192 }
3193
3194 left_nritems -= push_items;
3195 btrfs_set_header_nritems(left, left_nritems);
3196
3197 if (left_nritems)
3198 btrfs_mark_buffer_dirty(trans, left);
3199 else
3200 btrfs_clear_buffer_dirty(trans, left);
3201
3202 btrfs_mark_buffer_dirty(trans, right);
3203
3204 btrfs_item_key(right, &disk_key, 0);
3205 btrfs_set_node_key(upper, &disk_key, slot + 1);
3206 btrfs_mark_buffer_dirty(trans, upper);
3207
3208 /* then fixup the leaf pointer in the path */
3209 if (path->slots[0] >= left_nritems) {
3210 path->slots[0] -= left_nritems;
3211 if (btrfs_header_nritems(path->nodes[0]) == 0)
3212 btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3213 btrfs_tree_unlock(path->nodes[0]);
3214 free_extent_buffer(path->nodes[0]);
3215 path->nodes[0] = right;
3216 path->slots[1] += 1;
3217 } else {
3218 btrfs_tree_unlock(right);
3219 free_extent_buffer(right);
3220 }
3221 return 0;
3222
3223 out_unlock:
3224 btrfs_tree_unlock(right);
3225 free_extent_buffer(right);
3226 return 1;
3227 }
3228
3229 /*
3230 * push some data in the path leaf to the right, trying to free up at
3231 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3232 *
3233 * returns 1 if the push failed because the other node didn't have enough
3234 * room, 0 if everything worked out and < 0 if there were major errors.
3235 *
3236 * this will push starting from min_slot to the end of the leaf. It won't
3237 * push any slot lower than min_slot
3238 */
push_leaf_right(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 min_slot)3239 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3240 *root, struct btrfs_path *path,
3241 int min_data_size, int data_size,
3242 int empty, u32 min_slot)
3243 {
3244 struct extent_buffer *left = path->nodes[0];
3245 struct extent_buffer *right;
3246 struct extent_buffer *upper;
3247 int slot;
3248 int free_space;
3249 u32 left_nritems;
3250 int ret;
3251
3252 if (!path->nodes[1])
3253 return 1;
3254
3255 slot = path->slots[1];
3256 upper = path->nodes[1];
3257 if (slot >= btrfs_header_nritems(upper) - 1)
3258 return 1;
3259
3260 btrfs_assert_tree_write_locked(path->nodes[1]);
3261
3262 right = btrfs_read_node_slot(upper, slot + 1);
3263 if (IS_ERR(right))
3264 return PTR_ERR(right);
3265
3266 btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
3267
3268 free_space = btrfs_leaf_free_space(right);
3269 if (free_space < data_size)
3270 goto out_unlock;
3271
3272 ret = btrfs_cow_block(trans, root, right, upper,
3273 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3274 if (ret)
3275 goto out_unlock;
3276
3277 left_nritems = btrfs_header_nritems(left);
3278 if (left_nritems == 0)
3279 goto out_unlock;
3280
3281 if (check_sibling_keys(left, right)) {
3282 ret = -EUCLEAN;
3283 btrfs_abort_transaction(trans, ret);
3284 btrfs_tree_unlock(right);
3285 free_extent_buffer(right);
3286 return ret;
3287 }
3288 if (path->slots[0] == left_nritems && !empty) {
3289 /* Key greater than all keys in the leaf, right neighbor has
3290 * enough room for it and we're not emptying our leaf to delete
3291 * it, therefore use right neighbor to insert the new item and
3292 * no need to touch/dirty our left leaf. */
3293 btrfs_tree_unlock(left);
3294 free_extent_buffer(left);
3295 path->nodes[0] = right;
3296 path->slots[0] = 0;
3297 path->slots[1]++;
3298 return 0;
3299 }
3300
3301 return __push_leaf_right(trans, path, min_data_size, empty, right,
3302 free_space, left_nritems, min_slot);
3303 out_unlock:
3304 btrfs_tree_unlock(right);
3305 free_extent_buffer(right);
3306 return 1;
3307 }
3308
3309 /*
3310 * push some data in the path leaf to the left, trying to free up at
3311 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3312 *
3313 * max_slot can put a limit on how far into the leaf we'll push items. The
3314 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3315 * items
3316 */
__push_leaf_left(struct btrfs_trans_handle * trans,struct btrfs_path * path,int data_size,int empty,struct extent_buffer * left,int free_space,u32 right_nritems,u32 max_slot)3317 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3318 struct btrfs_path *path, int data_size,
3319 int empty, struct extent_buffer *left,
3320 int free_space, u32 right_nritems,
3321 u32 max_slot)
3322 {
3323 struct btrfs_fs_info *fs_info = left->fs_info;
3324 struct btrfs_disk_key disk_key;
3325 struct extent_buffer *right = path->nodes[0];
3326 int i;
3327 int push_space = 0;
3328 int push_items = 0;
3329 u32 old_left_nritems;
3330 u32 nr;
3331 int ret = 0;
3332 u32 this_item_size;
3333 u32 old_left_item_size;
3334
3335 if (empty)
3336 nr = min(right_nritems, max_slot);
3337 else
3338 nr = min(right_nritems - 1, max_slot);
3339
3340 for (i = 0; i < nr; i++) {
3341 if (!empty && push_items > 0) {
3342 if (path->slots[0] < i)
3343 break;
3344 if (path->slots[0] == i) {
3345 int space = btrfs_leaf_free_space(right);
3346
3347 if (space + push_space * 2 > free_space)
3348 break;
3349 }
3350 }
3351
3352 if (path->slots[0] == i)
3353 push_space += data_size;
3354
3355 this_item_size = btrfs_item_size(right, i);
3356 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3357 free_space)
3358 break;
3359
3360 push_items++;
3361 push_space += this_item_size + sizeof(struct btrfs_item);
3362 }
3363
3364 if (push_items == 0) {
3365 ret = 1;
3366 goto out;
3367 }
3368 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3369
3370 /* push data from right to left */
3371 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3372
3373 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3374 btrfs_item_offset(right, push_items - 1);
3375
3376 copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3377 btrfs_item_offset(right, push_items - 1), push_space);
3378 old_left_nritems = btrfs_header_nritems(left);
3379 BUG_ON(old_left_nritems <= 0);
3380
3381 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3382 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3383 u32 ioff;
3384
3385 ioff = btrfs_item_offset(left, i);
3386 btrfs_set_item_offset(left, i,
3387 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3388 }
3389 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3390
3391 /* fixup right node */
3392 if (push_items > right_nritems)
3393 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3394 right_nritems);
3395
3396 if (push_items < right_nritems) {
3397 push_space = btrfs_item_offset(right, push_items - 1) -
3398 leaf_data_end(right);
3399 memmove_leaf_data(right,
3400 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3401 leaf_data_end(right), push_space);
3402
3403 memmove_leaf_items(right, 0, push_items,
3404 btrfs_header_nritems(right) - push_items);
3405 }
3406
3407 right_nritems -= push_items;
3408 btrfs_set_header_nritems(right, right_nritems);
3409 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3410 for (i = 0; i < right_nritems; i++) {
3411 push_space = push_space - btrfs_item_size(right, i);
3412 btrfs_set_item_offset(right, i, push_space);
3413 }
3414
3415 btrfs_mark_buffer_dirty(trans, left);
3416 if (right_nritems)
3417 btrfs_mark_buffer_dirty(trans, right);
3418 else
3419 btrfs_clear_buffer_dirty(trans, right);
3420
3421 btrfs_item_key(right, &disk_key, 0);
3422 fixup_low_keys(trans, path, &disk_key, 1);
3423
3424 /* then fixup the leaf pointer in the path */
3425 if (path->slots[0] < push_items) {
3426 path->slots[0] += old_left_nritems;
3427 btrfs_tree_unlock(path->nodes[0]);
3428 free_extent_buffer(path->nodes[0]);
3429 path->nodes[0] = left;
3430 path->slots[1] -= 1;
3431 } else {
3432 btrfs_tree_unlock(left);
3433 free_extent_buffer(left);
3434 path->slots[0] -= push_items;
3435 }
3436 BUG_ON(path->slots[0] < 0);
3437 return ret;
3438 out:
3439 btrfs_tree_unlock(left);
3440 free_extent_buffer(left);
3441 return ret;
3442 }
3443
3444 /*
3445 * push some data in the path leaf to the left, trying to free up at
3446 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3447 *
3448 * max_slot can put a limit on how far into the leaf we'll push items. The
3449 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3450 * items
3451 */
push_leaf_left(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int min_data_size,int data_size,int empty,u32 max_slot)3452 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3453 *root, struct btrfs_path *path, int min_data_size,
3454 int data_size, int empty, u32 max_slot)
3455 {
3456 struct extent_buffer *right = path->nodes[0];
3457 struct extent_buffer *left;
3458 int slot;
3459 int free_space;
3460 u32 right_nritems;
3461 int ret = 0;
3462
3463 slot = path->slots[1];
3464 if (slot == 0)
3465 return 1;
3466 if (!path->nodes[1])
3467 return 1;
3468
3469 right_nritems = btrfs_header_nritems(right);
3470 if (right_nritems == 0)
3471 return 1;
3472
3473 btrfs_assert_tree_write_locked(path->nodes[1]);
3474
3475 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3476 if (IS_ERR(left))
3477 return PTR_ERR(left);
3478
3479 btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
3480
3481 free_space = btrfs_leaf_free_space(left);
3482 if (free_space < data_size) {
3483 ret = 1;
3484 goto out;
3485 }
3486
3487 ret = btrfs_cow_block(trans, root, left,
3488 path->nodes[1], slot - 1, &left,
3489 BTRFS_NESTING_LEFT_COW);
3490 if (ret) {
3491 /* we hit -ENOSPC, but it isn't fatal here */
3492 if (ret == -ENOSPC)
3493 ret = 1;
3494 goto out;
3495 }
3496
3497 if (check_sibling_keys(left, right)) {
3498 ret = -EUCLEAN;
3499 btrfs_abort_transaction(trans, ret);
3500 goto out;
3501 }
3502 return __push_leaf_left(trans, path, min_data_size, empty, left,
3503 free_space, right_nritems, max_slot);
3504 out:
3505 btrfs_tree_unlock(left);
3506 free_extent_buffer(left);
3507 return ret;
3508 }
3509
3510 /*
3511 * split the path's leaf in two, making sure there is at least data_size
3512 * available for the resulting leaf level of the path.
3513 */
copy_for_split(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct extent_buffer * l,struct extent_buffer * right,int slot,int mid,int nritems)3514 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3515 struct btrfs_path *path,
3516 struct extent_buffer *l,
3517 struct extent_buffer *right,
3518 int slot, int mid, int nritems)
3519 {
3520 struct btrfs_fs_info *fs_info = trans->fs_info;
3521 int data_copy_size;
3522 int rt_data_off;
3523 int i;
3524 int ret;
3525 struct btrfs_disk_key disk_key;
3526
3527 nritems = nritems - mid;
3528 btrfs_set_header_nritems(right, nritems);
3529 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3530
3531 copy_leaf_items(right, l, 0, mid, nritems);
3532
3533 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3534 leaf_data_end(l), data_copy_size);
3535
3536 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3537
3538 for (i = 0; i < nritems; i++) {
3539 u32 ioff;
3540
3541 ioff = btrfs_item_offset(right, i);
3542 btrfs_set_item_offset(right, i, ioff + rt_data_off);
3543 }
3544
3545 btrfs_set_header_nritems(l, mid);
3546 btrfs_item_key(right, &disk_key, 0);
3547 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3548 if (ret < 0)
3549 return ret;
3550
3551 btrfs_mark_buffer_dirty(trans, right);
3552 btrfs_mark_buffer_dirty(trans, l);
3553 BUG_ON(path->slots[0] != slot);
3554
3555 if (mid <= slot) {
3556 btrfs_tree_unlock(path->nodes[0]);
3557 free_extent_buffer(path->nodes[0]);
3558 path->nodes[0] = right;
3559 path->slots[0] -= mid;
3560 path->slots[1] += 1;
3561 } else {
3562 btrfs_tree_unlock(right);
3563 free_extent_buffer(right);
3564 }
3565
3566 BUG_ON(path->slots[0] < 0);
3567
3568 return 0;
3569 }
3570
3571 /*
3572 * double splits happen when we need to insert a big item in the middle
3573 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3574 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3575 * A B C
3576 *
3577 * We avoid this by trying to push the items on either side of our target
3578 * into the adjacent leaves. If all goes well we can avoid the double split
3579 * completely.
3580 */
push_for_double_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int data_size)3581 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3582 struct btrfs_root *root,
3583 struct btrfs_path *path,
3584 int data_size)
3585 {
3586 int ret;
3587 int progress = 0;
3588 int slot;
3589 u32 nritems;
3590 int space_needed = data_size;
3591
3592 slot = path->slots[0];
3593 if (slot < btrfs_header_nritems(path->nodes[0]))
3594 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3595
3596 /*
3597 * try to push all the items after our slot into the
3598 * right leaf
3599 */
3600 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3601 if (ret < 0)
3602 return ret;
3603
3604 if (ret == 0)
3605 progress++;
3606
3607 nritems = btrfs_header_nritems(path->nodes[0]);
3608 /*
3609 * our goal is to get our slot at the start or end of a leaf. If
3610 * we've done so we're done
3611 */
3612 if (path->slots[0] == 0 || path->slots[0] == nritems)
3613 return 0;
3614
3615 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3616 return 0;
3617
3618 /* try to push all the items before our slot into the next leaf */
3619 slot = path->slots[0];
3620 space_needed = data_size;
3621 if (slot > 0)
3622 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3623 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3624 if (ret < 0)
3625 return ret;
3626
3627 if (ret == 0)
3628 progress++;
3629
3630 if (progress)
3631 return 0;
3632 return 1;
3633 }
3634
3635 /*
3636 * split the path's leaf in two, making sure there is at least data_size
3637 * available for the resulting leaf level of the path.
3638 *
3639 * returns 0 if all went well and < 0 on failure.
3640 */
split_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * ins_key,struct btrfs_path * path,int data_size,int extend)3641 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3642 struct btrfs_root *root,
3643 const struct btrfs_key *ins_key,
3644 struct btrfs_path *path, int data_size,
3645 int extend)
3646 {
3647 struct btrfs_disk_key disk_key;
3648 struct extent_buffer *l;
3649 u32 nritems;
3650 int mid;
3651 int slot;
3652 struct extent_buffer *right;
3653 struct btrfs_fs_info *fs_info = root->fs_info;
3654 int ret = 0;
3655 int wret;
3656 int split;
3657 int num_doubles = 0;
3658 int tried_avoid_double = 0;
3659
3660 l = path->nodes[0];
3661 slot = path->slots[0];
3662 if (extend && data_size + btrfs_item_size(l, slot) +
3663 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3664 return -EOVERFLOW;
3665
3666 /* first try to make some room by pushing left and right */
3667 if (data_size && path->nodes[1]) {
3668 int space_needed = data_size;
3669
3670 if (slot < btrfs_header_nritems(l))
3671 space_needed -= btrfs_leaf_free_space(l);
3672
3673 wret = push_leaf_right(trans, root, path, space_needed,
3674 space_needed, 0, 0);
3675 if (wret < 0)
3676 return wret;
3677 if (wret) {
3678 space_needed = data_size;
3679 if (slot > 0)
3680 space_needed -= btrfs_leaf_free_space(l);
3681 wret = push_leaf_left(trans, root, path, space_needed,
3682 space_needed, 0, (u32)-1);
3683 if (wret < 0)
3684 return wret;
3685 }
3686 l = path->nodes[0];
3687
3688 /* did the pushes work? */
3689 if (btrfs_leaf_free_space(l) >= data_size)
3690 return 0;
3691 }
3692
3693 if (!path->nodes[1]) {
3694 ret = insert_new_root(trans, root, path, 1);
3695 if (ret)
3696 return ret;
3697 }
3698 again:
3699 split = 1;
3700 l = path->nodes[0];
3701 slot = path->slots[0];
3702 nritems = btrfs_header_nritems(l);
3703 mid = (nritems + 1) / 2;
3704
3705 if (mid <= slot) {
3706 if (nritems == 1 ||
3707 leaf_space_used(l, mid, nritems - mid) + data_size >
3708 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3709 if (slot >= nritems) {
3710 split = 0;
3711 } else {
3712 mid = slot;
3713 if (mid != nritems &&
3714 leaf_space_used(l, mid, nritems - mid) +
3715 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3716 if (data_size && !tried_avoid_double)
3717 goto push_for_double;
3718 split = 2;
3719 }
3720 }
3721 }
3722 } else {
3723 if (leaf_space_used(l, 0, mid) + data_size >
3724 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3725 if (!extend && data_size && slot == 0) {
3726 split = 0;
3727 } else if ((extend || !data_size) && slot == 0) {
3728 mid = 1;
3729 } else {
3730 mid = slot;
3731 if (mid != nritems &&
3732 leaf_space_used(l, mid, nritems - mid) +
3733 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3734 if (data_size && !tried_avoid_double)
3735 goto push_for_double;
3736 split = 2;
3737 }
3738 }
3739 }
3740 }
3741
3742 if (split == 0)
3743 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3744 else
3745 btrfs_item_key(l, &disk_key, mid);
3746
3747 /*
3748 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3749 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3750 * subclasses, which is 8 at the time of this patch, and we've maxed it
3751 * out. In the future we could add a
3752 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3753 * use BTRFS_NESTING_NEW_ROOT.
3754 */
3755 right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
3756 &disk_key, 0, l->start, 0, 0,
3757 num_doubles ? BTRFS_NESTING_NEW_ROOT :
3758 BTRFS_NESTING_SPLIT);
3759 if (IS_ERR(right))
3760 return PTR_ERR(right);
3761
3762 root_add_used_bytes(root);
3763
3764 if (split == 0) {
3765 if (mid <= slot) {
3766 btrfs_set_header_nritems(right, 0);
3767 ret = insert_ptr(trans, path, &disk_key,
3768 right->start, path->slots[1] + 1, 1);
3769 if (ret < 0) {
3770 btrfs_tree_unlock(right);
3771 free_extent_buffer(right);
3772 return ret;
3773 }
3774 btrfs_tree_unlock(path->nodes[0]);
3775 free_extent_buffer(path->nodes[0]);
3776 path->nodes[0] = right;
3777 path->slots[0] = 0;
3778 path->slots[1] += 1;
3779 } else {
3780 btrfs_set_header_nritems(right, 0);
3781 ret = insert_ptr(trans, path, &disk_key,
3782 right->start, path->slots[1], 1);
3783 if (ret < 0) {
3784 btrfs_tree_unlock(right);
3785 free_extent_buffer(right);
3786 return ret;
3787 }
3788 btrfs_tree_unlock(path->nodes[0]);
3789 free_extent_buffer(path->nodes[0]);
3790 path->nodes[0] = right;
3791 path->slots[0] = 0;
3792 if (path->slots[1] == 0)
3793 fixup_low_keys(trans, path, &disk_key, 1);
3794 }
3795 /*
3796 * We create a new leaf 'right' for the required ins_len and
3797 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3798 * the content of ins_len to 'right'.
3799 */
3800 return ret;
3801 }
3802
3803 ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3804 if (ret < 0) {
3805 btrfs_tree_unlock(right);
3806 free_extent_buffer(right);
3807 return ret;
3808 }
3809
3810 if (split == 2) {
3811 BUG_ON(num_doubles != 0);
3812 num_doubles++;
3813 goto again;
3814 }
3815
3816 return 0;
3817
3818 push_for_double:
3819 push_for_double_split(trans, root, path, data_size);
3820 tried_avoid_double = 1;
3821 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3822 return 0;
3823 goto again;
3824 }
3825
setup_leaf_for_split(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int ins_len)3826 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3827 struct btrfs_root *root,
3828 struct btrfs_path *path, int ins_len)
3829 {
3830 struct btrfs_key key;
3831 struct extent_buffer *leaf;
3832 struct btrfs_file_extent_item *fi;
3833 u64 extent_len = 0;
3834 u32 item_size;
3835 int ret;
3836
3837 leaf = path->nodes[0];
3838 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3839
3840 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3841 key.type != BTRFS_RAID_STRIPE_KEY &&
3842 key.type != BTRFS_EXTENT_CSUM_KEY);
3843
3844 if (btrfs_leaf_free_space(leaf) >= ins_len)
3845 return 0;
3846
3847 item_size = btrfs_item_size(leaf, path->slots[0]);
3848 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3849 fi = btrfs_item_ptr(leaf, path->slots[0],
3850 struct btrfs_file_extent_item);
3851 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3852 }
3853 btrfs_release_path(path);
3854
3855 path->keep_locks = 1;
3856 path->search_for_split = 1;
3857 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3858 path->search_for_split = 0;
3859 if (ret > 0)
3860 ret = -EAGAIN;
3861 if (ret < 0)
3862 goto err;
3863
3864 ret = -EAGAIN;
3865 leaf = path->nodes[0];
3866 /* if our item isn't there, return now */
3867 if (item_size != btrfs_item_size(leaf, path->slots[0]))
3868 goto err;
3869
3870 /* the leaf has changed, it now has room. return now */
3871 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3872 goto err;
3873
3874 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3875 fi = btrfs_item_ptr(leaf, path->slots[0],
3876 struct btrfs_file_extent_item);
3877 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3878 goto err;
3879 }
3880
3881 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3882 if (ret)
3883 goto err;
3884
3885 path->keep_locks = 0;
3886 btrfs_unlock_up_safe(path, 1);
3887 return 0;
3888 err:
3889 path->keep_locks = 0;
3890 return ret;
3891 }
3892
split_item(struct btrfs_trans_handle * trans,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)3893 static noinline int split_item(struct btrfs_trans_handle *trans,
3894 struct btrfs_path *path,
3895 const struct btrfs_key *new_key,
3896 unsigned long split_offset)
3897 {
3898 struct extent_buffer *leaf;
3899 int orig_slot, slot;
3900 char *buf;
3901 u32 nritems;
3902 u32 item_size;
3903 u32 orig_offset;
3904 struct btrfs_disk_key disk_key;
3905
3906 leaf = path->nodes[0];
3907 /*
3908 * Shouldn't happen because the caller must have previously called
3909 * setup_leaf_for_split() to make room for the new item in the leaf.
3910 */
3911 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
3912 return -ENOSPC;
3913
3914 orig_slot = path->slots[0];
3915 orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3916 item_size = btrfs_item_size(leaf, path->slots[0]);
3917
3918 buf = kmalloc(item_size, GFP_NOFS);
3919 if (!buf)
3920 return -ENOMEM;
3921
3922 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3923 path->slots[0]), item_size);
3924
3925 slot = path->slots[0] + 1;
3926 nritems = btrfs_header_nritems(leaf);
3927 if (slot != nritems) {
3928 /* shift the items */
3929 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
3930 }
3931
3932 btrfs_cpu_key_to_disk(&disk_key, new_key);
3933 btrfs_set_item_key(leaf, &disk_key, slot);
3934
3935 btrfs_set_item_offset(leaf, slot, orig_offset);
3936 btrfs_set_item_size(leaf, slot, item_size - split_offset);
3937
3938 btrfs_set_item_offset(leaf, orig_slot,
3939 orig_offset + item_size - split_offset);
3940 btrfs_set_item_size(leaf, orig_slot, split_offset);
3941
3942 btrfs_set_header_nritems(leaf, nritems + 1);
3943
3944 /* write the data for the start of the original item */
3945 write_extent_buffer(leaf, buf,
3946 btrfs_item_ptr_offset(leaf, path->slots[0]),
3947 split_offset);
3948
3949 /* write the data for the new item */
3950 write_extent_buffer(leaf, buf + split_offset,
3951 btrfs_item_ptr_offset(leaf, slot),
3952 item_size - split_offset);
3953 btrfs_mark_buffer_dirty(trans, leaf);
3954
3955 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3956 kfree(buf);
3957 return 0;
3958 }
3959
3960 /*
3961 * This function splits a single item into two items,
3962 * giving 'new_key' to the new item and splitting the
3963 * old one at split_offset (from the start of the item).
3964 *
3965 * The path may be released by this operation. After
3966 * the split, the path is pointing to the old item. The
3967 * new item is going to be in the same node as the old one.
3968 *
3969 * Note, the item being split must be smaller enough to live alone on
3970 * a tree block with room for one extra struct btrfs_item
3971 *
3972 * This allows us to split the item in place, keeping a lock on the
3973 * leaf the entire time.
3974 */
btrfs_split_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key,unsigned long split_offset)3975 int btrfs_split_item(struct btrfs_trans_handle *trans,
3976 struct btrfs_root *root,
3977 struct btrfs_path *path,
3978 const struct btrfs_key *new_key,
3979 unsigned long split_offset)
3980 {
3981 int ret;
3982 ret = setup_leaf_for_split(trans, root, path,
3983 sizeof(struct btrfs_item));
3984 if (ret)
3985 return ret;
3986
3987 ret = split_item(trans, path, new_key, split_offset);
3988 return ret;
3989 }
3990
3991 /*
3992 * make the item pointed to by the path smaller. new_size indicates
3993 * how small to make it, and from_end tells us if we just chop bytes
3994 * off the end of the item or if we shift the item to chop bytes off
3995 * the front.
3996 */
btrfs_truncate_item(struct btrfs_trans_handle * trans,const struct btrfs_path * path,u32 new_size,int from_end)3997 void btrfs_truncate_item(struct btrfs_trans_handle *trans,
3998 const struct btrfs_path *path, u32 new_size, int from_end)
3999 {
4000 int slot;
4001 struct extent_buffer *leaf;
4002 u32 nritems;
4003 unsigned int data_end;
4004 unsigned int old_data_start;
4005 unsigned int old_size;
4006 unsigned int size_diff;
4007 int i;
4008
4009 leaf = path->nodes[0];
4010 slot = path->slots[0];
4011
4012 old_size = btrfs_item_size(leaf, slot);
4013 if (old_size == new_size)
4014 return;
4015
4016 nritems = btrfs_header_nritems(leaf);
4017 data_end = leaf_data_end(leaf);
4018
4019 old_data_start = btrfs_item_offset(leaf, slot);
4020
4021 size_diff = old_size - new_size;
4022
4023 BUG_ON(slot < 0);
4024 BUG_ON(slot >= nritems);
4025
4026 /*
4027 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4028 */
4029 /* first correct the data pointers */
4030 for (i = slot; i < nritems; i++) {
4031 u32 ioff;
4032
4033 ioff = btrfs_item_offset(leaf, i);
4034 btrfs_set_item_offset(leaf, i, ioff + size_diff);
4035 }
4036
4037 /* shift the data */
4038 if (from_end) {
4039 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4040 old_data_start + new_size - data_end);
4041 } else {
4042 struct btrfs_disk_key disk_key;
4043 u64 offset;
4044
4045 btrfs_item_key(leaf, &disk_key, slot);
4046
4047 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4048 unsigned long ptr;
4049 struct btrfs_file_extent_item *fi;
4050
4051 fi = btrfs_item_ptr(leaf, slot,
4052 struct btrfs_file_extent_item);
4053 fi = (struct btrfs_file_extent_item *)(
4054 (unsigned long)fi - size_diff);
4055
4056 if (btrfs_file_extent_type(leaf, fi) ==
4057 BTRFS_FILE_EXTENT_INLINE) {
4058 ptr = btrfs_item_ptr_offset(leaf, slot);
4059 memmove_extent_buffer(leaf, ptr,
4060 (unsigned long)fi,
4061 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4062 }
4063 }
4064
4065 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4066 old_data_start - data_end);
4067
4068 offset = btrfs_disk_key_offset(&disk_key);
4069 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4070 btrfs_set_item_key(leaf, &disk_key, slot);
4071 if (slot == 0)
4072 fixup_low_keys(trans, path, &disk_key, 1);
4073 }
4074
4075 btrfs_set_item_size(leaf, slot, new_size);
4076 btrfs_mark_buffer_dirty(trans, leaf);
4077
4078 if (btrfs_leaf_free_space(leaf) < 0) {
4079 btrfs_print_leaf(leaf);
4080 BUG();
4081 }
4082 }
4083
4084 /*
4085 * make the item pointed to by the path bigger, data_size is the added size.
4086 */
btrfs_extend_item(struct btrfs_trans_handle * trans,const struct btrfs_path * path,u32 data_size)4087 void btrfs_extend_item(struct btrfs_trans_handle *trans,
4088 const struct btrfs_path *path, u32 data_size)
4089 {
4090 int slot;
4091 struct extent_buffer *leaf;
4092 u32 nritems;
4093 unsigned int data_end;
4094 unsigned int old_data;
4095 unsigned int old_size;
4096 int i;
4097
4098 leaf = path->nodes[0];
4099
4100 nritems = btrfs_header_nritems(leaf);
4101 data_end = leaf_data_end(leaf);
4102
4103 if (btrfs_leaf_free_space(leaf) < data_size) {
4104 btrfs_print_leaf(leaf);
4105 BUG();
4106 }
4107 slot = path->slots[0];
4108 old_data = btrfs_item_data_end(leaf, slot);
4109
4110 BUG_ON(slot < 0);
4111 if (slot >= nritems) {
4112 btrfs_print_leaf(leaf);
4113 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4114 slot, nritems);
4115 BUG();
4116 }
4117
4118 /*
4119 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4120 */
4121 /* first correct the data pointers */
4122 for (i = slot; i < nritems; i++) {
4123 u32 ioff;
4124
4125 ioff = btrfs_item_offset(leaf, i);
4126 btrfs_set_item_offset(leaf, i, ioff - data_size);
4127 }
4128
4129 /* shift the data */
4130 memmove_leaf_data(leaf, data_end - data_size, data_end,
4131 old_data - data_end);
4132
4133 data_end = old_data;
4134 old_size = btrfs_item_size(leaf, slot);
4135 btrfs_set_item_size(leaf, slot, old_size + data_size);
4136 btrfs_mark_buffer_dirty(trans, leaf);
4137
4138 if (btrfs_leaf_free_space(leaf) < 0) {
4139 btrfs_print_leaf(leaf);
4140 BUG();
4141 }
4142 }
4143
4144 /*
4145 * Make space in the node before inserting one or more items.
4146 *
4147 * @trans: transaction handle
4148 * @root: root we are inserting items to
4149 * @path: points to the leaf/slot where we are going to insert new items
4150 * @batch: information about the batch of items to insert
4151 *
4152 * Main purpose is to save stack depth by doing the bulk of the work in a
4153 * function that doesn't call btrfs_search_slot
4154 */
setup_items_for_insert(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_item_batch * batch)4155 static void setup_items_for_insert(struct btrfs_trans_handle *trans,
4156 struct btrfs_root *root, struct btrfs_path *path,
4157 const struct btrfs_item_batch *batch)
4158 {
4159 struct btrfs_fs_info *fs_info = root->fs_info;
4160 int i;
4161 u32 nritems;
4162 unsigned int data_end;
4163 struct btrfs_disk_key disk_key;
4164 struct extent_buffer *leaf;
4165 int slot;
4166 u32 total_size;
4167
4168 /*
4169 * Before anything else, update keys in the parent and other ancestors
4170 * if needed, then release the write locks on them, so that other tasks
4171 * can use them while we modify the leaf.
4172 */
4173 if (path->slots[0] == 0) {
4174 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4175 fixup_low_keys(trans, path, &disk_key, 1);
4176 }
4177 btrfs_unlock_up_safe(path, 1);
4178
4179 leaf = path->nodes[0];
4180 slot = path->slots[0];
4181
4182 nritems = btrfs_header_nritems(leaf);
4183 data_end = leaf_data_end(leaf);
4184 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4185
4186 if (btrfs_leaf_free_space(leaf) < total_size) {
4187 btrfs_print_leaf(leaf);
4188 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4189 total_size, btrfs_leaf_free_space(leaf));
4190 BUG();
4191 }
4192
4193 if (slot != nritems) {
4194 unsigned int old_data = btrfs_item_data_end(leaf, slot);
4195
4196 if (old_data < data_end) {
4197 btrfs_print_leaf(leaf);
4198 btrfs_crit(fs_info,
4199 "item at slot %d with data offset %u beyond data end of leaf %u",
4200 slot, old_data, data_end);
4201 BUG();
4202 }
4203 /*
4204 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4205 */
4206 /* first correct the data pointers */
4207 for (i = slot; i < nritems; i++) {
4208 u32 ioff;
4209
4210 ioff = btrfs_item_offset(leaf, i);
4211 btrfs_set_item_offset(leaf, i,
4212 ioff - batch->total_data_size);
4213 }
4214 /* shift the items */
4215 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4216
4217 /* shift the data */
4218 memmove_leaf_data(leaf, data_end - batch->total_data_size,
4219 data_end, old_data - data_end);
4220 data_end = old_data;
4221 }
4222
4223 /* setup the item for the new data */
4224 for (i = 0; i < batch->nr; i++) {
4225 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4226 btrfs_set_item_key(leaf, &disk_key, slot + i);
4227 data_end -= batch->data_sizes[i];
4228 btrfs_set_item_offset(leaf, slot + i, data_end);
4229 btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
4230 }
4231
4232 btrfs_set_header_nritems(leaf, nritems + batch->nr);
4233 btrfs_mark_buffer_dirty(trans, leaf);
4234
4235 if (btrfs_leaf_free_space(leaf) < 0) {
4236 btrfs_print_leaf(leaf);
4237 BUG();
4238 }
4239 }
4240
4241 /*
4242 * Insert a new item into a leaf.
4243 *
4244 * @trans: Transaction handle.
4245 * @root: The root of the btree.
4246 * @path: A path pointing to the target leaf and slot.
4247 * @key: The key of the new item.
4248 * @data_size: The size of the data associated with the new key.
4249 */
btrfs_setup_item_for_insert(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * key,u32 data_size)4250 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
4251 struct btrfs_root *root,
4252 struct btrfs_path *path,
4253 const struct btrfs_key *key,
4254 u32 data_size)
4255 {
4256 struct btrfs_item_batch batch;
4257
4258 batch.keys = key;
4259 batch.data_sizes = &data_size;
4260 batch.total_data_size = data_size;
4261 batch.nr = 1;
4262
4263 setup_items_for_insert(trans, root, path, &batch);
4264 }
4265
4266 /*
4267 * Given a key and some data, insert items into the tree.
4268 * This does all the path init required, making room in the tree if needed.
4269 *
4270 * Returns: 0 on success
4271 * -EEXIST if the first key already exists
4272 * < 0 on other errors
4273 */
btrfs_insert_empty_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_item_batch * batch)4274 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4275 struct btrfs_root *root,
4276 struct btrfs_path *path,
4277 const struct btrfs_item_batch *batch)
4278 {
4279 int ret = 0;
4280 int slot;
4281 u32 total_size;
4282
4283 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4284 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4285 if (ret == 0)
4286 return -EEXIST;
4287 if (ret < 0)
4288 return ret;
4289
4290 slot = path->slots[0];
4291 BUG_ON(slot < 0);
4292
4293 setup_items_for_insert(trans, root, path, batch);
4294 return 0;
4295 }
4296
4297 /*
4298 * Given a key and some data, insert an item into the tree.
4299 * This does all the path init required, making room in the tree if needed.
4300 */
btrfs_insert_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct btrfs_key * cpu_key,void * data,u32 data_size)4301 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4302 const struct btrfs_key *cpu_key, void *data,
4303 u32 data_size)
4304 {
4305 int ret = 0;
4306 BTRFS_PATH_AUTO_FREE(path);
4307 struct extent_buffer *leaf;
4308 unsigned long ptr;
4309
4310 path = btrfs_alloc_path();
4311 if (!path)
4312 return -ENOMEM;
4313 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4314 if (!ret) {
4315 leaf = path->nodes[0];
4316 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4317 write_extent_buffer(leaf, data, ptr, data_size);
4318 btrfs_mark_buffer_dirty(trans, leaf);
4319 }
4320 return ret;
4321 }
4322
4323 /*
4324 * This function duplicates an item, giving 'new_key' to the new item.
4325 * It guarantees both items live in the same tree leaf and the new item is
4326 * contiguous with the original item.
4327 *
4328 * This allows us to split a file extent in place, keeping a lock on the leaf
4329 * the entire time.
4330 */
btrfs_duplicate_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,const struct btrfs_key * new_key)4331 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4332 struct btrfs_root *root,
4333 struct btrfs_path *path,
4334 const struct btrfs_key *new_key)
4335 {
4336 struct extent_buffer *leaf;
4337 int ret;
4338 u32 item_size;
4339
4340 leaf = path->nodes[0];
4341 item_size = btrfs_item_size(leaf, path->slots[0]);
4342 ret = setup_leaf_for_split(trans, root, path,
4343 item_size + sizeof(struct btrfs_item));
4344 if (ret)
4345 return ret;
4346
4347 path->slots[0]++;
4348 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
4349 leaf = path->nodes[0];
4350 memcpy_extent_buffer(leaf,
4351 btrfs_item_ptr_offset(leaf, path->slots[0]),
4352 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4353 item_size);
4354 return 0;
4355 }
4356
4357 /*
4358 * delete the pointer from a given node.
4359 *
4360 * the tree should have been previously balanced so the deletion does not
4361 * empty a node.
4362 *
4363 * This is exported for use inside btrfs-progs, don't un-export it.
4364 */
btrfs_del_ptr(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int level,int slot)4365 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4366 struct btrfs_path *path, int level, int slot)
4367 {
4368 struct extent_buffer *parent = path->nodes[level];
4369 u32 nritems;
4370 int ret;
4371
4372 nritems = btrfs_header_nritems(parent);
4373 if (slot != nritems - 1) {
4374 if (level) {
4375 ret = btrfs_tree_mod_log_insert_move(parent, slot,
4376 slot + 1, nritems - slot - 1);
4377 if (ret < 0) {
4378 btrfs_abort_transaction(trans, ret);
4379 return ret;
4380 }
4381 }
4382 memmove_extent_buffer(parent,
4383 btrfs_node_key_ptr_offset(parent, slot),
4384 btrfs_node_key_ptr_offset(parent, slot + 1),
4385 sizeof(struct btrfs_key_ptr) *
4386 (nritems - slot - 1));
4387 } else if (level) {
4388 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4389 BTRFS_MOD_LOG_KEY_REMOVE);
4390 if (ret < 0) {
4391 btrfs_abort_transaction(trans, ret);
4392 return ret;
4393 }
4394 }
4395
4396 nritems--;
4397 btrfs_set_header_nritems(parent, nritems);
4398 if (nritems == 0 && parent == root->node) {
4399 BUG_ON(btrfs_header_level(root->node) != 1);
4400 /* just turn the root into a leaf and break */
4401 btrfs_set_header_level(root->node, 0);
4402 } else if (slot == 0) {
4403 struct btrfs_disk_key disk_key;
4404
4405 btrfs_node_key(parent, &disk_key, 0);
4406 fixup_low_keys(trans, path, &disk_key, level + 1);
4407 }
4408 btrfs_mark_buffer_dirty(trans, parent);
4409 return 0;
4410 }
4411
4412 /*
4413 * a helper function to delete the leaf pointed to by path->slots[1] and
4414 * path->nodes[1].
4415 *
4416 * This deletes the pointer in path->nodes[1] and frees the leaf
4417 * block extent. zero is returned if it all worked out, < 0 otherwise.
4418 *
4419 * The path must have already been setup for deleting the leaf, including
4420 * all the proper balancing. path->nodes[1] must be locked.
4421 */
btrfs_del_leaf(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * leaf)4422 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4423 struct btrfs_root *root,
4424 struct btrfs_path *path,
4425 struct extent_buffer *leaf)
4426 {
4427 int ret;
4428
4429 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4430 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4431 if (ret < 0)
4432 return ret;
4433
4434 /*
4435 * btrfs_free_extent is expensive, we want to make sure we
4436 * aren't holding any locks when we call it
4437 */
4438 btrfs_unlock_up_safe(path, 0);
4439
4440 root_sub_used_bytes(root);
4441
4442 refcount_inc(&leaf->refs);
4443 ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4444 free_extent_buffer_stale(leaf);
4445 if (ret < 0)
4446 btrfs_abort_transaction(trans, ret);
4447
4448 return ret;
4449 }
4450 /*
4451 * delete the item at the leaf level in path. If that empties
4452 * the leaf, remove it from the tree
4453 */
btrfs_del_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int slot,int nr)4454 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4455 struct btrfs_path *path, int slot, int nr)
4456 {
4457 struct btrfs_fs_info *fs_info = root->fs_info;
4458 struct extent_buffer *leaf;
4459 int ret = 0;
4460 int wret;
4461 u32 nritems;
4462
4463 leaf = path->nodes[0];
4464 nritems = btrfs_header_nritems(leaf);
4465
4466 if (slot + nr != nritems) {
4467 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4468 const int data_end = leaf_data_end(leaf);
4469 u32 dsize = 0;
4470 int i;
4471
4472 for (i = 0; i < nr; i++)
4473 dsize += btrfs_item_size(leaf, slot + i);
4474
4475 memmove_leaf_data(leaf, data_end + dsize, data_end,
4476 last_off - data_end);
4477
4478 for (i = slot + nr; i < nritems; i++) {
4479 u32 ioff;
4480
4481 ioff = btrfs_item_offset(leaf, i);
4482 btrfs_set_item_offset(leaf, i, ioff + dsize);
4483 }
4484
4485 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4486 }
4487 btrfs_set_header_nritems(leaf, nritems - nr);
4488 nritems -= nr;
4489
4490 /* delete the leaf if we've emptied it */
4491 if (nritems == 0) {
4492 if (leaf == root->node) {
4493 btrfs_set_header_level(leaf, 0);
4494 } else {
4495 btrfs_clear_buffer_dirty(trans, leaf);
4496 ret = btrfs_del_leaf(trans, root, path, leaf);
4497 if (ret < 0)
4498 return ret;
4499 }
4500 } else {
4501 int used = leaf_space_used(leaf, 0, nritems);
4502 if (slot == 0) {
4503 struct btrfs_disk_key disk_key;
4504
4505 btrfs_item_key(leaf, &disk_key, 0);
4506 fixup_low_keys(trans, path, &disk_key, 1);
4507 }
4508
4509 /*
4510 * Try to delete the leaf if it is mostly empty. We do this by
4511 * trying to move all its items into its left and right neighbours.
4512 * If we can't move all the items, then we don't delete it - it's
4513 * not ideal, but future insertions might fill the leaf with more
4514 * items, or items from other leaves might be moved later into our
4515 * leaf due to deletions on those leaves.
4516 */
4517 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4518 u32 min_push_space;
4519
4520 /* push_leaf_left fixes the path.
4521 * make sure the path still points to our leaf
4522 * for possible call to btrfs_del_ptr below
4523 */
4524 slot = path->slots[1];
4525 refcount_inc(&leaf->refs);
4526 /*
4527 * We want to be able to at least push one item to the
4528 * left neighbour leaf, and that's the first item.
4529 */
4530 min_push_space = sizeof(struct btrfs_item) +
4531 btrfs_item_size(leaf, 0);
4532 wret = push_leaf_left(trans, root, path, 0,
4533 min_push_space, 1, (u32)-1);
4534 if (wret < 0 && wret != -ENOSPC)
4535 ret = wret;
4536
4537 if (path->nodes[0] == leaf &&
4538 btrfs_header_nritems(leaf)) {
4539 /*
4540 * If we were not able to push all items from our
4541 * leaf to its left neighbour, then attempt to
4542 * either push all the remaining items to the
4543 * right neighbour or none. There's no advantage
4544 * in pushing only some items, instead of all, as
4545 * it's pointless to end up with a leaf having
4546 * too few items while the neighbours can be full
4547 * or nearly full.
4548 */
4549 nritems = btrfs_header_nritems(leaf);
4550 min_push_space = leaf_space_used(leaf, 0, nritems);
4551 wret = push_leaf_right(trans, root, path, 0,
4552 min_push_space, 1, 0);
4553 if (wret < 0 && wret != -ENOSPC)
4554 ret = wret;
4555 }
4556
4557 if (btrfs_header_nritems(leaf) == 0) {
4558 path->slots[1] = slot;
4559 ret = btrfs_del_leaf(trans, root, path, leaf);
4560 if (ret < 0)
4561 return ret;
4562 free_extent_buffer(leaf);
4563 ret = 0;
4564 } else {
4565 /* if we're still in the path, make sure
4566 * we're dirty. Otherwise, one of the
4567 * push_leaf functions must have already
4568 * dirtied this buffer
4569 */
4570 if (path->nodes[0] == leaf)
4571 btrfs_mark_buffer_dirty(trans, leaf);
4572 free_extent_buffer(leaf);
4573 }
4574 } else {
4575 btrfs_mark_buffer_dirty(trans, leaf);
4576 }
4577 }
4578 return ret;
4579 }
4580
4581 /*
4582 * A helper function to walk down the tree starting at min_key, and looking
4583 * for leaves that have a minimum transaction id.
4584 * This is used by the btree defrag code, and tree logging
4585 *
4586 * This does not cow, but it does stuff the starting key it finds back
4587 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4588 * key and get a writable path.
4589 *
4590 * min_trans indicates the oldest transaction that you are interested
4591 * in walking through. Any nodes or leaves older than min_trans are
4592 * skipped over (without reading them).
4593 *
4594 * returns zero if something useful was found, < 0 on error and 1 if there
4595 * was nothing in the tree that matched the search criteria.
4596 */
btrfs_search_forward(struct btrfs_root * root,struct btrfs_key * min_key,struct btrfs_path * path,u64 min_trans)4597 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4598 struct btrfs_path *path,
4599 u64 min_trans)
4600 {
4601 struct extent_buffer *cur;
4602 int slot;
4603 int sret;
4604 u32 nritems;
4605 int level;
4606 int ret = 1;
4607 int keep_locks = path->keep_locks;
4608
4609 ASSERT(!path->nowait);
4610 ASSERT(path->lowest_level == 0);
4611 path->keep_locks = 1;
4612 again:
4613 cur = btrfs_read_lock_root_node(root);
4614 level = btrfs_header_level(cur);
4615 WARN_ON(path->nodes[level]);
4616 path->nodes[level] = cur;
4617 path->locks[level] = BTRFS_READ_LOCK;
4618
4619 if (btrfs_header_generation(cur) < min_trans) {
4620 ret = 1;
4621 goto out;
4622 }
4623 while (1) {
4624 nritems = btrfs_header_nritems(cur);
4625 level = btrfs_header_level(cur);
4626 sret = btrfs_bin_search(cur, 0, min_key, &slot);
4627 if (sret < 0) {
4628 ret = sret;
4629 goto out;
4630 }
4631
4632 /* At level 0 we're done, setup the path and exit. */
4633 if (level == 0) {
4634 if (slot >= nritems)
4635 goto find_next_key;
4636 ret = 0;
4637 path->slots[level] = slot;
4638 /* Save our key for returning back. */
4639 btrfs_item_key_to_cpu(cur, min_key, slot);
4640 goto out;
4641 }
4642 if (sret && slot > 0)
4643 slot--;
4644 /*
4645 * check this node pointer against the min_trans parameters.
4646 * If it is too old, skip to the next one.
4647 */
4648 while (slot < nritems) {
4649 u64 gen;
4650
4651 gen = btrfs_node_ptr_generation(cur, slot);
4652 if (gen < min_trans) {
4653 slot++;
4654 continue;
4655 }
4656 break;
4657 }
4658 find_next_key:
4659 /*
4660 * we didn't find a candidate key in this node, walk forward
4661 * and find another one
4662 */
4663 path->slots[level] = slot;
4664 if (slot >= nritems) {
4665 sret = btrfs_find_next_key(root, path, min_key, level,
4666 min_trans);
4667 if (sret == 0) {
4668 btrfs_release_path(path);
4669 goto again;
4670 } else {
4671 goto out;
4672 }
4673 }
4674 cur = btrfs_read_node_slot(cur, slot);
4675 if (IS_ERR(cur)) {
4676 ret = PTR_ERR(cur);
4677 goto out;
4678 }
4679
4680 btrfs_tree_read_lock(cur);
4681
4682 path->locks[level - 1] = BTRFS_READ_LOCK;
4683 path->nodes[level - 1] = cur;
4684 unlock_up(path, level, 1, 0, NULL);
4685 }
4686 out:
4687 path->keep_locks = keep_locks;
4688 if (ret == 0)
4689 btrfs_unlock_up_safe(path, 1);
4690 return ret;
4691 }
4692
4693 /*
4694 * this is similar to btrfs_next_leaf, but does not try to preserve
4695 * and fixup the path. It looks for and returns the next key in the
4696 * tree based on the current path and the min_trans parameters.
4697 *
4698 * 0 is returned if another key is found, < 0 if there are any errors
4699 * and 1 is returned if there are no higher keys in the tree
4700 *
4701 * path->keep_locks should be set to 1 on the search made before
4702 * calling this function.
4703 */
btrfs_find_next_key(struct btrfs_root * root,struct btrfs_path * path,struct btrfs_key * key,int level,u64 min_trans)4704 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4705 struct btrfs_key *key, int level, u64 min_trans)
4706 {
4707 int slot;
4708 struct extent_buffer *c;
4709
4710 WARN_ON(!path->keep_locks && !path->skip_locking);
4711 while (level < BTRFS_MAX_LEVEL) {
4712 if (!path->nodes[level])
4713 return 1;
4714
4715 slot = path->slots[level] + 1;
4716 c = path->nodes[level];
4717 next:
4718 if (slot >= btrfs_header_nritems(c)) {
4719 int ret;
4720 int orig_lowest;
4721 struct btrfs_key cur_key;
4722 if (level + 1 >= BTRFS_MAX_LEVEL ||
4723 !path->nodes[level + 1])
4724 return 1;
4725
4726 if (path->locks[level + 1] || path->skip_locking) {
4727 level++;
4728 continue;
4729 }
4730
4731 slot = btrfs_header_nritems(c) - 1;
4732 if (level == 0)
4733 btrfs_item_key_to_cpu(c, &cur_key, slot);
4734 else
4735 btrfs_node_key_to_cpu(c, &cur_key, slot);
4736
4737 orig_lowest = path->lowest_level;
4738 btrfs_release_path(path);
4739 path->lowest_level = level;
4740 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4741 0, 0);
4742 path->lowest_level = orig_lowest;
4743 if (ret < 0)
4744 return ret;
4745
4746 c = path->nodes[level];
4747 slot = path->slots[level];
4748 if (ret == 0)
4749 slot++;
4750 goto next;
4751 }
4752
4753 if (level == 0)
4754 btrfs_item_key_to_cpu(c, key, slot);
4755 else {
4756 u64 gen = btrfs_node_ptr_generation(c, slot);
4757
4758 if (gen < min_trans) {
4759 slot++;
4760 goto next;
4761 }
4762 btrfs_node_key_to_cpu(c, key, slot);
4763 }
4764 return 0;
4765 }
4766 return 1;
4767 }
4768
btrfs_next_old_leaf(struct btrfs_root * root,struct btrfs_path * path,u64 time_seq)4769 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4770 u64 time_seq)
4771 {
4772 int slot;
4773 int level;
4774 struct extent_buffer *c;
4775 struct extent_buffer *next;
4776 struct btrfs_fs_info *fs_info = root->fs_info;
4777 struct btrfs_key key;
4778 bool need_commit_sem = false;
4779 u32 nritems;
4780 int ret;
4781 int i;
4782
4783 /*
4784 * The nowait semantics are used only for write paths, where we don't
4785 * use the tree mod log and sequence numbers.
4786 */
4787 if (time_seq)
4788 ASSERT(!path->nowait);
4789
4790 nritems = btrfs_header_nritems(path->nodes[0]);
4791 if (nritems == 0)
4792 return 1;
4793
4794 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4795 again:
4796 level = 1;
4797 next = NULL;
4798 btrfs_release_path(path);
4799
4800 path->keep_locks = 1;
4801
4802 if (time_seq) {
4803 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4804 } else {
4805 if (path->need_commit_sem) {
4806 path->need_commit_sem = 0;
4807 need_commit_sem = true;
4808 if (path->nowait) {
4809 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4810 ret = -EAGAIN;
4811 goto done;
4812 }
4813 } else {
4814 down_read(&fs_info->commit_root_sem);
4815 }
4816 }
4817 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4818 }
4819 path->keep_locks = 0;
4820
4821 if (ret < 0)
4822 goto done;
4823
4824 nritems = btrfs_header_nritems(path->nodes[0]);
4825 /*
4826 * by releasing the path above we dropped all our locks. A balance
4827 * could have added more items next to the key that used to be
4828 * at the very end of the block. So, check again here and
4829 * advance the path if there are now more items available.
4830 */
4831 if (nritems > 0 && path->slots[0] < nritems - 1) {
4832 if (ret == 0)
4833 path->slots[0]++;
4834 ret = 0;
4835 goto done;
4836 }
4837 /*
4838 * So the above check misses one case:
4839 * - after releasing the path above, someone has removed the item that
4840 * used to be at the very end of the block, and balance between leafs
4841 * gets another one with bigger key.offset to replace it.
4842 *
4843 * This one should be returned as well, or we can get leaf corruption
4844 * later(esp. in __btrfs_drop_extents()).
4845 *
4846 * And a bit more explanation about this check,
4847 * with ret > 0, the key isn't found, the path points to the slot
4848 * where it should be inserted, so the path->slots[0] item must be the
4849 * bigger one.
4850 */
4851 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4852 ret = 0;
4853 goto done;
4854 }
4855
4856 while (level < BTRFS_MAX_LEVEL) {
4857 if (!path->nodes[level]) {
4858 ret = 1;
4859 goto done;
4860 }
4861
4862 slot = path->slots[level] + 1;
4863 c = path->nodes[level];
4864 if (slot >= btrfs_header_nritems(c)) {
4865 level++;
4866 if (level == BTRFS_MAX_LEVEL) {
4867 ret = 1;
4868 goto done;
4869 }
4870 continue;
4871 }
4872
4873
4874 /*
4875 * Our current level is where we're going to start from, and to
4876 * make sure lockdep doesn't complain we need to drop our locks
4877 * and nodes from 0 to our current level.
4878 */
4879 for (i = 0; i < level; i++) {
4880 if (path->locks[level]) {
4881 btrfs_tree_read_unlock(path->nodes[i]);
4882 path->locks[i] = 0;
4883 }
4884 free_extent_buffer(path->nodes[i]);
4885 path->nodes[i] = NULL;
4886 }
4887
4888 next = c;
4889 ret = read_block_for_search(root, path, &next, slot, &key);
4890 if (ret == -EAGAIN && !path->nowait)
4891 goto again;
4892
4893 if (ret < 0) {
4894 btrfs_release_path(path);
4895 goto done;
4896 }
4897
4898 if (!path->skip_locking) {
4899 ret = btrfs_try_tree_read_lock(next);
4900 if (!ret && path->nowait) {
4901 ret = -EAGAIN;
4902 goto done;
4903 }
4904 if (!ret && time_seq) {
4905 /*
4906 * If we don't get the lock, we may be racing
4907 * with push_leaf_left, holding that lock while
4908 * itself waiting for the leaf we've currently
4909 * locked. To solve this situation, we give up
4910 * on our lock and cycle.
4911 */
4912 free_extent_buffer(next);
4913 btrfs_release_path(path);
4914 cond_resched();
4915 goto again;
4916 }
4917 if (!ret)
4918 btrfs_tree_read_lock(next);
4919 }
4920 break;
4921 }
4922 path->slots[level] = slot;
4923 while (1) {
4924 level--;
4925 path->nodes[level] = next;
4926 path->slots[level] = 0;
4927 if (!path->skip_locking)
4928 path->locks[level] = BTRFS_READ_LOCK;
4929 if (!level)
4930 break;
4931
4932 ret = read_block_for_search(root, path, &next, 0, &key);
4933 if (ret == -EAGAIN && !path->nowait)
4934 goto again;
4935
4936 if (ret < 0) {
4937 btrfs_release_path(path);
4938 goto done;
4939 }
4940
4941 if (!path->skip_locking) {
4942 if (path->nowait) {
4943 if (!btrfs_try_tree_read_lock(next)) {
4944 ret = -EAGAIN;
4945 goto done;
4946 }
4947 } else {
4948 btrfs_tree_read_lock(next);
4949 }
4950 }
4951 }
4952 ret = 0;
4953 done:
4954 unlock_up(path, 0, 1, 0, NULL);
4955 if (need_commit_sem) {
4956 int ret2;
4957
4958 path->need_commit_sem = 1;
4959 ret2 = finish_need_commit_sem_search(path);
4960 up_read(&fs_info->commit_root_sem);
4961 if (ret2)
4962 ret = ret2;
4963 }
4964
4965 return ret;
4966 }
4967
btrfs_next_old_item(struct btrfs_root * root,struct btrfs_path * path,u64 time_seq)4968 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4969 {
4970 path->slots[0]++;
4971 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4972 return btrfs_next_old_leaf(root, path, time_seq);
4973 return 0;
4974 }
4975
4976 /*
4977 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4978 * searching until it gets past min_objectid or finds an item of 'type'
4979 *
4980 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4981 */
btrfs_previous_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid,int type)4982 int btrfs_previous_item(struct btrfs_root *root,
4983 struct btrfs_path *path, u64 min_objectid,
4984 int type)
4985 {
4986 struct btrfs_key found_key;
4987 struct extent_buffer *leaf;
4988 u32 nritems;
4989 int ret;
4990
4991 while (1) {
4992 if (path->slots[0] == 0) {
4993 ret = btrfs_prev_leaf(root, path);
4994 if (ret != 0)
4995 return ret;
4996 } else {
4997 path->slots[0]--;
4998 }
4999 leaf = path->nodes[0];
5000 nritems = btrfs_header_nritems(leaf);
5001 if (nritems == 0)
5002 return 1;
5003 if (path->slots[0] == nritems)
5004 path->slots[0]--;
5005
5006 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5007 if (found_key.objectid < min_objectid)
5008 break;
5009 if (found_key.type == type)
5010 return 0;
5011 if (found_key.objectid == min_objectid &&
5012 found_key.type < type)
5013 break;
5014 }
5015 return 1;
5016 }
5017
5018 /*
5019 * search in extent tree to find a previous Metadata/Data extent item with
5020 * min objecitd.
5021 *
5022 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5023 */
btrfs_previous_extent_item(struct btrfs_root * root,struct btrfs_path * path,u64 min_objectid)5024 int btrfs_previous_extent_item(struct btrfs_root *root,
5025 struct btrfs_path *path, u64 min_objectid)
5026 {
5027 struct btrfs_key found_key;
5028 struct extent_buffer *leaf;
5029 u32 nritems;
5030 int ret;
5031
5032 while (1) {
5033 if (path->slots[0] == 0) {
5034 ret = btrfs_prev_leaf(root, path);
5035 if (ret != 0)
5036 return ret;
5037 } else {
5038 path->slots[0]--;
5039 }
5040 leaf = path->nodes[0];
5041 nritems = btrfs_header_nritems(leaf);
5042 if (nritems == 0)
5043 return 1;
5044 if (path->slots[0] == nritems)
5045 path->slots[0]--;
5046
5047 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5048 if (found_key.objectid < min_objectid)
5049 break;
5050 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5051 found_key.type == BTRFS_METADATA_ITEM_KEY)
5052 return 0;
5053 if (found_key.objectid == min_objectid &&
5054 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5055 break;
5056 }
5057 return 1;
5058 }
5059
btrfs_ctree_init(void)5060 int __init btrfs_ctree_init(void)
5061 {
5062 btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
5063 if (!btrfs_path_cachep)
5064 return -ENOMEM;
5065 return 0;
5066 }
5067
btrfs_ctree_exit(void)5068 void __cold btrfs_ctree_exit(void)
5069 {
5070 kmem_cache_destroy(btrfs_path_cachep);
5071 }
5072