1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "backref.h"
17 #include "compression.h"
18 #include "qgroup.h"
19 #include "block-group.h"
20 #include "space-info.h"
21 #include "inode-item.h"
22 #include "fs.h"
23 #include "accessors.h"
24 #include "extent-tree.h"
25 #include "root-tree.h"
26 #include "dir-item.h"
27 #include "file-item.h"
28 #include "file.h"
29 #include "orphan.h"
30 #include "tree-checker.h"
31
32 #define MAX_CONFLICT_INODES 10
33
34 /* magic values for the inode_only field in btrfs_log_inode:
35 *
36 * LOG_INODE_ALL means to log everything
37 * LOG_INODE_EXISTS means to log just enough to recreate the inode
38 * during log replay
39 */
40 enum {
41 LOG_INODE_ALL,
42 LOG_INODE_EXISTS,
43 };
44
45 /*
46 * directory trouble cases
47 *
48 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
49 * log, we must force a full commit before doing an fsync of the directory
50 * where the unlink was done.
51 * ---> record transid of last unlink/rename per directory
52 *
53 * mkdir foo/some_dir
54 * normal commit
55 * rename foo/some_dir foo2/some_dir
56 * mkdir foo/some_dir
57 * fsync foo/some_dir/some_file
58 *
59 * The fsync above will unlink the original some_dir without recording
60 * it in its new location (foo2). After a crash, some_dir will be gone
61 * unless the fsync of some_file forces a full commit
62 *
63 * 2) we must log any new names for any file or dir that is in the fsync
64 * log. ---> check inode while renaming/linking.
65 *
66 * 2a) we must log any new names for any file or dir during rename
67 * when the directory they are being removed from was logged.
68 * ---> check inode and old parent dir during rename
69 *
70 * 2a is actually the more important variant. With the extra logging
71 * a crash might unlink the old name without recreating the new one
72 *
73 * 3) after a crash, we must go through any directories with a link count
74 * of zero and redo the rm -rf
75 *
76 * mkdir f1/foo
77 * normal commit
78 * rm -rf f1/foo
79 * fsync(f1)
80 *
81 * The directory f1 was fully removed from the FS, but fsync was never
82 * called on f1, only its parent dir. After a crash the rm -rf must
83 * be replayed. This must be able to recurse down the entire
84 * directory tree. The inode link count fixup code takes care of the
85 * ugly details.
86 */
87
88 /*
89 * stages for the tree walking. The first
90 * stage (0) is to only pin down the blocks we find
91 * the second stage (1) is to make sure that all the inodes
92 * we find in the log are created in the subvolume.
93 *
94 * The last stage is to deal with directories and links and extents
95 * and all the other fun semantics
96 */
97 enum {
98 LOG_WALK_PIN_ONLY,
99 LOG_WALK_REPLAY_INODES,
100 LOG_WALK_REPLAY_DIR_INDEX,
101 LOG_WALK_REPLAY_ALL,
102 };
103
104 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
105 struct btrfs_inode *inode,
106 int inode_only,
107 struct btrfs_log_ctx *ctx);
108 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root,
110 struct btrfs_path *path, u64 objectid);
111 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
112 struct btrfs_root *root,
113 struct btrfs_root *log,
114 struct btrfs_path *path,
115 u64 dirid, bool del_all);
116 static void wait_log_commit(struct btrfs_root *root, int transid);
117
118 /*
119 * tree logging is a special write ahead log used to make sure that
120 * fsyncs and O_SYNCs can happen without doing full tree commits.
121 *
122 * Full tree commits are expensive because they require commonly
123 * modified blocks to be recowed, creating many dirty pages in the
124 * extent tree an 4x-6x higher write load than ext3.
125 *
126 * Instead of doing a tree commit on every fsync, we use the
127 * key ranges and transaction ids to find items for a given file or directory
128 * that have changed in this transaction. Those items are copied into
129 * a special tree (one per subvolume root), that tree is written to disk
130 * and then the fsync is considered complete.
131 *
132 * After a crash, items are copied out of the log-tree back into the
133 * subvolume tree. Any file data extents found are recorded in the extent
134 * allocation tree, and the log-tree freed.
135 *
136 * The log tree is read three times, once to pin down all the extents it is
137 * using in ram and once, once to create all the inodes logged in the tree
138 * and once to do all the other items.
139 */
140
btrfs_iget_logging(u64 objectid,struct btrfs_root * root)141 static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
142 {
143 unsigned int nofs_flag;
144 struct btrfs_inode *inode;
145
146 /* Only meant to be called for subvolume roots and not for log roots. */
147 ASSERT(btrfs_is_fstree(btrfs_root_id(root)));
148
149 /*
150 * We're holding a transaction handle whether we are logging or
151 * replaying a log tree, so we must make sure NOFS semantics apply
152 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
153 * to allocate an inode, which can recurse back into the filesystem and
154 * attempt a transaction commit, resulting in a deadlock.
155 */
156 nofs_flag = memalloc_nofs_save();
157 inode = btrfs_iget(objectid, root);
158 memalloc_nofs_restore(nofs_flag);
159
160 return inode;
161 }
162
163 /*
164 * start a sub transaction and setup the log tree
165 * this increments the log tree writer count to make the people
166 * syncing the tree wait for us to finish
167 */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)168 static int start_log_trans(struct btrfs_trans_handle *trans,
169 struct btrfs_root *root,
170 struct btrfs_log_ctx *ctx)
171 {
172 struct btrfs_fs_info *fs_info = root->fs_info;
173 struct btrfs_root *tree_root = fs_info->tree_root;
174 const bool zoned = btrfs_is_zoned(fs_info);
175 int ret = 0;
176 bool created = false;
177
178 /*
179 * First check if the log root tree was already created. If not, create
180 * it before locking the root's log_mutex, just to keep lockdep happy.
181 */
182 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
183 mutex_lock(&tree_root->log_mutex);
184 if (!fs_info->log_root_tree) {
185 ret = btrfs_init_log_root_tree(trans, fs_info);
186 if (!ret) {
187 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
188 created = true;
189 }
190 }
191 mutex_unlock(&tree_root->log_mutex);
192 if (ret)
193 return ret;
194 }
195
196 mutex_lock(&root->log_mutex);
197
198 again:
199 if (root->log_root) {
200 int index = (root->log_transid + 1) % 2;
201
202 if (btrfs_need_log_full_commit(trans)) {
203 ret = BTRFS_LOG_FORCE_COMMIT;
204 goto out;
205 }
206
207 if (zoned && atomic_read(&root->log_commit[index])) {
208 wait_log_commit(root, root->log_transid - 1);
209 goto again;
210 }
211
212 if (!root->log_start_pid) {
213 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
214 root->log_start_pid = current->pid;
215 } else if (root->log_start_pid != current->pid) {
216 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
217 }
218 } else {
219 /*
220 * This means fs_info->log_root_tree was already created
221 * for some other FS trees. Do the full commit not to mix
222 * nodes from multiple log transactions to do sequential
223 * writing.
224 */
225 if (zoned && !created) {
226 ret = BTRFS_LOG_FORCE_COMMIT;
227 goto out;
228 }
229
230 ret = btrfs_add_log_tree(trans, root);
231 if (ret)
232 goto out;
233
234 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
235 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
236 root->log_start_pid = current->pid;
237 }
238
239 atomic_inc(&root->log_writers);
240 if (!ctx->logging_new_name) {
241 int index = root->log_transid % 2;
242 list_add_tail(&ctx->list, &root->log_ctxs[index]);
243 ctx->log_transid = root->log_transid;
244 }
245
246 out:
247 mutex_unlock(&root->log_mutex);
248 return ret;
249 }
250
251 /*
252 * returns 0 if there was a log transaction running and we were able
253 * to join, or returns -ENOENT if there were not transactions
254 * in progress
255 */
join_running_log_trans(struct btrfs_root * root)256 static int join_running_log_trans(struct btrfs_root *root)
257 {
258 const bool zoned = btrfs_is_zoned(root->fs_info);
259 int ret = -ENOENT;
260
261 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
262 return ret;
263
264 mutex_lock(&root->log_mutex);
265 again:
266 if (root->log_root) {
267 int index = (root->log_transid + 1) % 2;
268
269 ret = 0;
270 if (zoned && atomic_read(&root->log_commit[index])) {
271 wait_log_commit(root, root->log_transid - 1);
272 goto again;
273 }
274 atomic_inc(&root->log_writers);
275 }
276 mutex_unlock(&root->log_mutex);
277 return ret;
278 }
279
280 /*
281 * This either makes the current running log transaction wait
282 * until you call btrfs_end_log_trans() or it makes any future
283 * log transactions wait until you call btrfs_end_log_trans()
284 */
btrfs_pin_log_trans(struct btrfs_root * root)285 void btrfs_pin_log_trans(struct btrfs_root *root)
286 {
287 atomic_inc(&root->log_writers);
288 }
289
290 /*
291 * indicate we're done making changes to the log tree
292 * and wake up anyone waiting to do a sync
293 */
btrfs_end_log_trans(struct btrfs_root * root)294 void btrfs_end_log_trans(struct btrfs_root *root)
295 {
296 if (atomic_dec_and_test(&root->log_writers)) {
297 /* atomic_dec_and_test implies a barrier */
298 cond_wake_up_nomb(&root->log_writer_wait);
299 }
300 }
301
302 /*
303 * the walk control struct is used to pass state down the chain when
304 * processing the log tree. The stage field tells us which part
305 * of the log tree processing we are currently doing. The others
306 * are state fields used for that specific part
307 */
308 struct walk_control {
309 /* should we free the extent on disk when done? This is used
310 * at transaction commit time while freeing a log tree
311 */
312 int free;
313
314 /* pin only walk, we record which extents on disk belong to the
315 * log trees
316 */
317 int pin;
318
319 /* what stage of the replay code we're currently in */
320 int stage;
321
322 /*
323 * Ignore any items from the inode currently being processed. Needs
324 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
325 * the LOG_WALK_REPLAY_INODES stage.
326 */
327 bool ignore_cur_inode;
328
329 /* the root we are currently replaying */
330 struct btrfs_root *replay_dest;
331
332 /* the trans handle for the current replay */
333 struct btrfs_trans_handle *trans;
334
335 /* the function that gets used to process blocks we find in the
336 * tree. Note the extent_buffer might not be up to date when it is
337 * passed in, and it must be checked or read if you need the data
338 * inside it
339 */
340 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
341 struct walk_control *wc, u64 gen, int level);
342 };
343
344 /*
345 * process_func used to pin down extents, write them or wait on them
346 */
process_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)347 static int process_one_buffer(struct btrfs_root *log,
348 struct extent_buffer *eb,
349 struct walk_control *wc, u64 gen, int level)
350 {
351 struct btrfs_fs_info *fs_info = log->fs_info;
352 int ret = 0;
353
354 /*
355 * If this fs is mixed then we need to be able to process the leaves to
356 * pin down any logged extents, so we have to read the block.
357 */
358 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
359 struct btrfs_tree_parent_check check = {
360 .level = level,
361 .transid = gen
362 };
363
364 ret = btrfs_read_extent_buffer(eb, &check);
365 if (ret)
366 return ret;
367 }
368
369 if (wc->pin) {
370 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
371 if (ret)
372 return ret;
373
374 if (btrfs_buffer_uptodate(eb, gen, 0) &&
375 btrfs_header_level(eb) == 0)
376 ret = btrfs_exclude_logged_extents(eb);
377 }
378 return ret;
379 }
380
381 /*
382 * Item overwrite used by log replay. The given eb, slot and key all refer to
383 * the source data we are copying out.
384 *
385 * The given root is for the tree we are copying into, and path is a scratch
386 * path for use in this function (it should be released on entry and will be
387 * released on exit).
388 *
389 * If the key is already in the destination tree the existing item is
390 * overwritten. If the existing item isn't big enough, it is extended.
391 * If it is too large, it is truncated.
392 *
393 * If the key isn't in the destination yet, a new item is inserted.
394 */
overwrite_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)395 static int overwrite_item(struct btrfs_trans_handle *trans,
396 struct btrfs_root *root,
397 struct btrfs_path *path,
398 struct extent_buffer *eb, int slot,
399 struct btrfs_key *key)
400 {
401 int ret;
402 u32 item_size;
403 u64 saved_i_size = 0;
404 int save_old_i_size = 0;
405 unsigned long src_ptr;
406 unsigned long dst_ptr;
407 struct extent_buffer *dst_eb;
408 int dst_slot;
409 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
410
411 /*
412 * This is only used during log replay, so the root is always from a
413 * fs/subvolume tree. In case we ever need to support a log root, then
414 * we'll have to clone the leaf in the path, release the path and use
415 * the leaf before writing into the log tree. See the comments at
416 * copy_items() for more details.
417 */
418 ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
419
420 item_size = btrfs_item_size(eb, slot);
421 src_ptr = btrfs_item_ptr_offset(eb, slot);
422
423 /* Look for the key in the destination tree. */
424 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
425 if (ret < 0)
426 return ret;
427
428 dst_eb = path->nodes[0];
429 dst_slot = path->slots[0];
430
431 if (ret == 0) {
432 char *src_copy;
433 const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
434
435 if (dst_size != item_size)
436 goto insert;
437
438 if (item_size == 0) {
439 btrfs_release_path(path);
440 return 0;
441 }
442 src_copy = kmalloc(item_size, GFP_NOFS);
443 if (!src_copy) {
444 btrfs_release_path(path);
445 return -ENOMEM;
446 }
447
448 read_extent_buffer(eb, src_copy, src_ptr, item_size);
449 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
450 ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
451
452 kfree(src_copy);
453 /*
454 * they have the same contents, just return, this saves
455 * us from cowing blocks in the destination tree and doing
456 * extra writes that may not have been done by a previous
457 * sync
458 */
459 if (ret == 0) {
460 btrfs_release_path(path);
461 return 0;
462 }
463
464 /*
465 * We need to load the old nbytes into the inode so when we
466 * replay the extents we've logged we get the right nbytes.
467 */
468 if (inode_item) {
469 struct btrfs_inode_item *item;
470 u64 nbytes;
471 u32 mode;
472
473 item = btrfs_item_ptr(dst_eb, dst_slot,
474 struct btrfs_inode_item);
475 nbytes = btrfs_inode_nbytes(dst_eb, item);
476 item = btrfs_item_ptr(eb, slot,
477 struct btrfs_inode_item);
478 btrfs_set_inode_nbytes(eb, item, nbytes);
479
480 /*
481 * If this is a directory we need to reset the i_size to
482 * 0 so that we can set it up properly when replaying
483 * the rest of the items in this log.
484 */
485 mode = btrfs_inode_mode(eb, item);
486 if (S_ISDIR(mode))
487 btrfs_set_inode_size(eb, item, 0);
488 }
489 } else if (inode_item) {
490 struct btrfs_inode_item *item;
491 u32 mode;
492
493 /*
494 * New inode, set nbytes to 0 so that the nbytes comes out
495 * properly when we replay the extents.
496 */
497 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
498 btrfs_set_inode_nbytes(eb, item, 0);
499
500 /*
501 * If this is a directory we need to reset the i_size to 0 so
502 * that we can set it up properly when replaying the rest of
503 * the items in this log.
504 */
505 mode = btrfs_inode_mode(eb, item);
506 if (S_ISDIR(mode))
507 btrfs_set_inode_size(eb, item, 0);
508 }
509 insert:
510 btrfs_release_path(path);
511 /* try to insert the key into the destination tree */
512 path->skip_release_on_error = 1;
513 ret = btrfs_insert_empty_item(trans, root, path,
514 key, item_size);
515 path->skip_release_on_error = 0;
516
517 dst_eb = path->nodes[0];
518 dst_slot = path->slots[0];
519
520 /* make sure any existing item is the correct size */
521 if (ret == -EEXIST || ret == -EOVERFLOW) {
522 const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
523
524 if (found_size > item_size)
525 btrfs_truncate_item(trans, path, item_size, 1);
526 else if (found_size < item_size)
527 btrfs_extend_item(trans, path, item_size - found_size);
528 } else if (ret) {
529 return ret;
530 }
531 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
532
533 /* don't overwrite an existing inode if the generation number
534 * was logged as zero. This is done when the tree logging code
535 * is just logging an inode to make sure it exists after recovery.
536 *
537 * Also, don't overwrite i_size on directories during replay.
538 * log replay inserts and removes directory items based on the
539 * state of the tree found in the subvolume, and i_size is modified
540 * as it goes
541 */
542 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
543 struct btrfs_inode_item *src_item;
544 struct btrfs_inode_item *dst_item;
545
546 src_item = (struct btrfs_inode_item *)src_ptr;
547 dst_item = (struct btrfs_inode_item *)dst_ptr;
548
549 if (btrfs_inode_generation(eb, src_item) == 0) {
550 const u64 ino_size = btrfs_inode_size(eb, src_item);
551
552 /*
553 * For regular files an ino_size == 0 is used only when
554 * logging that an inode exists, as part of a directory
555 * fsync, and the inode wasn't fsynced before. In this
556 * case don't set the size of the inode in the fs/subvol
557 * tree, otherwise we would be throwing valid data away.
558 */
559 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
560 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
561 ino_size != 0)
562 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
563 goto no_copy;
564 }
565
566 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
567 S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
568 save_old_i_size = 1;
569 saved_i_size = btrfs_inode_size(dst_eb, dst_item);
570 }
571 }
572
573 copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
574
575 if (save_old_i_size) {
576 struct btrfs_inode_item *dst_item;
577
578 dst_item = (struct btrfs_inode_item *)dst_ptr;
579 btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
580 }
581
582 /* make sure the generation is filled in */
583 if (key->type == BTRFS_INODE_ITEM_KEY) {
584 struct btrfs_inode_item *dst_item;
585
586 dst_item = (struct btrfs_inode_item *)dst_ptr;
587 if (btrfs_inode_generation(dst_eb, dst_item) == 0)
588 btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
589 }
590 no_copy:
591 btrfs_release_path(path);
592 return 0;
593 }
594
read_alloc_one_name(struct extent_buffer * eb,void * start,int len,struct fscrypt_str * name)595 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
596 struct fscrypt_str *name)
597 {
598 char *buf;
599
600 buf = kmalloc(len, GFP_NOFS);
601 if (!buf)
602 return -ENOMEM;
603
604 read_extent_buffer(eb, buf, (unsigned long)start, len);
605 name->name = buf;
606 name->len = len;
607 return 0;
608 }
609
610 /* replays a single extent in 'eb' at 'slot' with 'key' into the
611 * subvolume 'root'. path is released on entry and should be released
612 * on exit.
613 *
614 * extents in the log tree have not been allocated out of the extent
615 * tree yet. So, this completes the allocation, taking a reference
616 * as required if the extent already exists or creating a new extent
617 * if it isn't in the extent allocation tree yet.
618 *
619 * The extent is inserted into the file, dropping any existing extents
620 * from the file that overlap the new one.
621 */
replay_one_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)622 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
623 struct btrfs_root *root,
624 struct btrfs_path *path,
625 struct extent_buffer *eb, int slot,
626 struct btrfs_key *key)
627 {
628 struct btrfs_drop_extents_args drop_args = { 0 };
629 struct btrfs_fs_info *fs_info = root->fs_info;
630 int found_type;
631 u64 extent_end;
632 u64 start = key->offset;
633 u64 nbytes = 0;
634 struct btrfs_file_extent_item *item;
635 struct btrfs_inode *inode = NULL;
636 unsigned long size;
637 int ret = 0;
638
639 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
640 found_type = btrfs_file_extent_type(eb, item);
641
642 if (found_type == BTRFS_FILE_EXTENT_REG ||
643 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
644 nbytes = btrfs_file_extent_num_bytes(eb, item);
645 extent_end = start + nbytes;
646
647 /*
648 * We don't add to the inodes nbytes if we are prealloc or a
649 * hole.
650 */
651 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
652 nbytes = 0;
653 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
654 size = btrfs_file_extent_ram_bytes(eb, item);
655 nbytes = btrfs_file_extent_ram_bytes(eb, item);
656 extent_end = ALIGN(start + size,
657 fs_info->sectorsize);
658 } else {
659 btrfs_err(fs_info,
660 "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
661 found_type, btrfs_root_id(root), key->objectid, key->offset);
662 return -EUCLEAN;
663 }
664
665 inode = btrfs_iget_logging(key->objectid, root);
666 if (IS_ERR(inode))
667 return PTR_ERR(inode);
668
669 /*
670 * first check to see if we already have this extent in the
671 * file. This must be done before the btrfs_drop_extents run
672 * so we don't try to drop this extent.
673 */
674 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
675
676 if (ret == 0 &&
677 (found_type == BTRFS_FILE_EXTENT_REG ||
678 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
679 struct btrfs_file_extent_item existing;
680 unsigned long ptr;
681
682 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
683 read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
684
685 /*
686 * we already have a pointer to this exact extent,
687 * we don't have to do anything
688 */
689 if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
690 sizeof(existing)) == 0) {
691 btrfs_release_path(path);
692 goto out;
693 }
694 }
695 btrfs_release_path(path);
696
697 /* drop any overlapping extents */
698 drop_args.start = start;
699 drop_args.end = extent_end;
700 drop_args.drop_cache = true;
701 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
702 if (ret)
703 goto out;
704
705 if (found_type == BTRFS_FILE_EXTENT_REG ||
706 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
707 u64 offset;
708 unsigned long dest_offset;
709 struct btrfs_key ins;
710
711 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
712 btrfs_fs_incompat(fs_info, NO_HOLES))
713 goto update_inode;
714
715 ret = btrfs_insert_empty_item(trans, root, path, key,
716 sizeof(*item));
717 if (ret)
718 goto out;
719 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
720 path->slots[0]);
721 copy_extent_buffer(path->nodes[0], eb, dest_offset,
722 (unsigned long)item, sizeof(*item));
723
724 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
725 ins.type = BTRFS_EXTENT_ITEM_KEY;
726 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
727 offset = key->offset - btrfs_file_extent_offset(eb, item);
728
729 /*
730 * Manually record dirty extent, as here we did a shallow
731 * file extent item copy and skip normal backref update,
732 * but modifying extent tree all by ourselves.
733 * So need to manually record dirty extent for qgroup,
734 * as the owner of the file extent changed from log tree
735 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
736 */
737 ret = btrfs_qgroup_trace_extent(trans,
738 btrfs_file_extent_disk_bytenr(eb, item),
739 btrfs_file_extent_disk_num_bytes(eb, item));
740 if (ret < 0)
741 goto out;
742
743 if (ins.objectid > 0) {
744 u64 csum_start;
745 u64 csum_end;
746 LIST_HEAD(ordered_sums);
747
748 /*
749 * is this extent already allocated in the extent
750 * allocation tree? If so, just add a reference
751 */
752 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
753 ins.offset);
754 if (ret < 0) {
755 goto out;
756 } else if (ret == 0) {
757 struct btrfs_ref ref = {
758 .action = BTRFS_ADD_DELAYED_REF,
759 .bytenr = ins.objectid,
760 .num_bytes = ins.offset,
761 .owning_root = btrfs_root_id(root),
762 .ref_root = btrfs_root_id(root),
763 };
764 btrfs_init_data_ref(&ref, key->objectid, offset,
765 0, false);
766 ret = btrfs_inc_extent_ref(trans, &ref);
767 if (ret)
768 goto out;
769 } else {
770 /*
771 * insert the extent pointer in the extent
772 * allocation tree
773 */
774 ret = btrfs_alloc_logged_file_extent(trans,
775 btrfs_root_id(root),
776 key->objectid, offset, &ins);
777 if (ret)
778 goto out;
779 }
780 btrfs_release_path(path);
781
782 if (btrfs_file_extent_compression(eb, item)) {
783 csum_start = ins.objectid;
784 csum_end = csum_start + ins.offset;
785 } else {
786 csum_start = ins.objectid +
787 btrfs_file_extent_offset(eb, item);
788 csum_end = csum_start +
789 btrfs_file_extent_num_bytes(eb, item);
790 }
791
792 ret = btrfs_lookup_csums_list(root->log_root,
793 csum_start, csum_end - 1,
794 &ordered_sums, false);
795 if (ret < 0)
796 goto out;
797 ret = 0;
798 /*
799 * Now delete all existing cums in the csum root that
800 * cover our range. We do this because we can have an
801 * extent that is completely referenced by one file
802 * extent item and partially referenced by another
803 * file extent item (like after using the clone or
804 * extent_same ioctls). In this case if we end up doing
805 * the replay of the one that partially references the
806 * extent first, and we do not do the csum deletion
807 * below, we can get 2 csum items in the csum tree that
808 * overlap each other. For example, imagine our log has
809 * the two following file extent items:
810 *
811 * key (257 EXTENT_DATA 409600)
812 * extent data disk byte 12845056 nr 102400
813 * extent data offset 20480 nr 20480 ram 102400
814 *
815 * key (257 EXTENT_DATA 819200)
816 * extent data disk byte 12845056 nr 102400
817 * extent data offset 0 nr 102400 ram 102400
818 *
819 * Where the second one fully references the 100K extent
820 * that starts at disk byte 12845056, and the log tree
821 * has a single csum item that covers the entire range
822 * of the extent:
823 *
824 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
825 *
826 * After the first file extent item is replayed, the
827 * csum tree gets the following csum item:
828 *
829 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
830 *
831 * Which covers the 20K sub-range starting at offset 20K
832 * of our extent. Now when we replay the second file
833 * extent item, if we do not delete existing csum items
834 * that cover any of its blocks, we end up getting two
835 * csum items in our csum tree that overlap each other:
836 *
837 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
838 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
839 *
840 * Which is a problem, because after this anyone trying
841 * to lookup up for the checksum of any block of our
842 * extent starting at an offset of 40K or higher, will
843 * end up looking at the second csum item only, which
844 * does not contain the checksum for any block starting
845 * at offset 40K or higher of our extent.
846 */
847 while (!list_empty(&ordered_sums)) {
848 struct btrfs_ordered_sum *sums;
849 struct btrfs_root *csum_root;
850
851 sums = list_first_entry(&ordered_sums,
852 struct btrfs_ordered_sum,
853 list);
854 csum_root = btrfs_csum_root(fs_info,
855 sums->logical);
856 if (!ret)
857 ret = btrfs_del_csums(trans, csum_root,
858 sums->logical,
859 sums->len);
860 if (!ret)
861 ret = btrfs_csum_file_blocks(trans,
862 csum_root,
863 sums);
864 list_del(&sums->list);
865 kfree(sums);
866 }
867 if (ret)
868 goto out;
869 } else {
870 btrfs_release_path(path);
871 }
872 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
873 /* inline extents are easy, we just overwrite them */
874 ret = overwrite_item(trans, root, path, eb, slot, key);
875 if (ret)
876 goto out;
877 }
878
879 ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
880 if (ret)
881 goto out;
882
883 update_inode:
884 btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
885 ret = btrfs_update_inode(trans, inode);
886 out:
887 iput(&inode->vfs_inode);
888 return ret;
889 }
890
unlink_inode_for_log_replay(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)891 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
892 struct btrfs_inode *dir,
893 struct btrfs_inode *inode,
894 const struct fscrypt_str *name)
895 {
896 int ret;
897
898 ret = btrfs_unlink_inode(trans, dir, inode, name);
899 if (ret)
900 return ret;
901 /*
902 * Whenever we need to check if a name exists or not, we check the
903 * fs/subvolume tree. So after an unlink we must run delayed items, so
904 * that future checks for a name during log replay see that the name
905 * does not exists anymore.
906 */
907 return btrfs_run_delayed_items(trans);
908 }
909
910 /*
911 * when cleaning up conflicts between the directory names in the
912 * subvolume, directory names in the log and directory names in the
913 * inode back references, we may have to unlink inodes from directories.
914 *
915 * This is a helper function to do the unlink of a specific directory
916 * item
917 */
drop_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * dir,struct btrfs_dir_item * di)918 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
919 struct btrfs_path *path,
920 struct btrfs_inode *dir,
921 struct btrfs_dir_item *di)
922 {
923 struct btrfs_root *root = dir->root;
924 struct btrfs_inode *inode;
925 struct fscrypt_str name;
926 struct extent_buffer *leaf;
927 struct btrfs_key location;
928 int ret;
929
930 leaf = path->nodes[0];
931
932 btrfs_dir_item_key_to_cpu(leaf, di, &location);
933 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
934 if (ret)
935 return -ENOMEM;
936
937 btrfs_release_path(path);
938
939 inode = btrfs_iget_logging(location.objectid, root);
940 if (IS_ERR(inode)) {
941 ret = PTR_ERR(inode);
942 inode = NULL;
943 goto out;
944 }
945
946 ret = link_to_fixup_dir(trans, root, path, location.objectid);
947 if (ret)
948 goto out;
949
950 ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
951 out:
952 kfree(name.name);
953 if (inode)
954 iput(&inode->vfs_inode);
955 return ret;
956 }
957
958 /*
959 * See if a given name and sequence number found in an inode back reference are
960 * already in a directory and correctly point to this inode.
961 *
962 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
963 * exists.
964 */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,struct fscrypt_str * name)965 static noinline int inode_in_dir(struct btrfs_root *root,
966 struct btrfs_path *path,
967 u64 dirid, u64 objectid, u64 index,
968 struct fscrypt_str *name)
969 {
970 struct btrfs_dir_item *di;
971 struct btrfs_key location;
972 int ret = 0;
973
974 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
975 index, name, 0);
976 if (IS_ERR(di)) {
977 ret = PTR_ERR(di);
978 goto out;
979 } else if (di) {
980 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
981 if (location.objectid != objectid)
982 goto out;
983 } else {
984 goto out;
985 }
986
987 btrfs_release_path(path);
988 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
989 if (IS_ERR(di)) {
990 ret = PTR_ERR(di);
991 goto out;
992 } else if (di) {
993 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
994 if (location.objectid == objectid)
995 ret = 1;
996 }
997 out:
998 btrfs_release_path(path);
999 return ret;
1000 }
1001
1002 /*
1003 * helper function to check a log tree for a named back reference in
1004 * an inode. This is used to decide if a back reference that is
1005 * found in the subvolume conflicts with what we find in the log.
1006 *
1007 * inode backreferences may have multiple refs in a single item,
1008 * during replay we process one reference at a time, and we don't
1009 * want to delete valid links to a file from the subvolume if that
1010 * link is also in the log.
1011 */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const struct fscrypt_str * name)1012 static noinline int backref_in_log(struct btrfs_root *log,
1013 struct btrfs_key *key,
1014 u64 ref_objectid,
1015 const struct fscrypt_str *name)
1016 {
1017 struct btrfs_path *path;
1018 int ret;
1019
1020 path = btrfs_alloc_path();
1021 if (!path)
1022 return -ENOMEM;
1023
1024 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1025 if (ret < 0) {
1026 goto out;
1027 } else if (ret == 1) {
1028 ret = 0;
1029 goto out;
1030 }
1031
1032 if (key->type == BTRFS_INODE_EXTREF_KEY)
1033 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1034 path->slots[0],
1035 ref_objectid, name);
1036 else
1037 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1038 path->slots[0], name);
1039 out:
1040 btrfs_free_path(path);
1041 return ret;
1042 }
1043
unlink_refs_not_in_log(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_root * log_root,struct btrfs_key * search_key,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 parent_objectid)1044 static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
1045 struct btrfs_path *path,
1046 struct btrfs_root *log_root,
1047 struct btrfs_key *search_key,
1048 struct btrfs_inode *dir,
1049 struct btrfs_inode *inode,
1050 u64 parent_objectid)
1051 {
1052 struct extent_buffer *leaf = path->nodes[0];
1053 unsigned long ptr;
1054 unsigned long ptr_end;
1055
1056 /*
1057 * Check all the names in this back reference to see if they are in the
1058 * log. If so, we allow them to stay otherwise they must be unlinked as
1059 * a conflict.
1060 */
1061 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1062 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1063 while (ptr < ptr_end) {
1064 struct fscrypt_str victim_name;
1065 struct btrfs_inode_ref *victim_ref;
1066 int ret;
1067
1068 victim_ref = (struct btrfs_inode_ref *)ptr;
1069 ret = read_alloc_one_name(leaf, (victim_ref + 1),
1070 btrfs_inode_ref_name_len(leaf, victim_ref),
1071 &victim_name);
1072 if (ret)
1073 return ret;
1074
1075 ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
1076 if (ret) {
1077 kfree(victim_name.name);
1078 if (ret < 0)
1079 return ret;
1080 ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1081 continue;
1082 }
1083
1084 inc_nlink(&inode->vfs_inode);
1085 btrfs_release_path(path);
1086
1087 ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name);
1088 kfree(victim_name.name);
1089 if (ret)
1090 return ret;
1091 return -EAGAIN;
1092 }
1093
1094 return 0;
1095 }
1096
unlink_extrefs_not_in_log(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_root * root,struct btrfs_root * log_root,struct btrfs_key * search_key,struct btrfs_inode * inode,u64 inode_objectid,u64 parent_objectid)1097 static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans,
1098 struct btrfs_path *path,
1099 struct btrfs_root *root,
1100 struct btrfs_root *log_root,
1101 struct btrfs_key *search_key,
1102 struct btrfs_inode *inode,
1103 u64 inode_objectid,
1104 u64 parent_objectid)
1105 {
1106 struct extent_buffer *leaf = path->nodes[0];
1107 const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1108 const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
1109 u32 cur_offset = 0;
1110
1111 while (cur_offset < item_size) {
1112 struct btrfs_inode_extref *extref;
1113 struct btrfs_inode *victim_parent;
1114 struct fscrypt_str victim_name;
1115 int ret;
1116
1117 extref = (struct btrfs_inode_extref *)(base + cur_offset);
1118 victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
1119
1120 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1121 goto next;
1122
1123 ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
1124 &victim_name);
1125 if (ret)
1126 return ret;
1127
1128 search_key->objectid = inode_objectid;
1129 search_key->type = BTRFS_INODE_EXTREF_KEY;
1130 search_key->offset = btrfs_extref_hash(parent_objectid,
1131 victim_name.name,
1132 victim_name.len);
1133 ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
1134 if (ret) {
1135 kfree(victim_name.name);
1136 if (ret < 0)
1137 return ret;
1138 next:
1139 cur_offset += victim_name.len + sizeof(*extref);
1140 continue;
1141 }
1142
1143 victim_parent = btrfs_iget_logging(parent_objectid, root);
1144 if (IS_ERR(victim_parent)) {
1145 kfree(victim_name.name);
1146 return PTR_ERR(victim_parent);
1147 }
1148
1149 inc_nlink(&inode->vfs_inode);
1150 btrfs_release_path(path);
1151
1152 ret = unlink_inode_for_log_replay(trans, victim_parent, inode,
1153 &victim_name);
1154 iput(&victim_parent->vfs_inode);
1155 kfree(victim_name.name);
1156 if (ret)
1157 return ret;
1158 return -EAGAIN;
1159 }
1160
1161 return 0;
1162 }
1163
__add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_root * log_root,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 inode_objectid,u64 parent_objectid,u64 ref_index,struct fscrypt_str * name)1164 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1165 struct btrfs_root *root,
1166 struct btrfs_path *path,
1167 struct btrfs_root *log_root,
1168 struct btrfs_inode *dir,
1169 struct btrfs_inode *inode,
1170 u64 inode_objectid, u64 parent_objectid,
1171 u64 ref_index, struct fscrypt_str *name)
1172 {
1173 int ret;
1174 struct btrfs_dir_item *di;
1175 struct btrfs_key search_key;
1176 struct btrfs_inode_extref *extref;
1177
1178 again:
1179 /* Search old style refs */
1180 search_key.objectid = inode_objectid;
1181 search_key.type = BTRFS_INODE_REF_KEY;
1182 search_key.offset = parent_objectid;
1183 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1184 if (ret < 0) {
1185 return ret;
1186 } else if (ret == 0) {
1187 /*
1188 * Are we trying to overwrite a back ref for the root directory?
1189 * If so, we're done.
1190 */
1191 if (search_key.objectid == search_key.offset)
1192 return 1;
1193
1194 ret = unlink_refs_not_in_log(trans, path, log_root, &search_key,
1195 dir, inode, parent_objectid);
1196 if (ret == -EAGAIN)
1197 goto again;
1198 else if (ret)
1199 return ret;
1200 }
1201 btrfs_release_path(path);
1202
1203 /* Same search but for extended refs */
1204 extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid);
1205 if (IS_ERR(extref)) {
1206 return PTR_ERR(extref);
1207 } else if (extref) {
1208 ret = unlink_extrefs_not_in_log(trans, path, root, log_root,
1209 &search_key, inode,
1210 inode_objectid, parent_objectid);
1211 if (ret == -EAGAIN)
1212 goto again;
1213 else if (ret)
1214 return ret;
1215 }
1216 btrfs_release_path(path);
1217
1218 /* look for a conflicting sequence number */
1219 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1220 ref_index, name, 0);
1221 if (IS_ERR(di)) {
1222 return PTR_ERR(di);
1223 } else if (di) {
1224 ret = drop_one_dir_item(trans, path, dir, di);
1225 if (ret)
1226 return ret;
1227 }
1228 btrfs_release_path(path);
1229
1230 /* look for a conflicting name */
1231 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
1232 if (IS_ERR(di)) {
1233 return PTR_ERR(di);
1234 } else if (di) {
1235 ret = drop_one_dir_item(trans, path, dir, di);
1236 if (ret)
1237 return ret;
1238 }
1239 btrfs_release_path(path);
1240
1241 return 0;
1242 }
1243
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index,u64 * parent_objectid)1244 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1245 struct fscrypt_str *name, u64 *index,
1246 u64 *parent_objectid)
1247 {
1248 struct btrfs_inode_extref *extref;
1249 int ret;
1250
1251 extref = (struct btrfs_inode_extref *)ref_ptr;
1252
1253 ret = read_alloc_one_name(eb, &extref->name,
1254 btrfs_inode_extref_name_len(eb, extref), name);
1255 if (ret)
1256 return ret;
1257
1258 if (index)
1259 *index = btrfs_inode_extref_index(eb, extref);
1260 if (parent_objectid)
1261 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1262
1263 return 0;
1264 }
1265
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index)1266 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1267 struct fscrypt_str *name, u64 *index)
1268 {
1269 struct btrfs_inode_ref *ref;
1270 int ret;
1271
1272 ref = (struct btrfs_inode_ref *)ref_ptr;
1273
1274 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1275 name);
1276 if (ret)
1277 return ret;
1278
1279 if (index)
1280 *index = btrfs_inode_ref_index(eb, ref);
1281
1282 return 0;
1283 }
1284
1285 /*
1286 * Take an inode reference item from the log tree and iterate all names from the
1287 * inode reference item in the subvolume tree with the same key (if it exists).
1288 * For any name that is not in the inode reference item from the log tree, do a
1289 * proper unlink of that name (that is, remove its entry from the inode
1290 * reference item and both dir index keys).
1291 */
unlink_old_inode_refs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * inode,struct extent_buffer * log_eb,int log_slot,struct btrfs_key * key)1292 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1293 struct btrfs_root *root,
1294 struct btrfs_path *path,
1295 struct btrfs_inode *inode,
1296 struct extent_buffer *log_eb,
1297 int log_slot,
1298 struct btrfs_key *key)
1299 {
1300 int ret;
1301 unsigned long ref_ptr;
1302 unsigned long ref_end;
1303 struct extent_buffer *eb;
1304
1305 again:
1306 btrfs_release_path(path);
1307 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1308 if (ret > 0) {
1309 ret = 0;
1310 goto out;
1311 }
1312 if (ret < 0)
1313 goto out;
1314
1315 eb = path->nodes[0];
1316 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1317 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1318 while (ref_ptr < ref_end) {
1319 struct fscrypt_str name;
1320 u64 parent_id;
1321
1322 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1323 ret = extref_get_fields(eb, ref_ptr, &name,
1324 NULL, &parent_id);
1325 } else {
1326 parent_id = key->offset;
1327 ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1328 }
1329 if (ret)
1330 goto out;
1331
1332 if (key->type == BTRFS_INODE_EXTREF_KEY)
1333 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1334 parent_id, &name);
1335 else
1336 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1337
1338 if (!ret) {
1339 struct btrfs_inode *dir;
1340
1341 btrfs_release_path(path);
1342 dir = btrfs_iget_logging(parent_id, root);
1343 if (IS_ERR(dir)) {
1344 ret = PTR_ERR(dir);
1345 kfree(name.name);
1346 goto out;
1347 }
1348 ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
1349 kfree(name.name);
1350 iput(&dir->vfs_inode);
1351 if (ret)
1352 goto out;
1353 goto again;
1354 }
1355
1356 kfree(name.name);
1357 ref_ptr += name.len;
1358 if (key->type == BTRFS_INODE_EXTREF_KEY)
1359 ref_ptr += sizeof(struct btrfs_inode_extref);
1360 else
1361 ref_ptr += sizeof(struct btrfs_inode_ref);
1362 }
1363 ret = 0;
1364 out:
1365 btrfs_release_path(path);
1366 return ret;
1367 }
1368
1369 /*
1370 * replay one inode back reference item found in the log tree.
1371 * eb, slot and key refer to the buffer and key found in the log tree.
1372 * root is the destination we are replaying into, and path is for temp
1373 * use by this function. (it should be released on return).
1374 */
add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1375 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1376 struct btrfs_root *root,
1377 struct btrfs_root *log,
1378 struct btrfs_path *path,
1379 struct extent_buffer *eb, int slot,
1380 struct btrfs_key *key)
1381 {
1382 struct btrfs_inode *dir = NULL;
1383 struct btrfs_inode *inode = NULL;
1384 unsigned long ref_ptr;
1385 unsigned long ref_end;
1386 struct fscrypt_str name = { 0 };
1387 int ret;
1388 const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
1389 u64 parent_objectid;
1390 u64 inode_objectid;
1391 u64 ref_index = 0;
1392 int ref_struct_size;
1393
1394 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1395 ref_end = ref_ptr + btrfs_item_size(eb, slot);
1396
1397 if (is_extref_item) {
1398 struct btrfs_inode_extref *r;
1399
1400 ref_struct_size = sizeof(struct btrfs_inode_extref);
1401 r = (struct btrfs_inode_extref *)ref_ptr;
1402 parent_objectid = btrfs_inode_extref_parent(eb, r);
1403 } else {
1404 ref_struct_size = sizeof(struct btrfs_inode_ref);
1405 parent_objectid = key->offset;
1406 }
1407 inode_objectid = key->objectid;
1408
1409 /*
1410 * it is possible that we didn't log all the parent directories
1411 * for a given inode. If we don't find the dir, just don't
1412 * copy the back ref in. The link count fixup code will take
1413 * care of the rest
1414 */
1415 dir = btrfs_iget_logging(parent_objectid, root);
1416 if (IS_ERR(dir)) {
1417 ret = PTR_ERR(dir);
1418 if (ret == -ENOENT)
1419 ret = 0;
1420 dir = NULL;
1421 goto out;
1422 }
1423
1424 inode = btrfs_iget_logging(inode_objectid, root);
1425 if (IS_ERR(inode)) {
1426 ret = PTR_ERR(inode);
1427 inode = NULL;
1428 goto out;
1429 }
1430
1431 while (ref_ptr < ref_end) {
1432 if (is_extref_item) {
1433 ret = extref_get_fields(eb, ref_ptr, &name,
1434 &ref_index, &parent_objectid);
1435 if (ret)
1436 goto out;
1437 /*
1438 * parent object can change from one array
1439 * item to another.
1440 */
1441 if (!dir) {
1442 dir = btrfs_iget_logging(parent_objectid, root);
1443 if (IS_ERR(dir)) {
1444 ret = PTR_ERR(dir);
1445 dir = NULL;
1446 /*
1447 * A new parent dir may have not been
1448 * logged and not exist in the subvolume
1449 * tree, see the comment above before
1450 * the loop when getting the first
1451 * parent dir.
1452 */
1453 if (ret == -ENOENT) {
1454 /*
1455 * The next extref may refer to
1456 * another parent dir that
1457 * exists, so continue.
1458 */
1459 ret = 0;
1460 goto next;
1461 }
1462 goto out;
1463 }
1464 }
1465 } else {
1466 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
1467 if (ret)
1468 goto out;
1469 }
1470
1471 ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1472 ref_index, &name);
1473 if (ret < 0) {
1474 goto out;
1475 } else if (ret == 0) {
1476 /*
1477 * look for a conflicting back reference in the
1478 * metadata. if we find one we have to unlink that name
1479 * of the file before we add our new link. Later on, we
1480 * overwrite any existing back reference, and we don't
1481 * want to create dangling pointers in the directory.
1482 */
1483 ret = __add_inode_ref(trans, root, path, log, dir, inode,
1484 inode_objectid, parent_objectid,
1485 ref_index, &name);
1486 if (ret) {
1487 if (ret == 1)
1488 ret = 0;
1489 goto out;
1490 }
1491
1492 /* insert our name */
1493 ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
1494 if (ret)
1495 goto out;
1496
1497 ret = btrfs_update_inode(trans, inode);
1498 if (ret)
1499 goto out;
1500 }
1501 /* Else, ret == 1, we already have a perfect match, we're done. */
1502
1503 next:
1504 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1505 kfree(name.name);
1506 name.name = NULL;
1507 if (is_extref_item && dir) {
1508 iput(&dir->vfs_inode);
1509 dir = NULL;
1510 }
1511 }
1512
1513 /*
1514 * Before we overwrite the inode reference item in the subvolume tree
1515 * with the item from the log tree, we must unlink all names from the
1516 * parent directory that are in the subvolume's tree inode reference
1517 * item, otherwise we end up with an inconsistent subvolume tree where
1518 * dir index entries exist for a name but there is no inode reference
1519 * item with the same name.
1520 */
1521 ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
1522 if (ret)
1523 goto out;
1524
1525 /* finally write the back reference in the inode */
1526 ret = overwrite_item(trans, root, path, eb, slot, key);
1527 out:
1528 btrfs_release_path(path);
1529 kfree(name.name);
1530 if (dir)
1531 iput(&dir->vfs_inode);
1532 if (inode)
1533 iput(&inode->vfs_inode);
1534 return ret;
1535 }
1536
count_inode_extrefs(struct btrfs_inode * inode,struct btrfs_path * path)1537 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
1538 {
1539 int ret = 0;
1540 int name_len;
1541 unsigned int nlink = 0;
1542 u32 item_size;
1543 u32 cur_offset = 0;
1544 u64 inode_objectid = btrfs_ino(inode);
1545 u64 offset = 0;
1546 unsigned long ptr;
1547 struct btrfs_inode_extref *extref;
1548 struct extent_buffer *leaf;
1549
1550 while (1) {
1551 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
1552 path, &extref, &offset);
1553 if (ret)
1554 break;
1555
1556 leaf = path->nodes[0];
1557 item_size = btrfs_item_size(leaf, path->slots[0]);
1558 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1559 cur_offset = 0;
1560
1561 while (cur_offset < item_size) {
1562 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1563 name_len = btrfs_inode_extref_name_len(leaf, extref);
1564
1565 nlink++;
1566
1567 cur_offset += name_len + sizeof(*extref);
1568 }
1569
1570 offset++;
1571 btrfs_release_path(path);
1572 }
1573 btrfs_release_path(path);
1574
1575 if (ret < 0 && ret != -ENOENT)
1576 return ret;
1577 return nlink;
1578 }
1579
count_inode_refs(struct btrfs_inode * inode,struct btrfs_path * path)1580 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
1581 {
1582 int ret;
1583 struct btrfs_key key;
1584 unsigned int nlink = 0;
1585 unsigned long ptr;
1586 unsigned long ptr_end;
1587 int name_len;
1588 u64 ino = btrfs_ino(inode);
1589
1590 key.objectid = ino;
1591 key.type = BTRFS_INODE_REF_KEY;
1592 key.offset = (u64)-1;
1593
1594 while (1) {
1595 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
1596 if (ret < 0)
1597 break;
1598 if (ret > 0) {
1599 if (path->slots[0] == 0)
1600 break;
1601 path->slots[0]--;
1602 }
1603 process_slot:
1604 btrfs_item_key_to_cpu(path->nodes[0], &key,
1605 path->slots[0]);
1606 if (key.objectid != ino ||
1607 key.type != BTRFS_INODE_REF_KEY)
1608 break;
1609 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1610 ptr_end = ptr + btrfs_item_size(path->nodes[0],
1611 path->slots[0]);
1612 while (ptr < ptr_end) {
1613 struct btrfs_inode_ref *ref;
1614
1615 ref = (struct btrfs_inode_ref *)ptr;
1616 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1617 ref);
1618 ptr = (unsigned long)(ref + 1) + name_len;
1619 nlink++;
1620 }
1621
1622 if (key.offset == 0)
1623 break;
1624 if (path->slots[0] > 0) {
1625 path->slots[0]--;
1626 goto process_slot;
1627 }
1628 key.offset--;
1629 btrfs_release_path(path);
1630 }
1631 btrfs_release_path(path);
1632
1633 return nlink;
1634 }
1635
1636 /*
1637 * There are a few corners where the link count of the file can't
1638 * be properly maintained during replay. So, instead of adding
1639 * lots of complexity to the log code, we just scan the backrefs
1640 * for any file that has been through replay.
1641 *
1642 * The scan will update the link count on the inode to reflect the
1643 * number of back refs found. If it goes down to zero, the iput
1644 * will free the inode.
1645 */
fixup_inode_link_count(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)1646 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1647 struct btrfs_inode *inode)
1648 {
1649 struct btrfs_root *root = inode->root;
1650 struct btrfs_path *path;
1651 int ret;
1652 u64 nlink = 0;
1653 const u64 ino = btrfs_ino(inode);
1654
1655 path = btrfs_alloc_path();
1656 if (!path)
1657 return -ENOMEM;
1658
1659 ret = count_inode_refs(inode, path);
1660 if (ret < 0)
1661 goto out;
1662
1663 nlink = ret;
1664
1665 ret = count_inode_extrefs(inode, path);
1666 if (ret < 0)
1667 goto out;
1668
1669 nlink += ret;
1670
1671 ret = 0;
1672
1673 if (nlink != inode->vfs_inode.i_nlink) {
1674 set_nlink(&inode->vfs_inode, nlink);
1675 ret = btrfs_update_inode(trans, inode);
1676 if (ret)
1677 goto out;
1678 }
1679 if (S_ISDIR(inode->vfs_inode.i_mode))
1680 inode->index_cnt = (u64)-1;
1681
1682 if (inode->vfs_inode.i_nlink == 0) {
1683 if (S_ISDIR(inode->vfs_inode.i_mode)) {
1684 ret = replay_dir_deletes(trans, root, NULL, path, ino, true);
1685 if (ret)
1686 goto out;
1687 }
1688 ret = btrfs_insert_orphan_item(trans, root, ino);
1689 if (ret == -EEXIST)
1690 ret = 0;
1691 }
1692
1693 out:
1694 btrfs_free_path(path);
1695 return ret;
1696 }
1697
fixup_inode_link_counts(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path)1698 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1699 struct btrfs_root *root,
1700 struct btrfs_path *path)
1701 {
1702 int ret;
1703 struct btrfs_key key;
1704
1705 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1706 key.type = BTRFS_ORPHAN_ITEM_KEY;
1707 key.offset = (u64)-1;
1708 while (1) {
1709 struct btrfs_inode *inode;
1710
1711 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1712 if (ret < 0)
1713 break;
1714
1715 if (ret == 1) {
1716 ret = 0;
1717 if (path->slots[0] == 0)
1718 break;
1719 path->slots[0]--;
1720 }
1721
1722 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1723 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1724 key.type != BTRFS_ORPHAN_ITEM_KEY)
1725 break;
1726
1727 ret = btrfs_del_item(trans, root, path);
1728 if (ret)
1729 break;
1730
1731 btrfs_release_path(path);
1732 inode = btrfs_iget_logging(key.offset, root);
1733 if (IS_ERR(inode)) {
1734 ret = PTR_ERR(inode);
1735 break;
1736 }
1737
1738 ret = fixup_inode_link_count(trans, inode);
1739 iput(&inode->vfs_inode);
1740 if (ret)
1741 break;
1742
1743 /*
1744 * fixup on a directory may create new entries,
1745 * make sure we always look for the highset possible
1746 * offset
1747 */
1748 key.offset = (u64)-1;
1749 }
1750 btrfs_release_path(path);
1751 return ret;
1752 }
1753
1754
1755 /*
1756 * record a given inode in the fixup dir so we can check its link
1757 * count when replay is done. The link count is incremented here
1758 * so the inode won't go away until we check it
1759 */
link_to_fixup_dir(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 objectid)1760 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1761 struct btrfs_root *root,
1762 struct btrfs_path *path,
1763 u64 objectid)
1764 {
1765 struct btrfs_key key;
1766 int ret = 0;
1767 struct btrfs_inode *inode;
1768 struct inode *vfs_inode;
1769
1770 inode = btrfs_iget_logging(objectid, root);
1771 if (IS_ERR(inode))
1772 return PTR_ERR(inode);
1773
1774 vfs_inode = &inode->vfs_inode;
1775 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1776 key.type = BTRFS_ORPHAN_ITEM_KEY;
1777 key.offset = objectid;
1778
1779 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1780
1781 btrfs_release_path(path);
1782 if (ret == 0) {
1783 if (!vfs_inode->i_nlink)
1784 set_nlink(vfs_inode, 1);
1785 else
1786 inc_nlink(vfs_inode);
1787 ret = btrfs_update_inode(trans, inode);
1788 } else if (ret == -EEXIST) {
1789 ret = 0;
1790 }
1791 iput(vfs_inode);
1792
1793 return ret;
1794 }
1795
1796 /*
1797 * when replaying the log for a directory, we only insert names
1798 * for inodes that actually exist. This means an fsync on a directory
1799 * does not implicitly fsync all the new files in it
1800 */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,const struct fscrypt_str * name,struct btrfs_key * location)1801 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1802 struct btrfs_root *root,
1803 u64 dirid, u64 index,
1804 const struct fscrypt_str *name,
1805 struct btrfs_key *location)
1806 {
1807 struct btrfs_inode *inode;
1808 struct btrfs_inode *dir;
1809 int ret;
1810
1811 inode = btrfs_iget_logging(location->objectid, root);
1812 if (IS_ERR(inode))
1813 return PTR_ERR(inode);
1814
1815 dir = btrfs_iget_logging(dirid, root);
1816 if (IS_ERR(dir)) {
1817 iput(&inode->vfs_inode);
1818 return PTR_ERR(dir);
1819 }
1820
1821 ret = btrfs_add_link(trans, dir, inode, name, 1, index);
1822
1823 /* FIXME, put inode into FIXUP list */
1824
1825 iput(&inode->vfs_inode);
1826 iput(&dir->vfs_inode);
1827 return ret;
1828 }
1829
delete_conflicting_dir_entry(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_path * path,struct btrfs_dir_item * dst_di,const struct btrfs_key * log_key,u8 log_flags,bool exists)1830 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1831 struct btrfs_inode *dir,
1832 struct btrfs_path *path,
1833 struct btrfs_dir_item *dst_di,
1834 const struct btrfs_key *log_key,
1835 u8 log_flags,
1836 bool exists)
1837 {
1838 struct btrfs_key found_key;
1839
1840 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1841 /* The existing dentry points to the same inode, don't delete it. */
1842 if (found_key.objectid == log_key->objectid &&
1843 found_key.type == log_key->type &&
1844 found_key.offset == log_key->offset &&
1845 btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
1846 return 1;
1847
1848 /*
1849 * Don't drop the conflicting directory entry if the inode for the new
1850 * entry doesn't exist.
1851 */
1852 if (!exists)
1853 return 0;
1854
1855 return drop_one_dir_item(trans, path, dir, dst_di);
1856 }
1857
1858 /*
1859 * take a single entry in a log directory item and replay it into
1860 * the subvolume.
1861 *
1862 * if a conflicting item exists in the subdirectory already,
1863 * the inode it points to is unlinked and put into the link count
1864 * fix up tree.
1865 *
1866 * If a name from the log points to a file or directory that does
1867 * not exist in the FS, it is skipped. fsyncs on directories
1868 * do not force down inodes inside that directory, just changes to the
1869 * names or unlinks in a directory.
1870 *
1871 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1872 * non-existing inode) and 1 if the name was replayed.
1873 */
replay_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,struct btrfs_dir_item * di,struct btrfs_key * key)1874 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1875 struct btrfs_root *root,
1876 struct btrfs_path *path,
1877 struct extent_buffer *eb,
1878 struct btrfs_dir_item *di,
1879 struct btrfs_key *key)
1880 {
1881 struct fscrypt_str name = { 0 };
1882 struct btrfs_dir_item *dir_dst_di;
1883 struct btrfs_dir_item *index_dst_di;
1884 bool dir_dst_matches = false;
1885 bool index_dst_matches = false;
1886 struct btrfs_key log_key;
1887 struct btrfs_key search_key;
1888 struct btrfs_inode *dir;
1889 u8 log_flags;
1890 bool exists;
1891 int ret;
1892 bool update_size = true;
1893 bool name_added = false;
1894
1895 dir = btrfs_iget_logging(key->objectid, root);
1896 if (IS_ERR(dir))
1897 return PTR_ERR(dir);
1898
1899 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1900 if (ret)
1901 goto out;
1902
1903 log_flags = btrfs_dir_flags(eb, di);
1904 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1905 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1906 btrfs_release_path(path);
1907 if (ret < 0)
1908 goto out;
1909 exists = (ret == 0);
1910 ret = 0;
1911
1912 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1913 &name, 1);
1914 if (IS_ERR(dir_dst_di)) {
1915 ret = PTR_ERR(dir_dst_di);
1916 goto out;
1917 } else if (dir_dst_di) {
1918 ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
1919 &log_key, log_flags, exists);
1920 if (ret < 0)
1921 goto out;
1922 dir_dst_matches = (ret == 1);
1923 }
1924
1925 btrfs_release_path(path);
1926
1927 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1928 key->objectid, key->offset,
1929 &name, 1);
1930 if (IS_ERR(index_dst_di)) {
1931 ret = PTR_ERR(index_dst_di);
1932 goto out;
1933 } else if (index_dst_di) {
1934 ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
1935 &log_key, log_flags, exists);
1936 if (ret < 0)
1937 goto out;
1938 index_dst_matches = (ret == 1);
1939 }
1940
1941 btrfs_release_path(path);
1942
1943 if (dir_dst_matches && index_dst_matches) {
1944 ret = 0;
1945 update_size = false;
1946 goto out;
1947 }
1948
1949 /*
1950 * Check if the inode reference exists in the log for the given name,
1951 * inode and parent inode
1952 */
1953 search_key.objectid = log_key.objectid;
1954 search_key.type = BTRFS_INODE_REF_KEY;
1955 search_key.offset = key->objectid;
1956 ret = backref_in_log(root->log_root, &search_key, 0, &name);
1957 if (ret < 0) {
1958 goto out;
1959 } else if (ret) {
1960 /* The dentry will be added later. */
1961 ret = 0;
1962 update_size = false;
1963 goto out;
1964 }
1965
1966 search_key.objectid = log_key.objectid;
1967 search_key.type = BTRFS_INODE_EXTREF_KEY;
1968 search_key.offset = key->objectid;
1969 ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
1970 if (ret < 0) {
1971 goto out;
1972 } else if (ret) {
1973 /* The dentry will be added later. */
1974 ret = 0;
1975 update_size = false;
1976 goto out;
1977 }
1978 btrfs_release_path(path);
1979 ret = insert_one_name(trans, root, key->objectid, key->offset,
1980 &name, &log_key);
1981 if (ret && ret != -ENOENT && ret != -EEXIST)
1982 goto out;
1983 if (!ret)
1984 name_added = true;
1985 update_size = false;
1986 ret = 0;
1987
1988 out:
1989 if (!ret && update_size) {
1990 btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
1991 ret = btrfs_update_inode(trans, dir);
1992 }
1993 kfree(name.name);
1994 iput(&dir->vfs_inode);
1995 if (!ret && name_added)
1996 ret = 1;
1997 return ret;
1998 }
1999
2000 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
replay_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)2001 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2002 struct btrfs_root *root,
2003 struct btrfs_path *path,
2004 struct extent_buffer *eb, int slot,
2005 struct btrfs_key *key)
2006 {
2007 int ret;
2008 struct btrfs_dir_item *di;
2009
2010 /* We only log dir index keys, which only contain a single dir item. */
2011 ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
2012
2013 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2014 ret = replay_one_name(trans, root, path, eb, di, key);
2015 if (ret < 0)
2016 return ret;
2017
2018 /*
2019 * If this entry refers to a non-directory (directories can not have a
2020 * link count > 1) and it was added in the transaction that was not
2021 * committed, make sure we fixup the link count of the inode the entry
2022 * points to. Otherwise something like the following would result in a
2023 * directory pointing to an inode with a wrong link that does not account
2024 * for this dir entry:
2025 *
2026 * mkdir testdir
2027 * touch testdir/foo
2028 * touch testdir/bar
2029 * sync
2030 *
2031 * ln testdir/bar testdir/bar_link
2032 * ln testdir/foo testdir/foo_link
2033 * xfs_io -c "fsync" testdir/bar
2034 *
2035 * <power failure>
2036 *
2037 * mount fs, log replay happens
2038 *
2039 * File foo would remain with a link count of 1 when it has two entries
2040 * pointing to it in the directory testdir. This would make it impossible
2041 * to ever delete the parent directory has it would result in stale
2042 * dentries that can never be deleted.
2043 */
2044 if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
2045 struct btrfs_path *fixup_path;
2046 struct btrfs_key di_key;
2047
2048 fixup_path = btrfs_alloc_path();
2049 if (!fixup_path)
2050 return -ENOMEM;
2051
2052 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2053 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2054 btrfs_free_path(fixup_path);
2055 }
2056
2057 return ret;
2058 }
2059
2060 /*
2061 * directory replay has two parts. There are the standard directory
2062 * items in the log copied from the subvolume, and range items
2063 * created in the log while the subvolume was logged.
2064 *
2065 * The range items tell us which parts of the key space the log
2066 * is authoritative for. During replay, if a key in the subvolume
2067 * directory is in a logged range item, but not actually in the log
2068 * that means it was deleted from the directory before the fsync
2069 * and should be removed.
2070 */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 * start_ret,u64 * end_ret)2071 static noinline int find_dir_range(struct btrfs_root *root,
2072 struct btrfs_path *path,
2073 u64 dirid,
2074 u64 *start_ret, u64 *end_ret)
2075 {
2076 struct btrfs_key key;
2077 u64 found_end;
2078 struct btrfs_dir_log_item *item;
2079 int ret;
2080 int nritems;
2081
2082 if (*start_ret == (u64)-1)
2083 return 1;
2084
2085 key.objectid = dirid;
2086 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2087 key.offset = *start_ret;
2088
2089 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2090 if (ret < 0)
2091 goto out;
2092 if (ret > 0) {
2093 if (path->slots[0] == 0)
2094 goto out;
2095 path->slots[0]--;
2096 }
2097 if (ret != 0)
2098 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2099
2100 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2101 ret = 1;
2102 goto next;
2103 }
2104 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2105 struct btrfs_dir_log_item);
2106 found_end = btrfs_dir_log_end(path->nodes[0], item);
2107
2108 if (*start_ret >= key.offset && *start_ret <= found_end) {
2109 ret = 0;
2110 *start_ret = key.offset;
2111 *end_ret = found_end;
2112 goto out;
2113 }
2114 ret = 1;
2115 next:
2116 /* check the next slot in the tree to see if it is a valid item */
2117 nritems = btrfs_header_nritems(path->nodes[0]);
2118 path->slots[0]++;
2119 if (path->slots[0] >= nritems) {
2120 ret = btrfs_next_leaf(root, path);
2121 if (ret)
2122 goto out;
2123 }
2124
2125 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2126
2127 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2128 ret = 1;
2129 goto out;
2130 }
2131 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2132 struct btrfs_dir_log_item);
2133 found_end = btrfs_dir_log_end(path->nodes[0], item);
2134 *start_ret = key.offset;
2135 *end_ret = found_end;
2136 ret = 0;
2137 out:
2138 btrfs_release_path(path);
2139 return ret;
2140 }
2141
2142 /*
2143 * this looks for a given directory item in the log. If the directory
2144 * item is not in the log, the item is removed and the inode it points
2145 * to is unlinked
2146 */
check_item_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_path * log_path,struct btrfs_inode * dir,struct btrfs_key * dir_key)2147 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2148 struct btrfs_root *log,
2149 struct btrfs_path *path,
2150 struct btrfs_path *log_path,
2151 struct btrfs_inode *dir,
2152 struct btrfs_key *dir_key)
2153 {
2154 struct btrfs_root *root = dir->root;
2155 int ret;
2156 struct extent_buffer *eb;
2157 int slot;
2158 struct btrfs_dir_item *di;
2159 struct fscrypt_str name = { 0 };
2160 struct btrfs_inode *inode = NULL;
2161 struct btrfs_key location;
2162
2163 /*
2164 * Currently we only log dir index keys. Even if we replay a log created
2165 * by an older kernel that logged both dir index and dir item keys, all
2166 * we need to do is process the dir index keys, we (and our caller) can
2167 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2168 */
2169 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2170
2171 eb = path->nodes[0];
2172 slot = path->slots[0];
2173 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2174 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2175 if (ret)
2176 goto out;
2177
2178 if (log) {
2179 struct btrfs_dir_item *log_di;
2180
2181 log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2182 dir_key->objectid,
2183 dir_key->offset, &name, 0);
2184 if (IS_ERR(log_di)) {
2185 ret = PTR_ERR(log_di);
2186 goto out;
2187 } else if (log_di) {
2188 /* The dentry exists in the log, we have nothing to do. */
2189 ret = 0;
2190 goto out;
2191 }
2192 }
2193
2194 btrfs_dir_item_key_to_cpu(eb, di, &location);
2195 btrfs_release_path(path);
2196 btrfs_release_path(log_path);
2197 inode = btrfs_iget_logging(location.objectid, root);
2198 if (IS_ERR(inode)) {
2199 ret = PTR_ERR(inode);
2200 inode = NULL;
2201 goto out;
2202 }
2203
2204 ret = link_to_fixup_dir(trans, root, path, location.objectid);
2205 if (ret)
2206 goto out;
2207
2208 inc_nlink(&inode->vfs_inode);
2209 ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
2210 /*
2211 * Unlike dir item keys, dir index keys can only have one name (entry) in
2212 * them, as there are no key collisions since each key has a unique offset
2213 * (an index number), so we're done.
2214 */
2215 out:
2216 btrfs_release_path(path);
2217 btrfs_release_path(log_path);
2218 kfree(name.name);
2219 if (inode)
2220 iput(&inode->vfs_inode);
2221 return ret;
2222 }
2223
replay_xattr_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,const u64 ino)2224 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2225 struct btrfs_root *root,
2226 struct btrfs_root *log,
2227 struct btrfs_path *path,
2228 const u64 ino)
2229 {
2230 struct btrfs_key search_key;
2231 struct btrfs_path *log_path;
2232 int i;
2233 int nritems;
2234 int ret;
2235
2236 log_path = btrfs_alloc_path();
2237 if (!log_path)
2238 return -ENOMEM;
2239
2240 search_key.objectid = ino;
2241 search_key.type = BTRFS_XATTR_ITEM_KEY;
2242 search_key.offset = 0;
2243 again:
2244 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2245 if (ret < 0)
2246 goto out;
2247 process_leaf:
2248 nritems = btrfs_header_nritems(path->nodes[0]);
2249 for (i = path->slots[0]; i < nritems; i++) {
2250 struct btrfs_key key;
2251 struct btrfs_dir_item *di;
2252 struct btrfs_dir_item *log_di;
2253 u32 total_size;
2254 u32 cur;
2255
2256 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2257 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2258 ret = 0;
2259 goto out;
2260 }
2261
2262 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2263 total_size = btrfs_item_size(path->nodes[0], i);
2264 cur = 0;
2265 while (cur < total_size) {
2266 u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2267 u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2268 u32 this_len = sizeof(*di) + name_len + data_len;
2269 char *name;
2270
2271 name = kmalloc(name_len, GFP_NOFS);
2272 if (!name) {
2273 ret = -ENOMEM;
2274 goto out;
2275 }
2276 read_extent_buffer(path->nodes[0], name,
2277 (unsigned long)(di + 1), name_len);
2278
2279 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2280 name, name_len, 0);
2281 btrfs_release_path(log_path);
2282 if (!log_di) {
2283 /* Doesn't exist in log tree, so delete it. */
2284 btrfs_release_path(path);
2285 di = btrfs_lookup_xattr(trans, root, path, ino,
2286 name, name_len, -1);
2287 kfree(name);
2288 if (IS_ERR(di)) {
2289 ret = PTR_ERR(di);
2290 goto out;
2291 }
2292 ASSERT(di);
2293 ret = btrfs_delete_one_dir_name(trans, root,
2294 path, di);
2295 if (ret)
2296 goto out;
2297 btrfs_release_path(path);
2298 search_key = key;
2299 goto again;
2300 }
2301 kfree(name);
2302 if (IS_ERR(log_di)) {
2303 ret = PTR_ERR(log_di);
2304 goto out;
2305 }
2306 cur += this_len;
2307 di = (struct btrfs_dir_item *)((char *)di + this_len);
2308 }
2309 }
2310 ret = btrfs_next_leaf(root, path);
2311 if (ret > 0)
2312 ret = 0;
2313 else if (ret == 0)
2314 goto process_leaf;
2315 out:
2316 btrfs_free_path(log_path);
2317 btrfs_release_path(path);
2318 return ret;
2319 }
2320
2321
2322 /*
2323 * deletion replay happens before we copy any new directory items
2324 * out of the log or out of backreferences from inodes. It
2325 * scans the log to find ranges of keys that log is authoritative for,
2326 * and then scans the directory to find items in those ranges that are
2327 * not present in the log.
2328 *
2329 * Anything we don't find in the log is unlinked and removed from the
2330 * directory.
2331 */
replay_dir_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,bool del_all)2332 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2333 struct btrfs_root *root,
2334 struct btrfs_root *log,
2335 struct btrfs_path *path,
2336 u64 dirid, bool del_all)
2337 {
2338 u64 range_start;
2339 u64 range_end;
2340 int ret = 0;
2341 struct btrfs_key dir_key;
2342 struct btrfs_key found_key;
2343 struct btrfs_path *log_path;
2344 struct btrfs_inode *dir;
2345
2346 dir_key.objectid = dirid;
2347 dir_key.type = BTRFS_DIR_INDEX_KEY;
2348 log_path = btrfs_alloc_path();
2349 if (!log_path)
2350 return -ENOMEM;
2351
2352 dir = btrfs_iget_logging(dirid, root);
2353 /*
2354 * It isn't an error if the inode isn't there, that can happen because
2355 * we replay the deletes before we copy in the inode item from the log.
2356 */
2357 if (IS_ERR(dir)) {
2358 btrfs_free_path(log_path);
2359 ret = PTR_ERR(dir);
2360 if (ret == -ENOENT)
2361 ret = 0;
2362 return ret;
2363 }
2364
2365 range_start = 0;
2366 range_end = 0;
2367 while (1) {
2368 if (del_all)
2369 range_end = (u64)-1;
2370 else {
2371 ret = find_dir_range(log, path, dirid,
2372 &range_start, &range_end);
2373 if (ret < 0)
2374 goto out;
2375 else if (ret > 0)
2376 break;
2377 }
2378
2379 dir_key.offset = range_start;
2380 while (1) {
2381 int nritems;
2382 ret = btrfs_search_slot(NULL, root, &dir_key, path,
2383 0, 0);
2384 if (ret < 0)
2385 goto out;
2386
2387 nritems = btrfs_header_nritems(path->nodes[0]);
2388 if (path->slots[0] >= nritems) {
2389 ret = btrfs_next_leaf(root, path);
2390 if (ret == 1)
2391 break;
2392 else if (ret < 0)
2393 goto out;
2394 }
2395 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2396 path->slots[0]);
2397 if (found_key.objectid != dirid ||
2398 found_key.type != dir_key.type) {
2399 ret = 0;
2400 goto out;
2401 }
2402
2403 if (found_key.offset > range_end)
2404 break;
2405
2406 ret = check_item_in_log(trans, log, path,
2407 log_path, dir,
2408 &found_key);
2409 if (ret)
2410 goto out;
2411 if (found_key.offset == (u64)-1)
2412 break;
2413 dir_key.offset = found_key.offset + 1;
2414 }
2415 btrfs_release_path(path);
2416 if (range_end == (u64)-1)
2417 break;
2418 range_start = range_end + 1;
2419 }
2420 ret = 0;
2421 out:
2422 btrfs_release_path(path);
2423 btrfs_free_path(log_path);
2424 iput(&dir->vfs_inode);
2425 return ret;
2426 }
2427
2428 /*
2429 * the process_func used to replay items from the log tree. This
2430 * gets called in two different stages. The first stage just looks
2431 * for inodes and makes sure they are all copied into the subvolume.
2432 *
2433 * The second stage copies all the other item types from the log into
2434 * the subvolume. The two stage approach is slower, but gets rid of
2435 * lots of complexity around inodes referencing other inodes that exist
2436 * only in the log (references come from either directory items or inode
2437 * back refs).
2438 */
replay_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2439 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2440 struct walk_control *wc, u64 gen, int level)
2441 {
2442 int nritems;
2443 struct btrfs_tree_parent_check check = {
2444 .transid = gen,
2445 .level = level
2446 };
2447 struct btrfs_path *path;
2448 struct btrfs_root *root = wc->replay_dest;
2449 struct btrfs_key key;
2450 int i;
2451 int ret;
2452
2453 ret = btrfs_read_extent_buffer(eb, &check);
2454 if (ret)
2455 return ret;
2456
2457 level = btrfs_header_level(eb);
2458
2459 if (level != 0)
2460 return 0;
2461
2462 path = btrfs_alloc_path();
2463 if (!path)
2464 return -ENOMEM;
2465
2466 nritems = btrfs_header_nritems(eb);
2467 for (i = 0; i < nritems; i++) {
2468 btrfs_item_key_to_cpu(eb, &key, i);
2469
2470 /* inode keys are done during the first stage */
2471 if (key.type == BTRFS_INODE_ITEM_KEY &&
2472 wc->stage == LOG_WALK_REPLAY_INODES) {
2473 struct btrfs_inode_item *inode_item;
2474 u32 mode;
2475
2476 inode_item = btrfs_item_ptr(eb, i,
2477 struct btrfs_inode_item);
2478 /*
2479 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2480 * and never got linked before the fsync, skip it, as
2481 * replaying it is pointless since it would be deleted
2482 * later. We skip logging tmpfiles, but it's always
2483 * possible we are replaying a log created with a kernel
2484 * that used to log tmpfiles.
2485 */
2486 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2487 wc->ignore_cur_inode = true;
2488 continue;
2489 } else {
2490 wc->ignore_cur_inode = false;
2491 }
2492 ret = replay_xattr_deletes(wc->trans, root, log,
2493 path, key.objectid);
2494 if (ret)
2495 break;
2496 mode = btrfs_inode_mode(eb, inode_item);
2497 if (S_ISDIR(mode)) {
2498 ret = replay_dir_deletes(wc->trans, root, log, path,
2499 key.objectid, false);
2500 if (ret)
2501 break;
2502 }
2503 ret = overwrite_item(wc->trans, root, path,
2504 eb, i, &key);
2505 if (ret)
2506 break;
2507
2508 /*
2509 * Before replaying extents, truncate the inode to its
2510 * size. We need to do it now and not after log replay
2511 * because before an fsync we can have prealloc extents
2512 * added beyond the inode's i_size. If we did it after,
2513 * through orphan cleanup for example, we would drop
2514 * those prealloc extents just after replaying them.
2515 */
2516 if (S_ISREG(mode)) {
2517 struct btrfs_drop_extents_args drop_args = { 0 };
2518 struct btrfs_inode *inode;
2519 u64 from;
2520
2521 inode = btrfs_iget_logging(key.objectid, root);
2522 if (IS_ERR(inode)) {
2523 ret = PTR_ERR(inode);
2524 break;
2525 }
2526 from = ALIGN(i_size_read(&inode->vfs_inode),
2527 root->fs_info->sectorsize);
2528 drop_args.start = from;
2529 drop_args.end = (u64)-1;
2530 drop_args.drop_cache = true;
2531 ret = btrfs_drop_extents(wc->trans, root, inode,
2532 &drop_args);
2533 if (!ret) {
2534 inode_sub_bytes(&inode->vfs_inode,
2535 drop_args.bytes_found);
2536 /* Update the inode's nbytes. */
2537 ret = btrfs_update_inode(wc->trans, inode);
2538 }
2539 iput(&inode->vfs_inode);
2540 if (ret)
2541 break;
2542 }
2543
2544 ret = link_to_fixup_dir(wc->trans, root,
2545 path, key.objectid);
2546 if (ret)
2547 break;
2548 }
2549
2550 if (wc->ignore_cur_inode)
2551 continue;
2552
2553 if (key.type == BTRFS_DIR_INDEX_KEY &&
2554 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2555 ret = replay_one_dir_item(wc->trans, root, path,
2556 eb, i, &key);
2557 if (ret)
2558 break;
2559 }
2560
2561 if (wc->stage < LOG_WALK_REPLAY_ALL)
2562 continue;
2563
2564 /* these keys are simply copied */
2565 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2566 ret = overwrite_item(wc->trans, root, path,
2567 eb, i, &key);
2568 if (ret)
2569 break;
2570 } else if (key.type == BTRFS_INODE_REF_KEY ||
2571 key.type == BTRFS_INODE_EXTREF_KEY) {
2572 ret = add_inode_ref(wc->trans, root, log, path,
2573 eb, i, &key);
2574 if (ret)
2575 break;
2576 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2577 ret = replay_one_extent(wc->trans, root, path,
2578 eb, i, &key);
2579 if (ret)
2580 break;
2581 }
2582 /*
2583 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2584 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2585 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2586 * older kernel with such keys, ignore them.
2587 */
2588 }
2589 btrfs_free_path(path);
2590 return ret;
2591 }
2592
2593 /*
2594 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2595 */
unaccount_log_buffer(struct btrfs_fs_info * fs_info,u64 start)2596 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2597 {
2598 struct btrfs_block_group *cache;
2599
2600 cache = btrfs_lookup_block_group(fs_info, start);
2601 if (!cache) {
2602 btrfs_err(fs_info, "unable to find block group for %llu", start);
2603 return;
2604 }
2605
2606 spin_lock(&cache->space_info->lock);
2607 spin_lock(&cache->lock);
2608 cache->reserved -= fs_info->nodesize;
2609 cache->space_info->bytes_reserved -= fs_info->nodesize;
2610 spin_unlock(&cache->lock);
2611 spin_unlock(&cache->space_info->lock);
2612
2613 btrfs_put_block_group(cache);
2614 }
2615
clean_log_buffer(struct btrfs_trans_handle * trans,struct extent_buffer * eb)2616 static int clean_log_buffer(struct btrfs_trans_handle *trans,
2617 struct extent_buffer *eb)
2618 {
2619 int ret;
2620
2621 btrfs_tree_lock(eb);
2622 btrfs_clear_buffer_dirty(trans, eb);
2623 wait_on_extent_buffer_writeback(eb);
2624 btrfs_tree_unlock(eb);
2625
2626 if (trans) {
2627 ret = btrfs_pin_reserved_extent(trans, eb);
2628 if (ret)
2629 return ret;
2630 } else {
2631 unaccount_log_buffer(eb->fs_info, eb->start);
2632 }
2633
2634 return 0;
2635 }
2636
walk_down_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2637 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2638 struct btrfs_root *root,
2639 struct btrfs_path *path, int *level,
2640 struct walk_control *wc)
2641 {
2642 struct btrfs_fs_info *fs_info = root->fs_info;
2643 u64 bytenr;
2644 u64 ptr_gen;
2645 struct extent_buffer *next;
2646 struct extent_buffer *cur;
2647 int ret = 0;
2648
2649 while (*level > 0) {
2650 struct btrfs_tree_parent_check check = { 0 };
2651
2652 cur = path->nodes[*level];
2653
2654 WARN_ON(btrfs_header_level(cur) != *level);
2655
2656 if (path->slots[*level] >=
2657 btrfs_header_nritems(cur))
2658 break;
2659
2660 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2661 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2662 check.transid = ptr_gen;
2663 check.level = *level - 1;
2664 check.has_first_key = true;
2665 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
2666
2667 next = btrfs_find_create_tree_block(fs_info, bytenr,
2668 btrfs_header_owner(cur),
2669 *level - 1);
2670 if (IS_ERR(next))
2671 return PTR_ERR(next);
2672
2673 if (*level == 1) {
2674 ret = wc->process_func(root, next, wc, ptr_gen,
2675 *level - 1);
2676 if (ret) {
2677 free_extent_buffer(next);
2678 return ret;
2679 }
2680
2681 path->slots[*level]++;
2682 if (wc->free) {
2683 ret = btrfs_read_extent_buffer(next, &check);
2684 if (ret) {
2685 free_extent_buffer(next);
2686 return ret;
2687 }
2688
2689 ret = clean_log_buffer(trans, next);
2690 if (ret) {
2691 free_extent_buffer(next);
2692 return ret;
2693 }
2694 }
2695 free_extent_buffer(next);
2696 continue;
2697 }
2698 ret = btrfs_read_extent_buffer(next, &check);
2699 if (ret) {
2700 free_extent_buffer(next);
2701 return ret;
2702 }
2703
2704 if (path->nodes[*level-1])
2705 free_extent_buffer(path->nodes[*level-1]);
2706 path->nodes[*level-1] = next;
2707 *level = btrfs_header_level(next);
2708 path->slots[*level] = 0;
2709 cond_resched();
2710 }
2711 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2712
2713 cond_resched();
2714 return 0;
2715 }
2716
walk_up_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2717 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2718 struct btrfs_root *root,
2719 struct btrfs_path *path, int *level,
2720 struct walk_control *wc)
2721 {
2722 int i;
2723 int slot;
2724 int ret;
2725
2726 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2727 slot = path->slots[i];
2728 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2729 path->slots[i]++;
2730 *level = i;
2731 WARN_ON(*level == 0);
2732 return 0;
2733 } else {
2734 ret = wc->process_func(root, path->nodes[*level], wc,
2735 btrfs_header_generation(path->nodes[*level]),
2736 *level);
2737 if (ret)
2738 return ret;
2739
2740 if (wc->free) {
2741 ret = clean_log_buffer(trans, path->nodes[*level]);
2742 if (ret)
2743 return ret;
2744 }
2745 free_extent_buffer(path->nodes[*level]);
2746 path->nodes[*level] = NULL;
2747 *level = i + 1;
2748 }
2749 }
2750 return 1;
2751 }
2752
2753 /*
2754 * drop the reference count on the tree rooted at 'snap'. This traverses
2755 * the tree freeing any blocks that have a ref count of zero after being
2756 * decremented.
2757 */
walk_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct walk_control * wc)2758 static int walk_log_tree(struct btrfs_trans_handle *trans,
2759 struct btrfs_root *log, struct walk_control *wc)
2760 {
2761 int ret = 0;
2762 int wret;
2763 int level;
2764 struct btrfs_path *path;
2765 int orig_level;
2766
2767 path = btrfs_alloc_path();
2768 if (!path)
2769 return -ENOMEM;
2770
2771 level = btrfs_header_level(log->node);
2772 orig_level = level;
2773 path->nodes[level] = log->node;
2774 refcount_inc(&log->node->refs);
2775 path->slots[level] = 0;
2776
2777 while (1) {
2778 wret = walk_down_log_tree(trans, log, path, &level, wc);
2779 if (wret > 0)
2780 break;
2781 if (wret < 0) {
2782 ret = wret;
2783 goto out;
2784 }
2785
2786 wret = walk_up_log_tree(trans, log, path, &level, wc);
2787 if (wret > 0)
2788 break;
2789 if (wret < 0) {
2790 ret = wret;
2791 goto out;
2792 }
2793 }
2794
2795 /* was the root node processed? if not, catch it here */
2796 if (path->nodes[orig_level]) {
2797 ret = wc->process_func(log, path->nodes[orig_level], wc,
2798 btrfs_header_generation(path->nodes[orig_level]),
2799 orig_level);
2800 if (ret)
2801 goto out;
2802 if (wc->free)
2803 ret = clean_log_buffer(trans, path->nodes[orig_level]);
2804 }
2805
2806 out:
2807 btrfs_free_path(path);
2808 return ret;
2809 }
2810
2811 /*
2812 * helper function to update the item for a given subvolumes log root
2813 * in the tree of log roots
2814 */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)2815 static int update_log_root(struct btrfs_trans_handle *trans,
2816 struct btrfs_root *log,
2817 struct btrfs_root_item *root_item)
2818 {
2819 struct btrfs_fs_info *fs_info = log->fs_info;
2820 int ret;
2821
2822 if (log->log_transid == 1) {
2823 /* insert root item on the first sync */
2824 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2825 &log->root_key, root_item);
2826 } else {
2827 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2828 &log->root_key, root_item);
2829 }
2830 return ret;
2831 }
2832
wait_log_commit(struct btrfs_root * root,int transid)2833 static void wait_log_commit(struct btrfs_root *root, int transid)
2834 {
2835 DEFINE_WAIT(wait);
2836 int index = transid % 2;
2837
2838 /*
2839 * we only allow two pending log transactions at a time,
2840 * so we know that if ours is more than 2 older than the
2841 * current transaction, we're done
2842 */
2843 for (;;) {
2844 prepare_to_wait(&root->log_commit_wait[index],
2845 &wait, TASK_UNINTERRUPTIBLE);
2846
2847 if (!(root->log_transid_committed < transid &&
2848 atomic_read(&root->log_commit[index])))
2849 break;
2850
2851 mutex_unlock(&root->log_mutex);
2852 schedule();
2853 mutex_lock(&root->log_mutex);
2854 }
2855 finish_wait(&root->log_commit_wait[index], &wait);
2856 }
2857
wait_for_writer(struct btrfs_root * root)2858 static void wait_for_writer(struct btrfs_root *root)
2859 {
2860 DEFINE_WAIT(wait);
2861
2862 for (;;) {
2863 prepare_to_wait(&root->log_writer_wait, &wait,
2864 TASK_UNINTERRUPTIBLE);
2865 if (!atomic_read(&root->log_writers))
2866 break;
2867
2868 mutex_unlock(&root->log_mutex);
2869 schedule();
2870 mutex_lock(&root->log_mutex);
2871 }
2872 finish_wait(&root->log_writer_wait, &wait);
2873 }
2874
btrfs_init_log_ctx(struct btrfs_log_ctx * ctx,struct btrfs_inode * inode)2875 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
2876 {
2877 ctx->log_ret = 0;
2878 ctx->log_transid = 0;
2879 ctx->log_new_dentries = false;
2880 ctx->logging_new_name = false;
2881 ctx->logging_new_delayed_dentries = false;
2882 ctx->logged_before = false;
2883 ctx->inode = inode;
2884 INIT_LIST_HEAD(&ctx->list);
2885 INIT_LIST_HEAD(&ctx->ordered_extents);
2886 INIT_LIST_HEAD(&ctx->conflict_inodes);
2887 ctx->num_conflict_inodes = 0;
2888 ctx->logging_conflict_inodes = false;
2889 ctx->scratch_eb = NULL;
2890 }
2891
btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx * ctx)2892 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
2893 {
2894 struct btrfs_inode *inode = ctx->inode;
2895
2896 if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
2897 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
2898 return;
2899
2900 /*
2901 * Don't care about allocation failure. This is just for optimization,
2902 * if we fail to allocate here, we will try again later if needed.
2903 */
2904 ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
2905 }
2906
btrfs_release_log_ctx_extents(struct btrfs_log_ctx * ctx)2907 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
2908 {
2909 struct btrfs_ordered_extent *ordered;
2910 struct btrfs_ordered_extent *tmp;
2911
2912 btrfs_assert_inode_locked(ctx->inode);
2913
2914 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
2915 list_del_init(&ordered->log_list);
2916 btrfs_put_ordered_extent(ordered);
2917 }
2918 }
2919
2920
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)2921 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2922 struct btrfs_log_ctx *ctx)
2923 {
2924 mutex_lock(&root->log_mutex);
2925 list_del_init(&ctx->list);
2926 mutex_unlock(&root->log_mutex);
2927 }
2928
2929 /*
2930 * Invoked in log mutex context, or be sure there is no other task which
2931 * can access the list.
2932 */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)2933 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2934 int index, int error)
2935 {
2936 struct btrfs_log_ctx *ctx;
2937 struct btrfs_log_ctx *safe;
2938
2939 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2940 list_del_init(&ctx->list);
2941 ctx->log_ret = error;
2942 }
2943 }
2944
2945 /*
2946 * Sends a given tree log down to the disk and updates the super blocks to
2947 * record it. When this call is done, you know that any inodes previously
2948 * logged are safely on disk only if it returns 0.
2949 *
2950 * Any other return value means you need to call btrfs_commit_transaction.
2951 * Some of the edge cases for fsyncing directories that have had unlinks
2952 * or renames done in the past mean that sometimes the only safe
2953 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
2954 * that has happened.
2955 */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)2956 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2957 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2958 {
2959 int index1;
2960 int index2;
2961 int mark;
2962 int ret;
2963 struct btrfs_fs_info *fs_info = root->fs_info;
2964 struct btrfs_root *log = root->log_root;
2965 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2966 struct btrfs_root_item new_root_item;
2967 int log_transid = 0;
2968 struct btrfs_log_ctx root_log_ctx;
2969 struct blk_plug plug;
2970 u64 log_root_start;
2971 u64 log_root_level;
2972
2973 mutex_lock(&root->log_mutex);
2974 log_transid = ctx->log_transid;
2975 if (root->log_transid_committed >= log_transid) {
2976 mutex_unlock(&root->log_mutex);
2977 return ctx->log_ret;
2978 }
2979
2980 index1 = log_transid % 2;
2981 if (atomic_read(&root->log_commit[index1])) {
2982 wait_log_commit(root, log_transid);
2983 mutex_unlock(&root->log_mutex);
2984 return ctx->log_ret;
2985 }
2986 ASSERT(log_transid == root->log_transid);
2987 atomic_set(&root->log_commit[index1], 1);
2988
2989 /* wait for previous tree log sync to complete */
2990 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2991 wait_log_commit(root, log_transid - 1);
2992
2993 while (1) {
2994 int batch = atomic_read(&root->log_batch);
2995 /* when we're on an ssd, just kick the log commit out */
2996 if (!btrfs_test_opt(fs_info, SSD) &&
2997 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2998 mutex_unlock(&root->log_mutex);
2999 schedule_timeout_uninterruptible(1);
3000 mutex_lock(&root->log_mutex);
3001 }
3002 wait_for_writer(root);
3003 if (batch == atomic_read(&root->log_batch))
3004 break;
3005 }
3006
3007 /* bail out if we need to do a full commit */
3008 if (btrfs_need_log_full_commit(trans)) {
3009 ret = BTRFS_LOG_FORCE_COMMIT;
3010 mutex_unlock(&root->log_mutex);
3011 goto out;
3012 }
3013
3014 if (log_transid % 2 == 0)
3015 mark = EXTENT_DIRTY_LOG1;
3016 else
3017 mark = EXTENT_DIRTY_LOG2;
3018
3019 /* we start IO on all the marked extents here, but we don't actually
3020 * wait for them until later.
3021 */
3022 blk_start_plug(&plug);
3023 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3024 /*
3025 * -EAGAIN happens when someone, e.g., a concurrent transaction
3026 * commit, writes a dirty extent in this tree-log commit. This
3027 * concurrent write will create a hole writing out the extents,
3028 * and we cannot proceed on a zoned filesystem, requiring
3029 * sequential writing. While we can bail out to a full commit
3030 * here, but we can continue hoping the concurrent writing fills
3031 * the hole.
3032 */
3033 if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3034 ret = 0;
3035 if (ret) {
3036 blk_finish_plug(&plug);
3037 btrfs_set_log_full_commit(trans);
3038 mutex_unlock(&root->log_mutex);
3039 goto out;
3040 }
3041
3042 /*
3043 * We _must_ update under the root->log_mutex in order to make sure we
3044 * have a consistent view of the log root we are trying to commit at
3045 * this moment.
3046 *
3047 * We _must_ copy this into a local copy, because we are not holding the
3048 * log_root_tree->log_mutex yet. This is important because when we
3049 * commit the log_root_tree we must have a consistent view of the
3050 * log_root_tree when we update the super block to point at the
3051 * log_root_tree bytenr. If we update the log_root_tree here we'll race
3052 * with the commit and possibly point at the new block which we may not
3053 * have written out.
3054 */
3055 btrfs_set_root_node(&log->root_item, log->node);
3056 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3057
3058 btrfs_set_root_log_transid(root, root->log_transid + 1);
3059 log->log_transid = root->log_transid;
3060 root->log_start_pid = 0;
3061 /*
3062 * IO has been started, blocks of the log tree have WRITTEN flag set
3063 * in their headers. new modifications of the log will be written to
3064 * new positions. so it's safe to allow log writers to go in.
3065 */
3066 mutex_unlock(&root->log_mutex);
3067
3068 if (btrfs_is_zoned(fs_info)) {
3069 mutex_lock(&fs_info->tree_root->log_mutex);
3070 if (!log_root_tree->node) {
3071 ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3072 if (ret) {
3073 mutex_unlock(&fs_info->tree_root->log_mutex);
3074 blk_finish_plug(&plug);
3075 goto out;
3076 }
3077 }
3078 mutex_unlock(&fs_info->tree_root->log_mutex);
3079 }
3080
3081 btrfs_init_log_ctx(&root_log_ctx, NULL);
3082
3083 mutex_lock(&log_root_tree->log_mutex);
3084
3085 index2 = log_root_tree->log_transid % 2;
3086 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3087 root_log_ctx.log_transid = log_root_tree->log_transid;
3088
3089 /*
3090 * Now we are safe to update the log_root_tree because we're under the
3091 * log_mutex, and we're a current writer so we're holding the commit
3092 * open until we drop the log_mutex.
3093 */
3094 ret = update_log_root(trans, log, &new_root_item);
3095 if (ret) {
3096 list_del_init(&root_log_ctx.list);
3097 blk_finish_plug(&plug);
3098 btrfs_set_log_full_commit(trans);
3099 if (ret != -ENOSPC)
3100 btrfs_err(fs_info,
3101 "failed to update log for root %llu ret %d",
3102 btrfs_root_id(root), ret);
3103 btrfs_wait_tree_log_extents(log, mark);
3104 mutex_unlock(&log_root_tree->log_mutex);
3105 goto out;
3106 }
3107
3108 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3109 blk_finish_plug(&plug);
3110 list_del_init(&root_log_ctx.list);
3111 mutex_unlock(&log_root_tree->log_mutex);
3112 ret = root_log_ctx.log_ret;
3113 goto out;
3114 }
3115
3116 if (atomic_read(&log_root_tree->log_commit[index2])) {
3117 blk_finish_plug(&plug);
3118 ret = btrfs_wait_tree_log_extents(log, mark);
3119 wait_log_commit(log_root_tree,
3120 root_log_ctx.log_transid);
3121 mutex_unlock(&log_root_tree->log_mutex);
3122 if (!ret)
3123 ret = root_log_ctx.log_ret;
3124 goto out;
3125 }
3126 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3127 atomic_set(&log_root_tree->log_commit[index2], 1);
3128
3129 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3130 wait_log_commit(log_root_tree,
3131 root_log_ctx.log_transid - 1);
3132 }
3133
3134 /*
3135 * now that we've moved on to the tree of log tree roots,
3136 * check the full commit flag again
3137 */
3138 if (btrfs_need_log_full_commit(trans)) {
3139 blk_finish_plug(&plug);
3140 btrfs_wait_tree_log_extents(log, mark);
3141 mutex_unlock(&log_root_tree->log_mutex);
3142 ret = BTRFS_LOG_FORCE_COMMIT;
3143 goto out_wake_log_root;
3144 }
3145
3146 ret = btrfs_write_marked_extents(fs_info,
3147 &log_root_tree->dirty_log_pages,
3148 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3149 blk_finish_plug(&plug);
3150 /*
3151 * As described above, -EAGAIN indicates a hole in the extents. We
3152 * cannot wait for these write outs since the waiting cause a
3153 * deadlock. Bail out to the full commit instead.
3154 */
3155 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3156 btrfs_set_log_full_commit(trans);
3157 btrfs_wait_tree_log_extents(log, mark);
3158 mutex_unlock(&log_root_tree->log_mutex);
3159 goto out_wake_log_root;
3160 } else if (ret) {
3161 btrfs_set_log_full_commit(trans);
3162 mutex_unlock(&log_root_tree->log_mutex);
3163 goto out_wake_log_root;
3164 }
3165 ret = btrfs_wait_tree_log_extents(log, mark);
3166 if (!ret)
3167 ret = btrfs_wait_tree_log_extents(log_root_tree,
3168 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3169 if (ret) {
3170 btrfs_set_log_full_commit(trans);
3171 mutex_unlock(&log_root_tree->log_mutex);
3172 goto out_wake_log_root;
3173 }
3174
3175 log_root_start = log_root_tree->node->start;
3176 log_root_level = btrfs_header_level(log_root_tree->node);
3177 log_root_tree->log_transid++;
3178 mutex_unlock(&log_root_tree->log_mutex);
3179
3180 /*
3181 * Here we are guaranteed that nobody is going to write the superblock
3182 * for the current transaction before us and that neither we do write
3183 * our superblock before the previous transaction finishes its commit
3184 * and writes its superblock, because:
3185 *
3186 * 1) We are holding a handle on the current transaction, so no body
3187 * can commit it until we release the handle;
3188 *
3189 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3190 * if the previous transaction is still committing, and hasn't yet
3191 * written its superblock, we wait for it to do it, because a
3192 * transaction commit acquires the tree_log_mutex when the commit
3193 * begins and releases it only after writing its superblock.
3194 */
3195 mutex_lock(&fs_info->tree_log_mutex);
3196
3197 /*
3198 * The previous transaction writeout phase could have failed, and thus
3199 * marked the fs in an error state. We must not commit here, as we
3200 * could have updated our generation in the super_for_commit and
3201 * writing the super here would result in transid mismatches. If there
3202 * is an error here just bail.
3203 */
3204 if (BTRFS_FS_ERROR(fs_info)) {
3205 ret = -EIO;
3206 btrfs_set_log_full_commit(trans);
3207 btrfs_abort_transaction(trans, ret);
3208 mutex_unlock(&fs_info->tree_log_mutex);
3209 goto out_wake_log_root;
3210 }
3211
3212 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3213 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3214 ret = write_all_supers(fs_info, 1);
3215 mutex_unlock(&fs_info->tree_log_mutex);
3216 if (ret) {
3217 btrfs_set_log_full_commit(trans);
3218 btrfs_abort_transaction(trans, ret);
3219 goto out_wake_log_root;
3220 }
3221
3222 /*
3223 * We know there can only be one task here, since we have not yet set
3224 * root->log_commit[index1] to 0 and any task attempting to sync the
3225 * log must wait for the previous log transaction to commit if it's
3226 * still in progress or wait for the current log transaction commit if
3227 * someone else already started it. We use <= and not < because the
3228 * first log transaction has an ID of 0.
3229 */
3230 ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
3231 btrfs_set_root_last_log_commit(root, log_transid);
3232
3233 out_wake_log_root:
3234 mutex_lock(&log_root_tree->log_mutex);
3235 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3236
3237 log_root_tree->log_transid_committed++;
3238 atomic_set(&log_root_tree->log_commit[index2], 0);
3239 mutex_unlock(&log_root_tree->log_mutex);
3240
3241 /*
3242 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3243 * all the updates above are seen by the woken threads. It might not be
3244 * necessary, but proving that seems to be hard.
3245 */
3246 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3247 out:
3248 mutex_lock(&root->log_mutex);
3249 btrfs_remove_all_log_ctxs(root, index1, ret);
3250 root->log_transid_committed++;
3251 atomic_set(&root->log_commit[index1], 0);
3252 mutex_unlock(&root->log_mutex);
3253
3254 /*
3255 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3256 * all the updates above are seen by the woken threads. It might not be
3257 * necessary, but proving that seems to be hard.
3258 */
3259 cond_wake_up(&root->log_commit_wait[index1]);
3260 return ret;
3261 }
3262
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3263 static void free_log_tree(struct btrfs_trans_handle *trans,
3264 struct btrfs_root *log)
3265 {
3266 int ret;
3267 struct walk_control wc = {
3268 .free = 1,
3269 .process_func = process_one_buffer
3270 };
3271
3272 if (log->node) {
3273 ret = walk_log_tree(trans, log, &wc);
3274 if (ret) {
3275 /*
3276 * We weren't able to traverse the entire log tree, the
3277 * typical scenario is getting an -EIO when reading an
3278 * extent buffer of the tree, due to a previous writeback
3279 * failure of it.
3280 */
3281 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3282 &log->fs_info->fs_state);
3283
3284 /*
3285 * Some extent buffers of the log tree may still be dirty
3286 * and not yet written back to storage, because we may
3287 * have updates to a log tree without syncing a log tree,
3288 * such as during rename and link operations. So flush
3289 * them out and wait for their writeback to complete, so
3290 * that we properly cleanup their state and pages.
3291 */
3292 btrfs_write_marked_extents(log->fs_info,
3293 &log->dirty_log_pages,
3294 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3295 btrfs_wait_tree_log_extents(log,
3296 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3297
3298 if (trans)
3299 btrfs_abort_transaction(trans, ret);
3300 else
3301 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3302 }
3303 }
3304
3305 btrfs_extent_io_tree_release(&log->dirty_log_pages);
3306 btrfs_extent_io_tree_release(&log->log_csum_range);
3307
3308 btrfs_put_root(log);
3309 }
3310
3311 /*
3312 * free all the extents used by the tree log. This should be called
3313 * at commit time of the full transaction
3314 */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3315 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3316 {
3317 if (root->log_root) {
3318 free_log_tree(trans, root->log_root);
3319 root->log_root = NULL;
3320 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3321 }
3322 return 0;
3323 }
3324
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3325 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3326 struct btrfs_fs_info *fs_info)
3327 {
3328 if (fs_info->log_root_tree) {
3329 free_log_tree(trans, fs_info->log_root_tree);
3330 fs_info->log_root_tree = NULL;
3331 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3332 }
3333 return 0;
3334 }
3335
3336 /*
3337 * Check if an inode was logged in the current transaction. This correctly deals
3338 * with the case where the inode was logged but has a logged_trans of 0, which
3339 * happens if the inode is evicted and loaded again, as logged_trans is an in
3340 * memory only field (not persisted).
3341 *
3342 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3343 * and < 0 on error.
3344 */
inode_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path_in)3345 static int inode_logged(const struct btrfs_trans_handle *trans,
3346 struct btrfs_inode *inode,
3347 struct btrfs_path *path_in)
3348 {
3349 struct btrfs_path *path = path_in;
3350 struct btrfs_key key;
3351 int ret;
3352
3353 if (inode->logged_trans == trans->transid)
3354 return 1;
3355
3356 /*
3357 * If logged_trans is not 0, then we know the inode logged was not logged
3358 * in this transaction, so we can return false right away.
3359 */
3360 if (inode->logged_trans > 0)
3361 return 0;
3362
3363 /*
3364 * If no log tree was created for this root in this transaction, then
3365 * the inode can not have been logged in this transaction. In that case
3366 * set logged_trans to anything greater than 0 and less than the current
3367 * transaction's ID, to avoid the search below in a future call in case
3368 * a log tree gets created after this.
3369 */
3370 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3371 inode->logged_trans = trans->transid - 1;
3372 return 0;
3373 }
3374
3375 /*
3376 * We have a log tree and the inode's logged_trans is 0. We can't tell
3377 * for sure if the inode was logged before in this transaction by looking
3378 * only at logged_trans. We could be pessimistic and assume it was, but
3379 * that can lead to unnecessarily logging an inode during rename and link
3380 * operations, and then further updating the log in followup rename and
3381 * link operations, specially if it's a directory, which adds latency
3382 * visible to applications doing a series of rename or link operations.
3383 *
3384 * A logged_trans of 0 here can mean several things:
3385 *
3386 * 1) The inode was never logged since the filesystem was mounted, and may
3387 * or may have not been evicted and loaded again;
3388 *
3389 * 2) The inode was logged in a previous transaction, then evicted and
3390 * then loaded again;
3391 *
3392 * 3) The inode was logged in the current transaction, then evicted and
3393 * then loaded again.
3394 *
3395 * For cases 1) and 2) we don't want to return true, but we need to detect
3396 * case 3) and return true. So we do a search in the log root for the inode
3397 * item.
3398 */
3399 key.objectid = btrfs_ino(inode);
3400 key.type = BTRFS_INODE_ITEM_KEY;
3401 key.offset = 0;
3402
3403 if (!path) {
3404 path = btrfs_alloc_path();
3405 if (!path)
3406 return -ENOMEM;
3407 }
3408
3409 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3410
3411 if (path_in)
3412 btrfs_release_path(path);
3413 else
3414 btrfs_free_path(path);
3415
3416 /*
3417 * Logging an inode always results in logging its inode item. So if we
3418 * did not find the item we know the inode was not logged for sure.
3419 */
3420 if (ret < 0) {
3421 return ret;
3422 } else if (ret > 0) {
3423 /*
3424 * Set logged_trans to a value greater than 0 and less then the
3425 * current transaction to avoid doing the search in future calls.
3426 */
3427 inode->logged_trans = trans->transid - 1;
3428 return 0;
3429 }
3430
3431 /*
3432 * The inode was previously logged and then evicted, set logged_trans to
3433 * the current transacion's ID, to avoid future tree searches as long as
3434 * the inode is not evicted again.
3435 */
3436 inode->logged_trans = trans->transid;
3437
3438 /*
3439 * If it's a directory, then we must set last_dir_index_offset to the
3440 * maximum possible value, so that the next attempt to log the inode does
3441 * not skip checking if dir index keys found in modified subvolume tree
3442 * leaves have been logged before, otherwise it would result in attempts
3443 * to insert duplicate dir index keys in the log tree. This must be done
3444 * because last_dir_index_offset is an in-memory only field, not persisted
3445 * in the inode item or any other on-disk structure, so its value is lost
3446 * once the inode is evicted.
3447 */
3448 if (S_ISDIR(inode->vfs_inode.i_mode))
3449 inode->last_dir_index_offset = (u64)-1;
3450
3451 return 1;
3452 }
3453
3454 /*
3455 * Delete a directory entry from the log if it exists.
3456 *
3457 * Returns < 0 on error
3458 * 1 if the entry does not exists
3459 * 0 if the entry existed and was successfully deleted
3460 */
del_logged_dentry(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dir_ino,const struct fscrypt_str * name,u64 index)3461 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3462 struct btrfs_root *log,
3463 struct btrfs_path *path,
3464 u64 dir_ino,
3465 const struct fscrypt_str *name,
3466 u64 index)
3467 {
3468 struct btrfs_dir_item *di;
3469
3470 /*
3471 * We only log dir index items of a directory, so we don't need to look
3472 * for dir item keys.
3473 */
3474 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3475 index, name, -1);
3476 if (IS_ERR(di))
3477 return PTR_ERR(di);
3478 else if (!di)
3479 return 1;
3480
3481 /*
3482 * We do not need to update the size field of the directory's
3483 * inode item because on log replay we update the field to reflect
3484 * all existing entries in the directory (see overwrite_item()).
3485 */
3486 return btrfs_del_item(trans, log, path);
3487 }
3488
3489 /*
3490 * If both a file and directory are logged, and unlinks or renames are
3491 * mixed in, we have a few interesting corners:
3492 *
3493 * create file X in dir Y
3494 * link file X to X.link in dir Y
3495 * fsync file X
3496 * unlink file X but leave X.link
3497 * fsync dir Y
3498 *
3499 * After a crash we would expect only X.link to exist. But file X
3500 * didn't get fsync'd again so the log has back refs for X and X.link.
3501 *
3502 * We solve this by removing directory entries and inode backrefs from the
3503 * log when a file that was logged in the current transaction is
3504 * unlinked. Any later fsync will include the updated log entries, and
3505 * we'll be able to reconstruct the proper directory items from backrefs.
3506 *
3507 * This optimizations allows us to avoid relogging the entire inode
3508 * or the entire directory.
3509 */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * dir,u64 index)3510 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3511 struct btrfs_root *root,
3512 const struct fscrypt_str *name,
3513 struct btrfs_inode *dir, u64 index)
3514 {
3515 struct btrfs_path *path;
3516 int ret;
3517
3518 ret = inode_logged(trans, dir, NULL);
3519 if (ret == 0)
3520 return;
3521 else if (ret < 0) {
3522 btrfs_set_log_full_commit(trans);
3523 return;
3524 }
3525
3526 path = btrfs_alloc_path();
3527 if (!path) {
3528 btrfs_set_log_full_commit(trans);
3529 return;
3530 }
3531
3532 ret = join_running_log_trans(root);
3533 ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
3534 if (WARN_ON(ret))
3535 goto out;
3536
3537 mutex_lock(&dir->log_mutex);
3538
3539 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3540 name, index);
3541 mutex_unlock(&dir->log_mutex);
3542 if (ret < 0)
3543 btrfs_set_log_full_commit(trans);
3544 btrfs_end_log_trans(root);
3545 out:
3546 btrfs_free_path(path);
3547 }
3548
3549 /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * inode,u64 dirid)3550 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3551 struct btrfs_root *root,
3552 const struct fscrypt_str *name,
3553 struct btrfs_inode *inode, u64 dirid)
3554 {
3555 struct btrfs_root *log;
3556 int ret;
3557
3558 ret = inode_logged(trans, inode, NULL);
3559 if (ret == 0)
3560 return;
3561 else if (ret < 0) {
3562 btrfs_set_log_full_commit(trans);
3563 return;
3564 }
3565
3566 ret = join_running_log_trans(root);
3567 ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
3568 if (WARN_ON(ret))
3569 return;
3570 log = root->log_root;
3571 mutex_lock(&inode->log_mutex);
3572
3573 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL);
3574 mutex_unlock(&inode->log_mutex);
3575 if (ret < 0 && ret != -ENOENT)
3576 btrfs_set_log_full_commit(trans);
3577 btrfs_end_log_trans(root);
3578 }
3579
3580 /*
3581 * creates a range item in the log for 'dirid'. first_offset and
3582 * last_offset tell us which parts of the key space the log should
3583 * be considered authoritative for.
3584 */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,u64 first_offset,u64 last_offset)3585 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3586 struct btrfs_root *log,
3587 struct btrfs_path *path,
3588 u64 dirid,
3589 u64 first_offset, u64 last_offset)
3590 {
3591 int ret;
3592 struct btrfs_key key;
3593 struct btrfs_dir_log_item *item;
3594
3595 key.objectid = dirid;
3596 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3597 key.offset = first_offset;
3598 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3599 /*
3600 * -EEXIST is fine and can happen sporadically when we are logging a
3601 * directory and have concurrent insertions in the subvolume's tree for
3602 * items from other inodes and that result in pushing off some dir items
3603 * from one leaf to another in order to accommodate for the new items.
3604 * This results in logging the same dir index range key.
3605 */
3606 if (ret && ret != -EEXIST)
3607 return ret;
3608
3609 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3610 struct btrfs_dir_log_item);
3611 if (ret == -EEXIST) {
3612 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3613
3614 /*
3615 * btrfs_del_dir_entries_in_log() might have been called during
3616 * an unlink between the initial insertion of this key and the
3617 * current update, or we might be logging a single entry deletion
3618 * during a rename, so set the new last_offset to the max value.
3619 */
3620 last_offset = max(last_offset, curr_end);
3621 }
3622 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3623 btrfs_release_path(path);
3624 return 0;
3625 }
3626
flush_dir_items_batch(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct extent_buffer * src,struct btrfs_path * dst_path,int start_slot,int count)3627 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3628 struct btrfs_inode *inode,
3629 struct extent_buffer *src,
3630 struct btrfs_path *dst_path,
3631 int start_slot,
3632 int count)
3633 {
3634 struct btrfs_root *log = inode->root->log_root;
3635 char *ins_data = NULL;
3636 struct btrfs_item_batch batch;
3637 struct extent_buffer *dst;
3638 unsigned long src_offset;
3639 unsigned long dst_offset;
3640 u64 last_index;
3641 struct btrfs_key key;
3642 u32 item_size;
3643 int ret;
3644 int i;
3645
3646 ASSERT(count > 0);
3647 batch.nr = count;
3648
3649 if (count == 1) {
3650 btrfs_item_key_to_cpu(src, &key, start_slot);
3651 item_size = btrfs_item_size(src, start_slot);
3652 batch.keys = &key;
3653 batch.data_sizes = &item_size;
3654 batch.total_data_size = item_size;
3655 } else {
3656 struct btrfs_key *ins_keys;
3657 u32 *ins_sizes;
3658
3659 ins_data = kmalloc(count * sizeof(u32) +
3660 count * sizeof(struct btrfs_key), GFP_NOFS);
3661 if (!ins_data)
3662 return -ENOMEM;
3663
3664 ins_sizes = (u32 *)ins_data;
3665 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3666 batch.keys = ins_keys;
3667 batch.data_sizes = ins_sizes;
3668 batch.total_data_size = 0;
3669
3670 for (i = 0; i < count; i++) {
3671 const int slot = start_slot + i;
3672
3673 btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3674 ins_sizes[i] = btrfs_item_size(src, slot);
3675 batch.total_data_size += ins_sizes[i];
3676 }
3677 }
3678
3679 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3680 if (ret)
3681 goto out;
3682
3683 dst = dst_path->nodes[0];
3684 /*
3685 * Copy all the items in bulk, in a single copy operation. Item data is
3686 * organized such that it's placed at the end of a leaf and from right
3687 * to left. For example, the data for the second item ends at an offset
3688 * that matches the offset where the data for the first item starts, the
3689 * data for the third item ends at an offset that matches the offset
3690 * where the data of the second items starts, and so on.
3691 * Therefore our source and destination start offsets for copy match the
3692 * offsets of the last items (highest slots).
3693 */
3694 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3695 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3696 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3697 btrfs_release_path(dst_path);
3698
3699 last_index = batch.keys[count - 1].offset;
3700 ASSERT(last_index > inode->last_dir_index_offset);
3701
3702 /*
3703 * If for some unexpected reason the last item's index is not greater
3704 * than the last index we logged, warn and force a transaction commit.
3705 */
3706 if (WARN_ON(last_index <= inode->last_dir_index_offset))
3707 ret = BTRFS_LOG_FORCE_COMMIT;
3708 else
3709 inode->last_dir_index_offset = last_index;
3710
3711 if (btrfs_get_first_dir_index_to_log(inode) == 0)
3712 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
3713 out:
3714 kfree(ins_data);
3715
3716 return ret;
3717 }
3718
clone_leaf(struct btrfs_path * path,struct btrfs_log_ctx * ctx)3719 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
3720 {
3721 const int slot = path->slots[0];
3722
3723 if (ctx->scratch_eb) {
3724 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
3725 } else {
3726 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
3727 if (!ctx->scratch_eb)
3728 return -ENOMEM;
3729 }
3730
3731 btrfs_release_path(path);
3732 path->nodes[0] = ctx->scratch_eb;
3733 path->slots[0] = slot;
3734 /*
3735 * Add extra ref to scratch eb so that it is not freed when callers
3736 * release the path, so we can reuse it later if needed.
3737 */
3738 refcount_inc(&ctx->scratch_eb->refs);
3739
3740 return 0;
3741 }
3742
process_dir_items_leaf(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 * last_old_dentry_offset)3743 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3744 struct btrfs_inode *inode,
3745 struct btrfs_path *path,
3746 struct btrfs_path *dst_path,
3747 struct btrfs_log_ctx *ctx,
3748 u64 *last_old_dentry_offset)
3749 {
3750 struct btrfs_root *log = inode->root->log_root;
3751 struct extent_buffer *src;
3752 const int nritems = btrfs_header_nritems(path->nodes[0]);
3753 const u64 ino = btrfs_ino(inode);
3754 bool last_found = false;
3755 int batch_start = 0;
3756 int batch_size = 0;
3757 int ret;
3758
3759 /*
3760 * We need to clone the leaf, release the read lock on it, and use the
3761 * clone before modifying the log tree. See the comment at copy_items()
3762 * about why we need to do this.
3763 */
3764 ret = clone_leaf(path, ctx);
3765 if (ret < 0)
3766 return ret;
3767
3768 src = path->nodes[0];
3769
3770 for (int i = path->slots[0]; i < nritems; i++) {
3771 struct btrfs_dir_item *di;
3772 struct btrfs_key key;
3773 int ret;
3774
3775 btrfs_item_key_to_cpu(src, &key, i);
3776
3777 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3778 last_found = true;
3779 break;
3780 }
3781
3782 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3783
3784 /*
3785 * Skip ranges of items that consist only of dir item keys created
3786 * in past transactions. However if we find a gap, we must log a
3787 * dir index range item for that gap, so that index keys in that
3788 * gap are deleted during log replay.
3789 */
3790 if (btrfs_dir_transid(src, di) < trans->transid) {
3791 if (key.offset > *last_old_dentry_offset + 1) {
3792 ret = insert_dir_log_key(trans, log, dst_path,
3793 ino, *last_old_dentry_offset + 1,
3794 key.offset - 1);
3795 if (ret < 0)
3796 return ret;
3797 }
3798
3799 *last_old_dentry_offset = key.offset;
3800 continue;
3801 }
3802
3803 /* If we logged this dir index item before, we can skip it. */
3804 if (key.offset <= inode->last_dir_index_offset)
3805 continue;
3806
3807 /*
3808 * We must make sure that when we log a directory entry, the
3809 * corresponding inode, after log replay, has a matching link
3810 * count. For example:
3811 *
3812 * touch foo
3813 * mkdir mydir
3814 * sync
3815 * ln foo mydir/bar
3816 * xfs_io -c "fsync" mydir
3817 * <crash>
3818 * <mount fs and log replay>
3819 *
3820 * Would result in a fsync log that when replayed, our file inode
3821 * would have a link count of 1, but we get two directory entries
3822 * pointing to the same inode. After removing one of the names,
3823 * it would not be possible to remove the other name, which
3824 * resulted always in stale file handle errors, and would not be
3825 * possible to rmdir the parent directory, since its i_size could
3826 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3827 * resulting in -ENOTEMPTY errors.
3828 */
3829 if (!ctx->log_new_dentries) {
3830 struct btrfs_key di_key;
3831
3832 btrfs_dir_item_key_to_cpu(src, di, &di_key);
3833 if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3834 ctx->log_new_dentries = true;
3835 }
3836
3837 if (batch_size == 0)
3838 batch_start = i;
3839 batch_size++;
3840 }
3841
3842 if (batch_size > 0) {
3843 int ret;
3844
3845 ret = flush_dir_items_batch(trans, inode, src, dst_path,
3846 batch_start, batch_size);
3847 if (ret < 0)
3848 return ret;
3849 }
3850
3851 return last_found ? 1 : 0;
3852 }
3853
3854 /*
3855 * log all the items included in the current transaction for a given
3856 * directory. This also creates the range items in the log tree required
3857 * to replay anything deleted before the fsync
3858 */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)3859 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3860 struct btrfs_inode *inode,
3861 struct btrfs_path *path,
3862 struct btrfs_path *dst_path,
3863 struct btrfs_log_ctx *ctx,
3864 u64 min_offset, u64 *last_offset_ret)
3865 {
3866 struct btrfs_key min_key;
3867 struct btrfs_root *root = inode->root;
3868 struct btrfs_root *log = root->log_root;
3869 int ret;
3870 u64 last_old_dentry_offset = min_offset - 1;
3871 u64 last_offset = (u64)-1;
3872 u64 ino = btrfs_ino(inode);
3873
3874 min_key.objectid = ino;
3875 min_key.type = BTRFS_DIR_INDEX_KEY;
3876 min_key.offset = min_offset;
3877
3878 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3879
3880 /*
3881 * we didn't find anything from this transaction, see if there
3882 * is anything at all
3883 */
3884 if (ret != 0 || min_key.objectid != ino ||
3885 min_key.type != BTRFS_DIR_INDEX_KEY) {
3886 min_key.objectid = ino;
3887 min_key.type = BTRFS_DIR_INDEX_KEY;
3888 min_key.offset = (u64)-1;
3889 btrfs_release_path(path);
3890 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3891 if (ret < 0) {
3892 btrfs_release_path(path);
3893 return ret;
3894 }
3895 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3896
3897 /* if ret == 0 there are items for this type,
3898 * create a range to tell us the last key of this type.
3899 * otherwise, there are no items in this directory after
3900 * *min_offset, and we create a range to indicate that.
3901 */
3902 if (ret == 0) {
3903 struct btrfs_key tmp;
3904
3905 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3906 path->slots[0]);
3907 if (tmp.type == BTRFS_DIR_INDEX_KEY)
3908 last_old_dentry_offset = tmp.offset;
3909 } else if (ret > 0) {
3910 ret = 0;
3911 }
3912
3913 goto done;
3914 }
3915
3916 /* go backward to find any previous key */
3917 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3918 if (ret == 0) {
3919 struct btrfs_key tmp;
3920
3921 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3922 /*
3923 * The dir index key before the first one we found that needs to
3924 * be logged might be in a previous leaf, and there might be a
3925 * gap between these keys, meaning that we had deletions that
3926 * happened. So the key range item we log (key type
3927 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3928 * previous key's offset plus 1, so that those deletes are replayed.
3929 */
3930 if (tmp.type == BTRFS_DIR_INDEX_KEY)
3931 last_old_dentry_offset = tmp.offset;
3932 } else if (ret < 0) {
3933 goto done;
3934 }
3935
3936 btrfs_release_path(path);
3937
3938 /*
3939 * Find the first key from this transaction again or the one we were at
3940 * in the loop below in case we had to reschedule. We may be logging the
3941 * directory without holding its VFS lock, which happen when logging new
3942 * dentries (through log_new_dir_dentries()) or in some cases when we
3943 * need to log the parent directory of an inode. This means a dir index
3944 * key might be deleted from the inode's root, and therefore we may not
3945 * find it anymore. If we can't find it, just move to the next key. We
3946 * can not bail out and ignore, because if we do that we will simply
3947 * not log dir index keys that come after the one that was just deleted
3948 * and we can end up logging a dir index range that ends at (u64)-1
3949 * (@last_offset is initialized to that), resulting in removing dir
3950 * entries we should not remove at log replay time.
3951 */
3952 search:
3953 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3954 if (ret > 0) {
3955 ret = btrfs_next_item(root, path);
3956 if (ret > 0) {
3957 /* There are no more keys in the inode's root. */
3958 ret = 0;
3959 goto done;
3960 }
3961 }
3962 if (ret < 0)
3963 goto done;
3964
3965 /*
3966 * we have a block from this transaction, log every item in it
3967 * from our directory
3968 */
3969 while (1) {
3970 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3971 &last_old_dentry_offset);
3972 if (ret != 0) {
3973 if (ret > 0)
3974 ret = 0;
3975 goto done;
3976 }
3977 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3978
3979 /*
3980 * look ahead to the next item and see if it is also
3981 * from this directory and from this transaction
3982 */
3983 ret = btrfs_next_leaf(root, path);
3984 if (ret) {
3985 if (ret == 1) {
3986 last_offset = (u64)-1;
3987 ret = 0;
3988 }
3989 goto done;
3990 }
3991 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3992 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3993 last_offset = (u64)-1;
3994 goto done;
3995 }
3996 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3997 /*
3998 * The next leaf was not changed in the current transaction
3999 * and has at least one dir index key.
4000 * We check for the next key because there might have been
4001 * one or more deletions between the last key we logged and
4002 * that next key. So the key range item we log (key type
4003 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
4004 * offset minus 1, so that those deletes are replayed.
4005 */
4006 last_offset = min_key.offset - 1;
4007 goto done;
4008 }
4009 if (need_resched()) {
4010 btrfs_release_path(path);
4011 cond_resched();
4012 goto search;
4013 }
4014 }
4015 done:
4016 btrfs_release_path(path);
4017 btrfs_release_path(dst_path);
4018
4019 if (ret == 0) {
4020 *last_offset_ret = last_offset;
4021 /*
4022 * In case the leaf was changed in the current transaction but
4023 * all its dir items are from a past transaction, the last item
4024 * in the leaf is a dir item and there's no gap between that last
4025 * dir item and the first one on the next leaf (which did not
4026 * change in the current transaction), then we don't need to log
4027 * a range, last_old_dentry_offset is == to last_offset.
4028 */
4029 ASSERT(last_old_dentry_offset <= last_offset);
4030 if (last_old_dentry_offset < last_offset)
4031 ret = insert_dir_log_key(trans, log, path, ino,
4032 last_old_dentry_offset + 1,
4033 last_offset);
4034 }
4035
4036 return ret;
4037 }
4038
4039 /*
4040 * If the inode was logged before and it was evicted, then its
4041 * last_dir_index_offset is (u64)-1, so we don't the value of the last index
4042 * key offset. If that's the case, search for it and update the inode. This
4043 * is to avoid lookups in the log tree every time we try to insert a dir index
4044 * key from a leaf changed in the current transaction, and to allow us to always
4045 * do batch insertions of dir index keys.
4046 */
update_last_dir_index_offset(struct btrfs_inode * inode,struct btrfs_path * path,const struct btrfs_log_ctx * ctx)4047 static int update_last_dir_index_offset(struct btrfs_inode *inode,
4048 struct btrfs_path *path,
4049 const struct btrfs_log_ctx *ctx)
4050 {
4051 const u64 ino = btrfs_ino(inode);
4052 struct btrfs_key key;
4053 int ret;
4054
4055 lockdep_assert_held(&inode->log_mutex);
4056
4057 if (inode->last_dir_index_offset != (u64)-1)
4058 return 0;
4059
4060 if (!ctx->logged_before) {
4061 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4062 return 0;
4063 }
4064
4065 key.objectid = ino;
4066 key.type = BTRFS_DIR_INDEX_KEY;
4067 key.offset = (u64)-1;
4068
4069 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4070 /*
4071 * An error happened or we actually have an index key with an offset
4072 * value of (u64)-1. Bail out, we're done.
4073 */
4074 if (ret <= 0)
4075 goto out;
4076
4077 ret = 0;
4078 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4079
4080 /*
4081 * No dir index items, bail out and leave last_dir_index_offset with
4082 * the value right before the first valid index value.
4083 */
4084 if (path->slots[0] == 0)
4085 goto out;
4086
4087 /*
4088 * btrfs_search_slot() left us at one slot beyond the slot with the last
4089 * index key, or beyond the last key of the directory that is not an
4090 * index key. If we have an index key before, set last_dir_index_offset
4091 * to its offset value, otherwise leave it with a value right before the
4092 * first valid index value, as it means we have an empty directory.
4093 */
4094 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4095 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4096 inode->last_dir_index_offset = key.offset;
4097
4098 out:
4099 btrfs_release_path(path);
4100
4101 return ret;
4102 }
4103
4104 /*
4105 * logging directories is very similar to logging inodes, We find all the items
4106 * from the current transaction and write them to the log.
4107 *
4108 * The recovery code scans the directory in the subvolume, and if it finds a
4109 * key in the range logged that is not present in the log tree, then it means
4110 * that dir entry was unlinked during the transaction.
4111 *
4112 * In order for that scan to work, we must include one key smaller than
4113 * the smallest logged by this transaction and one key larger than the largest
4114 * key logged by this transaction.
4115 */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)4116 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4117 struct btrfs_inode *inode,
4118 struct btrfs_path *path,
4119 struct btrfs_path *dst_path,
4120 struct btrfs_log_ctx *ctx)
4121 {
4122 u64 min_key;
4123 u64 max_key;
4124 int ret;
4125
4126 ret = update_last_dir_index_offset(inode, path, ctx);
4127 if (ret)
4128 return ret;
4129
4130 min_key = BTRFS_DIR_START_INDEX;
4131 max_key = 0;
4132
4133 while (1) {
4134 ret = log_dir_items(trans, inode, path, dst_path,
4135 ctx, min_key, &max_key);
4136 if (ret)
4137 return ret;
4138 if (max_key == (u64)-1)
4139 break;
4140 min_key = max_key + 1;
4141 }
4142
4143 return 0;
4144 }
4145
4146 /*
4147 * a helper function to drop items from the log before we relog an
4148 * inode. max_key_type indicates the highest item type to remove.
4149 * This cannot be run for file data extents because it does not
4150 * free the extents they point to.
4151 */
drop_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,int max_key_type)4152 static int drop_inode_items(struct btrfs_trans_handle *trans,
4153 struct btrfs_root *log,
4154 struct btrfs_path *path,
4155 struct btrfs_inode *inode,
4156 int max_key_type)
4157 {
4158 int ret;
4159 struct btrfs_key key;
4160 struct btrfs_key found_key;
4161 int start_slot;
4162
4163 key.objectid = btrfs_ino(inode);
4164 key.type = max_key_type;
4165 key.offset = (u64)-1;
4166
4167 while (1) {
4168 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4169 if (ret < 0) {
4170 break;
4171 } else if (ret > 0) {
4172 if (path->slots[0] == 0)
4173 break;
4174 path->slots[0]--;
4175 }
4176
4177 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4178 path->slots[0]);
4179
4180 if (found_key.objectid != key.objectid)
4181 break;
4182
4183 found_key.offset = 0;
4184 found_key.type = 0;
4185 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4186 if (ret < 0)
4187 break;
4188
4189 ret = btrfs_del_items(trans, log, path, start_slot,
4190 path->slots[0] - start_slot + 1);
4191 /*
4192 * If start slot isn't 0 then we don't need to re-search, we've
4193 * found the last guy with the objectid in this tree.
4194 */
4195 if (ret || start_slot != 0)
4196 break;
4197 btrfs_release_path(path);
4198 }
4199 btrfs_release_path(path);
4200 if (ret > 0)
4201 ret = 0;
4202 return ret;
4203 }
4204
truncate_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log_root,struct btrfs_inode * inode,u64 new_size,u32 min_type)4205 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4206 struct btrfs_root *log_root,
4207 struct btrfs_inode *inode,
4208 u64 new_size, u32 min_type)
4209 {
4210 struct btrfs_truncate_control control = {
4211 .new_size = new_size,
4212 .ino = btrfs_ino(inode),
4213 .min_type = min_type,
4214 .skip_ref_updates = true,
4215 };
4216
4217 return btrfs_truncate_inode_items(trans, log_root, &control);
4218 }
4219
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,int log_inode_only,u64 logged_isize)4220 static void fill_inode_item(struct btrfs_trans_handle *trans,
4221 struct extent_buffer *leaf,
4222 struct btrfs_inode_item *item,
4223 struct inode *inode, int log_inode_only,
4224 u64 logged_isize)
4225 {
4226 u64 flags;
4227
4228 if (log_inode_only) {
4229 /* set the generation to zero so the recover code
4230 * can tell the difference between an logging
4231 * just to say 'this inode exists' and a logging
4232 * to say 'update this inode with these values'
4233 */
4234 btrfs_set_inode_generation(leaf, item, 0);
4235 btrfs_set_inode_size(leaf, item, logged_isize);
4236 } else {
4237 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
4238 btrfs_set_inode_size(leaf, item, inode->i_size);
4239 }
4240
4241 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
4242 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
4243 btrfs_set_inode_mode(leaf, item, inode->i_mode);
4244 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
4245
4246 btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
4247 btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
4248
4249 btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
4250 btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
4251
4252 btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
4253 btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
4254
4255 btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
4256 btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4257
4258 /*
4259 * We do not need to set the nbytes field, in fact during a fast fsync
4260 * its value may not even be correct, since a fast fsync does not wait
4261 * for ordered extent completion, which is where we update nbytes, it
4262 * only waits for writeback to complete. During log replay as we find
4263 * file extent items and replay them, we adjust the nbytes field of the
4264 * inode item in subvolume tree as needed (see overwrite_item()).
4265 */
4266
4267 btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
4268 btrfs_set_inode_transid(leaf, item, trans->transid);
4269 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
4270 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4271 BTRFS_I(inode)->ro_flags);
4272 btrfs_set_inode_flags(leaf, item, flags);
4273 btrfs_set_inode_block_group(leaf, item, 0);
4274 }
4275
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,bool inode_item_dropped)4276 static int log_inode_item(struct btrfs_trans_handle *trans,
4277 struct btrfs_root *log, struct btrfs_path *path,
4278 struct btrfs_inode *inode, bool inode_item_dropped)
4279 {
4280 struct btrfs_inode_item *inode_item;
4281 struct btrfs_key key;
4282 int ret;
4283
4284 btrfs_get_inode_key(inode, &key);
4285 /*
4286 * If we are doing a fast fsync and the inode was logged before in the
4287 * current transaction, then we know the inode was previously logged and
4288 * it exists in the log tree. For performance reasons, in this case use
4289 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4290 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4291 * contention in case there are concurrent fsyncs for other inodes of the
4292 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4293 * already exists can also result in unnecessarily splitting a leaf.
4294 */
4295 if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4296 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
4297 ASSERT(ret <= 0);
4298 if (ret > 0)
4299 ret = -ENOENT;
4300 } else {
4301 /*
4302 * This means it is the first fsync in the current transaction,
4303 * so the inode item is not in the log and we need to insert it.
4304 * We can never get -EEXIST because we are only called for a fast
4305 * fsync and in case an inode eviction happens after the inode was
4306 * logged before in the current transaction, when we load again
4307 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4308 * flags and set ->logged_trans to 0.
4309 */
4310 ret = btrfs_insert_empty_item(trans, log, path, &key,
4311 sizeof(*inode_item));
4312 ASSERT(ret != -EEXIST);
4313 }
4314 if (ret)
4315 return ret;
4316 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4317 struct btrfs_inode_item);
4318 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4319 0, 0);
4320 btrfs_release_path(path);
4321 return 0;
4322 }
4323
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)4324 static int log_csums(struct btrfs_trans_handle *trans,
4325 struct btrfs_inode *inode,
4326 struct btrfs_root *log_root,
4327 struct btrfs_ordered_sum *sums)
4328 {
4329 const u64 lock_end = sums->logical + sums->len - 1;
4330 struct extent_state *cached_state = NULL;
4331 int ret;
4332
4333 /*
4334 * If this inode was not used for reflink operations in the current
4335 * transaction with new extents, then do the fast path, no need to
4336 * worry about logging checksum items with overlapping ranges.
4337 */
4338 if (inode->last_reflink_trans < trans->transid)
4339 return btrfs_csum_file_blocks(trans, log_root, sums);
4340
4341 /*
4342 * Serialize logging for checksums. This is to avoid racing with the
4343 * same checksum being logged by another task that is logging another
4344 * file which happens to refer to the same extent as well. Such races
4345 * can leave checksum items in the log with overlapping ranges.
4346 */
4347 ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4348 &cached_state);
4349 if (ret)
4350 return ret;
4351 /*
4352 * Due to extent cloning, we might have logged a csum item that covers a
4353 * subrange of a cloned extent, and later we can end up logging a csum
4354 * item for a larger subrange of the same extent or the entire range.
4355 * This would leave csum items in the log tree that cover the same range
4356 * and break the searches for checksums in the log tree, resulting in
4357 * some checksums missing in the fs/subvolume tree. So just delete (or
4358 * trim and adjust) any existing csum items in the log for this range.
4359 */
4360 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4361 if (!ret)
4362 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4363
4364 btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4365 &cached_state);
4366
4367 return ret;
4368 }
4369
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize,struct btrfs_log_ctx * ctx)4370 static noinline int copy_items(struct btrfs_trans_handle *trans,
4371 struct btrfs_inode *inode,
4372 struct btrfs_path *dst_path,
4373 struct btrfs_path *src_path,
4374 int start_slot, int nr, int inode_only,
4375 u64 logged_isize, struct btrfs_log_ctx *ctx)
4376 {
4377 struct btrfs_root *log = inode->root->log_root;
4378 struct btrfs_file_extent_item *extent;
4379 struct extent_buffer *src;
4380 int ret;
4381 struct btrfs_key *ins_keys;
4382 u32 *ins_sizes;
4383 struct btrfs_item_batch batch;
4384 char *ins_data;
4385 int dst_index;
4386 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4387 const u64 i_size = i_size_read(&inode->vfs_inode);
4388
4389 /*
4390 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4391 * use the clone. This is because otherwise we would be changing the log
4392 * tree, to insert items from the subvolume tree or insert csum items,
4393 * while holding a read lock on a leaf from the subvolume tree, which
4394 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4395 *
4396 * 1) Modifying the log tree triggers an extent buffer allocation while
4397 * holding a write lock on a parent extent buffer from the log tree.
4398 * Allocating the pages for an extent buffer, or the extent buffer
4399 * struct, can trigger inode eviction and finally the inode eviction
4400 * will trigger a release/remove of a delayed node, which requires
4401 * taking the delayed node's mutex;
4402 *
4403 * 2) Allocating a metadata extent for a log tree can trigger the async
4404 * reclaim thread and make us wait for it to release enough space and
4405 * unblock our reservation ticket. The reclaim thread can start
4406 * flushing delayed items, and that in turn results in the need to
4407 * lock delayed node mutexes and in the need to write lock extent
4408 * buffers of a subvolume tree - all this while holding a write lock
4409 * on the parent extent buffer in the log tree.
4410 *
4411 * So one task in scenario 1) running in parallel with another task in
4412 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4413 * node mutex while having a read lock on a leaf from the subvolume,
4414 * while the other is holding the delayed node's mutex and wants to
4415 * write lock the same subvolume leaf for flushing delayed items.
4416 */
4417 ret = clone_leaf(src_path, ctx);
4418 if (ret < 0)
4419 return ret;
4420
4421 src = src_path->nodes[0];
4422
4423 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4424 nr * sizeof(u32), GFP_NOFS);
4425 if (!ins_data)
4426 return -ENOMEM;
4427
4428 ins_sizes = (u32 *)ins_data;
4429 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4430 batch.keys = ins_keys;
4431 batch.data_sizes = ins_sizes;
4432 batch.total_data_size = 0;
4433 batch.nr = 0;
4434
4435 dst_index = 0;
4436 for (int i = 0; i < nr; i++) {
4437 const int src_slot = start_slot + i;
4438 struct btrfs_root *csum_root;
4439 struct btrfs_ordered_sum *sums;
4440 struct btrfs_ordered_sum *sums_next;
4441 LIST_HEAD(ordered_sums);
4442 u64 disk_bytenr;
4443 u64 disk_num_bytes;
4444 u64 extent_offset;
4445 u64 extent_num_bytes;
4446 bool is_old_extent;
4447
4448 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4449
4450 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4451 goto add_to_batch;
4452
4453 extent = btrfs_item_ptr(src, src_slot,
4454 struct btrfs_file_extent_item);
4455
4456 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4457 trans->transid);
4458
4459 /*
4460 * Don't copy extents from past generations. That would make us
4461 * log a lot more metadata for common cases like doing only a
4462 * few random writes into a file and then fsync it for the first
4463 * time or after the full sync flag is set on the inode. We can
4464 * get leaves full of extent items, most of which are from past
4465 * generations, so we can skip them - as long as the inode has
4466 * not been the target of a reflink operation in this transaction,
4467 * as in that case it might have had file extent items with old
4468 * generations copied into it. We also must always log prealloc
4469 * extents that start at or beyond eof, otherwise we would lose
4470 * them on log replay.
4471 */
4472 if (is_old_extent &&
4473 ins_keys[dst_index].offset < i_size &&
4474 inode->last_reflink_trans < trans->transid)
4475 continue;
4476
4477 if (skip_csum)
4478 goto add_to_batch;
4479
4480 /* Only regular extents have checksums. */
4481 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4482 goto add_to_batch;
4483
4484 /*
4485 * If it's an extent created in a past transaction, then its
4486 * checksums are already accessible from the committed csum tree,
4487 * no need to log them.
4488 */
4489 if (is_old_extent)
4490 goto add_to_batch;
4491
4492 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4493 /* If it's an explicit hole, there are no checksums. */
4494 if (disk_bytenr == 0)
4495 goto add_to_batch;
4496
4497 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4498
4499 if (btrfs_file_extent_compression(src, extent)) {
4500 extent_offset = 0;
4501 extent_num_bytes = disk_num_bytes;
4502 } else {
4503 extent_offset = btrfs_file_extent_offset(src, extent);
4504 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4505 }
4506
4507 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4508 disk_bytenr += extent_offset;
4509 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4510 disk_bytenr + extent_num_bytes - 1,
4511 &ordered_sums, false);
4512 if (ret < 0)
4513 goto out;
4514 ret = 0;
4515
4516 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4517 if (!ret)
4518 ret = log_csums(trans, inode, log, sums);
4519 list_del(&sums->list);
4520 kfree(sums);
4521 }
4522 if (ret)
4523 goto out;
4524
4525 add_to_batch:
4526 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4527 batch.total_data_size += ins_sizes[dst_index];
4528 batch.nr++;
4529 dst_index++;
4530 }
4531
4532 /*
4533 * We have a leaf full of old extent items that don't need to be logged,
4534 * so we don't need to do anything.
4535 */
4536 if (batch.nr == 0)
4537 goto out;
4538
4539 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4540 if (ret)
4541 goto out;
4542
4543 dst_index = 0;
4544 for (int i = 0; i < nr; i++) {
4545 const int src_slot = start_slot + i;
4546 const int dst_slot = dst_path->slots[0] + dst_index;
4547 struct btrfs_key key;
4548 unsigned long src_offset;
4549 unsigned long dst_offset;
4550
4551 /*
4552 * We're done, all the remaining items in the source leaf
4553 * correspond to old file extent items.
4554 */
4555 if (dst_index >= batch.nr)
4556 break;
4557
4558 btrfs_item_key_to_cpu(src, &key, src_slot);
4559
4560 if (key.type != BTRFS_EXTENT_DATA_KEY)
4561 goto copy_item;
4562
4563 extent = btrfs_item_ptr(src, src_slot,
4564 struct btrfs_file_extent_item);
4565
4566 /* See the comment in the previous loop, same logic. */
4567 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4568 key.offset < i_size &&
4569 inode->last_reflink_trans < trans->transid)
4570 continue;
4571
4572 copy_item:
4573 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4574 src_offset = btrfs_item_ptr_offset(src, src_slot);
4575
4576 if (key.type == BTRFS_INODE_ITEM_KEY) {
4577 struct btrfs_inode_item *inode_item;
4578
4579 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4580 struct btrfs_inode_item);
4581 fill_inode_item(trans, dst_path->nodes[0], inode_item,
4582 &inode->vfs_inode,
4583 inode_only == LOG_INODE_EXISTS,
4584 logged_isize);
4585 } else {
4586 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4587 src_offset, ins_sizes[dst_index]);
4588 }
4589
4590 dst_index++;
4591 }
4592
4593 btrfs_release_path(dst_path);
4594 out:
4595 kfree(ins_data);
4596
4597 return ret;
4598 }
4599
extent_cmp(void * priv,const struct list_head * a,const struct list_head * b)4600 static int extent_cmp(void *priv, const struct list_head *a,
4601 const struct list_head *b)
4602 {
4603 const struct extent_map *em1, *em2;
4604
4605 em1 = list_entry(a, struct extent_map, list);
4606 em2 = list_entry(b, struct extent_map, list);
4607
4608 if (em1->start < em2->start)
4609 return -1;
4610 else if (em1->start > em2->start)
4611 return 1;
4612 return 0;
4613 }
4614
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4615 static int log_extent_csums(struct btrfs_trans_handle *trans,
4616 struct btrfs_inode *inode,
4617 struct btrfs_root *log_root,
4618 const struct extent_map *em,
4619 struct btrfs_log_ctx *ctx)
4620 {
4621 struct btrfs_ordered_extent *ordered;
4622 struct btrfs_root *csum_root;
4623 u64 block_start;
4624 u64 csum_offset;
4625 u64 csum_len;
4626 u64 mod_start = em->start;
4627 u64 mod_len = em->len;
4628 LIST_HEAD(ordered_sums);
4629 int ret = 0;
4630
4631 if (inode->flags & BTRFS_INODE_NODATASUM ||
4632 (em->flags & EXTENT_FLAG_PREALLOC) ||
4633 em->disk_bytenr == EXTENT_MAP_HOLE)
4634 return 0;
4635
4636 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4637 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4638 const u64 mod_end = mod_start + mod_len;
4639 struct btrfs_ordered_sum *sums;
4640
4641 if (mod_len == 0)
4642 break;
4643
4644 if (ordered_end <= mod_start)
4645 continue;
4646 if (mod_end <= ordered->file_offset)
4647 break;
4648
4649 /*
4650 * We are going to copy all the csums on this ordered extent, so
4651 * go ahead and adjust mod_start and mod_len in case this ordered
4652 * extent has already been logged.
4653 */
4654 if (ordered->file_offset > mod_start) {
4655 if (ordered_end >= mod_end)
4656 mod_len = ordered->file_offset - mod_start;
4657 /*
4658 * If we have this case
4659 *
4660 * |--------- logged extent ---------|
4661 * |----- ordered extent ----|
4662 *
4663 * Just don't mess with mod_start and mod_len, we'll
4664 * just end up logging more csums than we need and it
4665 * will be ok.
4666 */
4667 } else {
4668 if (ordered_end < mod_end) {
4669 mod_len = mod_end - ordered_end;
4670 mod_start = ordered_end;
4671 } else {
4672 mod_len = 0;
4673 }
4674 }
4675
4676 /*
4677 * To keep us from looping for the above case of an ordered
4678 * extent that falls inside of the logged extent.
4679 */
4680 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4681 continue;
4682
4683 list_for_each_entry(sums, &ordered->list, list) {
4684 ret = log_csums(trans, inode, log_root, sums);
4685 if (ret)
4686 return ret;
4687 }
4688 }
4689
4690 /* We're done, found all csums in the ordered extents. */
4691 if (mod_len == 0)
4692 return 0;
4693
4694 /* If we're compressed we have to save the entire range of csums. */
4695 if (btrfs_extent_map_is_compressed(em)) {
4696 csum_offset = 0;
4697 csum_len = em->disk_num_bytes;
4698 } else {
4699 csum_offset = mod_start - em->start;
4700 csum_len = mod_len;
4701 }
4702
4703 /* block start is already adjusted for the file extent offset. */
4704 block_start = btrfs_extent_map_block_start(em);
4705 csum_root = btrfs_csum_root(trans->fs_info, block_start);
4706 ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
4707 block_start + csum_offset + csum_len - 1,
4708 &ordered_sums, false);
4709 if (ret < 0)
4710 return ret;
4711 ret = 0;
4712
4713 while (!list_empty(&ordered_sums)) {
4714 struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
4715 struct btrfs_ordered_sum,
4716 list);
4717 if (!ret)
4718 ret = log_csums(trans, inode, log_root, sums);
4719 list_del(&sums->list);
4720 kfree(sums);
4721 }
4722
4723 return ret;
4724 }
4725
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4726 static int log_one_extent(struct btrfs_trans_handle *trans,
4727 struct btrfs_inode *inode,
4728 const struct extent_map *em,
4729 struct btrfs_path *path,
4730 struct btrfs_log_ctx *ctx)
4731 {
4732 struct btrfs_drop_extents_args drop_args = { 0 };
4733 struct btrfs_root *log = inode->root->log_root;
4734 struct btrfs_file_extent_item fi = { 0 };
4735 struct extent_buffer *leaf;
4736 struct btrfs_key key;
4737 enum btrfs_compression_type compress_type;
4738 u64 extent_offset = em->offset;
4739 u64 block_start = btrfs_extent_map_block_start(em);
4740 u64 block_len;
4741 int ret;
4742
4743 btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4744 if (em->flags & EXTENT_FLAG_PREALLOC)
4745 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4746 else
4747 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4748
4749 block_len = em->disk_num_bytes;
4750 compress_type = btrfs_extent_map_compression(em);
4751 if (compress_type != BTRFS_COMPRESS_NONE) {
4752 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
4753 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4754 } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
4755 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
4756 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4757 }
4758
4759 btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4760 btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4761 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4762 btrfs_set_stack_file_extent_compression(&fi, compress_type);
4763
4764 ret = log_extent_csums(trans, inode, log, em, ctx);
4765 if (ret)
4766 return ret;
4767
4768 /*
4769 * If this is the first time we are logging the inode in the current
4770 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4771 * because it does a deletion search, which always acquires write locks
4772 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4773 * but also adds significant contention in a log tree, since log trees
4774 * are small, with a root at level 2 or 3 at most, due to their short
4775 * life span.
4776 */
4777 if (ctx->logged_before) {
4778 drop_args.path = path;
4779 drop_args.start = em->start;
4780 drop_args.end = em->start + em->len;
4781 drop_args.replace_extent = true;
4782 drop_args.extent_item_size = sizeof(fi);
4783 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4784 if (ret)
4785 return ret;
4786 }
4787
4788 if (!drop_args.extent_inserted) {
4789 key.objectid = btrfs_ino(inode);
4790 key.type = BTRFS_EXTENT_DATA_KEY;
4791 key.offset = em->start;
4792
4793 ret = btrfs_insert_empty_item(trans, log, path, &key,
4794 sizeof(fi));
4795 if (ret)
4796 return ret;
4797 }
4798 leaf = path->nodes[0];
4799 write_extent_buffer(leaf, &fi,
4800 btrfs_item_ptr_offset(leaf, path->slots[0]),
4801 sizeof(fi));
4802
4803 btrfs_release_path(path);
4804
4805 return ret;
4806 }
4807
4808 /*
4809 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4810 * lose them after doing a full/fast fsync and replaying the log. We scan the
4811 * subvolume's root instead of iterating the inode's extent map tree because
4812 * otherwise we can log incorrect extent items based on extent map conversion.
4813 * That can happen due to the fact that extent maps are merged when they
4814 * are not in the extent map tree's list of modified extents.
4815 */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4816 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4817 struct btrfs_inode *inode,
4818 struct btrfs_path *path,
4819 struct btrfs_log_ctx *ctx)
4820 {
4821 struct btrfs_root *root = inode->root;
4822 struct btrfs_key key;
4823 const u64 i_size = i_size_read(&inode->vfs_inode);
4824 const u64 ino = btrfs_ino(inode);
4825 struct btrfs_path *dst_path = NULL;
4826 bool dropped_extents = false;
4827 u64 truncate_offset = i_size;
4828 struct extent_buffer *leaf;
4829 int slot;
4830 int ins_nr = 0;
4831 int start_slot = 0;
4832 int ret;
4833
4834 if (!(inode->flags & BTRFS_INODE_PREALLOC))
4835 return 0;
4836
4837 key.objectid = ino;
4838 key.type = BTRFS_EXTENT_DATA_KEY;
4839 key.offset = i_size;
4840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4841 if (ret < 0)
4842 goto out;
4843
4844 /*
4845 * We must check if there is a prealloc extent that starts before the
4846 * i_size and crosses the i_size boundary. This is to ensure later we
4847 * truncate down to the end of that extent and not to the i_size, as
4848 * otherwise we end up losing part of the prealloc extent after a log
4849 * replay and with an implicit hole if there is another prealloc extent
4850 * that starts at an offset beyond i_size.
4851 */
4852 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4853 if (ret < 0)
4854 goto out;
4855
4856 if (ret == 0) {
4857 struct btrfs_file_extent_item *ei;
4858
4859 leaf = path->nodes[0];
4860 slot = path->slots[0];
4861 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4862
4863 if (btrfs_file_extent_type(leaf, ei) ==
4864 BTRFS_FILE_EXTENT_PREALLOC) {
4865 u64 extent_end;
4866
4867 btrfs_item_key_to_cpu(leaf, &key, slot);
4868 extent_end = key.offset +
4869 btrfs_file_extent_num_bytes(leaf, ei);
4870
4871 if (extent_end > i_size)
4872 truncate_offset = extent_end;
4873 }
4874 } else {
4875 ret = 0;
4876 }
4877
4878 while (true) {
4879 leaf = path->nodes[0];
4880 slot = path->slots[0];
4881
4882 if (slot >= btrfs_header_nritems(leaf)) {
4883 if (ins_nr > 0) {
4884 ret = copy_items(trans, inode, dst_path, path,
4885 start_slot, ins_nr, 1, 0, ctx);
4886 if (ret < 0)
4887 goto out;
4888 ins_nr = 0;
4889 }
4890 ret = btrfs_next_leaf(root, path);
4891 if (ret < 0)
4892 goto out;
4893 if (ret > 0) {
4894 ret = 0;
4895 break;
4896 }
4897 continue;
4898 }
4899
4900 btrfs_item_key_to_cpu(leaf, &key, slot);
4901 if (key.objectid > ino)
4902 break;
4903 if (WARN_ON_ONCE(key.objectid < ino) ||
4904 key.type < BTRFS_EXTENT_DATA_KEY ||
4905 key.offset < i_size) {
4906 path->slots[0]++;
4907 continue;
4908 }
4909 /*
4910 * Avoid overlapping items in the log tree. The first time we
4911 * get here, get rid of everything from a past fsync. After
4912 * that, if the current extent starts before the end of the last
4913 * extent we copied, truncate the last one. This can happen if
4914 * an ordered extent completion modifies the subvolume tree
4915 * while btrfs_next_leaf() has the tree unlocked.
4916 */
4917 if (!dropped_extents || key.offset < truncate_offset) {
4918 ret = truncate_inode_items(trans, root->log_root, inode,
4919 min(key.offset, truncate_offset),
4920 BTRFS_EXTENT_DATA_KEY);
4921 if (ret)
4922 goto out;
4923 dropped_extents = true;
4924 }
4925 truncate_offset = btrfs_file_extent_end(path);
4926 if (ins_nr == 0)
4927 start_slot = slot;
4928 ins_nr++;
4929 path->slots[0]++;
4930 if (!dst_path) {
4931 dst_path = btrfs_alloc_path();
4932 if (!dst_path) {
4933 ret = -ENOMEM;
4934 goto out;
4935 }
4936 }
4937 }
4938 if (ins_nr > 0)
4939 ret = copy_items(trans, inode, dst_path, path,
4940 start_slot, ins_nr, 1, 0, ctx);
4941 out:
4942 btrfs_release_path(path);
4943 btrfs_free_path(dst_path);
4944 return ret;
4945 }
4946
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4947 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4948 struct btrfs_inode *inode,
4949 struct btrfs_path *path,
4950 struct btrfs_log_ctx *ctx)
4951 {
4952 struct btrfs_ordered_extent *ordered;
4953 struct btrfs_ordered_extent *tmp;
4954 struct extent_map *em, *n;
4955 LIST_HEAD(extents);
4956 struct extent_map_tree *tree = &inode->extent_tree;
4957 int ret = 0;
4958 int num = 0;
4959
4960 write_lock(&tree->lock);
4961
4962 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4963 list_del_init(&em->list);
4964 /*
4965 * Just an arbitrary number, this can be really CPU intensive
4966 * once we start getting a lot of extents, and really once we
4967 * have a bunch of extents we just want to commit since it will
4968 * be faster.
4969 */
4970 if (++num > 32768) {
4971 list_del_init(&tree->modified_extents);
4972 ret = -EFBIG;
4973 goto process;
4974 }
4975
4976 if (em->generation < trans->transid)
4977 continue;
4978
4979 /* We log prealloc extents beyond eof later. */
4980 if ((em->flags & EXTENT_FLAG_PREALLOC) &&
4981 em->start >= i_size_read(&inode->vfs_inode))
4982 continue;
4983
4984 /* Need a ref to keep it from getting evicted from cache */
4985 refcount_inc(&em->refs);
4986 em->flags |= EXTENT_FLAG_LOGGING;
4987 list_add_tail(&em->list, &extents);
4988 num++;
4989 }
4990
4991 list_sort(NULL, &extents, extent_cmp);
4992 process:
4993 while (!list_empty(&extents)) {
4994 em = list_first_entry(&extents, struct extent_map, list);
4995
4996 list_del_init(&em->list);
4997
4998 /*
4999 * If we had an error we just need to delete everybody from our
5000 * private list.
5001 */
5002 if (ret) {
5003 btrfs_clear_em_logging(inode, em);
5004 btrfs_free_extent_map(em);
5005 continue;
5006 }
5007
5008 write_unlock(&tree->lock);
5009
5010 ret = log_one_extent(trans, inode, em, path, ctx);
5011 write_lock(&tree->lock);
5012 btrfs_clear_em_logging(inode, em);
5013 btrfs_free_extent_map(em);
5014 }
5015 WARN_ON(!list_empty(&extents));
5016 write_unlock(&tree->lock);
5017
5018 if (!ret)
5019 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
5020 if (ret)
5021 return ret;
5022
5023 /*
5024 * We have logged all extents successfully, now make sure the commit of
5025 * the current transaction waits for the ordered extents to complete
5026 * before it commits and wipes out the log trees, otherwise we would
5027 * lose data if an ordered extents completes after the transaction
5028 * commits and a power failure happens after the transaction commit.
5029 */
5030 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
5031 list_del_init(&ordered->log_list);
5032 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
5033
5034 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5035 spin_lock_irq(&inode->ordered_tree_lock);
5036 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5037 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
5038 atomic_inc(&trans->transaction->pending_ordered);
5039 }
5040 spin_unlock_irq(&inode->ordered_tree_lock);
5041 }
5042 btrfs_put_ordered_extent(ordered);
5043 }
5044
5045 return 0;
5046 }
5047
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)5048 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5049 struct btrfs_path *path, u64 *size_ret)
5050 {
5051 struct btrfs_key key;
5052 int ret;
5053
5054 key.objectid = btrfs_ino(inode);
5055 key.type = BTRFS_INODE_ITEM_KEY;
5056 key.offset = 0;
5057
5058 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5059 if (ret < 0) {
5060 return ret;
5061 } else if (ret > 0) {
5062 *size_ret = 0;
5063 } else {
5064 struct btrfs_inode_item *item;
5065
5066 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5067 struct btrfs_inode_item);
5068 *size_ret = btrfs_inode_size(path->nodes[0], item);
5069 /*
5070 * If the in-memory inode's i_size is smaller then the inode
5071 * size stored in the btree, return the inode's i_size, so
5072 * that we get a correct inode size after replaying the log
5073 * when before a power failure we had a shrinking truncate
5074 * followed by addition of a new name (rename / new hard link).
5075 * Otherwise return the inode size from the btree, to avoid
5076 * data loss when replaying a log due to previously doing a
5077 * write that expands the inode's size and logging a new name
5078 * immediately after.
5079 */
5080 if (*size_ret > inode->vfs_inode.i_size)
5081 *size_ret = inode->vfs_inode.i_size;
5082 }
5083
5084 btrfs_release_path(path);
5085 return 0;
5086 }
5087
5088 /*
5089 * At the moment we always log all xattrs. This is to figure out at log replay
5090 * time which xattrs must have their deletion replayed. If a xattr is missing
5091 * in the log tree and exists in the fs/subvol tree, we delete it. This is
5092 * because if a xattr is deleted, the inode is fsynced and a power failure
5093 * happens, causing the log to be replayed the next time the fs is mounted,
5094 * we want the xattr to not exist anymore (same behaviour as other filesystems
5095 * with a journal, ext3/4, xfs, f2fs, etc).
5096 */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)5097 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5098 struct btrfs_inode *inode,
5099 struct btrfs_path *path,
5100 struct btrfs_path *dst_path,
5101 struct btrfs_log_ctx *ctx)
5102 {
5103 struct btrfs_root *root = inode->root;
5104 int ret;
5105 struct btrfs_key key;
5106 const u64 ino = btrfs_ino(inode);
5107 int ins_nr = 0;
5108 int start_slot = 0;
5109 bool found_xattrs = false;
5110
5111 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5112 return 0;
5113
5114 key.objectid = ino;
5115 key.type = BTRFS_XATTR_ITEM_KEY;
5116 key.offset = 0;
5117
5118 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5119 if (ret < 0)
5120 return ret;
5121
5122 while (true) {
5123 int slot = path->slots[0];
5124 struct extent_buffer *leaf = path->nodes[0];
5125 int nritems = btrfs_header_nritems(leaf);
5126
5127 if (slot >= nritems) {
5128 if (ins_nr > 0) {
5129 ret = copy_items(trans, inode, dst_path, path,
5130 start_slot, ins_nr, 1, 0, ctx);
5131 if (ret < 0)
5132 return ret;
5133 ins_nr = 0;
5134 }
5135 ret = btrfs_next_leaf(root, path);
5136 if (ret < 0)
5137 return ret;
5138 else if (ret > 0)
5139 break;
5140 continue;
5141 }
5142
5143 btrfs_item_key_to_cpu(leaf, &key, slot);
5144 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5145 break;
5146
5147 if (ins_nr == 0)
5148 start_slot = slot;
5149 ins_nr++;
5150 path->slots[0]++;
5151 found_xattrs = true;
5152 cond_resched();
5153 }
5154 if (ins_nr > 0) {
5155 ret = copy_items(trans, inode, dst_path, path,
5156 start_slot, ins_nr, 1, 0, ctx);
5157 if (ret < 0)
5158 return ret;
5159 }
5160
5161 if (!found_xattrs)
5162 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5163
5164 return 0;
5165 }
5166
5167 /*
5168 * When using the NO_HOLES feature if we punched a hole that causes the
5169 * deletion of entire leafs or all the extent items of the first leaf (the one
5170 * that contains the inode item and references) we may end up not processing
5171 * any extents, because there are no leafs with a generation matching the
5172 * current transaction that have extent items for our inode. So we need to find
5173 * if any holes exist and then log them. We also need to log holes after any
5174 * truncate operation that changes the inode's size.
5175 */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)5176 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5177 struct btrfs_inode *inode,
5178 struct btrfs_path *path)
5179 {
5180 struct btrfs_root *root = inode->root;
5181 struct btrfs_fs_info *fs_info = root->fs_info;
5182 struct btrfs_key key;
5183 const u64 ino = btrfs_ino(inode);
5184 const u64 i_size = i_size_read(&inode->vfs_inode);
5185 u64 prev_extent_end = 0;
5186 int ret;
5187
5188 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5189 return 0;
5190
5191 key.objectid = ino;
5192 key.type = BTRFS_EXTENT_DATA_KEY;
5193 key.offset = 0;
5194
5195 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5196 if (ret < 0)
5197 return ret;
5198
5199 while (true) {
5200 struct extent_buffer *leaf = path->nodes[0];
5201
5202 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5203 ret = btrfs_next_leaf(root, path);
5204 if (ret < 0)
5205 return ret;
5206 if (ret > 0) {
5207 ret = 0;
5208 break;
5209 }
5210 leaf = path->nodes[0];
5211 }
5212
5213 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5214 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5215 break;
5216
5217 /* We have a hole, log it. */
5218 if (prev_extent_end < key.offset) {
5219 const u64 hole_len = key.offset - prev_extent_end;
5220
5221 /*
5222 * Release the path to avoid deadlocks with other code
5223 * paths that search the root while holding locks on
5224 * leafs from the log root.
5225 */
5226 btrfs_release_path(path);
5227 ret = btrfs_insert_hole_extent(trans, root->log_root,
5228 ino, prev_extent_end,
5229 hole_len);
5230 if (ret < 0)
5231 return ret;
5232
5233 /*
5234 * Search for the same key again in the root. Since it's
5235 * an extent item and we are holding the inode lock, the
5236 * key must still exist. If it doesn't just emit warning
5237 * and return an error to fall back to a transaction
5238 * commit.
5239 */
5240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5241 if (ret < 0)
5242 return ret;
5243 if (WARN_ON(ret > 0))
5244 return -ENOENT;
5245 leaf = path->nodes[0];
5246 }
5247
5248 prev_extent_end = btrfs_file_extent_end(path);
5249 path->slots[0]++;
5250 cond_resched();
5251 }
5252
5253 if (prev_extent_end < i_size) {
5254 u64 hole_len;
5255
5256 btrfs_release_path(path);
5257 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5258 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5259 prev_extent_end, hole_len);
5260 if (ret < 0)
5261 return ret;
5262 }
5263
5264 return 0;
5265 }
5266
5267 /*
5268 * When we are logging a new inode X, check if it doesn't have a reference that
5269 * matches the reference from some other inode Y created in a past transaction
5270 * and that was renamed in the current transaction. If we don't do this, then at
5271 * log replay time we can lose inode Y (and all its files if it's a directory):
5272 *
5273 * mkdir /mnt/x
5274 * echo "hello world" > /mnt/x/foobar
5275 * sync
5276 * mv /mnt/x /mnt/y
5277 * mkdir /mnt/x # or touch /mnt/x
5278 * xfs_io -c fsync /mnt/x
5279 * <power fail>
5280 * mount fs, trigger log replay
5281 *
5282 * After the log replay procedure, we would lose the first directory and all its
5283 * files (file foobar).
5284 * For the case where inode Y is not a directory we simply end up losing it:
5285 *
5286 * echo "123" > /mnt/foo
5287 * sync
5288 * mv /mnt/foo /mnt/bar
5289 * echo "abc" > /mnt/foo
5290 * xfs_io -c fsync /mnt/foo
5291 * <power fail>
5292 *
5293 * We also need this for cases where a snapshot entry is replaced by some other
5294 * entry (file or directory) otherwise we end up with an unreplayable log due to
5295 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5296 * if it were a regular entry:
5297 *
5298 * mkdir /mnt/x
5299 * btrfs subvolume snapshot /mnt /mnt/x/snap
5300 * btrfs subvolume delete /mnt/x/snap
5301 * rmdir /mnt/x
5302 * mkdir /mnt/x
5303 * fsync /mnt/x or fsync some new file inside it
5304 * <power fail>
5305 *
5306 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5307 * the same transaction.
5308 */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)5309 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5310 const int slot,
5311 const struct btrfs_key *key,
5312 struct btrfs_inode *inode,
5313 u64 *other_ino, u64 *other_parent)
5314 {
5315 int ret;
5316 struct btrfs_path *search_path;
5317 char *name = NULL;
5318 u32 name_len = 0;
5319 u32 item_size = btrfs_item_size(eb, slot);
5320 u32 cur_offset = 0;
5321 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5322
5323 search_path = btrfs_alloc_path();
5324 if (!search_path)
5325 return -ENOMEM;
5326 search_path->search_commit_root = 1;
5327 search_path->skip_locking = 1;
5328
5329 while (cur_offset < item_size) {
5330 u64 parent;
5331 u32 this_name_len;
5332 u32 this_len;
5333 unsigned long name_ptr;
5334 struct btrfs_dir_item *di;
5335 struct fscrypt_str name_str;
5336
5337 if (key->type == BTRFS_INODE_REF_KEY) {
5338 struct btrfs_inode_ref *iref;
5339
5340 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5341 parent = key->offset;
5342 this_name_len = btrfs_inode_ref_name_len(eb, iref);
5343 name_ptr = (unsigned long)(iref + 1);
5344 this_len = sizeof(*iref) + this_name_len;
5345 } else {
5346 struct btrfs_inode_extref *extref;
5347
5348 extref = (struct btrfs_inode_extref *)(ptr +
5349 cur_offset);
5350 parent = btrfs_inode_extref_parent(eb, extref);
5351 this_name_len = btrfs_inode_extref_name_len(eb, extref);
5352 name_ptr = (unsigned long)&extref->name;
5353 this_len = sizeof(*extref) + this_name_len;
5354 }
5355
5356 if (this_name_len > name_len) {
5357 char *new_name;
5358
5359 new_name = krealloc(name, this_name_len, GFP_NOFS);
5360 if (!new_name) {
5361 ret = -ENOMEM;
5362 goto out;
5363 }
5364 name_len = this_name_len;
5365 name = new_name;
5366 }
5367
5368 read_extent_buffer(eb, name, name_ptr, this_name_len);
5369
5370 name_str.name = name;
5371 name_str.len = this_name_len;
5372 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5373 parent, &name_str, 0);
5374 if (di && !IS_ERR(di)) {
5375 struct btrfs_key di_key;
5376
5377 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5378 di, &di_key);
5379 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5380 if (di_key.objectid != key->objectid) {
5381 ret = 1;
5382 *other_ino = di_key.objectid;
5383 *other_parent = parent;
5384 } else {
5385 ret = 0;
5386 }
5387 } else {
5388 ret = -EAGAIN;
5389 }
5390 goto out;
5391 } else if (IS_ERR(di)) {
5392 ret = PTR_ERR(di);
5393 goto out;
5394 }
5395 btrfs_release_path(search_path);
5396
5397 cur_offset += this_len;
5398 }
5399 ret = 0;
5400 out:
5401 btrfs_free_path(search_path);
5402 kfree(name);
5403 return ret;
5404 }
5405
5406 /*
5407 * Check if we need to log an inode. This is used in contexts where while
5408 * logging an inode we need to log another inode (either that it exists or in
5409 * full mode). This is used instead of btrfs_inode_in_log() because the later
5410 * requires the inode to be in the log and have the log transaction committed,
5411 * while here we do not care if the log transaction was already committed - our
5412 * caller will commit the log later - and we want to avoid logging an inode
5413 * multiple times when multiple tasks have joined the same log transaction.
5414 */
need_log_inode(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5415 static bool need_log_inode(const struct btrfs_trans_handle *trans,
5416 struct btrfs_inode *inode)
5417 {
5418 /*
5419 * If a directory was not modified, no dentries added or removed, we can
5420 * and should avoid logging it.
5421 */
5422 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5423 return false;
5424
5425 /*
5426 * If this inode does not have new/updated/deleted xattrs since the last
5427 * time it was logged and is flagged as logged in the current transaction,
5428 * we can skip logging it. As for new/deleted names, those are updated in
5429 * the log by link/unlink/rename operations.
5430 * In case the inode was logged and then evicted and reloaded, its
5431 * logged_trans will be 0, in which case we have to fully log it since
5432 * logged_trans is a transient field, not persisted.
5433 */
5434 if (inode_logged(trans, inode, NULL) == 1 &&
5435 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5436 return false;
5437
5438 return true;
5439 }
5440
5441 struct btrfs_dir_list {
5442 u64 ino;
5443 struct list_head list;
5444 };
5445
5446 /*
5447 * Log the inodes of the new dentries of a directory.
5448 * See process_dir_items_leaf() for details about why it is needed.
5449 * This is a recursive operation - if an existing dentry corresponds to a
5450 * directory, that directory's new entries are logged too (same behaviour as
5451 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5452 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5453 * complains about the following circular lock dependency / possible deadlock:
5454 *
5455 * CPU0 CPU1
5456 * ---- ----
5457 * lock(&type->i_mutex_dir_key#3/2);
5458 * lock(sb_internal#2);
5459 * lock(&type->i_mutex_dir_key#3/2);
5460 * lock(&sb->s_type->i_mutex_key#14);
5461 *
5462 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5463 * sb_start_intwrite() in btrfs_start_transaction().
5464 * Not acquiring the VFS lock of the inodes is still safe because:
5465 *
5466 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5467 * that while logging the inode new references (names) are added or removed
5468 * from the inode, leaving the logged inode item with a link count that does
5469 * not match the number of logged inode reference items. This is fine because
5470 * at log replay time we compute the real number of links and correct the
5471 * link count in the inode item (see replay_one_buffer() and
5472 * link_to_fixup_dir());
5473 *
5474 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5475 * while logging the inode's items new index items (key type
5476 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5477 * has a size that doesn't match the sum of the lengths of all the logged
5478 * names - this is ok, not a problem, because at log replay time we set the
5479 * directory's i_size to the correct value (see replay_one_name() and
5480 * overwrite_item()).
5481 */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5482 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5483 struct btrfs_inode *start_inode,
5484 struct btrfs_log_ctx *ctx)
5485 {
5486 struct btrfs_root *root = start_inode->root;
5487 struct btrfs_path *path;
5488 LIST_HEAD(dir_list);
5489 struct btrfs_dir_list *dir_elem;
5490 u64 ino = btrfs_ino(start_inode);
5491 struct btrfs_inode *curr_inode = start_inode;
5492 int ret = 0;
5493
5494 /*
5495 * If we are logging a new name, as part of a link or rename operation,
5496 * don't bother logging new dentries, as we just want to log the names
5497 * of an inode and that any new parents exist.
5498 */
5499 if (ctx->logging_new_name)
5500 return 0;
5501
5502 path = btrfs_alloc_path();
5503 if (!path)
5504 return -ENOMEM;
5505
5506 /* Pairs with btrfs_add_delayed_iput below. */
5507 ihold(&curr_inode->vfs_inode);
5508
5509 while (true) {
5510 struct btrfs_key key;
5511 struct btrfs_key found_key;
5512 u64 next_index;
5513 bool continue_curr_inode = true;
5514 int iter_ret;
5515
5516 key.objectid = ino;
5517 key.type = BTRFS_DIR_INDEX_KEY;
5518 key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5519 next_index = key.offset;
5520 again:
5521 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5522 struct extent_buffer *leaf = path->nodes[0];
5523 struct btrfs_dir_item *di;
5524 struct btrfs_key di_key;
5525 struct btrfs_inode *di_inode;
5526 int log_mode = LOG_INODE_EXISTS;
5527 int type;
5528
5529 if (found_key.objectid != ino ||
5530 found_key.type != BTRFS_DIR_INDEX_KEY) {
5531 continue_curr_inode = false;
5532 break;
5533 }
5534
5535 next_index = found_key.offset + 1;
5536
5537 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5538 type = btrfs_dir_ftype(leaf, di);
5539 if (btrfs_dir_transid(leaf, di) < trans->transid)
5540 continue;
5541 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5542 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5543 continue;
5544
5545 btrfs_release_path(path);
5546 di_inode = btrfs_iget_logging(di_key.objectid, root);
5547 if (IS_ERR(di_inode)) {
5548 ret = PTR_ERR(di_inode);
5549 goto out;
5550 }
5551
5552 if (!need_log_inode(trans, di_inode)) {
5553 btrfs_add_delayed_iput(di_inode);
5554 break;
5555 }
5556
5557 ctx->log_new_dentries = false;
5558 if (type == BTRFS_FT_DIR)
5559 log_mode = LOG_INODE_ALL;
5560 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
5561 btrfs_add_delayed_iput(di_inode);
5562 if (ret)
5563 goto out;
5564 if (ctx->log_new_dentries) {
5565 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5566 if (!dir_elem) {
5567 ret = -ENOMEM;
5568 goto out;
5569 }
5570 dir_elem->ino = di_key.objectid;
5571 list_add_tail(&dir_elem->list, &dir_list);
5572 }
5573 break;
5574 }
5575
5576 btrfs_release_path(path);
5577
5578 if (iter_ret < 0) {
5579 ret = iter_ret;
5580 goto out;
5581 } else if (iter_ret > 0) {
5582 continue_curr_inode = false;
5583 } else {
5584 key = found_key;
5585 }
5586
5587 if (continue_curr_inode && key.offset < (u64)-1) {
5588 key.offset++;
5589 goto again;
5590 }
5591
5592 btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5593
5594 if (list_empty(&dir_list))
5595 break;
5596
5597 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5598 ino = dir_elem->ino;
5599 list_del(&dir_elem->list);
5600 kfree(dir_elem);
5601
5602 btrfs_add_delayed_iput(curr_inode);
5603
5604 curr_inode = btrfs_iget_logging(ino, root);
5605 if (IS_ERR(curr_inode)) {
5606 ret = PTR_ERR(curr_inode);
5607 curr_inode = NULL;
5608 break;
5609 }
5610 }
5611 out:
5612 btrfs_free_path(path);
5613 if (curr_inode)
5614 btrfs_add_delayed_iput(curr_inode);
5615
5616 if (ret) {
5617 struct btrfs_dir_list *next;
5618
5619 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5620 kfree(dir_elem);
5621 }
5622
5623 return ret;
5624 }
5625
5626 struct btrfs_ino_list {
5627 u64 ino;
5628 u64 parent;
5629 struct list_head list;
5630 };
5631
free_conflicting_inodes(struct btrfs_log_ctx * ctx)5632 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5633 {
5634 struct btrfs_ino_list *curr;
5635 struct btrfs_ino_list *next;
5636
5637 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5638 list_del(&curr->list);
5639 kfree(curr);
5640 }
5641 }
5642
conflicting_inode_is_dir(struct btrfs_root * root,u64 ino,struct btrfs_path * path)5643 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5644 struct btrfs_path *path)
5645 {
5646 struct btrfs_key key;
5647 int ret;
5648
5649 key.objectid = ino;
5650 key.type = BTRFS_INODE_ITEM_KEY;
5651 key.offset = 0;
5652
5653 path->search_commit_root = 1;
5654 path->skip_locking = 1;
5655
5656 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5657 if (WARN_ON_ONCE(ret > 0)) {
5658 /*
5659 * We have previously found the inode through the commit root
5660 * so this should not happen. If it does, just error out and
5661 * fallback to a transaction commit.
5662 */
5663 ret = -ENOENT;
5664 } else if (ret == 0) {
5665 struct btrfs_inode_item *item;
5666
5667 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5668 struct btrfs_inode_item);
5669 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5670 ret = 1;
5671 }
5672
5673 btrfs_release_path(path);
5674 path->search_commit_root = 0;
5675 path->skip_locking = 0;
5676
5677 return ret;
5678 }
5679
add_conflicting_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 ino,u64 parent,struct btrfs_log_ctx * ctx)5680 static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5681 struct btrfs_root *root,
5682 struct btrfs_path *path,
5683 u64 ino, u64 parent,
5684 struct btrfs_log_ctx *ctx)
5685 {
5686 struct btrfs_ino_list *ino_elem;
5687 struct btrfs_inode *inode;
5688
5689 /*
5690 * It's rare to have a lot of conflicting inodes, in practice it is not
5691 * common to have more than 1 or 2. We don't want to collect too many,
5692 * as we could end up logging too many inodes (even if only in
5693 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5694 * commits.
5695 */
5696 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5697 return BTRFS_LOG_FORCE_COMMIT;
5698
5699 inode = btrfs_iget_logging(ino, root);
5700 /*
5701 * If the other inode that had a conflicting dir entry was deleted in
5702 * the current transaction then we either:
5703 *
5704 * 1) Log the parent directory (later after adding it to the list) if
5705 * the inode is a directory. This is because it may be a deleted
5706 * subvolume/snapshot or it may be a regular directory that had
5707 * deleted subvolumes/snapshots (or subdirectories that had them),
5708 * and at the moment we can't deal with dropping subvolumes/snapshots
5709 * during log replay. So we just log the parent, which will result in
5710 * a fallback to a transaction commit if we are dealing with those
5711 * cases (last_unlink_trans will match the current transaction);
5712 *
5713 * 2) Do nothing if it's not a directory. During log replay we simply
5714 * unlink the conflicting dentry from the parent directory and then
5715 * add the dentry for our inode. Like this we can avoid logging the
5716 * parent directory (and maybe fallback to a transaction commit in
5717 * case it has a last_unlink_trans == trans->transid, due to moving
5718 * some inode from it to some other directory).
5719 */
5720 if (IS_ERR(inode)) {
5721 int ret = PTR_ERR(inode);
5722
5723 if (ret != -ENOENT)
5724 return ret;
5725
5726 ret = conflicting_inode_is_dir(root, ino, path);
5727 /* Not a directory or we got an error. */
5728 if (ret <= 0)
5729 return ret;
5730
5731 /* Conflicting inode is a directory, so we'll log its parent. */
5732 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5733 if (!ino_elem)
5734 return -ENOMEM;
5735 ino_elem->ino = ino;
5736 ino_elem->parent = parent;
5737 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5738 ctx->num_conflict_inodes++;
5739
5740 return 0;
5741 }
5742
5743 /*
5744 * If the inode was already logged skip it - otherwise we can hit an
5745 * infinite loop. Example:
5746 *
5747 * From the commit root (previous transaction) we have the following
5748 * inodes:
5749 *
5750 * inode 257 a directory
5751 * inode 258 with references "zz" and "zz_link" on inode 257
5752 * inode 259 with reference "a" on inode 257
5753 *
5754 * And in the current (uncommitted) transaction we have:
5755 *
5756 * inode 257 a directory, unchanged
5757 * inode 258 with references "a" and "a2" on inode 257
5758 * inode 259 with reference "zz_link" on inode 257
5759 * inode 261 with reference "zz" on inode 257
5760 *
5761 * When logging inode 261 the following infinite loop could
5762 * happen if we don't skip already logged inodes:
5763 *
5764 * - we detect inode 258 as a conflicting inode, with inode 261
5765 * on reference "zz", and log it;
5766 *
5767 * - we detect inode 259 as a conflicting inode, with inode 258
5768 * on reference "a", and log it;
5769 *
5770 * - we detect inode 258 as a conflicting inode, with inode 259
5771 * on reference "zz_link", and log it - again! After this we
5772 * repeat the above steps forever.
5773 *
5774 * Here we can use need_log_inode() because we only need to log the
5775 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5776 * so that the log ends up with the new name and without the old name.
5777 */
5778 if (!need_log_inode(trans, inode)) {
5779 btrfs_add_delayed_iput(inode);
5780 return 0;
5781 }
5782
5783 btrfs_add_delayed_iput(inode);
5784
5785 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5786 if (!ino_elem)
5787 return -ENOMEM;
5788 ino_elem->ino = ino;
5789 ino_elem->parent = parent;
5790 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5791 ctx->num_conflict_inodes++;
5792
5793 return 0;
5794 }
5795
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)5796 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5797 struct btrfs_root *root,
5798 struct btrfs_log_ctx *ctx)
5799 {
5800 int ret = 0;
5801
5802 /*
5803 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5804 * otherwise we could have unbounded recursion of btrfs_log_inode()
5805 * calls. This check guarantees we can have only 1 level of recursion.
5806 */
5807 if (ctx->logging_conflict_inodes)
5808 return 0;
5809
5810 ctx->logging_conflict_inodes = true;
5811
5812 /*
5813 * New conflicting inodes may be found and added to the list while we
5814 * are logging a conflicting inode, so keep iterating while the list is
5815 * not empty.
5816 */
5817 while (!list_empty(&ctx->conflict_inodes)) {
5818 struct btrfs_ino_list *curr;
5819 struct btrfs_inode *inode;
5820 u64 ino;
5821 u64 parent;
5822
5823 curr = list_first_entry(&ctx->conflict_inodes,
5824 struct btrfs_ino_list, list);
5825 ino = curr->ino;
5826 parent = curr->parent;
5827 list_del(&curr->list);
5828 kfree(curr);
5829
5830 inode = btrfs_iget_logging(ino, root);
5831 /*
5832 * If the other inode that had a conflicting dir entry was
5833 * deleted in the current transaction, we need to log its parent
5834 * directory. See the comment at add_conflicting_inode().
5835 */
5836 if (IS_ERR(inode)) {
5837 ret = PTR_ERR(inode);
5838 if (ret != -ENOENT)
5839 break;
5840
5841 inode = btrfs_iget_logging(parent, root);
5842 if (IS_ERR(inode)) {
5843 ret = PTR_ERR(inode);
5844 break;
5845 }
5846
5847 /*
5848 * Always log the directory, we cannot make this
5849 * conditional on need_log_inode() because the directory
5850 * might have been logged in LOG_INODE_EXISTS mode or
5851 * the dir index of the conflicting inode is not in a
5852 * dir index key range logged for the directory. So we
5853 * must make sure the deletion is recorded.
5854 */
5855 ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
5856 btrfs_add_delayed_iput(inode);
5857 if (ret)
5858 break;
5859 continue;
5860 }
5861
5862 /*
5863 * Here we can use need_log_inode() because we only need to log
5864 * the inode in LOG_INODE_EXISTS mode and rename operations
5865 * update the log, so that the log ends up with the new name and
5866 * without the old name.
5867 *
5868 * We did this check at add_conflicting_inode(), but here we do
5869 * it again because if some other task logged the inode after
5870 * that, we can avoid doing it again.
5871 */
5872 if (!need_log_inode(trans, inode)) {
5873 btrfs_add_delayed_iput(inode);
5874 continue;
5875 }
5876
5877 /*
5878 * We are safe logging the other inode without acquiring its
5879 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5880 * are safe against concurrent renames of the other inode as
5881 * well because during a rename we pin the log and update the
5882 * log with the new name before we unpin it.
5883 */
5884 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
5885 btrfs_add_delayed_iput(inode);
5886 if (ret)
5887 break;
5888 }
5889
5890 ctx->logging_conflict_inodes = false;
5891 if (ret)
5892 free_conflicting_inodes(ctx);
5893
5894 return ret;
5895 }
5896
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)5897 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5898 struct btrfs_inode *inode,
5899 struct btrfs_key *min_key,
5900 const struct btrfs_key *max_key,
5901 struct btrfs_path *path,
5902 struct btrfs_path *dst_path,
5903 const u64 logged_isize,
5904 const int inode_only,
5905 struct btrfs_log_ctx *ctx,
5906 bool *need_log_inode_item)
5907 {
5908 const u64 i_size = i_size_read(&inode->vfs_inode);
5909 struct btrfs_root *root = inode->root;
5910 int ins_start_slot = 0;
5911 int ins_nr = 0;
5912 int ret;
5913
5914 while (1) {
5915 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5916 if (ret < 0)
5917 return ret;
5918 if (ret > 0) {
5919 ret = 0;
5920 break;
5921 }
5922 again:
5923 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5924 if (min_key->objectid != max_key->objectid)
5925 break;
5926 if (min_key->type > max_key->type)
5927 break;
5928
5929 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5930 *need_log_inode_item = false;
5931 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5932 min_key->offset >= i_size) {
5933 /*
5934 * Extents at and beyond eof are logged with
5935 * btrfs_log_prealloc_extents().
5936 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5937 * and no keys greater than that, so bail out.
5938 */
5939 break;
5940 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5941 min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5942 (inode->generation == trans->transid ||
5943 ctx->logging_conflict_inodes)) {
5944 u64 other_ino = 0;
5945 u64 other_parent = 0;
5946
5947 ret = btrfs_check_ref_name_override(path->nodes[0],
5948 path->slots[0], min_key, inode,
5949 &other_ino, &other_parent);
5950 if (ret < 0) {
5951 return ret;
5952 } else if (ret > 0 &&
5953 other_ino != btrfs_ino(ctx->inode)) {
5954 if (ins_nr > 0) {
5955 ins_nr++;
5956 } else {
5957 ins_nr = 1;
5958 ins_start_slot = path->slots[0];
5959 }
5960 ret = copy_items(trans, inode, dst_path, path,
5961 ins_start_slot, ins_nr,
5962 inode_only, logged_isize, ctx);
5963 if (ret < 0)
5964 return ret;
5965 ins_nr = 0;
5966
5967 btrfs_release_path(path);
5968 ret = add_conflicting_inode(trans, root, path,
5969 other_ino,
5970 other_parent, ctx);
5971 if (ret)
5972 return ret;
5973 goto next_key;
5974 }
5975 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5976 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5977 if (ins_nr == 0)
5978 goto next_slot;
5979 ret = copy_items(trans, inode, dst_path, path,
5980 ins_start_slot,
5981 ins_nr, inode_only, logged_isize, ctx);
5982 if (ret < 0)
5983 return ret;
5984 ins_nr = 0;
5985 goto next_slot;
5986 }
5987
5988 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5989 ins_nr++;
5990 goto next_slot;
5991 } else if (!ins_nr) {
5992 ins_start_slot = path->slots[0];
5993 ins_nr = 1;
5994 goto next_slot;
5995 }
5996
5997 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5998 ins_nr, inode_only, logged_isize, ctx);
5999 if (ret < 0)
6000 return ret;
6001 ins_nr = 1;
6002 ins_start_slot = path->slots[0];
6003 next_slot:
6004 path->slots[0]++;
6005 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
6006 btrfs_item_key_to_cpu(path->nodes[0], min_key,
6007 path->slots[0]);
6008 goto again;
6009 }
6010 if (ins_nr) {
6011 ret = copy_items(trans, inode, dst_path, path,
6012 ins_start_slot, ins_nr, inode_only,
6013 logged_isize, ctx);
6014 if (ret < 0)
6015 return ret;
6016 ins_nr = 0;
6017 }
6018 btrfs_release_path(path);
6019 next_key:
6020 if (min_key->offset < (u64)-1) {
6021 min_key->offset++;
6022 } else if (min_key->type < max_key->type) {
6023 min_key->type++;
6024 min_key->offset = 0;
6025 } else {
6026 break;
6027 }
6028
6029 /*
6030 * We may process many leaves full of items for our inode, so
6031 * avoid monopolizing a cpu for too long by rescheduling while
6032 * not holding locks on any tree.
6033 */
6034 cond_resched();
6035 }
6036 if (ins_nr) {
6037 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
6038 ins_nr, inode_only, logged_isize, ctx);
6039 if (ret)
6040 return ret;
6041 }
6042
6043 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
6044 /*
6045 * Release the path because otherwise we might attempt to double
6046 * lock the same leaf with btrfs_log_prealloc_extents() below.
6047 */
6048 btrfs_release_path(path);
6049 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
6050 }
6051
6052 return ret;
6053 }
6054
insert_delayed_items_batch(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,const struct btrfs_item_batch * batch,const struct btrfs_delayed_item * first_item)6055 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6056 struct btrfs_root *log,
6057 struct btrfs_path *path,
6058 const struct btrfs_item_batch *batch,
6059 const struct btrfs_delayed_item *first_item)
6060 {
6061 const struct btrfs_delayed_item *curr = first_item;
6062 int ret;
6063
6064 ret = btrfs_insert_empty_items(trans, log, path, batch);
6065 if (ret)
6066 return ret;
6067
6068 for (int i = 0; i < batch->nr; i++) {
6069 char *data_ptr;
6070
6071 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
6072 write_extent_buffer(path->nodes[0], &curr->data,
6073 (unsigned long)data_ptr, curr->data_len);
6074 curr = list_next_entry(curr, log_list);
6075 path->slots[0]++;
6076 }
6077
6078 btrfs_release_path(path);
6079
6080 return 0;
6081 }
6082
log_delayed_insertion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6083 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6084 struct btrfs_inode *inode,
6085 struct btrfs_path *path,
6086 const struct list_head *delayed_ins_list,
6087 struct btrfs_log_ctx *ctx)
6088 {
6089 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6090 const int max_batch_size = 195;
6091 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6092 const u64 ino = btrfs_ino(inode);
6093 struct btrfs_root *log = inode->root->log_root;
6094 struct btrfs_item_batch batch = {
6095 .nr = 0,
6096 .total_data_size = 0,
6097 };
6098 const struct btrfs_delayed_item *first = NULL;
6099 const struct btrfs_delayed_item *curr;
6100 char *ins_data;
6101 struct btrfs_key *ins_keys;
6102 u32 *ins_sizes;
6103 u64 curr_batch_size = 0;
6104 int batch_idx = 0;
6105 int ret;
6106
6107 /* We are adding dir index items to the log tree. */
6108 lockdep_assert_held(&inode->log_mutex);
6109
6110 /*
6111 * We collect delayed items before copying index keys from the subvolume
6112 * to the log tree. However just after we collected them, they may have
6113 * been flushed (all of them or just some of them), and therefore we
6114 * could have copied them from the subvolume tree to the log tree.
6115 * So find the first delayed item that was not yet logged (they are
6116 * sorted by index number).
6117 */
6118 list_for_each_entry(curr, delayed_ins_list, log_list) {
6119 if (curr->index > inode->last_dir_index_offset) {
6120 first = curr;
6121 break;
6122 }
6123 }
6124
6125 /* Empty list or all delayed items were already logged. */
6126 if (!first)
6127 return 0;
6128
6129 ins_data = kmalloc(max_batch_size * sizeof(u32) +
6130 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6131 if (!ins_data)
6132 return -ENOMEM;
6133 ins_sizes = (u32 *)ins_data;
6134 batch.data_sizes = ins_sizes;
6135 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6136 batch.keys = ins_keys;
6137
6138 curr = first;
6139 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6140 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6141
6142 if (curr_batch_size + curr_size > leaf_data_size ||
6143 batch.nr == max_batch_size) {
6144 ret = insert_delayed_items_batch(trans, log, path,
6145 &batch, first);
6146 if (ret)
6147 goto out;
6148 batch_idx = 0;
6149 batch.nr = 0;
6150 batch.total_data_size = 0;
6151 curr_batch_size = 0;
6152 first = curr;
6153 }
6154
6155 ins_sizes[batch_idx] = curr->data_len;
6156 ins_keys[batch_idx].objectid = ino;
6157 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6158 ins_keys[batch_idx].offset = curr->index;
6159 curr_batch_size += curr_size;
6160 batch.total_data_size += curr->data_len;
6161 batch.nr++;
6162 batch_idx++;
6163 curr = list_next_entry(curr, log_list);
6164 }
6165
6166 ASSERT(batch.nr >= 1);
6167 ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6168
6169 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6170 log_list);
6171 inode->last_dir_index_offset = curr->index;
6172 out:
6173 kfree(ins_data);
6174
6175 return ret;
6176 }
6177
log_delayed_deletions_full(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6178 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6179 struct btrfs_inode *inode,
6180 struct btrfs_path *path,
6181 const struct list_head *delayed_del_list,
6182 struct btrfs_log_ctx *ctx)
6183 {
6184 const u64 ino = btrfs_ino(inode);
6185 const struct btrfs_delayed_item *curr;
6186
6187 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6188 log_list);
6189
6190 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6191 u64 first_dir_index = curr->index;
6192 u64 last_dir_index;
6193 const struct btrfs_delayed_item *next;
6194 int ret;
6195
6196 /*
6197 * Find a range of consecutive dir index items to delete. Like
6198 * this we log a single dir range item spanning several contiguous
6199 * dir items instead of logging one range item per dir index item.
6200 */
6201 next = list_next_entry(curr, log_list);
6202 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6203 if (next->index != curr->index + 1)
6204 break;
6205 curr = next;
6206 next = list_next_entry(next, log_list);
6207 }
6208
6209 last_dir_index = curr->index;
6210 ASSERT(last_dir_index >= first_dir_index);
6211
6212 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6213 ino, first_dir_index, last_dir_index);
6214 if (ret)
6215 return ret;
6216 curr = list_next_entry(curr, log_list);
6217 }
6218
6219 return 0;
6220 }
6221
batch_delete_dir_index_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,const struct btrfs_delayed_item * first,const struct btrfs_delayed_item ** last_ret)6222 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6223 struct btrfs_inode *inode,
6224 struct btrfs_path *path,
6225 const struct list_head *delayed_del_list,
6226 const struct btrfs_delayed_item *first,
6227 const struct btrfs_delayed_item **last_ret)
6228 {
6229 const struct btrfs_delayed_item *next;
6230 struct extent_buffer *leaf = path->nodes[0];
6231 const int last_slot = btrfs_header_nritems(leaf) - 1;
6232 int slot = path->slots[0] + 1;
6233 const u64 ino = btrfs_ino(inode);
6234
6235 next = list_next_entry(first, log_list);
6236
6237 while (slot < last_slot &&
6238 !list_entry_is_head(next, delayed_del_list, log_list)) {
6239 struct btrfs_key key;
6240
6241 btrfs_item_key_to_cpu(leaf, &key, slot);
6242 if (key.objectid != ino ||
6243 key.type != BTRFS_DIR_INDEX_KEY ||
6244 key.offset != next->index)
6245 break;
6246
6247 slot++;
6248 *last_ret = next;
6249 next = list_next_entry(next, log_list);
6250 }
6251
6252 return btrfs_del_items(trans, inode->root->log_root, path,
6253 path->slots[0], slot - path->slots[0]);
6254 }
6255
log_delayed_deletions_incremental(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6256 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6257 struct btrfs_inode *inode,
6258 struct btrfs_path *path,
6259 const struct list_head *delayed_del_list,
6260 struct btrfs_log_ctx *ctx)
6261 {
6262 struct btrfs_root *log = inode->root->log_root;
6263 const struct btrfs_delayed_item *curr;
6264 u64 last_range_start = 0;
6265 u64 last_range_end = 0;
6266 struct btrfs_key key;
6267
6268 key.objectid = btrfs_ino(inode);
6269 key.type = BTRFS_DIR_INDEX_KEY;
6270 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6271 log_list);
6272
6273 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6274 const struct btrfs_delayed_item *last = curr;
6275 u64 first_dir_index = curr->index;
6276 u64 last_dir_index;
6277 bool deleted_items = false;
6278 int ret;
6279
6280 key.offset = curr->index;
6281 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6282 if (ret < 0) {
6283 return ret;
6284 } else if (ret == 0) {
6285 ret = batch_delete_dir_index_items(trans, inode, path,
6286 delayed_del_list, curr,
6287 &last);
6288 if (ret)
6289 return ret;
6290 deleted_items = true;
6291 }
6292
6293 btrfs_release_path(path);
6294
6295 /*
6296 * If we deleted items from the leaf, it means we have a range
6297 * item logging their range, so no need to add one or update an
6298 * existing one. Otherwise we have to log a dir range item.
6299 */
6300 if (deleted_items)
6301 goto next_batch;
6302
6303 last_dir_index = last->index;
6304 ASSERT(last_dir_index >= first_dir_index);
6305 /*
6306 * If this range starts right after where the previous one ends,
6307 * then we want to reuse the previous range item and change its
6308 * end offset to the end of this range. This is just to minimize
6309 * leaf space usage, by avoiding adding a new range item.
6310 */
6311 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6312 first_dir_index = last_range_start;
6313
6314 ret = insert_dir_log_key(trans, log, path, key.objectid,
6315 first_dir_index, last_dir_index);
6316 if (ret)
6317 return ret;
6318
6319 last_range_start = first_dir_index;
6320 last_range_end = last_dir_index;
6321 next_batch:
6322 curr = list_next_entry(last, log_list);
6323 }
6324
6325 return 0;
6326 }
6327
log_delayed_deletion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6328 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6329 struct btrfs_inode *inode,
6330 struct btrfs_path *path,
6331 const struct list_head *delayed_del_list,
6332 struct btrfs_log_ctx *ctx)
6333 {
6334 /*
6335 * We are deleting dir index items from the log tree or adding range
6336 * items to it.
6337 */
6338 lockdep_assert_held(&inode->log_mutex);
6339
6340 if (list_empty(delayed_del_list))
6341 return 0;
6342
6343 if (ctx->logged_before)
6344 return log_delayed_deletions_incremental(trans, inode, path,
6345 delayed_del_list, ctx);
6346
6347 return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6348 ctx);
6349 }
6350
6351 /*
6352 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6353 * items instead of the subvolume tree.
6354 */
log_new_delayed_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6355 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6356 struct btrfs_inode *inode,
6357 const struct list_head *delayed_ins_list,
6358 struct btrfs_log_ctx *ctx)
6359 {
6360 const bool orig_log_new_dentries = ctx->log_new_dentries;
6361 struct btrfs_delayed_item *item;
6362 int ret = 0;
6363
6364 /*
6365 * No need for the log mutex, plus to avoid potential deadlocks or
6366 * lockdep annotations due to nesting of delayed inode mutexes and log
6367 * mutexes.
6368 */
6369 lockdep_assert_not_held(&inode->log_mutex);
6370
6371 ASSERT(!ctx->logging_new_delayed_dentries);
6372 ctx->logging_new_delayed_dentries = true;
6373
6374 list_for_each_entry(item, delayed_ins_list, log_list) {
6375 struct btrfs_dir_item *dir_item;
6376 struct btrfs_inode *di_inode;
6377 struct btrfs_key key;
6378 int log_mode = LOG_INODE_EXISTS;
6379
6380 dir_item = (struct btrfs_dir_item *)item->data;
6381 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6382
6383 if (key.type == BTRFS_ROOT_ITEM_KEY)
6384 continue;
6385
6386 di_inode = btrfs_iget_logging(key.objectid, inode->root);
6387 if (IS_ERR(di_inode)) {
6388 ret = PTR_ERR(di_inode);
6389 break;
6390 }
6391
6392 if (!need_log_inode(trans, di_inode)) {
6393 btrfs_add_delayed_iput(di_inode);
6394 continue;
6395 }
6396
6397 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6398 log_mode = LOG_INODE_ALL;
6399
6400 ctx->log_new_dentries = false;
6401 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
6402
6403 if (!ret && ctx->log_new_dentries)
6404 ret = log_new_dir_dentries(trans, di_inode, ctx);
6405
6406 btrfs_add_delayed_iput(di_inode);
6407
6408 if (ret)
6409 break;
6410 }
6411
6412 ctx->log_new_dentries = orig_log_new_dentries;
6413 ctx->logging_new_delayed_dentries = false;
6414
6415 return ret;
6416 }
6417
6418 /* log a single inode in the tree log.
6419 * At least one parent directory for this inode must exist in the tree
6420 * or be logged already.
6421 *
6422 * Any items from this inode changed by the current transaction are copied
6423 * to the log tree. An extra reference is taken on any extents in this
6424 * file, allowing us to avoid a whole pile of corner cases around logging
6425 * blocks that have been removed from the tree.
6426 *
6427 * See LOG_INODE_ALL and related defines for a description of what inode_only
6428 * does.
6429 *
6430 * This handles both files and directories.
6431 */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)6432 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6433 struct btrfs_inode *inode,
6434 int inode_only,
6435 struct btrfs_log_ctx *ctx)
6436 {
6437 struct btrfs_path *path;
6438 struct btrfs_path *dst_path;
6439 struct btrfs_key min_key;
6440 struct btrfs_key max_key;
6441 struct btrfs_root *log = inode->root->log_root;
6442 int ret;
6443 bool fast_search = false;
6444 u64 ino = btrfs_ino(inode);
6445 struct extent_map_tree *em_tree = &inode->extent_tree;
6446 u64 logged_isize = 0;
6447 bool need_log_inode_item = true;
6448 bool xattrs_logged = false;
6449 bool inode_item_dropped = true;
6450 bool full_dir_logging = false;
6451 LIST_HEAD(delayed_ins_list);
6452 LIST_HEAD(delayed_del_list);
6453
6454 path = btrfs_alloc_path();
6455 if (!path)
6456 return -ENOMEM;
6457 dst_path = btrfs_alloc_path();
6458 if (!dst_path) {
6459 btrfs_free_path(path);
6460 return -ENOMEM;
6461 }
6462
6463 min_key.objectid = ino;
6464 min_key.type = BTRFS_INODE_ITEM_KEY;
6465 min_key.offset = 0;
6466
6467 max_key.objectid = ino;
6468
6469
6470 /* today the code can only do partial logging of directories */
6471 if (S_ISDIR(inode->vfs_inode.i_mode) ||
6472 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6473 &inode->runtime_flags) &&
6474 inode_only >= LOG_INODE_EXISTS))
6475 max_key.type = BTRFS_XATTR_ITEM_KEY;
6476 else
6477 max_key.type = (u8)-1;
6478 max_key.offset = (u64)-1;
6479
6480 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6481 full_dir_logging = true;
6482
6483 /*
6484 * If we are logging a directory while we are logging dentries of the
6485 * delayed items of some other inode, then we need to flush the delayed
6486 * items of this directory and not log the delayed items directly. This
6487 * is to prevent more than one level of recursion into btrfs_log_inode()
6488 * by having something like this:
6489 *
6490 * $ mkdir -p a/b/c/d/e/f/g/h/...
6491 * $ xfs_io -c "fsync" a
6492 *
6493 * Where all directories in the path did not exist before and are
6494 * created in the current transaction.
6495 * So in such a case we directly log the delayed items of the main
6496 * directory ("a") without flushing them first, while for each of its
6497 * subdirectories we flush their delayed items before logging them.
6498 * This prevents a potential unbounded recursion like this:
6499 *
6500 * btrfs_log_inode()
6501 * log_new_delayed_dentries()
6502 * btrfs_log_inode()
6503 * log_new_delayed_dentries()
6504 * btrfs_log_inode()
6505 * log_new_delayed_dentries()
6506 * (...)
6507 *
6508 * We have thresholds for the maximum number of delayed items to have in
6509 * memory, and once they are hit, the items are flushed asynchronously.
6510 * However the limit is quite high, so lets prevent deep levels of
6511 * recursion to happen by limiting the maximum depth to be 1.
6512 */
6513 if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6514 ret = btrfs_commit_inode_delayed_items(trans, inode);
6515 if (ret)
6516 goto out;
6517 }
6518
6519 mutex_lock(&inode->log_mutex);
6520
6521 /*
6522 * For symlinks, we must always log their content, which is stored in an
6523 * inline extent, otherwise we could end up with an empty symlink after
6524 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6525 * one attempts to create an empty symlink).
6526 * We don't need to worry about flushing delalloc, because when we create
6527 * the inline extent when the symlink is created (we never have delalloc
6528 * for symlinks).
6529 */
6530 if (S_ISLNK(inode->vfs_inode.i_mode))
6531 inode_only = LOG_INODE_ALL;
6532
6533 /*
6534 * Before logging the inode item, cache the value returned by
6535 * inode_logged(), because after that we have the need to figure out if
6536 * the inode was previously logged in this transaction.
6537 */
6538 ret = inode_logged(trans, inode, path);
6539 if (ret < 0)
6540 goto out_unlock;
6541 ctx->logged_before = (ret == 1);
6542 ret = 0;
6543
6544 /*
6545 * This is for cases where logging a directory could result in losing a
6546 * a file after replaying the log. For example, if we move a file from a
6547 * directory A to a directory B, then fsync directory A, we have no way
6548 * to known the file was moved from A to B, so logging just A would
6549 * result in losing the file after a log replay.
6550 */
6551 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6552 ret = BTRFS_LOG_FORCE_COMMIT;
6553 goto out_unlock;
6554 }
6555
6556 /*
6557 * a brute force approach to making sure we get the most uptodate
6558 * copies of everything.
6559 */
6560 if (S_ISDIR(inode->vfs_inode.i_mode)) {
6561 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6562 if (ctx->logged_before)
6563 ret = drop_inode_items(trans, log, path, inode,
6564 BTRFS_XATTR_ITEM_KEY);
6565 } else {
6566 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6567 /*
6568 * Make sure the new inode item we write to the log has
6569 * the same isize as the current one (if it exists).
6570 * This is necessary to prevent data loss after log
6571 * replay, and also to prevent doing a wrong expanding
6572 * truncate - for e.g. create file, write 4K into offset
6573 * 0, fsync, write 4K into offset 4096, add hard link,
6574 * fsync some other file (to sync log), power fail - if
6575 * we use the inode's current i_size, after log replay
6576 * we get a 8Kb file, with the last 4Kb extent as a hole
6577 * (zeroes), as if an expanding truncate happened,
6578 * instead of getting a file of 4Kb only.
6579 */
6580 ret = logged_inode_size(log, inode, path, &logged_isize);
6581 if (ret)
6582 goto out_unlock;
6583 }
6584 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6585 &inode->runtime_flags)) {
6586 if (inode_only == LOG_INODE_EXISTS) {
6587 max_key.type = BTRFS_XATTR_ITEM_KEY;
6588 if (ctx->logged_before)
6589 ret = drop_inode_items(trans, log, path,
6590 inode, max_key.type);
6591 } else {
6592 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6593 &inode->runtime_flags);
6594 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6595 &inode->runtime_flags);
6596 if (ctx->logged_before)
6597 ret = truncate_inode_items(trans, log,
6598 inode, 0, 0);
6599 }
6600 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6601 &inode->runtime_flags) ||
6602 inode_only == LOG_INODE_EXISTS) {
6603 if (inode_only == LOG_INODE_ALL)
6604 fast_search = true;
6605 max_key.type = BTRFS_XATTR_ITEM_KEY;
6606 if (ctx->logged_before)
6607 ret = drop_inode_items(trans, log, path, inode,
6608 max_key.type);
6609 } else {
6610 if (inode_only == LOG_INODE_ALL)
6611 fast_search = true;
6612 inode_item_dropped = false;
6613 goto log_extents;
6614 }
6615
6616 }
6617 if (ret)
6618 goto out_unlock;
6619
6620 /*
6621 * If we are logging a directory in full mode, collect the delayed items
6622 * before iterating the subvolume tree, so that we don't miss any new
6623 * dir index items in case they get flushed while or right after we are
6624 * iterating the subvolume tree.
6625 */
6626 if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6627 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6628 &delayed_del_list);
6629
6630 /*
6631 * If we are fsyncing a file with 0 hard links, then commit the delayed
6632 * inode because the last inode ref (or extref) item may still be in the
6633 * subvolume tree and if we log it the file will still exist after a log
6634 * replay. So commit the delayed inode to delete that last ref and we
6635 * skip logging it.
6636 */
6637 if (inode->vfs_inode.i_nlink == 0) {
6638 ret = btrfs_commit_inode_delayed_inode(inode);
6639 if (ret)
6640 goto out_unlock;
6641 }
6642
6643 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6644 path, dst_path, logged_isize,
6645 inode_only, ctx,
6646 &need_log_inode_item);
6647 if (ret)
6648 goto out_unlock;
6649
6650 btrfs_release_path(path);
6651 btrfs_release_path(dst_path);
6652 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6653 if (ret)
6654 goto out_unlock;
6655 xattrs_logged = true;
6656 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6657 btrfs_release_path(path);
6658 btrfs_release_path(dst_path);
6659 ret = btrfs_log_holes(trans, inode, path);
6660 if (ret)
6661 goto out_unlock;
6662 }
6663 log_extents:
6664 btrfs_release_path(path);
6665 btrfs_release_path(dst_path);
6666 if (need_log_inode_item) {
6667 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6668 if (ret)
6669 goto out_unlock;
6670 /*
6671 * If we are doing a fast fsync and the inode was logged before
6672 * in this transaction, we don't need to log the xattrs because
6673 * they were logged before. If xattrs were added, changed or
6674 * deleted since the last time we logged the inode, then we have
6675 * already logged them because the inode had the runtime flag
6676 * BTRFS_INODE_COPY_EVERYTHING set.
6677 */
6678 if (!xattrs_logged && inode->logged_trans < trans->transid) {
6679 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6680 if (ret)
6681 goto out_unlock;
6682 btrfs_release_path(path);
6683 }
6684 }
6685 if (fast_search) {
6686 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6687 if (ret)
6688 goto out_unlock;
6689 } else if (inode_only == LOG_INODE_ALL) {
6690 struct extent_map *em, *n;
6691
6692 write_lock(&em_tree->lock);
6693 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6694 list_del_init(&em->list);
6695 write_unlock(&em_tree->lock);
6696 }
6697
6698 if (full_dir_logging) {
6699 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6700 if (ret)
6701 goto out_unlock;
6702 ret = log_delayed_insertion_items(trans, inode, path,
6703 &delayed_ins_list, ctx);
6704 if (ret)
6705 goto out_unlock;
6706 ret = log_delayed_deletion_items(trans, inode, path,
6707 &delayed_del_list, ctx);
6708 if (ret)
6709 goto out_unlock;
6710 }
6711
6712 spin_lock(&inode->lock);
6713 inode->logged_trans = trans->transid;
6714 /*
6715 * Don't update last_log_commit if we logged that an inode exists.
6716 * We do this for three reasons:
6717 *
6718 * 1) We might have had buffered writes to this inode that were
6719 * flushed and had their ordered extents completed in this
6720 * transaction, but we did not previously log the inode with
6721 * LOG_INODE_ALL. Later the inode was evicted and after that
6722 * it was loaded again and this LOG_INODE_EXISTS log operation
6723 * happened. We must make sure that if an explicit fsync against
6724 * the inode is performed later, it logs the new extents, an
6725 * updated inode item, etc, and syncs the log. The same logic
6726 * applies to direct IO writes instead of buffered writes.
6727 *
6728 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6729 * is logged with an i_size of 0 or whatever value was logged
6730 * before. If later the i_size of the inode is increased by a
6731 * truncate operation, the log is synced through an fsync of
6732 * some other inode and then finally an explicit fsync against
6733 * this inode is made, we must make sure this fsync logs the
6734 * inode with the new i_size, the hole between old i_size and
6735 * the new i_size, and syncs the log.
6736 *
6737 * 3) If we are logging that an ancestor inode exists as part of
6738 * logging a new name from a link or rename operation, don't update
6739 * its last_log_commit - otherwise if an explicit fsync is made
6740 * against an ancestor, the fsync considers the inode in the log
6741 * and doesn't sync the log, resulting in the ancestor missing after
6742 * a power failure unless the log was synced as part of an fsync
6743 * against any other unrelated inode.
6744 */
6745 if (inode_only != LOG_INODE_EXISTS)
6746 inode->last_log_commit = inode->last_sub_trans;
6747 spin_unlock(&inode->lock);
6748
6749 /*
6750 * Reset the last_reflink_trans so that the next fsync does not need to
6751 * go through the slower path when logging extents and their checksums.
6752 */
6753 if (inode_only == LOG_INODE_ALL)
6754 inode->last_reflink_trans = 0;
6755
6756 out_unlock:
6757 mutex_unlock(&inode->log_mutex);
6758 out:
6759 btrfs_free_path(path);
6760 btrfs_free_path(dst_path);
6761
6762 if (ret)
6763 free_conflicting_inodes(ctx);
6764 else
6765 ret = log_conflicting_inodes(trans, inode->root, ctx);
6766
6767 if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6768 if (!ret)
6769 ret = log_new_delayed_dentries(trans, inode,
6770 &delayed_ins_list, ctx);
6771
6772 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6773 &delayed_del_list);
6774 }
6775
6776 return ret;
6777 }
6778
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)6779 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6780 struct btrfs_inode *inode,
6781 struct btrfs_log_ctx *ctx)
6782 {
6783 int ret;
6784 struct btrfs_path *path;
6785 struct btrfs_key key;
6786 struct btrfs_root *root = inode->root;
6787 const u64 ino = btrfs_ino(inode);
6788
6789 path = btrfs_alloc_path();
6790 if (!path)
6791 return -ENOMEM;
6792 path->skip_locking = 1;
6793 path->search_commit_root = 1;
6794
6795 key.objectid = ino;
6796 key.type = BTRFS_INODE_REF_KEY;
6797 key.offset = 0;
6798 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6799 if (ret < 0)
6800 goto out;
6801
6802 while (true) {
6803 struct extent_buffer *leaf = path->nodes[0];
6804 int slot = path->slots[0];
6805 u32 cur_offset = 0;
6806 u32 item_size;
6807 unsigned long ptr;
6808
6809 if (slot >= btrfs_header_nritems(leaf)) {
6810 ret = btrfs_next_leaf(root, path);
6811 if (ret < 0)
6812 goto out;
6813 else if (ret > 0)
6814 break;
6815 continue;
6816 }
6817
6818 btrfs_item_key_to_cpu(leaf, &key, slot);
6819 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6820 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6821 break;
6822
6823 item_size = btrfs_item_size(leaf, slot);
6824 ptr = btrfs_item_ptr_offset(leaf, slot);
6825 while (cur_offset < item_size) {
6826 struct btrfs_key inode_key;
6827 struct btrfs_inode *dir_inode;
6828
6829 inode_key.type = BTRFS_INODE_ITEM_KEY;
6830 inode_key.offset = 0;
6831
6832 if (key.type == BTRFS_INODE_EXTREF_KEY) {
6833 struct btrfs_inode_extref *extref;
6834
6835 extref = (struct btrfs_inode_extref *)
6836 (ptr + cur_offset);
6837 inode_key.objectid = btrfs_inode_extref_parent(
6838 leaf, extref);
6839 cur_offset += sizeof(*extref);
6840 cur_offset += btrfs_inode_extref_name_len(leaf,
6841 extref);
6842 } else {
6843 inode_key.objectid = key.offset;
6844 cur_offset = item_size;
6845 }
6846
6847 dir_inode = btrfs_iget_logging(inode_key.objectid, root);
6848 /*
6849 * If the parent inode was deleted, return an error to
6850 * fallback to a transaction commit. This is to prevent
6851 * getting an inode that was moved from one parent A to
6852 * a parent B, got its former parent A deleted and then
6853 * it got fsync'ed, from existing at both parents after
6854 * a log replay (and the old parent still existing).
6855 * Example:
6856 *
6857 * mkdir /mnt/A
6858 * mkdir /mnt/B
6859 * touch /mnt/B/bar
6860 * sync
6861 * mv /mnt/B/bar /mnt/A/bar
6862 * mv -T /mnt/A /mnt/B
6863 * fsync /mnt/B/bar
6864 * <power fail>
6865 *
6866 * If we ignore the old parent B which got deleted,
6867 * after a log replay we would have file bar linked
6868 * at both parents and the old parent B would still
6869 * exist.
6870 */
6871 if (IS_ERR(dir_inode)) {
6872 ret = PTR_ERR(dir_inode);
6873 goto out;
6874 }
6875
6876 if (!need_log_inode(trans, dir_inode)) {
6877 btrfs_add_delayed_iput(dir_inode);
6878 continue;
6879 }
6880
6881 ctx->log_new_dentries = false;
6882 ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
6883 if (!ret && ctx->log_new_dentries)
6884 ret = log_new_dir_dentries(trans, dir_inode, ctx);
6885 btrfs_add_delayed_iput(dir_inode);
6886 if (ret)
6887 goto out;
6888 }
6889 path->slots[0]++;
6890 }
6891 ret = 0;
6892 out:
6893 btrfs_free_path(path);
6894 return ret;
6895 }
6896
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)6897 static int log_new_ancestors(struct btrfs_trans_handle *trans,
6898 struct btrfs_root *root,
6899 struct btrfs_path *path,
6900 struct btrfs_log_ctx *ctx)
6901 {
6902 struct btrfs_key found_key;
6903
6904 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6905
6906 while (true) {
6907 struct extent_buffer *leaf;
6908 int slot;
6909 struct btrfs_key search_key;
6910 struct btrfs_inode *inode;
6911 u64 ino;
6912 int ret = 0;
6913
6914 btrfs_release_path(path);
6915
6916 ino = found_key.offset;
6917
6918 search_key.objectid = found_key.offset;
6919 search_key.type = BTRFS_INODE_ITEM_KEY;
6920 search_key.offset = 0;
6921 inode = btrfs_iget_logging(ino, root);
6922 if (IS_ERR(inode))
6923 return PTR_ERR(inode);
6924
6925 if (inode->generation >= trans->transid &&
6926 need_log_inode(trans, inode))
6927 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
6928 btrfs_add_delayed_iput(inode);
6929 if (ret)
6930 return ret;
6931
6932 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6933 break;
6934
6935 search_key.type = BTRFS_INODE_REF_KEY;
6936 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6937 if (ret < 0)
6938 return ret;
6939
6940 leaf = path->nodes[0];
6941 slot = path->slots[0];
6942 if (slot >= btrfs_header_nritems(leaf)) {
6943 ret = btrfs_next_leaf(root, path);
6944 if (ret < 0)
6945 return ret;
6946 else if (ret > 0)
6947 return -ENOENT;
6948 leaf = path->nodes[0];
6949 slot = path->slots[0];
6950 }
6951
6952 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6953 if (found_key.objectid != search_key.objectid ||
6954 found_key.type != BTRFS_INODE_REF_KEY)
6955 return -ENOENT;
6956 }
6957 return 0;
6958 }
6959
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6960 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6961 struct btrfs_inode *inode,
6962 struct dentry *parent,
6963 struct btrfs_log_ctx *ctx)
6964 {
6965 struct btrfs_root *root = inode->root;
6966 struct dentry *old_parent = NULL;
6967 struct super_block *sb = inode->vfs_inode.i_sb;
6968 int ret = 0;
6969
6970 while (true) {
6971 if (!parent || d_really_is_negative(parent) ||
6972 sb != parent->d_sb)
6973 break;
6974
6975 inode = BTRFS_I(d_inode(parent));
6976 if (root != inode->root)
6977 break;
6978
6979 if (inode->generation >= trans->transid &&
6980 need_log_inode(trans, inode)) {
6981 ret = btrfs_log_inode(trans, inode,
6982 LOG_INODE_EXISTS, ctx);
6983 if (ret)
6984 break;
6985 }
6986 if (IS_ROOT(parent))
6987 break;
6988
6989 parent = dget_parent(parent);
6990 dput(old_parent);
6991 old_parent = parent;
6992 }
6993 dput(old_parent);
6994
6995 return ret;
6996 }
6997
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6998 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6999 struct btrfs_inode *inode,
7000 struct dentry *parent,
7001 struct btrfs_log_ctx *ctx)
7002 {
7003 struct btrfs_root *root = inode->root;
7004 const u64 ino = btrfs_ino(inode);
7005 struct btrfs_path *path;
7006 struct btrfs_key search_key;
7007 int ret;
7008
7009 /*
7010 * For a single hard link case, go through a fast path that does not
7011 * need to iterate the fs/subvolume tree.
7012 */
7013 if (inode->vfs_inode.i_nlink < 2)
7014 return log_new_ancestors_fast(trans, inode, parent, ctx);
7015
7016 path = btrfs_alloc_path();
7017 if (!path)
7018 return -ENOMEM;
7019
7020 search_key.objectid = ino;
7021 search_key.type = BTRFS_INODE_REF_KEY;
7022 search_key.offset = 0;
7023 again:
7024 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
7025 if (ret < 0)
7026 goto out;
7027 if (ret == 0)
7028 path->slots[0]++;
7029
7030 while (true) {
7031 struct extent_buffer *leaf = path->nodes[0];
7032 int slot = path->slots[0];
7033 struct btrfs_key found_key;
7034
7035 if (slot >= btrfs_header_nritems(leaf)) {
7036 ret = btrfs_next_leaf(root, path);
7037 if (ret < 0)
7038 goto out;
7039 else if (ret > 0)
7040 break;
7041 continue;
7042 }
7043
7044 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7045 if (found_key.objectid != ino ||
7046 found_key.type > BTRFS_INODE_EXTREF_KEY)
7047 break;
7048
7049 /*
7050 * Don't deal with extended references because they are rare
7051 * cases and too complex to deal with (we would need to keep
7052 * track of which subitem we are processing for each item in
7053 * this loop, etc). So just return some error to fallback to
7054 * a transaction commit.
7055 */
7056 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
7057 ret = -EMLINK;
7058 goto out;
7059 }
7060
7061 /*
7062 * Logging ancestors needs to do more searches on the fs/subvol
7063 * tree, so it releases the path as needed to avoid deadlocks.
7064 * Keep track of the last inode ref key and resume from that key
7065 * after logging all new ancestors for the current hard link.
7066 */
7067 memcpy(&search_key, &found_key, sizeof(search_key));
7068
7069 ret = log_new_ancestors(trans, root, path, ctx);
7070 if (ret)
7071 goto out;
7072 btrfs_release_path(path);
7073 goto again;
7074 }
7075 ret = 0;
7076 out:
7077 btrfs_free_path(path);
7078 return ret;
7079 }
7080
7081 /*
7082 * helper function around btrfs_log_inode to make sure newly created
7083 * parent directories also end up in the log. A minimal inode and backref
7084 * only logging is done of any parent directories that are older than
7085 * the last committed transaction
7086 */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)7087 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7088 struct btrfs_inode *inode,
7089 struct dentry *parent,
7090 int inode_only,
7091 struct btrfs_log_ctx *ctx)
7092 {
7093 struct btrfs_root *root = inode->root;
7094 struct btrfs_fs_info *fs_info = root->fs_info;
7095 int ret = 0;
7096 bool log_dentries;
7097
7098 if (btrfs_test_opt(fs_info, NOTREELOG))
7099 return BTRFS_LOG_FORCE_COMMIT;
7100
7101 if (btrfs_root_refs(&root->root_item) == 0)
7102 return BTRFS_LOG_FORCE_COMMIT;
7103
7104 /*
7105 * If we're logging an inode from a subvolume created in the current
7106 * transaction we must force a commit since the root is not persisted.
7107 */
7108 if (btrfs_root_generation(&root->root_item) == trans->transid)
7109 return BTRFS_LOG_FORCE_COMMIT;
7110
7111 /* Skip already logged inodes and without new extents. */
7112 if (btrfs_inode_in_log(inode, trans->transid) &&
7113 list_empty(&ctx->ordered_extents))
7114 return BTRFS_NO_LOG_SYNC;
7115
7116 ret = start_log_trans(trans, root, ctx);
7117 if (ret)
7118 return ret;
7119
7120 ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7121 if (ret)
7122 goto end_trans;
7123
7124 /*
7125 * for regular files, if its inode is already on disk, we don't
7126 * have to worry about the parents at all. This is because
7127 * we can use the last_unlink_trans field to record renames
7128 * and other fun in this file.
7129 */
7130 if (S_ISREG(inode->vfs_inode.i_mode) &&
7131 inode->generation < trans->transid &&
7132 inode->last_unlink_trans < trans->transid) {
7133 ret = 0;
7134 goto end_trans;
7135 }
7136
7137 /*
7138 * Track if we need to log dentries because ctx->log_new_dentries can
7139 * be modified in the call chains below.
7140 */
7141 log_dentries = ctx->log_new_dentries;
7142
7143 /*
7144 * On unlink we must make sure all our current and old parent directory
7145 * inodes are fully logged. This is to prevent leaving dangling
7146 * directory index entries in directories that were our parents but are
7147 * not anymore. Not doing this results in old parent directory being
7148 * impossible to delete after log replay (rmdir will always fail with
7149 * error -ENOTEMPTY).
7150 *
7151 * Example 1:
7152 *
7153 * mkdir testdir
7154 * touch testdir/foo
7155 * ln testdir/foo testdir/bar
7156 * sync
7157 * unlink testdir/bar
7158 * xfs_io -c fsync testdir/foo
7159 * <power failure>
7160 * mount fs, triggers log replay
7161 *
7162 * If we don't log the parent directory (testdir), after log replay the
7163 * directory still has an entry pointing to the file inode using the bar
7164 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7165 * the file inode has a link count of 1.
7166 *
7167 * Example 2:
7168 *
7169 * mkdir testdir
7170 * touch foo
7171 * ln foo testdir/foo2
7172 * ln foo testdir/foo3
7173 * sync
7174 * unlink testdir/foo3
7175 * xfs_io -c fsync foo
7176 * <power failure>
7177 * mount fs, triggers log replay
7178 *
7179 * Similar as the first example, after log replay the parent directory
7180 * testdir still has an entry pointing to the inode file with name foo3
7181 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7182 * and has a link count of 2.
7183 */
7184 if (inode->last_unlink_trans >= trans->transid) {
7185 ret = btrfs_log_all_parents(trans, inode, ctx);
7186 if (ret)
7187 goto end_trans;
7188 }
7189
7190 ret = log_all_new_ancestors(trans, inode, parent, ctx);
7191 if (ret)
7192 goto end_trans;
7193
7194 if (log_dentries)
7195 ret = log_new_dir_dentries(trans, inode, ctx);
7196 end_trans:
7197 if (ret < 0) {
7198 btrfs_set_log_full_commit(trans);
7199 ret = BTRFS_LOG_FORCE_COMMIT;
7200 }
7201
7202 if (ret)
7203 btrfs_remove_log_ctx(root, ctx);
7204 btrfs_end_log_trans(root);
7205
7206 return ret;
7207 }
7208
7209 /*
7210 * it is not safe to log dentry if the chunk root has added new
7211 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
7212 * If this returns 1, you must commit the transaction to safely get your
7213 * data on disk.
7214 */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)7215 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7216 struct dentry *dentry,
7217 struct btrfs_log_ctx *ctx)
7218 {
7219 struct dentry *parent = dget_parent(dentry);
7220 int ret;
7221
7222 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7223 LOG_INODE_ALL, ctx);
7224 dput(parent);
7225
7226 return ret;
7227 }
7228
7229 /*
7230 * should be called during mount to recover any replay any log trees
7231 * from the FS
7232 */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)7233 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7234 {
7235 int ret;
7236 struct btrfs_path *path;
7237 struct btrfs_trans_handle *trans;
7238 struct btrfs_key key;
7239 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7240 struct walk_control wc = {
7241 .process_func = process_one_buffer,
7242 .stage = LOG_WALK_PIN_ONLY,
7243 };
7244
7245 path = btrfs_alloc_path();
7246 if (!path)
7247 return -ENOMEM;
7248
7249 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7250
7251 trans = btrfs_start_transaction(fs_info->tree_root, 0);
7252 if (IS_ERR(trans)) {
7253 ret = PTR_ERR(trans);
7254 goto error;
7255 }
7256
7257 wc.trans = trans;
7258 wc.pin = 1;
7259
7260 ret = walk_log_tree(trans, log_root_tree, &wc);
7261 if (ret) {
7262 btrfs_abort_transaction(trans, ret);
7263 goto error;
7264 }
7265
7266 again:
7267 key.objectid = BTRFS_TREE_LOG_OBJECTID;
7268 key.type = BTRFS_ROOT_ITEM_KEY;
7269 key.offset = (u64)-1;
7270
7271 while (1) {
7272 struct btrfs_root *log;
7273 struct btrfs_key found_key;
7274
7275 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7276
7277 if (ret < 0) {
7278 btrfs_abort_transaction(trans, ret);
7279 goto error;
7280 }
7281 if (ret > 0) {
7282 if (path->slots[0] == 0)
7283 break;
7284 path->slots[0]--;
7285 }
7286 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7287 path->slots[0]);
7288 btrfs_release_path(path);
7289 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7290 break;
7291
7292 log = btrfs_read_tree_root(log_root_tree, &found_key);
7293 if (IS_ERR(log)) {
7294 ret = PTR_ERR(log);
7295 btrfs_abort_transaction(trans, ret);
7296 goto error;
7297 }
7298
7299 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7300 true);
7301 if (IS_ERR(wc.replay_dest)) {
7302 ret = PTR_ERR(wc.replay_dest);
7303 wc.replay_dest = NULL;
7304 if (ret != -ENOENT) {
7305 btrfs_put_root(log);
7306 btrfs_abort_transaction(trans, ret);
7307 goto error;
7308 }
7309
7310 /*
7311 * We didn't find the subvol, likely because it was
7312 * deleted. This is ok, simply skip this log and go to
7313 * the next one.
7314 *
7315 * We need to exclude the root because we can't have
7316 * other log replays overwriting this log as we'll read
7317 * it back in a few more times. This will keep our
7318 * block from being modified, and we'll just bail for
7319 * each subsequent pass.
7320 */
7321 ret = btrfs_pin_extent_for_log_replay(trans, log->node);
7322 if (ret) {
7323 btrfs_put_root(log);
7324 btrfs_abort_transaction(trans, ret);
7325 goto error;
7326 }
7327 goto next;
7328 }
7329
7330 wc.replay_dest->log_root = log;
7331 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7332 if (ret) {
7333 btrfs_abort_transaction(trans, ret);
7334 goto next;
7335 }
7336
7337 ret = walk_log_tree(trans, log, &wc);
7338 if (ret) {
7339 btrfs_abort_transaction(trans, ret);
7340 goto next;
7341 }
7342
7343 if (wc.stage == LOG_WALK_REPLAY_ALL) {
7344 struct btrfs_root *root = wc.replay_dest;
7345
7346 ret = fixup_inode_link_counts(trans, wc.replay_dest, path);
7347 if (ret) {
7348 btrfs_abort_transaction(trans, ret);
7349 goto next;
7350 }
7351 /*
7352 * We have just replayed everything, and the highest
7353 * objectid of fs roots probably has changed in case
7354 * some inode_item's got replayed.
7355 *
7356 * root->objectid_mutex is not acquired as log replay
7357 * could only happen during mount.
7358 */
7359 ret = btrfs_init_root_free_objectid(root);
7360 if (ret) {
7361 btrfs_abort_transaction(trans, ret);
7362 goto next;
7363 }
7364 }
7365 next:
7366 if (wc.replay_dest) {
7367 wc.replay_dest->log_root = NULL;
7368 btrfs_put_root(wc.replay_dest);
7369 }
7370 btrfs_put_root(log);
7371
7372 if (ret)
7373 goto error;
7374 if (found_key.offset == 0)
7375 break;
7376 key.offset = found_key.offset - 1;
7377 }
7378 btrfs_release_path(path);
7379
7380 /* step one is to pin it all, step two is to replay just inodes */
7381 if (wc.pin) {
7382 wc.pin = 0;
7383 wc.process_func = replay_one_buffer;
7384 wc.stage = LOG_WALK_REPLAY_INODES;
7385 goto again;
7386 }
7387 /* step three is to replay everything */
7388 if (wc.stage < LOG_WALK_REPLAY_ALL) {
7389 wc.stage++;
7390 goto again;
7391 }
7392
7393 btrfs_free_path(path);
7394
7395 /* step 4: commit the transaction, which also unpins the blocks */
7396 ret = btrfs_commit_transaction(trans);
7397 if (ret)
7398 return ret;
7399
7400 log_root_tree->log_root = NULL;
7401 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7402 btrfs_put_root(log_root_tree);
7403
7404 return 0;
7405 error:
7406 if (wc.trans)
7407 btrfs_end_transaction(wc.trans);
7408 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7409 btrfs_free_path(path);
7410 return ret;
7411 }
7412
7413 /*
7414 * there are some corner cases where we want to force a full
7415 * commit instead of allowing a directory to be logged.
7416 *
7417 * They revolve around files there were unlinked from the directory, and
7418 * this function updates the parent directory so that a full commit is
7419 * properly done if it is fsync'd later after the unlinks are done.
7420 *
7421 * Must be called before the unlink operations (updates to the subvolume tree,
7422 * inodes, etc) are done.
7423 */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,bool for_rename)7424 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7425 struct btrfs_inode *dir, struct btrfs_inode *inode,
7426 bool for_rename)
7427 {
7428 /*
7429 * when we're logging a file, if it hasn't been renamed
7430 * or unlinked, and its inode is fully committed on disk,
7431 * we don't have to worry about walking up the directory chain
7432 * to log its parents.
7433 *
7434 * So, we use the last_unlink_trans field to put this transid
7435 * into the file. When the file is logged we check it and
7436 * don't log the parents if the file is fully on disk.
7437 */
7438 mutex_lock(&inode->log_mutex);
7439 inode->last_unlink_trans = trans->transid;
7440 mutex_unlock(&inode->log_mutex);
7441
7442 if (!for_rename)
7443 return;
7444
7445 /*
7446 * If this directory was already logged, any new names will be logged
7447 * with btrfs_log_new_name() and old names will be deleted from the log
7448 * tree with btrfs_del_dir_entries_in_log() or with
7449 * btrfs_del_inode_ref_in_log().
7450 */
7451 if (inode_logged(trans, dir, NULL) == 1)
7452 return;
7453
7454 /*
7455 * If the inode we're about to unlink was logged before, the log will be
7456 * properly updated with the new name with btrfs_log_new_name() and the
7457 * old name removed with btrfs_del_dir_entries_in_log() or with
7458 * btrfs_del_inode_ref_in_log().
7459 */
7460 if (inode_logged(trans, inode, NULL) == 1)
7461 return;
7462
7463 /*
7464 * when renaming files across directories, if the directory
7465 * there we're unlinking from gets fsync'd later on, there's
7466 * no way to find the destination directory later and fsync it
7467 * properly. So, we have to be conservative and force commits
7468 * so the new name gets discovered.
7469 */
7470 mutex_lock(&dir->log_mutex);
7471 dir->last_unlink_trans = trans->transid;
7472 mutex_unlock(&dir->log_mutex);
7473 }
7474
7475 /*
7476 * Make sure that if someone attempts to fsync the parent directory of a deleted
7477 * snapshot, it ends up triggering a transaction commit. This is to guarantee
7478 * that after replaying the log tree of the parent directory's root we will not
7479 * see the snapshot anymore and at log replay time we will not see any log tree
7480 * corresponding to the deleted snapshot's root, which could lead to replaying
7481 * it after replaying the log tree of the parent directory (which would replay
7482 * the snapshot delete operation).
7483 *
7484 * Must be called before the actual snapshot destroy operation (updates to the
7485 * parent root and tree of tree roots trees, etc) are done.
7486 */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7487 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7488 struct btrfs_inode *dir)
7489 {
7490 mutex_lock(&dir->log_mutex);
7491 dir->last_unlink_trans = trans->transid;
7492 mutex_unlock(&dir->log_mutex);
7493 }
7494
7495 /*
7496 * Call this when creating a subvolume in a directory.
7497 * Because we don't commit a transaction when creating a subvolume, we can't
7498 * allow the directory pointing to the subvolume to be logged with an entry that
7499 * points to an unpersisted root if we are still in the transaction used to
7500 * create the subvolume, so make any attempt to log the directory to result in a
7501 * full log sync.
7502 * Also we don't need to worry with renames, since btrfs_rename() marks the log
7503 * for full commit when renaming a subvolume.
7504 *
7505 * Must be called before creating the subvolume entry in its parent directory.
7506 */
btrfs_record_new_subvolume(const struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7507 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
7508 struct btrfs_inode *dir)
7509 {
7510 mutex_lock(&dir->log_mutex);
7511 dir->last_unlink_trans = trans->transid;
7512 mutex_unlock(&dir->log_mutex);
7513 }
7514
7515 /*
7516 * Update the log after adding a new name for an inode.
7517 *
7518 * @trans: Transaction handle.
7519 * @old_dentry: The dentry associated with the old name and the old
7520 * parent directory.
7521 * @old_dir: The inode of the previous parent directory for the case
7522 * of a rename. For a link operation, it must be NULL.
7523 * @old_dir_index: The index number associated with the old name, meaningful
7524 * only for rename operations (when @old_dir is not NULL).
7525 * Ignored for link operations.
7526 * @parent: The dentry associated with the directory under which the
7527 * new name is located.
7528 *
7529 * Call this after adding a new name for an inode, as a result of a link or
7530 * rename operation, and it will properly update the log to reflect the new name.
7531 */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct dentry * old_dentry,struct btrfs_inode * old_dir,u64 old_dir_index,struct dentry * parent)7532 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7533 struct dentry *old_dentry, struct btrfs_inode *old_dir,
7534 u64 old_dir_index, struct dentry *parent)
7535 {
7536 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7537 struct btrfs_root *root = inode->root;
7538 struct btrfs_log_ctx ctx;
7539 bool log_pinned = false;
7540 int ret;
7541
7542 btrfs_init_log_ctx(&ctx, inode);
7543 ctx.logging_new_name = true;
7544
7545 /*
7546 * this will force the logging code to walk the dentry chain
7547 * up for the file
7548 */
7549 if (!S_ISDIR(inode->vfs_inode.i_mode))
7550 inode->last_unlink_trans = trans->transid;
7551
7552 /*
7553 * if this inode hasn't been logged and directory we're renaming it
7554 * from hasn't been logged, we don't need to log it
7555 */
7556 ret = inode_logged(trans, inode, NULL);
7557 if (ret < 0) {
7558 goto out;
7559 } else if (ret == 0) {
7560 if (!old_dir)
7561 return;
7562 /*
7563 * If the inode was not logged and we are doing a rename (old_dir is not
7564 * NULL), check if old_dir was logged - if it was not we can return and
7565 * do nothing.
7566 */
7567 ret = inode_logged(trans, old_dir, NULL);
7568 if (ret < 0)
7569 goto out;
7570 else if (ret == 0)
7571 return;
7572 }
7573 ret = 0;
7574
7575 /*
7576 * Now that we know we need to update the log, allocate the scratch eb
7577 * for the context before joining a log transaction below, as this can
7578 * take time and therefore we could delay log commits from other tasks.
7579 */
7580 btrfs_init_log_ctx_scratch_eb(&ctx);
7581
7582 /*
7583 * If we are doing a rename (old_dir is not NULL) from a directory that
7584 * was previously logged, make sure that on log replay we get the old
7585 * dir entry deleted. This is needed because we will also log the new
7586 * name of the renamed inode, so we need to make sure that after log
7587 * replay we don't end up with both the new and old dir entries existing.
7588 */
7589 if (old_dir && old_dir->logged_trans == trans->transid) {
7590 struct btrfs_root *log = old_dir->root->log_root;
7591 struct btrfs_path *path;
7592 struct fscrypt_name fname;
7593
7594 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7595
7596 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7597 &old_dentry->d_name, 0, &fname);
7598 if (ret)
7599 goto out;
7600
7601 path = btrfs_alloc_path();
7602 if (!path) {
7603 ret = -ENOMEM;
7604 fscrypt_free_filename(&fname);
7605 goto out;
7606 }
7607
7608 /*
7609 * We have two inodes to update in the log, the old directory and
7610 * the inode that got renamed, so we must pin the log to prevent
7611 * anyone from syncing the log until we have updated both inodes
7612 * in the log.
7613 */
7614 ret = join_running_log_trans(root);
7615 /*
7616 * At least one of the inodes was logged before, so this should
7617 * not fail, but if it does, it's not serious, just bail out and
7618 * mark the log for a full commit.
7619 */
7620 if (WARN_ON_ONCE(ret < 0)) {
7621 btrfs_free_path(path);
7622 fscrypt_free_filename(&fname);
7623 goto out;
7624 }
7625
7626 log_pinned = true;
7627
7628 /*
7629 * Other concurrent task might be logging the old directory,
7630 * as it can be triggered when logging other inode that had or
7631 * still has a dentry in the old directory. We lock the old
7632 * directory's log_mutex to ensure the deletion of the old
7633 * name is persisted, because during directory logging we
7634 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7635 * the old name's dir index item is in the delayed items, so
7636 * it could be missed by an in progress directory logging.
7637 */
7638 mutex_lock(&old_dir->log_mutex);
7639 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7640 &fname.disk_name, old_dir_index);
7641 if (ret > 0) {
7642 /*
7643 * The dentry does not exist in the log, so record its
7644 * deletion.
7645 */
7646 btrfs_release_path(path);
7647 ret = insert_dir_log_key(trans, log, path,
7648 btrfs_ino(old_dir),
7649 old_dir_index, old_dir_index);
7650 }
7651 mutex_unlock(&old_dir->log_mutex);
7652
7653 btrfs_free_path(path);
7654 fscrypt_free_filename(&fname);
7655 if (ret < 0)
7656 goto out;
7657 }
7658
7659 /*
7660 * We don't care about the return value. If we fail to log the new name
7661 * then we know the next attempt to sync the log will fallback to a full
7662 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7663 * we don't need to worry about getting a log committed that has an
7664 * inconsistent state after a rename operation.
7665 */
7666 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7667 ASSERT(list_empty(&ctx.conflict_inodes));
7668 out:
7669 /*
7670 * If an error happened mark the log for a full commit because it's not
7671 * consistent and up to date or we couldn't find out if one of the
7672 * inodes was logged before in this transaction. Do it before unpinning
7673 * the log, to avoid any races with someone else trying to commit it.
7674 */
7675 if (ret < 0)
7676 btrfs_set_log_full_commit(trans);
7677 if (log_pinned)
7678 btrfs_end_log_trans(root);
7679 free_extent_buffer(ctx.scratch_eb);
7680 }
7681
7682