1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "backref.h"
17 #include "compression.h"
18 #include "qgroup.h"
19 #include "block-group.h"
20 #include "space-info.h"
21 #include "inode-item.h"
22 #include "fs.h"
23 #include "accessors.h"
24 #include "extent-tree.h"
25 #include "root-tree.h"
26 #include "dir-item.h"
27 #include "file-item.h"
28 #include "file.h"
29 #include "orphan.h"
30 #include "print-tree.h"
31 #include "tree-checker.h"
32
33 #define MAX_CONFLICT_INODES 10
34
35 /* magic values for the inode_only field in btrfs_log_inode:
36 *
37 * LOG_INODE_ALL means to log everything
38 * LOG_INODE_EXISTS means to log just enough to recreate the inode
39 * during log replay
40 */
41 enum {
42 LOG_INODE_ALL,
43 LOG_INODE_EXISTS,
44 };
45
46 /*
47 * directory trouble cases
48 *
49 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
50 * log, we must force a full commit before doing an fsync of the directory
51 * where the unlink was done.
52 * ---> record transid of last unlink/rename per directory
53 *
54 * mkdir foo/some_dir
55 * normal commit
56 * rename foo/some_dir foo2/some_dir
57 * mkdir foo/some_dir
58 * fsync foo/some_dir/some_file
59 *
60 * The fsync above will unlink the original some_dir without recording
61 * it in its new location (foo2). After a crash, some_dir will be gone
62 * unless the fsync of some_file forces a full commit
63 *
64 * 2) we must log any new names for any file or dir that is in the fsync
65 * log. ---> check inode while renaming/linking.
66 *
67 * 2a) we must log any new names for any file or dir during rename
68 * when the directory they are being removed from was logged.
69 * ---> check inode and old parent dir during rename
70 *
71 * 2a is actually the more important variant. With the extra logging
72 * a crash might unlink the old name without recreating the new one
73 *
74 * 3) after a crash, we must go through any directories with a link count
75 * of zero and redo the rm -rf
76 *
77 * mkdir f1/foo
78 * normal commit
79 * rm -rf f1/foo
80 * fsync(f1)
81 *
82 * The directory f1 was fully removed from the FS, but fsync was never
83 * called on f1, only its parent dir. After a crash the rm -rf must
84 * be replayed. This must be able to recurse down the entire
85 * directory tree. The inode link count fixup code takes care of the
86 * ugly details.
87 */
88
89 /*
90 * stages for the tree walking. The first
91 * stage (0) is to only pin down the blocks we find
92 * the second stage (1) is to make sure that all the inodes
93 * we find in the log are created in the subvolume.
94 *
95 * The last stage is to deal with directories and links and extents
96 * and all the other fun semantics
97 */
98 enum {
99 LOG_WALK_PIN_ONLY,
100 LOG_WALK_REPLAY_INODES,
101 LOG_WALK_REPLAY_DIR_INDEX,
102 LOG_WALK_REPLAY_ALL,
103 };
104
105 /*
106 * The walk control struct is used to pass state down the chain when processing
107 * the log tree. The stage field tells us which part of the log tree processing
108 * we are currently doing.
109 */
110 struct walk_control {
111 /*
112 * Signal that we are freeing the metadata extents of a log tree.
113 * This is used at transaction commit time while freeing a log tree.
114 */
115 bool free;
116
117 /*
118 * Signal that we are pinning the metadata extents of a log tree and the
119 * data extents its leaves point to (if using mixed block groups).
120 * This happens in the first stage of log replay to ensure that during
121 * replay, while we are modifying subvolume trees, we don't overwrite
122 * the metadata extents of log trees.
123 */
124 bool pin;
125
126 /* What stage of the replay code we're currently in. */
127 int stage;
128
129 /*
130 * Ignore any items from the inode currently being processed. Needs
131 * to be set every time we find a BTRFS_INODE_ITEM_KEY.
132 */
133 bool ignore_cur_inode;
134
135 /*
136 * The root we are currently replaying to. This is NULL for the replay
137 * stage LOG_WALK_PIN_ONLY.
138 */
139 struct btrfs_root *root;
140
141 /* The log tree we are currently processing (not NULL for any stage). */
142 struct btrfs_root *log;
143
144 /* The transaction handle used for replaying all log trees. */
145 struct btrfs_trans_handle *trans;
146
147 /*
148 * The function that gets used to process blocks we find in the tree.
149 * Note the extent_buffer might not be up to date when it is passed in,
150 * and it must be checked or read if you need the data inside it.
151 */
152 int (*process_func)(struct extent_buffer *eb,
153 struct walk_control *wc, u64 gen, int level);
154
155 /*
156 * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
157 * and by the replay_one_buffer() callback.
158 */
159
160 /* The current log leaf being processed. */
161 struct extent_buffer *log_leaf;
162 /* The key being processed of the current log leaf. */
163 struct btrfs_key log_key;
164 /* The slot being processed of the current log leaf. */
165 int log_slot;
166
167 /* A path used for searches and modifications to subvolume trees. */
168 struct btrfs_path *subvol_path;
169 };
170
do_abort_log_replay(struct walk_control * wc,const char * function,unsigned int line,int error,const char * fmt,...)171 static void do_abort_log_replay(struct walk_control *wc, const char *function,
172 unsigned int line, int error, const char *fmt, ...)
173 {
174 struct btrfs_fs_info *fs_info = wc->trans->fs_info;
175 struct va_format vaf;
176 va_list args;
177
178 /*
179 * Do nothing if we already aborted, to avoid dumping leaves again which
180 * can be verbose. Further more, only the first call is useful since it
181 * is where we have a problem. Note that we do not use the flag
182 * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
183 * are outside of tree-log.c that can abort transactions (such as
184 * btrfs_add_link() for example), so if that happens we still want to
185 * dump all log replay specific information below.
186 */
187 if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
188 return;
189
190 btrfs_abort_transaction(wc->trans, error);
191
192 if (wc->subvol_path->nodes[0]) {
193 btrfs_crit(fs_info,
194 "subvolume (root %llu) leaf currently being processed:",
195 btrfs_root_id(wc->root));
196 btrfs_print_leaf(wc->subvol_path->nodes[0]);
197 }
198
199 if (wc->log_leaf) {
200 btrfs_crit(fs_info,
201 "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):",
202 btrfs_root_id(wc->root), wc->log_slot,
203 wc->log_key.objectid, wc->log_key.type, wc->log_key.offset);
204 btrfs_print_leaf(wc->log_leaf);
205 }
206
207 va_start(args, fmt);
208 vaf.fmt = fmt;
209 vaf.va = &args;
210
211 btrfs_crit(fs_info,
212 "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
213 function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
214
215 va_end(args);
216 }
217
218 /*
219 * Use this for aborting a transaction during log replay while we are down the
220 * call chain of replay_one_buffer(), so that we get a lot more useful
221 * information for debugging issues when compared to a plain call to
222 * btrfs_abort_transaction().
223 */
224 #define btrfs_abort_log_replay(wc, error, fmt, args...) \
225 do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
226
227 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
228 struct btrfs_inode *inode,
229 int inode_only,
230 struct btrfs_log_ctx *ctx);
231 static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
232 static noinline int replay_dir_deletes(struct walk_control *wc,
233 u64 dirid, bool del_all);
234 static void wait_log_commit(struct btrfs_root *root, int transid);
235
236 /*
237 * tree logging is a special write ahead log used to make sure that
238 * fsyncs and O_SYNCs can happen without doing full tree commits.
239 *
240 * Full tree commits are expensive because they require commonly
241 * modified blocks to be recowed, creating many dirty pages in the
242 * extent tree an 4x-6x higher write load than ext3.
243 *
244 * Instead of doing a tree commit on every fsync, we use the
245 * key ranges and transaction ids to find items for a given file or directory
246 * that have changed in this transaction. Those items are copied into
247 * a special tree (one per subvolume root), that tree is written to disk
248 * and then the fsync is considered complete.
249 *
250 * After a crash, items are copied out of the log-tree back into the
251 * subvolume tree. Any file data extents found are recorded in the extent
252 * allocation tree, and the log-tree freed.
253 *
254 * The log tree is read three times, once to pin down all the extents it is
255 * using in ram and once, once to create all the inodes logged in the tree
256 * and once to do all the other items.
257 */
258
btrfs_iget_logging(u64 objectid,struct btrfs_root * root)259 static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
260 {
261 unsigned int nofs_flag;
262 struct btrfs_inode *inode;
263
264 /* Only meant to be called for subvolume roots and not for log roots. */
265 ASSERT(btrfs_is_fstree(btrfs_root_id(root)));
266
267 /*
268 * We're holding a transaction handle whether we are logging or
269 * replaying a log tree, so we must make sure NOFS semantics apply
270 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
271 * to allocate an inode, which can recurse back into the filesystem and
272 * attempt a transaction commit, resulting in a deadlock.
273 */
274 nofs_flag = memalloc_nofs_save();
275 inode = btrfs_iget(objectid, root);
276 memalloc_nofs_restore(nofs_flag);
277
278 return inode;
279 }
280
281 /*
282 * start a sub transaction and setup the log tree
283 * this increments the log tree writer count to make the people
284 * syncing the tree wait for us to finish
285 */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)286 static int start_log_trans(struct btrfs_trans_handle *trans,
287 struct btrfs_root *root,
288 struct btrfs_log_ctx *ctx)
289 {
290 struct btrfs_fs_info *fs_info = root->fs_info;
291 struct btrfs_root *tree_root = fs_info->tree_root;
292 const bool zoned = btrfs_is_zoned(fs_info);
293 int ret = 0;
294 bool created = false;
295
296 /*
297 * First check if the log root tree was already created. If not, create
298 * it before locking the root's log_mutex, just to keep lockdep happy.
299 */
300 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
301 mutex_lock(&tree_root->log_mutex);
302 if (!fs_info->log_root_tree) {
303 ret = btrfs_init_log_root_tree(trans, fs_info);
304 if (!ret) {
305 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
306 created = true;
307 }
308 }
309 mutex_unlock(&tree_root->log_mutex);
310 if (ret)
311 return ret;
312 }
313
314 mutex_lock(&root->log_mutex);
315
316 again:
317 if (root->log_root) {
318 int index = (root->log_transid + 1) % 2;
319
320 if (btrfs_need_log_full_commit(trans)) {
321 ret = BTRFS_LOG_FORCE_COMMIT;
322 goto out;
323 }
324
325 if (zoned && atomic_read(&root->log_commit[index])) {
326 wait_log_commit(root, root->log_transid - 1);
327 goto again;
328 }
329
330 if (!root->log_start_pid) {
331 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
332 root->log_start_pid = current->pid;
333 } else if (root->log_start_pid != current->pid) {
334 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
335 }
336 } else {
337 /*
338 * This means fs_info->log_root_tree was already created
339 * for some other FS trees. Do the full commit not to mix
340 * nodes from multiple log transactions to do sequential
341 * writing.
342 */
343 if (zoned && !created) {
344 ret = BTRFS_LOG_FORCE_COMMIT;
345 goto out;
346 }
347
348 ret = btrfs_add_log_tree(trans, root);
349 if (ret)
350 goto out;
351
352 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
353 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
354 root->log_start_pid = current->pid;
355 }
356
357 atomic_inc(&root->log_writers);
358 if (!ctx->logging_new_name) {
359 int index = root->log_transid % 2;
360 list_add_tail(&ctx->list, &root->log_ctxs[index]);
361 ctx->log_transid = root->log_transid;
362 }
363
364 out:
365 mutex_unlock(&root->log_mutex);
366 return ret;
367 }
368
369 /*
370 * returns 0 if there was a log transaction running and we were able
371 * to join, or returns -ENOENT if there were not transactions
372 * in progress
373 */
join_running_log_trans(struct btrfs_root * root)374 static int join_running_log_trans(struct btrfs_root *root)
375 {
376 const bool zoned = btrfs_is_zoned(root->fs_info);
377 int ret = -ENOENT;
378
379 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
380 return ret;
381
382 mutex_lock(&root->log_mutex);
383 again:
384 if (root->log_root) {
385 int index = (root->log_transid + 1) % 2;
386
387 ret = 0;
388 if (zoned && atomic_read(&root->log_commit[index])) {
389 wait_log_commit(root, root->log_transid - 1);
390 goto again;
391 }
392 atomic_inc(&root->log_writers);
393 }
394 mutex_unlock(&root->log_mutex);
395 return ret;
396 }
397
398 /*
399 * This either makes the current running log transaction wait
400 * until you call btrfs_end_log_trans() or it makes any future
401 * log transactions wait until you call btrfs_end_log_trans()
402 */
btrfs_pin_log_trans(struct btrfs_root * root)403 void btrfs_pin_log_trans(struct btrfs_root *root)
404 {
405 atomic_inc(&root->log_writers);
406 }
407
408 /*
409 * indicate we're done making changes to the log tree
410 * and wake up anyone waiting to do a sync
411 */
btrfs_end_log_trans(struct btrfs_root * root)412 void btrfs_end_log_trans(struct btrfs_root *root)
413 {
414 if (atomic_dec_and_test(&root->log_writers)) {
415 /* atomic_dec_and_test implies a barrier */
416 cond_wake_up_nomb(&root->log_writer_wait);
417 }
418 }
419
420 /*
421 * process_func used to pin down extents, write them or wait on them
422 */
process_one_buffer(struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)423 static int process_one_buffer(struct extent_buffer *eb,
424 struct walk_control *wc, u64 gen, int level)
425 {
426 struct btrfs_root *log = wc->log;
427 struct btrfs_trans_handle *trans = wc->trans;
428 struct btrfs_fs_info *fs_info = log->fs_info;
429 int ret = 0;
430
431 /*
432 * If this fs is mixed then we need to be able to process the leaves to
433 * pin down any logged extents, so we have to read the block.
434 */
435 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
436 struct btrfs_tree_parent_check check = {
437 .level = level,
438 .transid = gen
439 };
440
441 ret = btrfs_read_extent_buffer(eb, &check);
442 if (unlikely(ret)) {
443 if (trans)
444 btrfs_abort_transaction(trans, ret);
445 else
446 btrfs_handle_fs_error(fs_info, ret, NULL);
447 return ret;
448 }
449 }
450
451 if (wc->pin) {
452 ASSERT(trans != NULL);
453 ret = btrfs_pin_extent_for_log_replay(trans, eb);
454 if (unlikely(ret)) {
455 btrfs_abort_transaction(trans, ret);
456 return ret;
457 }
458
459 if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
460 ret = btrfs_exclude_logged_extents(eb);
461 if (ret)
462 btrfs_abort_transaction(trans, ret);
463 }
464 }
465 return ret;
466 }
467
468 /*
469 * Item overwrite used by log replay. The given log tree leaf, slot and key
470 * from the walk_control structure all refer to the source data we are copying
471 * out.
472 *
473 * The given root is for the tree we are copying into, and path is a scratch
474 * path for use in this function (it should be released on entry and will be
475 * released on exit).
476 *
477 * If the key is already in the destination tree the existing item is
478 * overwritten. If the existing item isn't big enough, it is extended.
479 * If it is too large, it is truncated.
480 *
481 * If the key isn't in the destination yet, a new item is inserted.
482 */
overwrite_item(struct walk_control * wc)483 static int overwrite_item(struct walk_control *wc)
484 {
485 struct btrfs_trans_handle *trans = wc->trans;
486 struct btrfs_root *root = wc->root;
487 int ret;
488 u32 item_size;
489 u64 saved_i_size = 0;
490 int save_old_i_size = 0;
491 unsigned long src_ptr;
492 unsigned long dst_ptr;
493 struct extent_buffer *dst_eb;
494 int dst_slot;
495 const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
496
497 /*
498 * This is only used during log replay, so the root is always from a
499 * fs/subvolume tree. In case we ever need to support a log root, then
500 * we'll have to clone the leaf in the path, release the path and use
501 * the leaf before writing into the log tree. See the comments at
502 * copy_items() for more details.
503 */
504 ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
505
506 item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
507 src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
508
509 /* Look for the key in the destination tree. */
510 ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
511 if (ret < 0) {
512 btrfs_abort_log_replay(wc, ret,
513 "failed to search subvolume tree for key (%llu %u %llu) root %llu",
514 wc->log_key.objectid, wc->log_key.type,
515 wc->log_key.offset, btrfs_root_id(root));
516 return ret;
517 }
518
519 dst_eb = wc->subvol_path->nodes[0];
520 dst_slot = wc->subvol_path->slots[0];
521
522 if (ret == 0) {
523 char *src_copy;
524 const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
525
526 if (dst_size != item_size)
527 goto insert;
528
529 if (item_size == 0) {
530 btrfs_release_path(wc->subvol_path);
531 return 0;
532 }
533 src_copy = kmalloc(item_size, GFP_NOFS);
534 if (!src_copy) {
535 btrfs_abort_log_replay(wc, -ENOMEM,
536 "failed to allocate memory for log leaf item");
537 return -ENOMEM;
538 }
539
540 read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
541 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
542 ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
543
544 kfree(src_copy);
545 /*
546 * they have the same contents, just return, this saves
547 * us from cowing blocks in the destination tree and doing
548 * extra writes that may not have been done by a previous
549 * sync
550 */
551 if (ret == 0) {
552 btrfs_release_path(wc->subvol_path);
553 return 0;
554 }
555
556 /*
557 * We need to load the old nbytes into the inode so when we
558 * replay the extents we've logged we get the right nbytes.
559 */
560 if (is_inode_item) {
561 struct btrfs_inode_item *item;
562 u64 nbytes;
563 u32 mode;
564
565 item = btrfs_item_ptr(dst_eb, dst_slot,
566 struct btrfs_inode_item);
567 nbytes = btrfs_inode_nbytes(dst_eb, item);
568 item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
569 struct btrfs_inode_item);
570 btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
571
572 /*
573 * If this is a directory we need to reset the i_size to
574 * 0 so that we can set it up properly when replaying
575 * the rest of the items in this log.
576 */
577 mode = btrfs_inode_mode(wc->log_leaf, item);
578 if (S_ISDIR(mode))
579 btrfs_set_inode_size(wc->log_leaf, item, 0);
580 }
581 } else if (is_inode_item) {
582 struct btrfs_inode_item *item;
583 u32 mode;
584
585 /*
586 * New inode, set nbytes to 0 so that the nbytes comes out
587 * properly when we replay the extents.
588 */
589 item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
590 btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
591
592 /*
593 * If this is a directory we need to reset the i_size to 0 so
594 * that we can set it up properly when replaying the rest of
595 * the items in this log.
596 */
597 mode = btrfs_inode_mode(wc->log_leaf, item);
598 if (S_ISDIR(mode))
599 btrfs_set_inode_size(wc->log_leaf, item, 0);
600 }
601 insert:
602 btrfs_release_path(wc->subvol_path);
603 /* try to insert the key into the destination tree */
604 wc->subvol_path->skip_release_on_error = 1;
605 ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
606 wc->subvol_path->skip_release_on_error = 0;
607
608 dst_eb = wc->subvol_path->nodes[0];
609 dst_slot = wc->subvol_path->slots[0];
610
611 /* make sure any existing item is the correct size */
612 if (ret == -EEXIST || ret == -EOVERFLOW) {
613 const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
614
615 if (found_size > item_size)
616 btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
617 else if (found_size < item_size)
618 btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
619 } else if (ret) {
620 btrfs_abort_log_replay(wc, ret,
621 "failed to insert item for key (%llu %u %llu)",
622 wc->log_key.objectid, wc->log_key.type,
623 wc->log_key.offset);
624 return ret;
625 }
626 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
627
628 /* don't overwrite an existing inode if the generation number
629 * was logged as zero. This is done when the tree logging code
630 * is just logging an inode to make sure it exists after recovery.
631 *
632 * Also, don't overwrite i_size on directories during replay.
633 * log replay inserts and removes directory items based on the
634 * state of the tree found in the subvolume, and i_size is modified
635 * as it goes
636 */
637 if (is_inode_item && ret == -EEXIST) {
638 struct btrfs_inode_item *src_item;
639 struct btrfs_inode_item *dst_item;
640
641 src_item = (struct btrfs_inode_item *)src_ptr;
642 dst_item = (struct btrfs_inode_item *)dst_ptr;
643
644 if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
645 const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
646
647 /*
648 * For regular files an ino_size == 0 is used only when
649 * logging that an inode exists, as part of a directory
650 * fsync, and the inode wasn't fsynced before. In this
651 * case don't set the size of the inode in the fs/subvol
652 * tree, otherwise we would be throwing valid data away.
653 */
654 if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
655 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
656 ino_size != 0)
657 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
658 goto no_copy;
659 }
660
661 if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
662 S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
663 save_old_i_size = 1;
664 saved_i_size = btrfs_inode_size(dst_eb, dst_item);
665 }
666 }
667
668 copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
669
670 if (save_old_i_size) {
671 struct btrfs_inode_item *dst_item;
672
673 dst_item = (struct btrfs_inode_item *)dst_ptr;
674 btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
675 }
676
677 /* make sure the generation is filled in */
678 if (is_inode_item) {
679 struct btrfs_inode_item *dst_item;
680
681 dst_item = (struct btrfs_inode_item *)dst_ptr;
682 if (btrfs_inode_generation(dst_eb, dst_item) == 0)
683 btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
684 }
685 no_copy:
686 btrfs_release_path(wc->subvol_path);
687 return 0;
688 }
689
read_alloc_one_name(struct extent_buffer * eb,void * start,int len,struct fscrypt_str * name)690 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
691 struct fscrypt_str *name)
692 {
693 char *buf;
694
695 buf = kmalloc(len, GFP_NOFS);
696 if (!buf)
697 return -ENOMEM;
698
699 read_extent_buffer(eb, buf, (unsigned long)start, len);
700 name->name = buf;
701 name->len = len;
702 return 0;
703 }
704
705 /* replays a single extent in 'eb' at 'slot' with 'key' into the
706 * subvolume 'root'. path is released on entry and should be released
707 * on exit.
708 *
709 * extents in the log tree have not been allocated out of the extent
710 * tree yet. So, this completes the allocation, taking a reference
711 * as required if the extent already exists or creating a new extent
712 * if it isn't in the extent allocation tree yet.
713 *
714 * The extent is inserted into the file, dropping any existing extents
715 * from the file that overlap the new one.
716 */
replay_one_extent(struct walk_control * wc)717 static noinline int replay_one_extent(struct walk_control *wc)
718 {
719 struct btrfs_trans_handle *trans = wc->trans;
720 struct btrfs_root *root = wc->root;
721 struct btrfs_drop_extents_args drop_args = { 0 };
722 struct btrfs_fs_info *fs_info = root->fs_info;
723 int found_type;
724 u64 extent_end;
725 const u64 start = wc->log_key.offset;
726 u64 nbytes = 0;
727 u64 csum_start;
728 u64 csum_end;
729 LIST_HEAD(ordered_sums);
730 u64 offset;
731 unsigned long dest_offset;
732 struct btrfs_key ins;
733 struct btrfs_file_extent_item *item;
734 struct btrfs_inode *inode = NULL;
735 int ret = 0;
736
737 item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
738 found_type = btrfs_file_extent_type(wc->log_leaf, item);
739
740 if (found_type == BTRFS_FILE_EXTENT_REG ||
741 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
742 extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
743 /* Holes don't take up space. */
744 if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
745 nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
746 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
747 nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
748 extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
749 } else {
750 btrfs_abort_log_replay(wc, -EUCLEAN,
751 "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
752 found_type, btrfs_root_id(root),
753 wc->log_key.objectid, wc->log_key.offset);
754 return -EUCLEAN;
755 }
756
757 inode = btrfs_iget_logging(wc->log_key.objectid, root);
758 if (IS_ERR(inode)) {
759 ret = PTR_ERR(inode);
760 btrfs_abort_log_replay(wc, ret,
761 "failed to get inode %llu for root %llu",
762 wc->log_key.objectid, btrfs_root_id(root));
763 return ret;
764 }
765
766 /*
767 * first check to see if we already have this extent in the
768 * file. This must be done before the btrfs_drop_extents run
769 * so we don't try to drop this extent.
770 */
771 ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
772 btrfs_ino(inode), start, 0);
773
774 if (ret == 0 &&
775 (found_type == BTRFS_FILE_EXTENT_REG ||
776 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
777 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
778 struct btrfs_file_extent_item existing;
779 unsigned long ptr;
780
781 ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
782 read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
783
784 /*
785 * we already have a pointer to this exact extent,
786 * we don't have to do anything
787 */
788 if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
789 sizeof(existing)) == 0) {
790 btrfs_release_path(wc->subvol_path);
791 goto out;
792 }
793 }
794 btrfs_release_path(wc->subvol_path);
795
796 /* drop any overlapping extents */
797 drop_args.start = start;
798 drop_args.end = extent_end;
799 drop_args.drop_cache = true;
800 drop_args.path = wc->subvol_path;
801 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
802 if (ret) {
803 btrfs_abort_log_replay(wc, ret,
804 "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
805 wc->log_key.objectid, start, extent_end,
806 btrfs_root_id(root));
807 goto out;
808 }
809
810 if (found_type == BTRFS_FILE_EXTENT_INLINE) {
811 /* inline extents are easy, we just overwrite them */
812 ret = overwrite_item(wc);
813 if (ret)
814 goto out;
815 goto update_inode;
816 }
817
818 /*
819 * If not an inline extent, it can only be a regular or prealloc one.
820 * We have checked that above and returned -EUCLEAN if not.
821 */
822
823 /* A hole and NO_HOLES feature enabled, nothing else to do. */
824 if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
825 btrfs_fs_incompat(fs_info, NO_HOLES))
826 goto update_inode;
827
828 ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
829 &wc->log_key, sizeof(*item));
830 if (ret) {
831 btrfs_abort_log_replay(wc, ret,
832 "failed to insert item with key (%llu %u %llu) root %llu",
833 wc->log_key.objectid, wc->log_key.type,
834 wc->log_key.offset, btrfs_root_id(root));
835 goto out;
836 }
837 dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
838 wc->subvol_path->slots[0]);
839 copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
840 (unsigned long)item, sizeof(*item));
841
842 /*
843 * We have an explicit hole and NO_HOLES is not enabled. We have added
844 * the hole file extent item to the subvolume tree, so we don't have
845 * anything else to do other than update the file extent item range and
846 * update the inode item.
847 */
848 if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
849 btrfs_release_path(wc->subvol_path);
850 goto update_inode;
851 }
852
853 ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
854 ins.type = BTRFS_EXTENT_ITEM_KEY;
855 ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
856 offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
857
858 /*
859 * Manually record dirty extent, as here we did a shallow file extent
860 * item copy and skip normal backref update, but modifying extent tree
861 * all by ourselves. So need to manually record dirty extent for qgroup,
862 * as the owner of the file extent changed from log tree (doesn't affect
863 * qgroup) to fs/file tree (affects qgroup).
864 */
865 ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
866 if (ret < 0) {
867 btrfs_abort_log_replay(wc, ret,
868 "failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
869 ins.objectid, ins.offset,
870 wc->log_key.objectid, btrfs_root_id(root));
871 goto out;
872 }
873
874 /*
875 * Is this extent already allocated in the extent tree?
876 * If so, just add a reference.
877 */
878 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
879 if (ret < 0) {
880 btrfs_abort_log_replay(wc, ret,
881 "failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
882 ins.objectid, ins.offset,
883 wc->log_key.objectid, btrfs_root_id(root));
884 goto out;
885 } else if (ret == 0) {
886 struct btrfs_ref ref = {
887 .action = BTRFS_ADD_DELAYED_REF,
888 .bytenr = ins.objectid,
889 .num_bytes = ins.offset,
890 .owning_root = btrfs_root_id(root),
891 .ref_root = btrfs_root_id(root),
892 };
893
894 btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
895 ret = btrfs_inc_extent_ref(trans, &ref);
896 if (ret) {
897 btrfs_abort_log_replay(wc, ret,
898 "failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
899 ins.objectid, ins.offset,
900 wc->log_key.objectid,
901 btrfs_root_id(root));
902 goto out;
903 }
904 } else {
905 /* Insert the extent pointer in the extent tree. */
906 ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
907 wc->log_key.objectid, offset, &ins);
908 if (ret) {
909 btrfs_abort_log_replay(wc, ret,
910 "failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
911 ins.objectid, ins.offset, offset,
912 wc->log_key.objectid, btrfs_root_id(root));
913 goto out;
914 }
915 }
916
917 btrfs_release_path(wc->subvol_path);
918
919 if (btrfs_file_extent_compression(wc->log_leaf, item)) {
920 csum_start = ins.objectid;
921 csum_end = csum_start + ins.offset;
922 } else {
923 csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
924 csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
925 }
926
927 ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
928 &ordered_sums, false);
929 if (ret < 0) {
930 btrfs_abort_log_replay(wc, ret,
931 "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
932 csum_start, csum_end, wc->log_key.objectid,
933 btrfs_root_id(root));
934 goto out;
935 }
936 ret = 0;
937 /*
938 * Now delete all existing cums in the csum root that cover our range.
939 * We do this because we can have an extent that is completely
940 * referenced by one file extent item and partially referenced by
941 * another file extent item (like after using the clone or extent_same
942 * ioctls). In this case if we end up doing the replay of the one that
943 * partially references the extent first, and we do not do the csum
944 * deletion below, we can get 2 csum items in the csum tree that overlap
945 * each other. For example, imagine our log has the two following file
946 * extent items:
947 *
948 * key (257 EXTENT_DATA 409600)
949 * extent data disk byte 12845056 nr 102400
950 * extent data offset 20480 nr 20480 ram 102400
951 *
952 * key (257 EXTENT_DATA 819200)
953 * extent data disk byte 12845056 nr 102400
954 * extent data offset 0 nr 102400 ram 102400
955 *
956 * Where the second one fully references the 100K extent that starts at
957 * disk byte 12845056, and the log tree has a single csum item that
958 * covers the entire range of the extent:
959 *
960 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
961 *
962 * After the first file extent item is replayed, the csum tree gets the
963 * following csum item:
964 *
965 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
966 *
967 * Which covers the 20K sub-range starting at offset 20K of our extent.
968 * Now when we replay the second file extent item, if we do not delete
969 * existing csum items that cover any of its blocks, we end up getting
970 * two csum items in our csum tree that overlap each other:
971 *
972 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
973 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
974 *
975 * Which is a problem, because after this anyone trying to lookup for
976 * the checksum of any block of our extent starting at an offset of 40K
977 * or higher, will end up looking at the second csum item only, which
978 * does not contain the checksum for any block starting at offset 40K or
979 * higher of our extent.
980 */
981 while (!list_empty(&ordered_sums)) {
982 struct btrfs_ordered_sum *sums;
983 struct btrfs_root *csum_root;
984
985 sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
986 csum_root = btrfs_csum_root(fs_info, sums->logical);
987 if (!ret) {
988 ret = btrfs_del_csums(trans, csum_root, sums->logical,
989 sums->len);
990 if (ret)
991 btrfs_abort_log_replay(wc, ret,
992 "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
993 sums->logical,
994 sums->logical + sums->len,
995 wc->log_key.objectid,
996 btrfs_root_id(root));
997 }
998 if (!ret) {
999 ret = btrfs_csum_file_blocks(trans, csum_root, sums);
1000 if (ret)
1001 btrfs_abort_log_replay(wc, ret,
1002 "failed to add csums for range [%llu, %llu) inode %llu root %llu",
1003 sums->logical,
1004 sums->logical + sums->len,
1005 wc->log_key.objectid,
1006 btrfs_root_id(root));
1007 }
1008 list_del(&sums->list);
1009 kfree(sums);
1010 }
1011 if (ret)
1012 goto out;
1013
1014 update_inode:
1015 ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
1016 if (ret) {
1017 btrfs_abort_log_replay(wc, ret,
1018 "failed to set file extent range [%llu, %llu) inode %llu root %llu",
1019 start, extent_end, wc->log_key.objectid,
1020 btrfs_root_id(root));
1021 goto out;
1022 }
1023
1024 btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
1025 ret = btrfs_update_inode(trans, inode);
1026 if (ret)
1027 btrfs_abort_log_replay(wc, ret,
1028 "failed to update inode %llu root %llu",
1029 wc->log_key.objectid, btrfs_root_id(root));
1030 out:
1031 iput(&inode->vfs_inode);
1032 return ret;
1033 }
1034
unlink_inode_for_log_replay(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)1035 static int unlink_inode_for_log_replay(struct walk_control *wc,
1036 struct btrfs_inode *dir,
1037 struct btrfs_inode *inode,
1038 const struct fscrypt_str *name)
1039 {
1040 struct btrfs_trans_handle *trans = wc->trans;
1041 int ret;
1042
1043 ret = btrfs_unlink_inode(trans, dir, inode, name);
1044 if (ret) {
1045 btrfs_abort_log_replay(wc, ret,
1046 "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
1047 btrfs_ino(inode), btrfs_ino(dir), name->len,
1048 name->name, btrfs_root_id(inode->root));
1049 return ret;
1050 }
1051 /*
1052 * Whenever we need to check if a name exists or not, we check the
1053 * fs/subvolume tree. So after an unlink we must run delayed items, so
1054 * that future checks for a name during log replay see that the name
1055 * does not exists anymore.
1056 */
1057 ret = btrfs_run_delayed_items(trans);
1058 if (ret)
1059 btrfs_abort_log_replay(wc, ret,
1060 "failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
1061 btrfs_ino(inode), btrfs_ino(dir), name->len,
1062 name->name, btrfs_root_id(inode->root));
1063
1064 return ret;
1065 }
1066
1067 /*
1068 * when cleaning up conflicts between the directory names in the
1069 * subvolume, directory names in the log and directory names in the
1070 * inode back references, we may have to unlink inodes from directories.
1071 *
1072 * This is a helper function to do the unlink of a specific directory
1073 * item
1074 */
drop_one_dir_item(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_dir_item * di)1075 static noinline int drop_one_dir_item(struct walk_control *wc,
1076 struct btrfs_inode *dir,
1077 struct btrfs_dir_item *di)
1078 {
1079 struct btrfs_root *root = dir->root;
1080 struct btrfs_inode *inode;
1081 struct fscrypt_str name;
1082 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
1083 struct btrfs_key location;
1084 int ret;
1085
1086 btrfs_dir_item_key_to_cpu(leaf, di, &location);
1087 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
1088 if (ret) {
1089 btrfs_abort_log_replay(wc, ret,
1090 "failed to allocate name for dir %llu root %llu",
1091 btrfs_ino(dir), btrfs_root_id(root));
1092 return ret;
1093 }
1094
1095 btrfs_release_path(wc->subvol_path);
1096
1097 inode = btrfs_iget_logging(location.objectid, root);
1098 if (IS_ERR(inode)) {
1099 ret = PTR_ERR(inode);
1100 btrfs_abort_log_replay(wc, ret,
1101 "failed to open inode %llu parent dir %llu name %.*s root %llu",
1102 location.objectid, btrfs_ino(dir),
1103 name.len, name.name, btrfs_root_id(root));
1104 inode = NULL;
1105 goto out;
1106 }
1107
1108 ret = link_to_fixup_dir(wc, location.objectid);
1109 if (ret)
1110 goto out;
1111
1112 ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
1113 out:
1114 kfree(name.name);
1115 if (inode)
1116 iput(&inode->vfs_inode);
1117 return ret;
1118 }
1119
1120 /*
1121 * See if a given name and sequence number found in an inode back reference are
1122 * already in a directory and correctly point to this inode.
1123 *
1124 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
1125 * exists.
1126 */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,struct fscrypt_str * name)1127 static noinline int inode_in_dir(struct btrfs_root *root,
1128 struct btrfs_path *path,
1129 u64 dirid, u64 objectid, u64 index,
1130 struct fscrypt_str *name)
1131 {
1132 struct btrfs_dir_item *di;
1133 struct btrfs_key location;
1134 int ret = 0;
1135
1136 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
1137 index, name, 0);
1138 if (IS_ERR(di)) {
1139 ret = PTR_ERR(di);
1140 goto out;
1141 } else if (di) {
1142 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1143 if (location.objectid != objectid)
1144 goto out;
1145 } else {
1146 goto out;
1147 }
1148
1149 btrfs_release_path(path);
1150 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
1151 if (IS_ERR(di)) {
1152 ret = PTR_ERR(di);
1153 goto out;
1154 } else if (di) {
1155 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1156 if (location.objectid == objectid)
1157 ret = 1;
1158 }
1159 out:
1160 btrfs_release_path(path);
1161 return ret;
1162 }
1163
1164 /*
1165 * helper function to check a log tree for a named back reference in
1166 * an inode. This is used to decide if a back reference that is
1167 * found in the subvolume conflicts with what we find in the log.
1168 *
1169 * inode backreferences may have multiple refs in a single item,
1170 * during replay we process one reference at a time, and we don't
1171 * want to delete valid links to a file from the subvolume if that
1172 * link is also in the log.
1173 */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const struct fscrypt_str * name)1174 static noinline int backref_in_log(struct btrfs_root *log,
1175 struct btrfs_key *key,
1176 u64 ref_objectid,
1177 const struct fscrypt_str *name)
1178 {
1179 BTRFS_PATH_AUTO_FREE(path);
1180 int ret;
1181
1182 path = btrfs_alloc_path();
1183 if (!path)
1184 return -ENOMEM;
1185
1186 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1187 if (ret < 0)
1188 return ret;
1189 if (ret == 1)
1190 return 0;
1191
1192 if (key->type == BTRFS_INODE_EXTREF_KEY)
1193 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1194 path->slots[0],
1195 ref_objectid, name);
1196 else
1197 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1198 path->slots[0], name);
1199 return ret;
1200 }
1201
unlink_refs_not_in_log(struct walk_control * wc,struct btrfs_key * search_key,struct btrfs_inode * dir,struct btrfs_inode * inode)1202 static int unlink_refs_not_in_log(struct walk_control *wc,
1203 struct btrfs_key *search_key,
1204 struct btrfs_inode *dir,
1205 struct btrfs_inode *inode)
1206 {
1207 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
1208 unsigned long ptr;
1209 unsigned long ptr_end;
1210
1211 /*
1212 * Check all the names in this back reference to see if they are in the
1213 * log. If so, we allow them to stay otherwise they must be unlinked as
1214 * a conflict.
1215 */
1216 ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
1217 ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
1218 while (ptr < ptr_end) {
1219 struct fscrypt_str victim_name;
1220 struct btrfs_inode_ref *victim_ref;
1221 int ret;
1222
1223 victim_ref = (struct btrfs_inode_ref *)ptr;
1224 ret = read_alloc_one_name(leaf, (victim_ref + 1),
1225 btrfs_inode_ref_name_len(leaf, victim_ref),
1226 &victim_name);
1227 if (ret) {
1228 btrfs_abort_log_replay(wc, ret,
1229 "failed to allocate name for inode %llu parent dir %llu root %llu",
1230 btrfs_ino(inode), btrfs_ino(dir),
1231 btrfs_root_id(inode->root));
1232 return ret;
1233 }
1234
1235 ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
1236 if (ret) {
1237 if (ret < 0) {
1238 btrfs_abort_log_replay(wc, ret,
1239 "failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
1240 btrfs_ino(inode), btrfs_ino(dir),
1241 victim_name.len, victim_name.name,
1242 btrfs_root_id(inode->root));
1243 kfree(victim_name.name);
1244 return ret;
1245 }
1246 kfree(victim_name.name);
1247 ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1248 continue;
1249 }
1250
1251 inc_nlink(&inode->vfs_inode);
1252 btrfs_release_path(wc->subvol_path);
1253
1254 ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
1255 kfree(victim_name.name);
1256 if (ret)
1257 return ret;
1258 return -EAGAIN;
1259 }
1260
1261 return 0;
1262 }
1263
unlink_extrefs_not_in_log(struct walk_control * wc,struct btrfs_key * search_key,struct btrfs_inode * dir,struct btrfs_inode * inode)1264 static int unlink_extrefs_not_in_log(struct walk_control *wc,
1265 struct btrfs_key *search_key,
1266 struct btrfs_inode *dir,
1267 struct btrfs_inode *inode)
1268 {
1269 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
1270 const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
1271 const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
1272 u32 cur_offset = 0;
1273
1274 while (cur_offset < item_size) {
1275 struct btrfs_root *log_root = wc->log;
1276 struct btrfs_inode_extref *extref;
1277 struct fscrypt_str victim_name;
1278 int ret;
1279
1280 extref = (struct btrfs_inode_extref *)(base + cur_offset);
1281 victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
1282
1283 if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
1284 goto next;
1285
1286 ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
1287 &victim_name);
1288 if (ret) {
1289 btrfs_abort_log_replay(wc, ret,
1290 "failed to allocate name for inode %llu parent dir %llu root %llu",
1291 btrfs_ino(inode), btrfs_ino(dir),
1292 btrfs_root_id(inode->root));
1293 return ret;
1294 }
1295
1296 search_key->objectid = btrfs_ino(inode);
1297 search_key->type = BTRFS_INODE_EXTREF_KEY;
1298 search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
1299 victim_name.name,
1300 victim_name.len);
1301 ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
1302 if (ret) {
1303 if (ret < 0) {
1304 btrfs_abort_log_replay(wc, ret,
1305 "failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
1306 btrfs_ino(inode), btrfs_ino(dir),
1307 victim_name.len, victim_name.name,
1308 btrfs_root_id(inode->root));
1309 kfree(victim_name.name);
1310 return ret;
1311 }
1312 kfree(victim_name.name);
1313 next:
1314 cur_offset += victim_name.len + sizeof(*extref);
1315 continue;
1316 }
1317
1318 inc_nlink(&inode->vfs_inode);
1319 btrfs_release_path(wc->subvol_path);
1320
1321 ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
1322 kfree(victim_name.name);
1323 if (ret)
1324 return ret;
1325 return -EAGAIN;
1326 }
1327
1328 return 0;
1329 }
1330
__add_inode_ref(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 ref_index,struct fscrypt_str * name)1331 static inline int __add_inode_ref(struct walk_control *wc,
1332 struct btrfs_inode *dir,
1333 struct btrfs_inode *inode,
1334 u64 ref_index, struct fscrypt_str *name)
1335 {
1336 int ret;
1337 struct btrfs_trans_handle *trans = wc->trans;
1338 struct btrfs_root *root = wc->root;
1339 struct btrfs_dir_item *di;
1340 struct btrfs_key search_key;
1341 struct btrfs_inode_extref *extref;
1342
1343 again:
1344 /* Search old style refs */
1345 search_key.objectid = btrfs_ino(inode);
1346 search_key.type = BTRFS_INODE_REF_KEY;
1347 search_key.offset = btrfs_ino(dir);
1348 ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
1349 if (ret < 0) {
1350 btrfs_abort_log_replay(wc, ret,
1351 "failed to search subvolume tree for key (%llu %u %llu) root %llu",
1352 search_key.objectid, search_key.type,
1353 search_key.offset, btrfs_root_id(root));
1354 return ret;
1355 } else if (ret == 0) {
1356 /*
1357 * Are we trying to overwrite a back ref for the root directory?
1358 * If so, we're done.
1359 */
1360 if (search_key.objectid == search_key.offset)
1361 return 1;
1362
1363 ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
1364 if (ret == -EAGAIN)
1365 goto again;
1366 else if (ret)
1367 return ret;
1368 }
1369 btrfs_release_path(wc->subvol_path);
1370
1371 /* Same search but for extended refs */
1372 extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
1373 btrfs_ino(inode), btrfs_ino(dir));
1374 if (IS_ERR(extref)) {
1375 return PTR_ERR(extref);
1376 } else if (extref) {
1377 ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
1378 if (ret == -EAGAIN)
1379 goto again;
1380 else if (ret)
1381 return ret;
1382 }
1383 btrfs_release_path(wc->subvol_path);
1384
1385 /* look for a conflicting sequence number */
1386 di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
1387 ref_index, name, 0);
1388 if (IS_ERR(di)) {
1389 ret = PTR_ERR(di);
1390 btrfs_abort_log_replay(wc, ret,
1391 "failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
1392 btrfs_ino(dir), ref_index, name->len,
1393 name->name, btrfs_root_id(root));
1394 return ret;
1395 } else if (di) {
1396 ret = drop_one_dir_item(wc, dir, di);
1397 if (ret)
1398 return ret;
1399 }
1400 btrfs_release_path(wc->subvol_path);
1401
1402 /* look for a conflicting name */
1403 di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
1404 if (IS_ERR(di)) {
1405 ret = PTR_ERR(di);
1406 btrfs_abort_log_replay(wc, ret,
1407 "failed to lookup dir item for dir %llu name %.*s root %llu",
1408 btrfs_ino(dir), name->len, name->name,
1409 btrfs_root_id(root));
1410 return ret;
1411 } else if (di) {
1412 ret = drop_one_dir_item(wc, dir, di);
1413 if (ret)
1414 return ret;
1415 }
1416 btrfs_release_path(wc->subvol_path);
1417
1418 return 0;
1419 }
1420
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index,u64 * parent_objectid)1421 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1422 struct fscrypt_str *name, u64 *index,
1423 u64 *parent_objectid)
1424 {
1425 struct btrfs_inode_extref *extref;
1426 int ret;
1427
1428 extref = (struct btrfs_inode_extref *)ref_ptr;
1429
1430 ret = read_alloc_one_name(eb, &extref->name,
1431 btrfs_inode_extref_name_len(eb, extref), name);
1432 if (ret)
1433 return ret;
1434
1435 if (index)
1436 *index = btrfs_inode_extref_index(eb, extref);
1437 if (parent_objectid)
1438 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1439
1440 return 0;
1441 }
1442
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index)1443 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1444 struct fscrypt_str *name, u64 *index)
1445 {
1446 struct btrfs_inode_ref *ref;
1447 int ret;
1448
1449 ref = (struct btrfs_inode_ref *)ref_ptr;
1450
1451 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1452 name);
1453 if (ret)
1454 return ret;
1455
1456 if (index)
1457 *index = btrfs_inode_ref_index(eb, ref);
1458
1459 return 0;
1460 }
1461
1462 /*
1463 * Take an inode reference item from the log tree and iterate all names from the
1464 * inode reference item in the subvolume tree with the same key (if it exists).
1465 * For any name that is not in the inode reference item from the log tree, do a
1466 * proper unlink of that name (that is, remove its entry from the inode
1467 * reference item and both dir index keys).
1468 */
unlink_old_inode_refs(struct walk_control * wc,struct btrfs_inode * inode)1469 static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
1470 {
1471 struct btrfs_root *root = wc->root;
1472 int ret;
1473 unsigned long ref_ptr;
1474 unsigned long ref_end;
1475 struct extent_buffer *eb;
1476
1477 again:
1478 btrfs_release_path(wc->subvol_path);
1479 ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
1480 if (ret > 0) {
1481 ret = 0;
1482 goto out;
1483 }
1484 if (ret < 0) {
1485 btrfs_abort_log_replay(wc, ret,
1486 "failed to search subvolume tree for key (%llu %u %llu) root %llu",
1487 wc->log_key.objectid, wc->log_key.type,
1488 wc->log_key.offset, btrfs_root_id(root));
1489 goto out;
1490 }
1491
1492 eb = wc->subvol_path->nodes[0];
1493 ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
1494 ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
1495 while (ref_ptr < ref_end) {
1496 struct fscrypt_str name;
1497 u64 parent_id;
1498
1499 if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
1500 ret = extref_get_fields(eb, ref_ptr, &name,
1501 NULL, &parent_id);
1502 if (ret) {
1503 btrfs_abort_log_replay(wc, ret,
1504 "failed to get extref details for inode %llu root %llu",
1505 btrfs_ino(inode),
1506 btrfs_root_id(root));
1507 goto out;
1508 }
1509 } else {
1510 parent_id = wc->log_key.offset;
1511 ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1512 if (ret) {
1513 btrfs_abort_log_replay(wc, ret,
1514 "failed to get ref details for inode %llu parent_id %llu root %llu",
1515 btrfs_ino(inode), parent_id,
1516 btrfs_root_id(root));
1517 goto out;
1518 }
1519 }
1520
1521 if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
1522 ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
1523 parent_id, &name);
1524 else
1525 ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
1526 &name);
1527
1528 if (!ret) {
1529 struct btrfs_inode *dir;
1530
1531 btrfs_release_path(wc->subvol_path);
1532 dir = btrfs_iget_logging(parent_id, root);
1533 if (IS_ERR(dir)) {
1534 ret = PTR_ERR(dir);
1535 kfree(name.name);
1536 btrfs_abort_log_replay(wc, ret,
1537 "failed to lookup dir inode %llu root %llu",
1538 parent_id, btrfs_root_id(root));
1539 goto out;
1540 }
1541 ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
1542 kfree(name.name);
1543 iput(&dir->vfs_inode);
1544 if (ret)
1545 goto out;
1546 goto again;
1547 }
1548
1549 kfree(name.name);
1550 ref_ptr += name.len;
1551 if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
1552 ref_ptr += sizeof(struct btrfs_inode_extref);
1553 else
1554 ref_ptr += sizeof(struct btrfs_inode_ref);
1555 }
1556 ret = 0;
1557 out:
1558 btrfs_release_path(wc->subvol_path);
1559 return ret;
1560 }
1561
1562 /*
1563 * Replay one inode back reference item found in the log tree.
1564 * Path is for temporary use by this function (it should be released on return).
1565 */
add_inode_ref(struct walk_control * wc)1566 static noinline int add_inode_ref(struct walk_control *wc)
1567 {
1568 struct btrfs_trans_handle *trans = wc->trans;
1569 struct btrfs_root *root = wc->root;
1570 struct btrfs_inode *dir = NULL;
1571 struct btrfs_inode *inode = NULL;
1572 unsigned long ref_ptr;
1573 unsigned long ref_end;
1574 struct fscrypt_str name = { 0 };
1575 int ret;
1576 const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
1577 u64 parent_objectid;
1578 u64 inode_objectid;
1579 u64 ref_index = 0;
1580 int ref_struct_size;
1581
1582 ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
1583 ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
1584
1585 if (is_extref_item) {
1586 struct btrfs_inode_extref *r;
1587
1588 ref_struct_size = sizeof(struct btrfs_inode_extref);
1589 r = (struct btrfs_inode_extref *)ref_ptr;
1590 parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
1591 } else {
1592 ref_struct_size = sizeof(struct btrfs_inode_ref);
1593 parent_objectid = wc->log_key.offset;
1594 }
1595 inode_objectid = wc->log_key.objectid;
1596
1597 /*
1598 * it is possible that we didn't log all the parent directories
1599 * for a given inode. If we don't find the dir, just don't
1600 * copy the back ref in. The link count fixup code will take
1601 * care of the rest
1602 */
1603 dir = btrfs_iget_logging(parent_objectid, root);
1604 if (IS_ERR(dir)) {
1605 ret = PTR_ERR(dir);
1606 if (ret == -ENOENT)
1607 ret = 0;
1608 else
1609 btrfs_abort_log_replay(wc, ret,
1610 "failed to lookup dir inode %llu root %llu",
1611 parent_objectid, btrfs_root_id(root));
1612 dir = NULL;
1613 goto out;
1614 }
1615
1616 inode = btrfs_iget_logging(inode_objectid, root);
1617 if (IS_ERR(inode)) {
1618 ret = PTR_ERR(inode);
1619 btrfs_abort_log_replay(wc, ret,
1620 "failed to lookup inode %llu root %llu",
1621 inode_objectid, btrfs_root_id(root));
1622 inode = NULL;
1623 goto out;
1624 }
1625
1626 while (ref_ptr < ref_end) {
1627 if (is_extref_item) {
1628 ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
1629 &ref_index, &parent_objectid);
1630 if (ret) {
1631 btrfs_abort_log_replay(wc, ret,
1632 "failed to get extref details for inode %llu root %llu",
1633 btrfs_ino(inode),
1634 btrfs_root_id(root));
1635 goto out;
1636 }
1637 /*
1638 * parent object can change from one array
1639 * item to another.
1640 */
1641 if (!dir) {
1642 dir = btrfs_iget_logging(parent_objectid, root);
1643 if (IS_ERR(dir)) {
1644 ret = PTR_ERR(dir);
1645 dir = NULL;
1646 /*
1647 * A new parent dir may have not been
1648 * logged and not exist in the subvolume
1649 * tree, see the comment above before
1650 * the loop when getting the first
1651 * parent dir.
1652 */
1653 if (ret == -ENOENT) {
1654 /*
1655 * The next extref may refer to
1656 * another parent dir that
1657 * exists, so continue.
1658 */
1659 ret = 0;
1660 goto next;
1661 } else {
1662 btrfs_abort_log_replay(wc, ret,
1663 "failed to lookup dir inode %llu root %llu",
1664 parent_objectid,
1665 btrfs_root_id(root));
1666 }
1667 goto out;
1668 }
1669 }
1670 } else {
1671 ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
1672 if (ret) {
1673 btrfs_abort_log_replay(wc, ret,
1674 "failed to get ref details for inode %llu parent_objectid %llu root %llu",
1675 btrfs_ino(inode),
1676 parent_objectid,
1677 btrfs_root_id(root));
1678 goto out;
1679 }
1680 }
1681
1682 ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
1683 btrfs_ino(inode), ref_index, &name);
1684 if (ret < 0) {
1685 btrfs_abort_log_replay(wc, ret,
1686 "failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
1687 btrfs_ino(inode), btrfs_ino(dir),
1688 ref_index, name.len, name.name,
1689 btrfs_root_id(root));
1690 goto out;
1691 } else if (ret == 0) {
1692 /*
1693 * look for a conflicting back reference in the
1694 * metadata. if we find one we have to unlink that name
1695 * of the file before we add our new link. Later on, we
1696 * overwrite any existing back reference, and we don't
1697 * want to create dangling pointers in the directory.
1698 */
1699 ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
1700 if (ret) {
1701 if (ret == 1)
1702 ret = 0;
1703 goto out;
1704 }
1705
1706 /* insert our name */
1707 ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
1708 if (ret) {
1709 btrfs_abort_log_replay(wc, ret,
1710 "failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
1711 btrfs_ino(inode),
1712 btrfs_ino(dir), ref_index,
1713 name.len, name.name,
1714 btrfs_root_id(root));
1715 goto out;
1716 }
1717
1718 ret = btrfs_update_inode(trans, inode);
1719 if (ret) {
1720 btrfs_abort_log_replay(wc, ret,
1721 "failed to update inode %llu root %llu",
1722 btrfs_ino(inode),
1723 btrfs_root_id(root));
1724 goto out;
1725 }
1726 }
1727 /* Else, ret == 1, we already have a perfect match, we're done. */
1728
1729 next:
1730 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1731 kfree(name.name);
1732 name.name = NULL;
1733 if (is_extref_item && dir) {
1734 iput(&dir->vfs_inode);
1735 dir = NULL;
1736 }
1737 }
1738
1739 /*
1740 * Before we overwrite the inode reference item in the subvolume tree
1741 * with the item from the log tree, we must unlink all names from the
1742 * parent directory that are in the subvolume's tree inode reference
1743 * item, otherwise we end up with an inconsistent subvolume tree where
1744 * dir index entries exist for a name but there is no inode reference
1745 * item with the same name.
1746 */
1747 ret = unlink_old_inode_refs(wc, inode);
1748 if (ret)
1749 goto out;
1750
1751 /* finally write the back reference in the inode */
1752 ret = overwrite_item(wc);
1753 out:
1754 btrfs_release_path(wc->subvol_path);
1755 kfree(name.name);
1756 if (dir)
1757 iput(&dir->vfs_inode);
1758 if (inode)
1759 iput(&inode->vfs_inode);
1760 return ret;
1761 }
1762
count_inode_extrefs(struct btrfs_inode * inode,struct btrfs_path * path)1763 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
1764 {
1765 int ret = 0;
1766 int name_len;
1767 unsigned int nlink = 0;
1768 u32 item_size;
1769 u32 cur_offset = 0;
1770 u64 inode_objectid = btrfs_ino(inode);
1771 u64 offset = 0;
1772 unsigned long ptr;
1773 struct btrfs_inode_extref *extref;
1774 struct extent_buffer *leaf;
1775
1776 while (1) {
1777 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
1778 path, &extref, &offset);
1779 if (ret)
1780 break;
1781
1782 leaf = path->nodes[0];
1783 item_size = btrfs_item_size(leaf, path->slots[0]);
1784 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1785 cur_offset = 0;
1786
1787 while (cur_offset < item_size) {
1788 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1789 name_len = btrfs_inode_extref_name_len(leaf, extref);
1790
1791 nlink++;
1792
1793 cur_offset += name_len + sizeof(*extref);
1794 }
1795
1796 offset++;
1797 btrfs_release_path(path);
1798 }
1799 btrfs_release_path(path);
1800
1801 if (ret < 0 && ret != -ENOENT)
1802 return ret;
1803 return nlink;
1804 }
1805
count_inode_refs(struct btrfs_inode * inode,struct btrfs_path * path)1806 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
1807 {
1808 int ret;
1809 struct btrfs_key key;
1810 unsigned int nlink = 0;
1811 unsigned long ptr;
1812 unsigned long ptr_end;
1813 int name_len;
1814 u64 ino = btrfs_ino(inode);
1815
1816 key.objectid = ino;
1817 key.type = BTRFS_INODE_REF_KEY;
1818 key.offset = (u64)-1;
1819
1820 while (1) {
1821 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
1822 if (ret < 0)
1823 break;
1824 if (ret > 0) {
1825 if (path->slots[0] == 0)
1826 break;
1827 path->slots[0]--;
1828 }
1829 process_slot:
1830 btrfs_item_key_to_cpu(path->nodes[0], &key,
1831 path->slots[0]);
1832 if (key.objectid != ino ||
1833 key.type != BTRFS_INODE_REF_KEY)
1834 break;
1835 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1836 ptr_end = ptr + btrfs_item_size(path->nodes[0],
1837 path->slots[0]);
1838 while (ptr < ptr_end) {
1839 struct btrfs_inode_ref *ref;
1840
1841 ref = (struct btrfs_inode_ref *)ptr;
1842 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1843 ref);
1844 ptr = (unsigned long)(ref + 1) + name_len;
1845 nlink++;
1846 }
1847
1848 if (key.offset == 0)
1849 break;
1850 if (path->slots[0] > 0) {
1851 path->slots[0]--;
1852 goto process_slot;
1853 }
1854 key.offset--;
1855 btrfs_release_path(path);
1856 }
1857 btrfs_release_path(path);
1858
1859 return nlink;
1860 }
1861
1862 /*
1863 * There are a few corners where the link count of the file can't
1864 * be properly maintained during replay. So, instead of adding
1865 * lots of complexity to the log code, we just scan the backrefs
1866 * for any file that has been through replay.
1867 *
1868 * The scan will update the link count on the inode to reflect the
1869 * number of back refs found. If it goes down to zero, the iput
1870 * will free the inode.
1871 */
fixup_inode_link_count(struct walk_control * wc,struct btrfs_inode * inode)1872 static noinline int fixup_inode_link_count(struct walk_control *wc,
1873 struct btrfs_inode *inode)
1874 {
1875 struct btrfs_trans_handle *trans = wc->trans;
1876 struct btrfs_root *root = inode->root;
1877 int ret;
1878 u64 nlink = 0;
1879 const u64 ino = btrfs_ino(inode);
1880
1881 ret = count_inode_refs(inode, wc->subvol_path);
1882 if (ret < 0)
1883 goto out;
1884
1885 nlink = ret;
1886
1887 ret = count_inode_extrefs(inode, wc->subvol_path);
1888 if (ret < 0)
1889 goto out;
1890
1891 nlink += ret;
1892
1893 ret = 0;
1894
1895 if (nlink != inode->vfs_inode.i_nlink) {
1896 set_nlink(&inode->vfs_inode, nlink);
1897 ret = btrfs_update_inode(trans, inode);
1898 if (ret)
1899 goto out;
1900 }
1901 if (S_ISDIR(inode->vfs_inode.i_mode))
1902 inode->index_cnt = (u64)-1;
1903
1904 if (inode->vfs_inode.i_nlink == 0) {
1905 if (S_ISDIR(inode->vfs_inode.i_mode)) {
1906 ret = replay_dir_deletes(wc, ino, true);
1907 if (ret)
1908 goto out;
1909 }
1910 ret = btrfs_insert_orphan_item(trans, root, ino);
1911 if (ret == -EEXIST)
1912 ret = 0;
1913 }
1914
1915 out:
1916 btrfs_release_path(wc->subvol_path);
1917 return ret;
1918 }
1919
fixup_inode_link_counts(struct walk_control * wc)1920 static noinline int fixup_inode_link_counts(struct walk_control *wc)
1921 {
1922 int ret;
1923 struct btrfs_key key;
1924
1925 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1926 key.type = BTRFS_ORPHAN_ITEM_KEY;
1927 key.offset = (u64)-1;
1928 while (1) {
1929 struct btrfs_trans_handle *trans = wc->trans;
1930 struct btrfs_root *root = wc->root;
1931 struct btrfs_inode *inode;
1932
1933 ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
1934 if (ret < 0)
1935 break;
1936
1937 if (ret == 1) {
1938 ret = 0;
1939 if (wc->subvol_path->slots[0] == 0)
1940 break;
1941 wc->subvol_path->slots[0]--;
1942 }
1943
1944 btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
1945 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1946 key.type != BTRFS_ORPHAN_ITEM_KEY)
1947 break;
1948
1949 ret = btrfs_del_item(trans, root, wc->subvol_path);
1950 if (ret)
1951 break;
1952
1953 btrfs_release_path(wc->subvol_path);
1954 inode = btrfs_iget_logging(key.offset, root);
1955 if (IS_ERR(inode)) {
1956 ret = PTR_ERR(inode);
1957 break;
1958 }
1959
1960 ret = fixup_inode_link_count(wc, inode);
1961 iput(&inode->vfs_inode);
1962 if (ret)
1963 break;
1964
1965 /*
1966 * fixup on a directory may create new entries,
1967 * make sure we always look for the highest possible
1968 * offset
1969 */
1970 key.offset = (u64)-1;
1971 }
1972 btrfs_release_path(wc->subvol_path);
1973 return ret;
1974 }
1975
1976
1977 /*
1978 * record a given inode in the fixup dir so we can check its link
1979 * count when replay is done. The link count is incremented here
1980 * so the inode won't go away until we check it
1981 */
link_to_fixup_dir(struct walk_control * wc,u64 objectid)1982 static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
1983 {
1984 struct btrfs_trans_handle *trans = wc->trans;
1985 struct btrfs_root *root = wc->root;
1986 struct btrfs_key key;
1987 int ret = 0;
1988 struct btrfs_inode *inode;
1989 struct inode *vfs_inode;
1990
1991 inode = btrfs_iget_logging(objectid, root);
1992 if (IS_ERR(inode)) {
1993 ret = PTR_ERR(inode);
1994 btrfs_abort_log_replay(wc, ret,
1995 "failed to lookup inode %llu root %llu",
1996 objectid, btrfs_root_id(root));
1997 return ret;
1998 }
1999
2000 vfs_inode = &inode->vfs_inode;
2001 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
2002 key.type = BTRFS_ORPHAN_ITEM_KEY;
2003 key.offset = objectid;
2004
2005 ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
2006
2007 btrfs_release_path(wc->subvol_path);
2008 if (ret == 0) {
2009 if (!vfs_inode->i_nlink)
2010 set_nlink(vfs_inode, 1);
2011 else
2012 inc_nlink(vfs_inode);
2013 ret = btrfs_update_inode(trans, inode);
2014 if (ret)
2015 btrfs_abort_log_replay(wc, ret,
2016 "failed to update inode %llu root %llu",
2017 objectid, btrfs_root_id(root));
2018 } else if (ret == -EEXIST) {
2019 ret = 0;
2020 } else {
2021 btrfs_abort_log_replay(wc, ret,
2022 "failed to insert fixup item for inode %llu root %llu",
2023 objectid, btrfs_root_id(root));
2024 }
2025 iput(vfs_inode);
2026
2027 return ret;
2028 }
2029
2030 /*
2031 * when replaying the log for a directory, we only insert names
2032 * for inodes that actually exist. This means an fsync on a directory
2033 * does not implicitly fsync all the new files in it
2034 */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,const struct fscrypt_str * name,struct btrfs_key * location)2035 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *root,
2037 u64 dirid, u64 index,
2038 const struct fscrypt_str *name,
2039 struct btrfs_key *location)
2040 {
2041 struct btrfs_inode *inode;
2042 struct btrfs_inode *dir;
2043 int ret;
2044
2045 inode = btrfs_iget_logging(location->objectid, root);
2046 if (IS_ERR(inode))
2047 return PTR_ERR(inode);
2048
2049 dir = btrfs_iget_logging(dirid, root);
2050 if (IS_ERR(dir)) {
2051 iput(&inode->vfs_inode);
2052 return PTR_ERR(dir);
2053 }
2054
2055 ret = btrfs_add_link(trans, dir, inode, name, 1, index);
2056
2057 /* FIXME, put inode into FIXUP list */
2058
2059 iput(&inode->vfs_inode);
2060 iput(&dir->vfs_inode);
2061 return ret;
2062 }
2063
delete_conflicting_dir_entry(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_dir_item * dst_di,const struct btrfs_key * log_key,u8 log_flags,bool exists)2064 static int delete_conflicting_dir_entry(struct walk_control *wc,
2065 struct btrfs_inode *dir,
2066 struct btrfs_dir_item *dst_di,
2067 const struct btrfs_key *log_key,
2068 u8 log_flags,
2069 bool exists)
2070 {
2071 struct btrfs_key found_key;
2072
2073 btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
2074 /* The existing dentry points to the same inode, don't delete it. */
2075 if (found_key.objectid == log_key->objectid &&
2076 found_key.type == log_key->type &&
2077 found_key.offset == log_key->offset &&
2078 btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
2079 return 1;
2080
2081 /*
2082 * Don't drop the conflicting directory entry if the inode for the new
2083 * entry doesn't exist.
2084 */
2085 if (!exists)
2086 return 0;
2087
2088 return drop_one_dir_item(wc, dir, dst_di);
2089 }
2090
2091 /*
2092 * take a single entry in a log directory item and replay it into
2093 * the subvolume.
2094 *
2095 * if a conflicting item exists in the subdirectory already,
2096 * the inode it points to is unlinked and put into the link count
2097 * fix up tree.
2098 *
2099 * If a name from the log points to a file or directory that does
2100 * not exist in the FS, it is skipped. fsyncs on directories
2101 * do not force down inodes inside that directory, just changes to the
2102 * names or unlinks in a directory.
2103 *
2104 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
2105 * non-existing inode) and 1 if the name was replayed.
2106 */
replay_one_name(struct walk_control * wc,struct btrfs_dir_item * di)2107 static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
2108 {
2109 struct btrfs_trans_handle *trans = wc->trans;
2110 struct btrfs_root *root = wc->root;
2111 struct fscrypt_str name = { 0 };
2112 struct btrfs_dir_item *dir_dst_di;
2113 struct btrfs_dir_item *index_dst_di;
2114 bool dir_dst_matches = false;
2115 bool index_dst_matches = false;
2116 struct btrfs_key log_key;
2117 struct btrfs_key search_key;
2118 struct btrfs_inode *dir;
2119 u8 log_flags;
2120 bool exists;
2121 int ret;
2122 bool update_size = true;
2123 bool name_added = false;
2124
2125 dir = btrfs_iget_logging(wc->log_key.objectid, root);
2126 if (IS_ERR(dir)) {
2127 ret = PTR_ERR(dir);
2128 btrfs_abort_log_replay(wc, ret,
2129 "failed to lookup dir inode %llu root %llu",
2130 wc->log_key.objectid, btrfs_root_id(root));
2131 return ret;
2132 }
2133
2134 ret = read_alloc_one_name(wc->log_leaf, di + 1,
2135 btrfs_dir_name_len(wc->log_leaf, di), &name);
2136 if (ret) {
2137 btrfs_abort_log_replay(wc, ret,
2138 "failed to allocate name for dir %llu root %llu",
2139 btrfs_ino(dir), btrfs_root_id(root));
2140 goto out;
2141 }
2142
2143 log_flags = btrfs_dir_flags(wc->log_leaf, di);
2144 btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
2145 ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
2146 btrfs_release_path(wc->subvol_path);
2147 if (ret < 0) {
2148 btrfs_abort_log_replay(wc, ret,
2149 "failed to lookup inode %llu root %llu",
2150 log_key.objectid, btrfs_root_id(root));
2151 goto out;
2152 }
2153 exists = (ret == 0);
2154 ret = 0;
2155
2156 dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
2157 wc->log_key.objectid, &name, 1);
2158 if (IS_ERR(dir_dst_di)) {
2159 ret = PTR_ERR(dir_dst_di);
2160 btrfs_abort_log_replay(wc, ret,
2161 "failed to lookup dir item for dir %llu name %.*s root %llu",
2162 wc->log_key.objectid, name.len, name.name,
2163 btrfs_root_id(root));
2164 goto out;
2165 } else if (dir_dst_di) {
2166 ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
2167 &log_key, log_flags, exists);
2168 if (ret < 0) {
2169 btrfs_abort_log_replay(wc, ret,
2170 "failed to delete conflicting entry for dir %llu name %.*s root %llu",
2171 btrfs_ino(dir), name.len, name.name,
2172 btrfs_root_id(root));
2173 goto out;
2174 }
2175 dir_dst_matches = (ret == 1);
2176 }
2177
2178 btrfs_release_path(wc->subvol_path);
2179
2180 index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
2181 wc->log_key.objectid,
2182 wc->log_key.offset, &name, 1);
2183 if (IS_ERR(index_dst_di)) {
2184 ret = PTR_ERR(index_dst_di);
2185 btrfs_abort_log_replay(wc, ret,
2186 "failed to lookup dir index item for dir %llu name %.*s root %llu",
2187 wc->log_key.objectid, name.len, name.name,
2188 btrfs_root_id(root));
2189 goto out;
2190 } else if (index_dst_di) {
2191 ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
2192 &log_key, log_flags, exists);
2193 if (ret < 0) {
2194 btrfs_abort_log_replay(wc, ret,
2195 "failed to delete conflicting entry for dir %llu name %.*s root %llu",
2196 btrfs_ino(dir), name.len, name.name,
2197 btrfs_root_id(root));
2198 goto out;
2199 }
2200 index_dst_matches = (ret == 1);
2201 }
2202
2203 btrfs_release_path(wc->subvol_path);
2204
2205 if (dir_dst_matches && index_dst_matches) {
2206 ret = 0;
2207 update_size = false;
2208 goto out;
2209 }
2210
2211 /*
2212 * Check if the inode reference exists in the log for the given name,
2213 * inode and parent inode
2214 */
2215 search_key.objectid = log_key.objectid;
2216 search_key.type = BTRFS_INODE_REF_KEY;
2217 search_key.offset = wc->log_key.objectid;
2218 ret = backref_in_log(root->log_root, &search_key, 0, &name);
2219 if (ret < 0) {
2220 btrfs_abort_log_replay(wc, ret,
2221 "failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
2222 search_key.objectid, btrfs_ino(dir),
2223 name.len, name.name, btrfs_root_id(root));
2224 goto out;
2225 } else if (ret) {
2226 /* The dentry will be added later. */
2227 ret = 0;
2228 update_size = false;
2229 goto out;
2230 }
2231
2232 search_key.objectid = log_key.objectid;
2233 search_key.type = BTRFS_INODE_EXTREF_KEY;
2234 search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
2235 ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
2236 if (ret < 0) {
2237 btrfs_abort_log_replay(wc, ret,
2238 "failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
2239 search_key.objectid, btrfs_ino(dir),
2240 name.len, name.name, btrfs_root_id(root));
2241 goto out;
2242 } else if (ret) {
2243 /* The dentry will be added later. */
2244 ret = 0;
2245 update_size = false;
2246 goto out;
2247 }
2248 ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
2249 &name, &log_key);
2250 if (ret && ret != -ENOENT && ret != -EEXIST) {
2251 btrfs_abort_log_replay(wc, ret,
2252 "failed to insert name %.*s for inode %llu dir %llu root %llu",
2253 name.len, name.name, log_key.objectid,
2254 btrfs_ino(dir), btrfs_root_id(root));
2255 goto out;
2256 }
2257 if (!ret)
2258 name_added = true;
2259 update_size = false;
2260 ret = 0;
2261
2262 out:
2263 if (!ret && update_size) {
2264 btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
2265 ret = btrfs_update_inode(trans, dir);
2266 if (ret)
2267 btrfs_abort_log_replay(wc, ret,
2268 "failed to update dir inode %llu root %llu",
2269 btrfs_ino(dir), btrfs_root_id(root));
2270 }
2271 kfree(name.name);
2272 iput(&dir->vfs_inode);
2273 if (!ret && name_added)
2274 ret = 1;
2275 return ret;
2276 }
2277
2278 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
replay_one_dir_item(struct walk_control * wc)2279 static noinline int replay_one_dir_item(struct walk_control *wc)
2280 {
2281 int ret;
2282 struct btrfs_dir_item *di;
2283
2284 /* We only log dir index keys, which only contain a single dir item. */
2285 ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY);
2286
2287 di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
2288 ret = replay_one_name(wc, di);
2289 if (ret < 0)
2290 return ret;
2291
2292 /*
2293 * If this entry refers to a non-directory (directories can not have a
2294 * link count > 1) and it was added in the transaction that was not
2295 * committed, make sure we fixup the link count of the inode the entry
2296 * points to. Otherwise something like the following would result in a
2297 * directory pointing to an inode with a wrong link that does not account
2298 * for this dir entry:
2299 *
2300 * mkdir testdir
2301 * touch testdir/foo
2302 * touch testdir/bar
2303 * sync
2304 *
2305 * ln testdir/bar testdir/bar_link
2306 * ln testdir/foo testdir/foo_link
2307 * xfs_io -c "fsync" testdir/bar
2308 *
2309 * <power failure>
2310 *
2311 * mount fs, log replay happens
2312 *
2313 * File foo would remain with a link count of 1 when it has two entries
2314 * pointing to it in the directory testdir. This would make it impossible
2315 * to ever delete the parent directory has it would result in stale
2316 * dentries that can never be deleted.
2317 */
2318 if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
2319 struct btrfs_key di_key;
2320
2321 btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
2322 ret = link_to_fixup_dir(wc, di_key.objectid);
2323 }
2324
2325 return ret;
2326 }
2327
2328 /*
2329 * directory replay has two parts. There are the standard directory
2330 * items in the log copied from the subvolume, and range items
2331 * created in the log while the subvolume was logged.
2332 *
2333 * The range items tell us which parts of the key space the log
2334 * is authoritative for. During replay, if a key in the subvolume
2335 * directory is in a logged range item, but not actually in the log
2336 * that means it was deleted from the directory before the fsync
2337 * and should be removed.
2338 */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 * start_ret,u64 * end_ret)2339 static noinline int find_dir_range(struct btrfs_root *root,
2340 struct btrfs_path *path,
2341 u64 dirid,
2342 u64 *start_ret, u64 *end_ret)
2343 {
2344 struct btrfs_key key;
2345 u64 found_end;
2346 struct btrfs_dir_log_item *item;
2347 int ret;
2348 int nritems;
2349
2350 if (*start_ret == (u64)-1)
2351 return 1;
2352
2353 key.objectid = dirid;
2354 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2355 key.offset = *start_ret;
2356
2357 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2358 if (ret < 0)
2359 goto out;
2360 if (ret > 0) {
2361 if (path->slots[0] == 0)
2362 goto out;
2363 path->slots[0]--;
2364 }
2365 if (ret != 0)
2366 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2367
2368 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2369 ret = 1;
2370 goto next;
2371 }
2372 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2373 struct btrfs_dir_log_item);
2374 found_end = btrfs_dir_log_end(path->nodes[0], item);
2375
2376 if (*start_ret >= key.offset && *start_ret <= found_end) {
2377 ret = 0;
2378 *start_ret = key.offset;
2379 *end_ret = found_end;
2380 goto out;
2381 }
2382 ret = 1;
2383 next:
2384 /* check the next slot in the tree to see if it is a valid item */
2385 nritems = btrfs_header_nritems(path->nodes[0]);
2386 path->slots[0]++;
2387 if (path->slots[0] >= nritems) {
2388 ret = btrfs_next_leaf(root, path);
2389 if (ret)
2390 goto out;
2391 }
2392
2393 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2394
2395 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2396 ret = 1;
2397 goto out;
2398 }
2399 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2400 struct btrfs_dir_log_item);
2401 found_end = btrfs_dir_log_end(path->nodes[0], item);
2402 *start_ret = key.offset;
2403 *end_ret = found_end;
2404 ret = 0;
2405 out:
2406 btrfs_release_path(path);
2407 return ret;
2408 }
2409
2410 /*
2411 * this looks for a given directory item in the log. If the directory
2412 * item is not in the log, the item is removed and the inode it points
2413 * to is unlinked
2414 */
check_item_in_log(struct walk_control * wc,struct btrfs_path * log_path,struct btrfs_inode * dir,struct btrfs_key * dir_key,bool force_remove)2415 static noinline int check_item_in_log(struct walk_control *wc,
2416 struct btrfs_path *log_path,
2417 struct btrfs_inode *dir,
2418 struct btrfs_key *dir_key,
2419 bool force_remove)
2420 {
2421 struct btrfs_trans_handle *trans = wc->trans;
2422 struct btrfs_root *root = dir->root;
2423 int ret;
2424 struct extent_buffer *eb;
2425 int slot;
2426 struct btrfs_dir_item *di;
2427 struct fscrypt_str name = { 0 };
2428 struct btrfs_inode *inode = NULL;
2429 struct btrfs_key location;
2430
2431 /*
2432 * Currently we only log dir index keys. Even if we replay a log created
2433 * by an older kernel that logged both dir index and dir item keys, all
2434 * we need to do is process the dir index keys, we (and our caller) can
2435 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2436 */
2437 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2438
2439 eb = wc->subvol_path->nodes[0];
2440 slot = wc->subvol_path->slots[0];
2441 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2442 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2443 if (ret) {
2444 btrfs_abort_log_replay(wc, ret,
2445 "failed to allocate name for dir %llu index %llu root %llu",
2446 btrfs_ino(dir), dir_key->offset,
2447 btrfs_root_id(root));
2448 goto out;
2449 }
2450
2451 if (!force_remove) {
2452 struct btrfs_dir_item *log_di;
2453
2454 log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
2455 dir_key->objectid,
2456 dir_key->offset, &name, 0);
2457 if (IS_ERR(log_di)) {
2458 ret = PTR_ERR(log_di);
2459 btrfs_abort_log_replay(wc, ret,
2460 "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
2461 btrfs_ino(dir), dir_key->offset,
2462 name.len, name.name,
2463 btrfs_root_id(root));
2464 goto out;
2465 } else if (log_di) {
2466 /* The dentry exists in the log, we have nothing to do. */
2467 ret = 0;
2468 goto out;
2469 }
2470 }
2471
2472 btrfs_dir_item_key_to_cpu(eb, di, &location);
2473 btrfs_release_path(wc->subvol_path);
2474 btrfs_release_path(log_path);
2475 inode = btrfs_iget_logging(location.objectid, root);
2476 if (IS_ERR(inode)) {
2477 ret = PTR_ERR(inode);
2478 inode = NULL;
2479 btrfs_abort_log_replay(wc, ret,
2480 "failed to lookup inode %llu root %llu",
2481 location.objectid, btrfs_root_id(root));
2482 goto out;
2483 }
2484
2485 ret = link_to_fixup_dir(wc, location.objectid);
2486 if (ret)
2487 goto out;
2488
2489 inc_nlink(&inode->vfs_inode);
2490 ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
2491 /*
2492 * Unlike dir item keys, dir index keys can only have one name (entry) in
2493 * them, as there are no key collisions since each key has a unique offset
2494 * (an index number), so we're done.
2495 */
2496 out:
2497 btrfs_release_path(wc->subvol_path);
2498 btrfs_release_path(log_path);
2499 kfree(name.name);
2500 if (inode)
2501 iput(&inode->vfs_inode);
2502 return ret;
2503 }
2504
replay_xattr_deletes(struct walk_control * wc)2505 static int replay_xattr_deletes(struct walk_control *wc)
2506 {
2507 struct btrfs_trans_handle *trans = wc->trans;
2508 struct btrfs_root *root = wc->root;
2509 struct btrfs_root *log = wc->log;
2510 struct btrfs_key search_key;
2511 BTRFS_PATH_AUTO_FREE(log_path);
2512 const u64 ino = wc->log_key.objectid;
2513 int nritems;
2514 int ret;
2515
2516 log_path = btrfs_alloc_path();
2517 if (!log_path) {
2518 btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2519 return -ENOMEM;
2520 }
2521
2522 search_key.objectid = ino;
2523 search_key.type = BTRFS_XATTR_ITEM_KEY;
2524 search_key.offset = 0;
2525 again:
2526 ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
2527 if (ret < 0) {
2528 btrfs_abort_log_replay(wc, ret,
2529 "failed to search xattrs for inode %llu root %llu",
2530 ino, btrfs_root_id(root));
2531 goto out;
2532 }
2533 process_leaf:
2534 nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
2535 for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
2536 struct btrfs_key key;
2537 struct btrfs_dir_item *di;
2538 struct btrfs_dir_item *log_di;
2539 u32 total_size;
2540 u32 cur;
2541
2542 btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
2543 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2544 ret = 0;
2545 goto out;
2546 }
2547
2548 di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
2549 total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
2550 cur = 0;
2551 while (cur < total_size) {
2552 u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
2553 u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
2554 u32 this_len = sizeof(*di) + name_len + data_len;
2555 char *name;
2556
2557 name = kmalloc(name_len, GFP_NOFS);
2558 if (!name) {
2559 ret = -ENOMEM;
2560 btrfs_abort_log_replay(wc, ret,
2561 "failed to allocate memory for name of length %u",
2562 name_len);
2563 goto out;
2564 }
2565 read_extent_buffer(wc->subvol_path->nodes[0], name,
2566 (unsigned long)(di + 1), name_len);
2567
2568 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2569 name, name_len, 0);
2570 btrfs_release_path(log_path);
2571 if (!log_di) {
2572 /* Doesn't exist in log tree, so delete it. */
2573 btrfs_release_path(wc->subvol_path);
2574 di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
2575 name, name_len, -1);
2576 if (IS_ERR(di)) {
2577 ret = PTR_ERR(di);
2578 btrfs_abort_log_replay(wc, ret,
2579 "failed to lookup xattr with name %.*s for inode %llu root %llu",
2580 name_len, name, ino,
2581 btrfs_root_id(root));
2582 kfree(name);
2583 goto out;
2584 }
2585 ASSERT(di);
2586 ret = btrfs_delete_one_dir_name(trans, root,
2587 wc->subvol_path, di);
2588 if (ret) {
2589 btrfs_abort_log_replay(wc, ret,
2590 "failed to delete xattr with name %.*s for inode %llu root %llu",
2591 name_len, name, ino,
2592 btrfs_root_id(root));
2593 kfree(name);
2594 goto out;
2595 }
2596 btrfs_release_path(wc->subvol_path);
2597 kfree(name);
2598 search_key = key;
2599 goto again;
2600 }
2601 if (IS_ERR(log_di)) {
2602 ret = PTR_ERR(log_di);
2603 btrfs_abort_log_replay(wc, ret,
2604 "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
2605 name_len, name, ino,
2606 btrfs_root_id(root));
2607 kfree(name);
2608 goto out;
2609 }
2610 kfree(name);
2611 cur += this_len;
2612 di = (struct btrfs_dir_item *)((char *)di + this_len);
2613 }
2614 }
2615 ret = btrfs_next_leaf(root, wc->subvol_path);
2616 if (ret > 0)
2617 ret = 0;
2618 else if (ret == 0)
2619 goto process_leaf;
2620 else
2621 btrfs_abort_log_replay(wc, ret,
2622 "failed to get next leaf in subvolume root %llu",
2623 btrfs_root_id(root));
2624 out:
2625 btrfs_release_path(wc->subvol_path);
2626 return ret;
2627 }
2628
2629
2630 /*
2631 * deletion replay happens before we copy any new directory items
2632 * out of the log or out of backreferences from inodes. It
2633 * scans the log to find ranges of keys that log is authoritative for,
2634 * and then scans the directory to find items in those ranges that are
2635 * not present in the log.
2636 *
2637 * Anything we don't find in the log is unlinked and removed from the
2638 * directory.
2639 */
replay_dir_deletes(struct walk_control * wc,u64 dirid,bool del_all)2640 static noinline int replay_dir_deletes(struct walk_control *wc,
2641 u64 dirid, bool del_all)
2642 {
2643 struct btrfs_root *root = wc->root;
2644 struct btrfs_root *log = (del_all ? NULL : wc->log);
2645 u64 range_start;
2646 u64 range_end;
2647 int ret = 0;
2648 struct btrfs_key dir_key;
2649 struct btrfs_key found_key;
2650 struct btrfs_path *log_path;
2651 struct btrfs_inode *dir;
2652
2653 dir_key.objectid = dirid;
2654 dir_key.type = BTRFS_DIR_INDEX_KEY;
2655 log_path = btrfs_alloc_path();
2656 if (!log_path) {
2657 btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2658 return -ENOMEM;
2659 }
2660
2661 dir = btrfs_iget_logging(dirid, root);
2662 /*
2663 * It isn't an error if the inode isn't there, that can happen because
2664 * we replay the deletes before we copy in the inode item from the log.
2665 */
2666 if (IS_ERR(dir)) {
2667 btrfs_free_path(log_path);
2668 ret = PTR_ERR(dir);
2669 if (ret == -ENOENT)
2670 ret = 0;
2671 else
2672 btrfs_abort_log_replay(wc, ret,
2673 "failed to lookup dir inode %llu root %llu",
2674 dirid, btrfs_root_id(root));
2675 return ret;
2676 }
2677
2678 range_start = 0;
2679 range_end = 0;
2680 while (1) {
2681 if (del_all)
2682 range_end = (u64)-1;
2683 else {
2684 ret = find_dir_range(log, wc->subvol_path, dirid,
2685 &range_start, &range_end);
2686 if (ret < 0) {
2687 btrfs_abort_log_replay(wc, ret,
2688 "failed to find range for dir %llu in log tree root %llu",
2689 dirid, btrfs_root_id(root));
2690 goto out;
2691 } else if (ret > 0) {
2692 break;
2693 }
2694 }
2695
2696 dir_key.offset = range_start;
2697 while (1) {
2698 int nritems;
2699 ret = btrfs_search_slot(NULL, root, &dir_key,
2700 wc->subvol_path, 0, 0);
2701 if (ret < 0) {
2702 btrfs_abort_log_replay(wc, ret,
2703 "failed to search root %llu for key (%llu %u %llu)",
2704 btrfs_root_id(root),
2705 dir_key.objectid, dir_key.type,
2706 dir_key.offset);
2707 goto out;
2708 }
2709
2710 nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
2711 if (wc->subvol_path->slots[0] >= nritems) {
2712 ret = btrfs_next_leaf(root, wc->subvol_path);
2713 if (ret == 1) {
2714 break;
2715 } else if (ret < 0) {
2716 btrfs_abort_log_replay(wc, ret,
2717 "failed to get next leaf in subvolume root %llu",
2718 btrfs_root_id(root));
2719 goto out;
2720 }
2721 }
2722 btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
2723 wc->subvol_path->slots[0]);
2724 if (found_key.objectid != dirid ||
2725 found_key.type != dir_key.type) {
2726 ret = 0;
2727 goto out;
2728 }
2729
2730 if (found_key.offset > range_end)
2731 break;
2732
2733 ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
2734 if (ret)
2735 goto out;
2736 if (found_key.offset == (u64)-1)
2737 break;
2738 dir_key.offset = found_key.offset + 1;
2739 }
2740 btrfs_release_path(wc->subvol_path);
2741 if (range_end == (u64)-1)
2742 break;
2743 range_start = range_end + 1;
2744 }
2745 ret = 0;
2746 out:
2747 btrfs_release_path(wc->subvol_path);
2748 btrfs_free_path(log_path);
2749 iput(&dir->vfs_inode);
2750 return ret;
2751 }
2752
2753 /*
2754 * the process_func used to replay items from the log tree. This
2755 * gets called in two different stages. The first stage just looks
2756 * for inodes and makes sure they are all copied into the subvolume.
2757 *
2758 * The second stage copies all the other item types from the log into
2759 * the subvolume. The two stage approach is slower, but gets rid of
2760 * lots of complexity around inodes referencing other inodes that exist
2761 * only in the log (references come from either directory items or inode
2762 * back refs).
2763 */
replay_one_buffer(struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2764 static int replay_one_buffer(struct extent_buffer *eb,
2765 struct walk_control *wc, u64 gen, int level)
2766 {
2767 int nritems;
2768 struct btrfs_tree_parent_check check = {
2769 .transid = gen,
2770 .level = level
2771 };
2772 struct btrfs_root *root = wc->root;
2773 struct btrfs_trans_handle *trans = wc->trans;
2774 int ret;
2775
2776 if (level != 0)
2777 return 0;
2778
2779 /*
2780 * Set to NULL since it was not yet read and in case we abort log replay
2781 * on error, we have no valid log tree leaf to dump.
2782 */
2783 wc->log_leaf = NULL;
2784 ret = btrfs_read_extent_buffer(eb, &check);
2785 if (ret) {
2786 btrfs_abort_log_replay(wc, ret,
2787 "failed to read log tree leaf %llu for root %llu",
2788 eb->start, btrfs_root_id(root));
2789 return ret;
2790 }
2791
2792 ASSERT(wc->subvol_path == NULL);
2793 wc->subvol_path = btrfs_alloc_path();
2794 if (!wc->subvol_path) {
2795 btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2796 return -ENOMEM;
2797 }
2798
2799 wc->log_leaf = eb;
2800
2801 nritems = btrfs_header_nritems(eb);
2802 for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
2803 struct btrfs_inode_item *inode_item;
2804
2805 btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
2806
2807 if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
2808 inode_item = btrfs_item_ptr(eb, wc->log_slot,
2809 struct btrfs_inode_item);
2810 /*
2811 * An inode with no links is either:
2812 *
2813 * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
2814 * got linked before the fsync, skip it, as replaying
2815 * it is pointless since it would be deleted later.
2816 * We skip logging tmpfiles, but it's always possible
2817 * we are replaying a log created with a kernel that
2818 * used to log tmpfiles;
2819 *
2820 * 2) A non-tmpfile which got its last link deleted
2821 * while holding an open fd on it and later got
2822 * fsynced through that fd. We always log the
2823 * parent inodes when inode->last_unlink_trans is
2824 * set to the current transaction, so ignore all the
2825 * inode items for this inode. We will delete the
2826 * inode when processing the parent directory with
2827 * replay_dir_deletes().
2828 */
2829 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2830 wc->ignore_cur_inode = true;
2831 continue;
2832 } else {
2833 wc->ignore_cur_inode = false;
2834 }
2835 }
2836
2837 /* Inode keys are done during the first stage. */
2838 if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
2839 wc->stage == LOG_WALK_REPLAY_INODES) {
2840 u32 mode;
2841
2842 ret = replay_xattr_deletes(wc);
2843 if (ret)
2844 break;
2845 mode = btrfs_inode_mode(eb, inode_item);
2846 if (S_ISDIR(mode)) {
2847 ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
2848 if (ret)
2849 break;
2850 }
2851 ret = overwrite_item(wc);
2852 if (ret)
2853 break;
2854
2855 /*
2856 * Before replaying extents, truncate the inode to its
2857 * size. We need to do it now and not after log replay
2858 * because before an fsync we can have prealloc extents
2859 * added beyond the inode's i_size. If we did it after,
2860 * through orphan cleanup for example, we would drop
2861 * those prealloc extents just after replaying them.
2862 */
2863 if (S_ISREG(mode)) {
2864 struct btrfs_drop_extents_args drop_args = { 0 };
2865 struct btrfs_inode *inode;
2866 u64 from;
2867
2868 inode = btrfs_iget_logging(wc->log_key.objectid, root);
2869 if (IS_ERR(inode)) {
2870 ret = PTR_ERR(inode);
2871 btrfs_abort_log_replay(wc, ret,
2872 "failed to lookup inode %llu root %llu",
2873 wc->log_key.objectid,
2874 btrfs_root_id(root));
2875 break;
2876 }
2877 from = ALIGN(i_size_read(&inode->vfs_inode),
2878 root->fs_info->sectorsize);
2879 drop_args.start = from;
2880 drop_args.end = (u64)-1;
2881 drop_args.drop_cache = true;
2882 drop_args.path = wc->subvol_path;
2883 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2884 if (ret) {
2885 btrfs_abort_log_replay(wc, ret,
2886 "failed to drop extents for inode %llu root %llu offset %llu",
2887 btrfs_ino(inode),
2888 btrfs_root_id(root),
2889 from);
2890 } else {
2891 inode_sub_bytes(&inode->vfs_inode,
2892 drop_args.bytes_found);
2893 /* Update the inode's nbytes. */
2894 ret = btrfs_update_inode(trans, inode);
2895 if (ret)
2896 btrfs_abort_log_replay(wc, ret,
2897 "failed to update inode %llu root %llu",
2898 btrfs_ino(inode),
2899 btrfs_root_id(root));
2900 }
2901 iput(&inode->vfs_inode);
2902 if (ret)
2903 break;
2904 }
2905
2906 ret = link_to_fixup_dir(wc, wc->log_key.objectid);
2907 if (ret)
2908 break;
2909 }
2910
2911 if (wc->ignore_cur_inode)
2912 continue;
2913
2914 if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
2915 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2916 ret = replay_one_dir_item(wc);
2917 if (ret)
2918 break;
2919 }
2920
2921 if (wc->stage < LOG_WALK_REPLAY_ALL)
2922 continue;
2923
2924 /* these keys are simply copied */
2925 if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
2926 ret = overwrite_item(wc);
2927 if (ret)
2928 break;
2929 } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
2930 wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
2931 ret = add_inode_ref(wc);
2932 if (ret)
2933 break;
2934 } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
2935 ret = replay_one_extent(wc);
2936 if (ret)
2937 break;
2938 }
2939 /*
2940 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2941 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2942 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2943 * older kernel with such keys, ignore them.
2944 */
2945 }
2946 btrfs_free_path(wc->subvol_path);
2947 wc->subvol_path = NULL;
2948 return ret;
2949 }
2950
clean_log_buffer(struct btrfs_trans_handle * trans,struct extent_buffer * eb)2951 static int clean_log_buffer(struct btrfs_trans_handle *trans,
2952 struct extent_buffer *eb)
2953 {
2954 struct btrfs_fs_info *fs_info = eb->fs_info;
2955 struct btrfs_block_group *bg;
2956
2957 btrfs_tree_lock(eb);
2958 btrfs_clear_buffer_dirty(trans, eb);
2959 wait_on_extent_buffer_writeback(eb);
2960 btrfs_tree_unlock(eb);
2961
2962 if (trans) {
2963 int ret;
2964
2965 ret = btrfs_pin_reserved_extent(trans, eb);
2966 if (ret)
2967 btrfs_abort_transaction(trans, ret);
2968 return ret;
2969 }
2970
2971 bg = btrfs_lookup_block_group(fs_info, eb->start);
2972 if (!bg) {
2973 btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
2974 btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
2975 return -ENOENT;
2976 }
2977
2978 spin_lock(&bg->space_info->lock);
2979 spin_lock(&bg->lock);
2980 bg->reserved -= fs_info->nodesize;
2981 bg->space_info->bytes_reserved -= fs_info->nodesize;
2982 spin_unlock(&bg->lock);
2983 spin_unlock(&bg->space_info->lock);
2984
2985 btrfs_put_block_group(bg);
2986
2987 return 0;
2988 }
2989
walk_down_log_tree(struct btrfs_path * path,int * level,struct walk_control * wc)2990 static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
2991 struct walk_control *wc)
2992 {
2993 struct btrfs_trans_handle *trans = wc->trans;
2994 struct btrfs_fs_info *fs_info = wc->log->fs_info;
2995 u64 bytenr;
2996 u64 ptr_gen;
2997 struct extent_buffer *next;
2998 struct extent_buffer *cur;
2999 int ret = 0;
3000
3001 while (*level > 0) {
3002 struct btrfs_tree_parent_check check = { 0 };
3003
3004 cur = path->nodes[*level];
3005
3006 WARN_ON(btrfs_header_level(cur) != *level);
3007
3008 if (path->slots[*level] >=
3009 btrfs_header_nritems(cur))
3010 break;
3011
3012 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3013 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3014 check.transid = ptr_gen;
3015 check.level = *level - 1;
3016 check.has_first_key = true;
3017 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
3018
3019 next = btrfs_find_create_tree_block(fs_info, bytenr,
3020 btrfs_header_owner(cur),
3021 *level - 1);
3022 if (IS_ERR(next)) {
3023 ret = PTR_ERR(next);
3024 if (trans)
3025 btrfs_abort_transaction(trans, ret);
3026 else
3027 btrfs_handle_fs_error(fs_info, ret, NULL);
3028 return ret;
3029 }
3030
3031 if (*level == 1) {
3032 ret = wc->process_func(next, wc, ptr_gen, *level - 1);
3033 if (ret) {
3034 free_extent_buffer(next);
3035 return ret;
3036 }
3037
3038 path->slots[*level]++;
3039 if (wc->free) {
3040 ret = btrfs_read_extent_buffer(next, &check);
3041 if (ret) {
3042 free_extent_buffer(next);
3043 if (trans)
3044 btrfs_abort_transaction(trans, ret);
3045 else
3046 btrfs_handle_fs_error(fs_info, ret, NULL);
3047 return ret;
3048 }
3049
3050 ret = clean_log_buffer(trans, next);
3051 if (ret) {
3052 free_extent_buffer(next);
3053 return ret;
3054 }
3055 }
3056 free_extent_buffer(next);
3057 continue;
3058 }
3059 ret = btrfs_read_extent_buffer(next, &check);
3060 if (ret) {
3061 free_extent_buffer(next);
3062 if (trans)
3063 btrfs_abort_transaction(trans, ret);
3064 else
3065 btrfs_handle_fs_error(fs_info, ret, NULL);
3066 return ret;
3067 }
3068
3069 if (path->nodes[*level-1])
3070 free_extent_buffer(path->nodes[*level-1]);
3071 path->nodes[*level-1] = next;
3072 *level = btrfs_header_level(next);
3073 path->slots[*level] = 0;
3074 cond_resched();
3075 }
3076 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
3077
3078 cond_resched();
3079 return 0;
3080 }
3081
walk_up_log_tree(struct btrfs_path * path,int * level,struct walk_control * wc)3082 static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
3083 struct walk_control *wc)
3084 {
3085 int i;
3086 int slot;
3087 int ret;
3088
3089 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
3090 slot = path->slots[i];
3091 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
3092 path->slots[i]++;
3093 *level = i;
3094 WARN_ON(*level == 0);
3095 return 0;
3096 } else {
3097 ret = wc->process_func(path->nodes[*level], wc,
3098 btrfs_header_generation(path->nodes[*level]),
3099 *level);
3100 if (ret)
3101 return ret;
3102
3103 if (wc->free) {
3104 ret = clean_log_buffer(wc->trans, path->nodes[*level]);
3105 if (ret)
3106 return ret;
3107 }
3108 free_extent_buffer(path->nodes[*level]);
3109 path->nodes[*level] = NULL;
3110 *level = i + 1;
3111 }
3112 }
3113 return 1;
3114 }
3115
3116 /*
3117 * drop the reference count on the tree rooted at 'snap'. This traverses
3118 * the tree freeing any blocks that have a ref count of zero after being
3119 * decremented.
3120 */
walk_log_tree(struct walk_control * wc)3121 static int walk_log_tree(struct walk_control *wc)
3122 {
3123 struct btrfs_root *log = wc->log;
3124 int ret = 0;
3125 int wret;
3126 int level;
3127 BTRFS_PATH_AUTO_FREE(path);
3128 int orig_level;
3129
3130 path = btrfs_alloc_path();
3131 if (!path)
3132 return -ENOMEM;
3133
3134 level = btrfs_header_level(log->node);
3135 orig_level = level;
3136 path->nodes[level] = log->node;
3137 refcount_inc(&log->node->refs);
3138 path->slots[level] = 0;
3139
3140 while (1) {
3141 wret = walk_down_log_tree(path, &level, wc);
3142 if (wret > 0)
3143 break;
3144 if (wret < 0)
3145 return wret;
3146
3147 wret = walk_up_log_tree(path, &level, wc);
3148 if (wret > 0)
3149 break;
3150 if (wret < 0)
3151 return wret;
3152 }
3153
3154 /* was the root node processed? if not, catch it here */
3155 if (path->nodes[orig_level]) {
3156 ret = wc->process_func(path->nodes[orig_level], wc,
3157 btrfs_header_generation(path->nodes[orig_level]),
3158 orig_level);
3159 if (ret)
3160 return ret;
3161 if (wc->free)
3162 ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
3163 }
3164
3165 return ret;
3166 }
3167
3168 /*
3169 * helper function to update the item for a given subvolumes log root
3170 * in the tree of log roots
3171 */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)3172 static int update_log_root(struct btrfs_trans_handle *trans,
3173 struct btrfs_root *log,
3174 struct btrfs_root_item *root_item)
3175 {
3176 struct btrfs_fs_info *fs_info = log->fs_info;
3177 int ret;
3178
3179 if (log->log_transid == 1) {
3180 /* insert root item on the first sync */
3181 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
3182 &log->root_key, root_item);
3183 } else {
3184 ret = btrfs_update_root(trans, fs_info->log_root_tree,
3185 &log->root_key, root_item);
3186 }
3187 return ret;
3188 }
3189
wait_log_commit(struct btrfs_root * root,int transid)3190 static void wait_log_commit(struct btrfs_root *root, int transid)
3191 {
3192 DEFINE_WAIT(wait);
3193 int index = transid % 2;
3194
3195 /*
3196 * we only allow two pending log transactions at a time,
3197 * so we know that if ours is more than 2 older than the
3198 * current transaction, we're done
3199 */
3200 for (;;) {
3201 prepare_to_wait(&root->log_commit_wait[index],
3202 &wait, TASK_UNINTERRUPTIBLE);
3203
3204 if (!(root->log_transid_committed < transid &&
3205 atomic_read(&root->log_commit[index])))
3206 break;
3207
3208 mutex_unlock(&root->log_mutex);
3209 schedule();
3210 mutex_lock(&root->log_mutex);
3211 }
3212 finish_wait(&root->log_commit_wait[index], &wait);
3213 }
3214
wait_for_writer(struct btrfs_root * root)3215 static void wait_for_writer(struct btrfs_root *root)
3216 {
3217 DEFINE_WAIT(wait);
3218
3219 for (;;) {
3220 prepare_to_wait(&root->log_writer_wait, &wait,
3221 TASK_UNINTERRUPTIBLE);
3222 if (!atomic_read(&root->log_writers))
3223 break;
3224
3225 mutex_unlock(&root->log_mutex);
3226 schedule();
3227 mutex_lock(&root->log_mutex);
3228 }
3229 finish_wait(&root->log_writer_wait, &wait);
3230 }
3231
btrfs_init_log_ctx(struct btrfs_log_ctx * ctx,struct btrfs_inode * inode)3232 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
3233 {
3234 ctx->log_ret = 0;
3235 ctx->log_transid = 0;
3236 ctx->log_new_dentries = false;
3237 ctx->logging_new_name = false;
3238 ctx->logging_new_delayed_dentries = false;
3239 ctx->logged_before = false;
3240 ctx->inode = inode;
3241 INIT_LIST_HEAD(&ctx->list);
3242 INIT_LIST_HEAD(&ctx->ordered_extents);
3243 INIT_LIST_HEAD(&ctx->conflict_inodes);
3244 ctx->num_conflict_inodes = 0;
3245 ctx->logging_conflict_inodes = false;
3246 ctx->scratch_eb = NULL;
3247 }
3248
btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx * ctx)3249 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
3250 {
3251 struct btrfs_inode *inode = ctx->inode;
3252
3253 if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3254 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
3255 return;
3256
3257 /*
3258 * Don't care about allocation failure. This is just for optimization,
3259 * if we fail to allocate here, we will try again later if needed.
3260 */
3261 ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
3262 }
3263
btrfs_release_log_ctx_extents(struct btrfs_log_ctx * ctx)3264 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
3265 {
3266 struct btrfs_ordered_extent *ordered;
3267 struct btrfs_ordered_extent *tmp;
3268
3269 btrfs_assert_inode_locked(ctx->inode);
3270
3271 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
3272 list_del_init(&ordered->log_list);
3273 btrfs_put_ordered_extent(ordered);
3274 }
3275 }
3276
3277
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)3278 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3279 struct btrfs_log_ctx *ctx)
3280 {
3281 mutex_lock(&root->log_mutex);
3282 list_del_init(&ctx->list);
3283 mutex_unlock(&root->log_mutex);
3284 }
3285
3286 /*
3287 * Invoked in log mutex context, or be sure there is no other task which
3288 * can access the list.
3289 */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)3290 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3291 int index, int error)
3292 {
3293 struct btrfs_log_ctx *ctx;
3294 struct btrfs_log_ctx *safe;
3295
3296 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3297 list_del_init(&ctx->list);
3298 ctx->log_ret = error;
3299 }
3300 }
3301
3302 /*
3303 * Sends a given tree log down to the disk and updates the super blocks to
3304 * record it. When this call is done, you know that any inodes previously
3305 * logged are safely on disk only if it returns 0.
3306 *
3307 * Any other return value means you need to call btrfs_commit_transaction.
3308 * Some of the edge cases for fsyncing directories that have had unlinks
3309 * or renames done in the past mean that sometimes the only safe
3310 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3311 * that has happened.
3312 */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)3313 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3314 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3315 {
3316 int index1;
3317 int index2;
3318 int mark;
3319 int ret;
3320 struct btrfs_fs_info *fs_info = root->fs_info;
3321 struct btrfs_root *log = root->log_root;
3322 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3323 struct btrfs_root_item new_root_item;
3324 int log_transid = 0;
3325 struct btrfs_log_ctx root_log_ctx;
3326 struct blk_plug plug;
3327 u64 log_root_start;
3328 u64 log_root_level;
3329
3330 mutex_lock(&root->log_mutex);
3331 log_transid = ctx->log_transid;
3332 if (root->log_transid_committed >= log_transid) {
3333 mutex_unlock(&root->log_mutex);
3334 return ctx->log_ret;
3335 }
3336
3337 index1 = log_transid % 2;
3338 if (atomic_read(&root->log_commit[index1])) {
3339 wait_log_commit(root, log_transid);
3340 mutex_unlock(&root->log_mutex);
3341 return ctx->log_ret;
3342 }
3343 ASSERT(log_transid == root->log_transid);
3344 atomic_set(&root->log_commit[index1], 1);
3345
3346 /* wait for previous tree log sync to complete */
3347 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3348 wait_log_commit(root, log_transid - 1);
3349
3350 while (1) {
3351 int batch = atomic_read(&root->log_batch);
3352 /* when we're on an ssd, just kick the log commit out */
3353 if (!btrfs_test_opt(fs_info, SSD) &&
3354 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3355 mutex_unlock(&root->log_mutex);
3356 schedule_timeout_uninterruptible(1);
3357 mutex_lock(&root->log_mutex);
3358 }
3359 wait_for_writer(root);
3360 if (batch == atomic_read(&root->log_batch))
3361 break;
3362 }
3363
3364 /* bail out if we need to do a full commit */
3365 if (btrfs_need_log_full_commit(trans)) {
3366 ret = BTRFS_LOG_FORCE_COMMIT;
3367 mutex_unlock(&root->log_mutex);
3368 goto out;
3369 }
3370
3371 if (log_transid % 2 == 0)
3372 mark = EXTENT_DIRTY_LOG1;
3373 else
3374 mark = EXTENT_DIRTY_LOG2;
3375
3376 /* we start IO on all the marked extents here, but we don't actually
3377 * wait for them until later.
3378 */
3379 blk_start_plug(&plug);
3380 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3381 /*
3382 * -EAGAIN happens when someone, e.g., a concurrent transaction
3383 * commit, writes a dirty extent in this tree-log commit. This
3384 * concurrent write will create a hole writing out the extents,
3385 * and we cannot proceed on a zoned filesystem, requiring
3386 * sequential writing. While we can bail out to a full commit
3387 * here, but we can continue hoping the concurrent writing fills
3388 * the hole.
3389 */
3390 if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3391 ret = 0;
3392 if (ret) {
3393 blk_finish_plug(&plug);
3394 btrfs_set_log_full_commit(trans);
3395 mutex_unlock(&root->log_mutex);
3396 goto out;
3397 }
3398
3399 /*
3400 * We _must_ update under the root->log_mutex in order to make sure we
3401 * have a consistent view of the log root we are trying to commit at
3402 * this moment.
3403 *
3404 * We _must_ copy this into a local copy, because we are not holding the
3405 * log_root_tree->log_mutex yet. This is important because when we
3406 * commit the log_root_tree we must have a consistent view of the
3407 * log_root_tree when we update the super block to point at the
3408 * log_root_tree bytenr. If we update the log_root_tree here we'll race
3409 * with the commit and possibly point at the new block which we may not
3410 * have written out.
3411 */
3412 btrfs_set_root_node(&log->root_item, log->node);
3413 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3414
3415 btrfs_set_root_log_transid(root, root->log_transid + 1);
3416 log->log_transid = root->log_transid;
3417 root->log_start_pid = 0;
3418 /*
3419 * IO has been started, blocks of the log tree have WRITTEN flag set
3420 * in their headers. new modifications of the log will be written to
3421 * new positions. so it's safe to allow log writers to go in.
3422 */
3423 mutex_unlock(&root->log_mutex);
3424
3425 if (btrfs_is_zoned(fs_info)) {
3426 mutex_lock(&fs_info->tree_root->log_mutex);
3427 if (!log_root_tree->node) {
3428 ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3429 if (ret) {
3430 mutex_unlock(&fs_info->tree_root->log_mutex);
3431 blk_finish_plug(&plug);
3432 goto out;
3433 }
3434 }
3435 mutex_unlock(&fs_info->tree_root->log_mutex);
3436 }
3437
3438 btrfs_init_log_ctx(&root_log_ctx, NULL);
3439
3440 mutex_lock(&log_root_tree->log_mutex);
3441
3442 index2 = log_root_tree->log_transid % 2;
3443 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3444 root_log_ctx.log_transid = log_root_tree->log_transid;
3445
3446 /*
3447 * Now we are safe to update the log_root_tree because we're under the
3448 * log_mutex, and we're a current writer so we're holding the commit
3449 * open until we drop the log_mutex.
3450 */
3451 ret = update_log_root(trans, log, &new_root_item);
3452 if (ret) {
3453 list_del_init(&root_log_ctx.list);
3454 blk_finish_plug(&plug);
3455 btrfs_set_log_full_commit(trans);
3456 if (ret != -ENOSPC)
3457 btrfs_err(fs_info,
3458 "failed to update log for root %llu ret %d",
3459 btrfs_root_id(root), ret);
3460 btrfs_wait_tree_log_extents(log, mark);
3461 mutex_unlock(&log_root_tree->log_mutex);
3462 goto out;
3463 }
3464
3465 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3466 blk_finish_plug(&plug);
3467 list_del_init(&root_log_ctx.list);
3468 mutex_unlock(&log_root_tree->log_mutex);
3469 ret = root_log_ctx.log_ret;
3470 goto out;
3471 }
3472
3473 if (atomic_read(&log_root_tree->log_commit[index2])) {
3474 blk_finish_plug(&plug);
3475 ret = btrfs_wait_tree_log_extents(log, mark);
3476 wait_log_commit(log_root_tree,
3477 root_log_ctx.log_transid);
3478 mutex_unlock(&log_root_tree->log_mutex);
3479 if (!ret)
3480 ret = root_log_ctx.log_ret;
3481 goto out;
3482 }
3483 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3484 atomic_set(&log_root_tree->log_commit[index2], 1);
3485
3486 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3487 wait_log_commit(log_root_tree,
3488 root_log_ctx.log_transid - 1);
3489 }
3490
3491 /*
3492 * now that we've moved on to the tree of log tree roots,
3493 * check the full commit flag again
3494 */
3495 if (btrfs_need_log_full_commit(trans)) {
3496 blk_finish_plug(&plug);
3497 btrfs_wait_tree_log_extents(log, mark);
3498 mutex_unlock(&log_root_tree->log_mutex);
3499 ret = BTRFS_LOG_FORCE_COMMIT;
3500 goto out_wake_log_root;
3501 }
3502
3503 ret = btrfs_write_marked_extents(fs_info,
3504 &log_root_tree->dirty_log_pages,
3505 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3506 blk_finish_plug(&plug);
3507 /*
3508 * As described above, -EAGAIN indicates a hole in the extents. We
3509 * cannot wait for these write outs since the waiting cause a
3510 * deadlock. Bail out to the full commit instead.
3511 */
3512 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3513 btrfs_set_log_full_commit(trans);
3514 btrfs_wait_tree_log_extents(log, mark);
3515 mutex_unlock(&log_root_tree->log_mutex);
3516 goto out_wake_log_root;
3517 } else if (ret) {
3518 btrfs_set_log_full_commit(trans);
3519 mutex_unlock(&log_root_tree->log_mutex);
3520 goto out_wake_log_root;
3521 }
3522 ret = btrfs_wait_tree_log_extents(log, mark);
3523 if (!ret)
3524 ret = btrfs_wait_tree_log_extents(log_root_tree,
3525 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3526 if (ret) {
3527 btrfs_set_log_full_commit(trans);
3528 mutex_unlock(&log_root_tree->log_mutex);
3529 goto out_wake_log_root;
3530 }
3531
3532 log_root_start = log_root_tree->node->start;
3533 log_root_level = btrfs_header_level(log_root_tree->node);
3534 log_root_tree->log_transid++;
3535 mutex_unlock(&log_root_tree->log_mutex);
3536
3537 /*
3538 * Here we are guaranteed that nobody is going to write the superblock
3539 * for the current transaction before us and that neither we do write
3540 * our superblock before the previous transaction finishes its commit
3541 * and writes its superblock, because:
3542 *
3543 * 1) We are holding a handle on the current transaction, so no body
3544 * can commit it until we release the handle;
3545 *
3546 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3547 * if the previous transaction is still committing, and hasn't yet
3548 * written its superblock, we wait for it to do it, because a
3549 * transaction commit acquires the tree_log_mutex when the commit
3550 * begins and releases it only after writing its superblock.
3551 */
3552 mutex_lock(&fs_info->tree_log_mutex);
3553
3554 /*
3555 * The previous transaction writeout phase could have failed, and thus
3556 * marked the fs in an error state. We must not commit here, as we
3557 * could have updated our generation in the super_for_commit and
3558 * writing the super here would result in transid mismatches. If there
3559 * is an error here just bail.
3560 */
3561 if (BTRFS_FS_ERROR(fs_info)) {
3562 ret = -EIO;
3563 btrfs_set_log_full_commit(trans);
3564 btrfs_abort_transaction(trans, ret);
3565 mutex_unlock(&fs_info->tree_log_mutex);
3566 goto out_wake_log_root;
3567 }
3568
3569 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3570 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3571 ret = write_all_supers(fs_info, 1);
3572 mutex_unlock(&fs_info->tree_log_mutex);
3573 if (unlikely(ret)) {
3574 btrfs_set_log_full_commit(trans);
3575 btrfs_abort_transaction(trans, ret);
3576 goto out_wake_log_root;
3577 }
3578
3579 /*
3580 * We know there can only be one task here, since we have not yet set
3581 * root->log_commit[index1] to 0 and any task attempting to sync the
3582 * log must wait for the previous log transaction to commit if it's
3583 * still in progress or wait for the current log transaction commit if
3584 * someone else already started it. We use <= and not < because the
3585 * first log transaction has an ID of 0.
3586 */
3587 ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
3588 btrfs_set_root_last_log_commit(root, log_transid);
3589
3590 out_wake_log_root:
3591 mutex_lock(&log_root_tree->log_mutex);
3592 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3593
3594 log_root_tree->log_transid_committed++;
3595 atomic_set(&log_root_tree->log_commit[index2], 0);
3596 mutex_unlock(&log_root_tree->log_mutex);
3597
3598 /*
3599 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3600 * all the updates above are seen by the woken threads. It might not be
3601 * necessary, but proving that seems to be hard.
3602 */
3603 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3604 out:
3605 mutex_lock(&root->log_mutex);
3606 btrfs_remove_all_log_ctxs(root, index1, ret);
3607 root->log_transid_committed++;
3608 atomic_set(&root->log_commit[index1], 0);
3609 mutex_unlock(&root->log_mutex);
3610
3611 /*
3612 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3613 * all the updates above are seen by the woken threads. It might not be
3614 * necessary, but proving that seems to be hard.
3615 */
3616 cond_wake_up(&root->log_commit_wait[index1]);
3617 return ret;
3618 }
3619
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3620 static void free_log_tree(struct btrfs_trans_handle *trans,
3621 struct btrfs_root *log)
3622 {
3623 int ret;
3624 struct walk_control wc = {
3625 .free = true,
3626 .process_func = process_one_buffer,
3627 .log = log,
3628 .trans = trans,
3629 };
3630
3631 if (log->node) {
3632 ret = walk_log_tree(&wc);
3633 if (ret) {
3634 /*
3635 * We weren't able to traverse the entire log tree, the
3636 * typical scenario is getting an -EIO when reading an
3637 * extent buffer of the tree, due to a previous writeback
3638 * failure of it.
3639 */
3640 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3641 &log->fs_info->fs_state);
3642
3643 /*
3644 * Some extent buffers of the log tree may still be dirty
3645 * and not yet written back to storage, because we may
3646 * have updates to a log tree without syncing a log tree,
3647 * such as during rename and link operations. So flush
3648 * them out and wait for their writeback to complete, so
3649 * that we properly cleanup their state and pages.
3650 */
3651 btrfs_write_marked_extents(log->fs_info,
3652 &log->dirty_log_pages,
3653 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3654 btrfs_wait_tree_log_extents(log,
3655 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3656
3657 if (trans)
3658 btrfs_abort_transaction(trans, ret);
3659 else
3660 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3661 }
3662 }
3663
3664 btrfs_extent_io_tree_release(&log->dirty_log_pages);
3665 btrfs_extent_io_tree_release(&log->log_csum_range);
3666
3667 btrfs_put_root(log);
3668 }
3669
3670 /*
3671 * free all the extents used by the tree log. This should be called
3672 * at commit time of the full transaction
3673 */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3674 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3675 {
3676 if (root->log_root) {
3677 free_log_tree(trans, root->log_root);
3678 root->log_root = NULL;
3679 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3680 }
3681 return 0;
3682 }
3683
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3684 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3685 struct btrfs_fs_info *fs_info)
3686 {
3687 if (fs_info->log_root_tree) {
3688 free_log_tree(trans, fs_info->log_root_tree);
3689 fs_info->log_root_tree = NULL;
3690 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3691 }
3692 return 0;
3693 }
3694
mark_inode_as_not_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3695 static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
3696 struct btrfs_inode *inode)
3697 {
3698 bool ret = false;
3699
3700 /*
3701 * Do this only if ->logged_trans is still 0 to prevent races with
3702 * concurrent logging as we may see the inode not logged when
3703 * inode_logged() is called but it gets logged after inode_logged() did
3704 * not find it in the log tree and we end up setting ->logged_trans to a
3705 * value less than trans->transid after the concurrent logging task has
3706 * set it to trans->transid. As a consequence, subsequent rename, unlink
3707 * and link operations may end up not logging new names and removing old
3708 * names from the log.
3709 */
3710 spin_lock(&inode->lock);
3711 if (inode->logged_trans == 0)
3712 inode->logged_trans = trans->transid - 1;
3713 else if (inode->logged_trans == trans->transid)
3714 ret = true;
3715 spin_unlock(&inode->lock);
3716
3717 return ret;
3718 }
3719
3720 /*
3721 * Check if an inode was logged in the current transaction. This correctly deals
3722 * with the case where the inode was logged but has a logged_trans of 0, which
3723 * happens if the inode is evicted and loaded again, as logged_trans is an in
3724 * memory only field (not persisted).
3725 *
3726 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3727 * and < 0 on error.
3728 */
inode_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path_in)3729 static int inode_logged(const struct btrfs_trans_handle *trans,
3730 struct btrfs_inode *inode,
3731 struct btrfs_path *path_in)
3732 {
3733 struct btrfs_path *path = path_in;
3734 struct btrfs_key key;
3735 int ret;
3736
3737 /*
3738 * Quick lockless call, since once ->logged_trans is set to the current
3739 * transaction, we never set it to a lower value anywhere else.
3740 */
3741 if (data_race(inode->logged_trans) == trans->transid)
3742 return 1;
3743
3744 /*
3745 * If logged_trans is not 0 and not trans->transid, then we know the
3746 * inode was not logged in this transaction, so we can return false
3747 * right away. We take the lock to avoid a race caused by load/store
3748 * tearing with a concurrent btrfs_log_inode() call or a concurrent task
3749 * in this function further below - an update to trans->transid can be
3750 * teared into two 32 bits updates for example, in which case we could
3751 * see a positive value that is not trans->transid and assume the inode
3752 * was not logged when it was.
3753 */
3754 spin_lock(&inode->lock);
3755 if (inode->logged_trans == trans->transid) {
3756 spin_unlock(&inode->lock);
3757 return 1;
3758 } else if (inode->logged_trans > 0) {
3759 spin_unlock(&inode->lock);
3760 return 0;
3761 }
3762 spin_unlock(&inode->lock);
3763
3764 /*
3765 * If no log tree was created for this root in this transaction, then
3766 * the inode can not have been logged in this transaction. In that case
3767 * set logged_trans to anything greater than 0 and less than the current
3768 * transaction's ID, to avoid the search below in a future call in case
3769 * a log tree gets created after this.
3770 */
3771 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
3772 return mark_inode_as_not_logged(trans, inode);
3773
3774 /*
3775 * We have a log tree and the inode's logged_trans is 0. We can't tell
3776 * for sure if the inode was logged before in this transaction by looking
3777 * only at logged_trans. We could be pessimistic and assume it was, but
3778 * that can lead to unnecessarily logging an inode during rename and link
3779 * operations, and then further updating the log in followup rename and
3780 * link operations, specially if it's a directory, which adds latency
3781 * visible to applications doing a series of rename or link operations.
3782 *
3783 * A logged_trans of 0 here can mean several things:
3784 *
3785 * 1) The inode was never logged since the filesystem was mounted, and may
3786 * or may have not been evicted and loaded again;
3787 *
3788 * 2) The inode was logged in a previous transaction, then evicted and
3789 * then loaded again;
3790 *
3791 * 3) The inode was logged in the current transaction, then evicted and
3792 * then loaded again.
3793 *
3794 * For cases 1) and 2) we don't want to return true, but we need to detect
3795 * case 3) and return true. So we do a search in the log root for the inode
3796 * item.
3797 */
3798 key.objectid = btrfs_ino(inode);
3799 key.type = BTRFS_INODE_ITEM_KEY;
3800 key.offset = 0;
3801
3802 if (!path) {
3803 path = btrfs_alloc_path();
3804 if (!path)
3805 return -ENOMEM;
3806 }
3807
3808 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3809
3810 if (path_in)
3811 btrfs_release_path(path);
3812 else
3813 btrfs_free_path(path);
3814
3815 /*
3816 * Logging an inode always results in logging its inode item. So if we
3817 * did not find the item we know the inode was not logged for sure.
3818 */
3819 if (ret < 0) {
3820 return ret;
3821 } else if (ret > 0) {
3822 /*
3823 * Set logged_trans to a value greater than 0 and less then the
3824 * current transaction to avoid doing the search in future calls.
3825 */
3826 return mark_inode_as_not_logged(trans, inode);
3827 }
3828
3829 /*
3830 * The inode was previously logged and then evicted, set logged_trans to
3831 * the current transaction's ID, to avoid future tree searches as long as
3832 * the inode is not evicted again.
3833 */
3834 spin_lock(&inode->lock);
3835 inode->logged_trans = trans->transid;
3836 spin_unlock(&inode->lock);
3837
3838 return 1;
3839 }
3840
3841 /*
3842 * Delete a directory entry from the log if it exists.
3843 *
3844 * Returns < 0 on error
3845 * 1 if the entry does not exists
3846 * 0 if the entry existed and was successfully deleted
3847 */
del_logged_dentry(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dir_ino,const struct fscrypt_str * name,u64 index)3848 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3849 struct btrfs_root *log,
3850 struct btrfs_path *path,
3851 u64 dir_ino,
3852 const struct fscrypt_str *name,
3853 u64 index)
3854 {
3855 struct btrfs_dir_item *di;
3856
3857 /*
3858 * We only log dir index items of a directory, so we don't need to look
3859 * for dir item keys.
3860 */
3861 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3862 index, name, -1);
3863 if (IS_ERR(di))
3864 return PTR_ERR(di);
3865 else if (!di)
3866 return 1;
3867
3868 /*
3869 * We do not need to update the size field of the directory's
3870 * inode item because on log replay we update the field to reflect
3871 * all existing entries in the directory (see overwrite_item()).
3872 */
3873 return btrfs_del_item(trans, log, path);
3874 }
3875
3876 /*
3877 * If both a file and directory are logged, and unlinks or renames are
3878 * mixed in, we have a few interesting corners:
3879 *
3880 * create file X in dir Y
3881 * link file X to X.link in dir Y
3882 * fsync file X
3883 * unlink file X but leave X.link
3884 * fsync dir Y
3885 *
3886 * After a crash we would expect only X.link to exist. But file X
3887 * didn't get fsync'd again so the log has back refs for X and X.link.
3888 *
3889 * We solve this by removing directory entries and inode backrefs from the
3890 * log when a file that was logged in the current transaction is
3891 * unlinked. Any later fsync will include the updated log entries, and
3892 * we'll be able to reconstruct the proper directory items from backrefs.
3893 *
3894 * This optimizations allows us to avoid relogging the entire inode
3895 * or the entire directory.
3896 */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * dir,u64 index)3897 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3898 struct btrfs_root *root,
3899 const struct fscrypt_str *name,
3900 struct btrfs_inode *dir, u64 index)
3901 {
3902 BTRFS_PATH_AUTO_FREE(path);
3903 int ret;
3904
3905 ret = inode_logged(trans, dir, NULL);
3906 if (ret == 0)
3907 return;
3908 if (ret < 0) {
3909 btrfs_set_log_full_commit(trans);
3910 return;
3911 }
3912
3913 path = btrfs_alloc_path();
3914 if (!path) {
3915 btrfs_set_log_full_commit(trans);
3916 return;
3917 }
3918
3919 ret = join_running_log_trans(root);
3920 ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
3921 if (WARN_ON(ret))
3922 return;
3923
3924 mutex_lock(&dir->log_mutex);
3925
3926 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3927 name, index);
3928 mutex_unlock(&dir->log_mutex);
3929 if (ret < 0)
3930 btrfs_set_log_full_commit(trans);
3931 btrfs_end_log_trans(root);
3932 }
3933
3934 /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const struct fscrypt_str * name,struct btrfs_inode * inode,u64 dirid)3935 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3936 struct btrfs_root *root,
3937 const struct fscrypt_str *name,
3938 struct btrfs_inode *inode, u64 dirid)
3939 {
3940 struct btrfs_root *log;
3941 int ret;
3942
3943 ret = inode_logged(trans, inode, NULL);
3944 if (ret == 0)
3945 return;
3946 else if (ret < 0) {
3947 btrfs_set_log_full_commit(trans);
3948 return;
3949 }
3950
3951 ret = join_running_log_trans(root);
3952 ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
3953 if (WARN_ON(ret))
3954 return;
3955 log = root->log_root;
3956 mutex_lock(&inode->log_mutex);
3957
3958 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL);
3959 mutex_unlock(&inode->log_mutex);
3960 if (ret < 0 && ret != -ENOENT)
3961 btrfs_set_log_full_commit(trans);
3962 btrfs_end_log_trans(root);
3963 }
3964
3965 /*
3966 * creates a range item in the log for 'dirid'. first_offset and
3967 * last_offset tell us which parts of the key space the log should
3968 * be considered authoritative for.
3969 */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,u64 first_offset,u64 last_offset)3970 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3971 struct btrfs_root *log,
3972 struct btrfs_path *path,
3973 u64 dirid,
3974 u64 first_offset, u64 last_offset)
3975 {
3976 int ret;
3977 struct btrfs_key key;
3978 struct btrfs_dir_log_item *item;
3979
3980 key.objectid = dirid;
3981 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3982 key.offset = first_offset;
3983 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3984 /*
3985 * -EEXIST is fine and can happen sporadically when we are logging a
3986 * directory and have concurrent insertions in the subvolume's tree for
3987 * items from other inodes and that result in pushing off some dir items
3988 * from one leaf to another in order to accommodate for the new items.
3989 * This results in logging the same dir index range key.
3990 */
3991 if (ret && ret != -EEXIST)
3992 return ret;
3993
3994 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3995 struct btrfs_dir_log_item);
3996 if (ret == -EEXIST) {
3997 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3998
3999 /*
4000 * btrfs_del_dir_entries_in_log() might have been called during
4001 * an unlink between the initial insertion of this key and the
4002 * current update, or we might be logging a single entry deletion
4003 * during a rename, so set the new last_offset to the max value.
4004 */
4005 last_offset = max(last_offset, curr_end);
4006 }
4007 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
4008 btrfs_release_path(path);
4009 return 0;
4010 }
4011
flush_dir_items_batch(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct extent_buffer * src,struct btrfs_path * dst_path,int start_slot,int count)4012 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
4013 struct btrfs_inode *inode,
4014 struct extent_buffer *src,
4015 struct btrfs_path *dst_path,
4016 int start_slot,
4017 int count)
4018 {
4019 struct btrfs_root *log = inode->root->log_root;
4020 char *ins_data = NULL;
4021 struct btrfs_item_batch batch;
4022 struct extent_buffer *dst;
4023 unsigned long src_offset;
4024 unsigned long dst_offset;
4025 u64 last_index;
4026 struct btrfs_key key;
4027 u32 item_size;
4028 int ret;
4029 int i;
4030
4031 ASSERT(count > 0);
4032 batch.nr = count;
4033
4034 if (count == 1) {
4035 btrfs_item_key_to_cpu(src, &key, start_slot);
4036 item_size = btrfs_item_size(src, start_slot);
4037 batch.keys = &key;
4038 batch.data_sizes = &item_size;
4039 batch.total_data_size = item_size;
4040 } else {
4041 struct btrfs_key *ins_keys;
4042 u32 *ins_sizes;
4043
4044 ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
4045 if (!ins_data)
4046 return -ENOMEM;
4047
4048 ins_sizes = (u32 *)ins_data;
4049 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
4050 batch.keys = ins_keys;
4051 batch.data_sizes = ins_sizes;
4052 batch.total_data_size = 0;
4053
4054 for (i = 0; i < count; i++) {
4055 const int slot = start_slot + i;
4056
4057 btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
4058 ins_sizes[i] = btrfs_item_size(src, slot);
4059 batch.total_data_size += ins_sizes[i];
4060 }
4061 }
4062
4063 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4064 if (ret)
4065 goto out;
4066
4067 dst = dst_path->nodes[0];
4068 /*
4069 * Copy all the items in bulk, in a single copy operation. Item data is
4070 * organized such that it's placed at the end of a leaf and from right
4071 * to left. For example, the data for the second item ends at an offset
4072 * that matches the offset where the data for the first item starts, the
4073 * data for the third item ends at an offset that matches the offset
4074 * where the data of the second items starts, and so on.
4075 * Therefore our source and destination start offsets for copy match the
4076 * offsets of the last items (highest slots).
4077 */
4078 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
4079 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
4080 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
4081 btrfs_release_path(dst_path);
4082
4083 last_index = batch.keys[count - 1].offset;
4084 ASSERT(last_index > inode->last_dir_index_offset);
4085
4086 /*
4087 * If for some unexpected reason the last item's index is not greater
4088 * than the last index we logged, warn and force a transaction commit.
4089 */
4090 if (WARN_ON(last_index <= inode->last_dir_index_offset))
4091 ret = BTRFS_LOG_FORCE_COMMIT;
4092 else
4093 inode->last_dir_index_offset = last_index;
4094
4095 if (btrfs_get_first_dir_index_to_log(inode) == 0)
4096 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
4097 out:
4098 kfree(ins_data);
4099
4100 return ret;
4101 }
4102
clone_leaf(struct btrfs_path * path,struct btrfs_log_ctx * ctx)4103 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
4104 {
4105 const int slot = path->slots[0];
4106
4107 if (ctx->scratch_eb) {
4108 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
4109 } else {
4110 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
4111 if (!ctx->scratch_eb)
4112 return -ENOMEM;
4113 }
4114
4115 btrfs_release_path(path);
4116 path->nodes[0] = ctx->scratch_eb;
4117 path->slots[0] = slot;
4118 /*
4119 * Add extra ref to scratch eb so that it is not freed when callers
4120 * release the path, so we can reuse it later if needed.
4121 */
4122 refcount_inc(&ctx->scratch_eb->refs);
4123
4124 return 0;
4125 }
4126
process_dir_items_leaf(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 * last_old_dentry_offset)4127 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
4128 struct btrfs_inode *inode,
4129 struct btrfs_path *path,
4130 struct btrfs_path *dst_path,
4131 struct btrfs_log_ctx *ctx,
4132 u64 *last_old_dentry_offset)
4133 {
4134 struct btrfs_root *log = inode->root->log_root;
4135 struct extent_buffer *src;
4136 const int nritems = btrfs_header_nritems(path->nodes[0]);
4137 const u64 ino = btrfs_ino(inode);
4138 bool last_found = false;
4139 int batch_start = 0;
4140 int batch_size = 0;
4141 int ret;
4142
4143 /*
4144 * We need to clone the leaf, release the read lock on it, and use the
4145 * clone before modifying the log tree. See the comment at copy_items()
4146 * about why we need to do this.
4147 */
4148 ret = clone_leaf(path, ctx);
4149 if (ret < 0)
4150 return ret;
4151
4152 src = path->nodes[0];
4153
4154 for (int i = path->slots[0]; i < nritems; i++) {
4155 struct btrfs_dir_item *di;
4156 struct btrfs_key key;
4157 int ret;
4158
4159 btrfs_item_key_to_cpu(src, &key, i);
4160
4161 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
4162 last_found = true;
4163 break;
4164 }
4165
4166 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
4167
4168 /*
4169 * Skip ranges of items that consist only of dir item keys created
4170 * in past transactions. However if we find a gap, we must log a
4171 * dir index range item for that gap, so that index keys in that
4172 * gap are deleted during log replay.
4173 */
4174 if (btrfs_dir_transid(src, di) < trans->transid) {
4175 if (key.offset > *last_old_dentry_offset + 1) {
4176 ret = insert_dir_log_key(trans, log, dst_path,
4177 ino, *last_old_dentry_offset + 1,
4178 key.offset - 1);
4179 if (ret < 0)
4180 return ret;
4181 }
4182
4183 *last_old_dentry_offset = key.offset;
4184 continue;
4185 }
4186
4187 /* If we logged this dir index item before, we can skip it. */
4188 if (key.offset <= inode->last_dir_index_offset)
4189 continue;
4190
4191 /*
4192 * We must make sure that when we log a directory entry, the
4193 * corresponding inode, after log replay, has a matching link
4194 * count. For example:
4195 *
4196 * touch foo
4197 * mkdir mydir
4198 * sync
4199 * ln foo mydir/bar
4200 * xfs_io -c "fsync" mydir
4201 * <crash>
4202 * <mount fs and log replay>
4203 *
4204 * Would result in a fsync log that when replayed, our file inode
4205 * would have a link count of 1, but we get two directory entries
4206 * pointing to the same inode. After removing one of the names,
4207 * it would not be possible to remove the other name, which
4208 * resulted always in stale file handle errors, and would not be
4209 * possible to rmdir the parent directory, since its i_size could
4210 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
4211 * resulting in -ENOTEMPTY errors.
4212 */
4213 if (!ctx->log_new_dentries) {
4214 struct btrfs_key di_key;
4215
4216 btrfs_dir_item_key_to_cpu(src, di, &di_key);
4217 if (di_key.type != BTRFS_ROOT_ITEM_KEY)
4218 ctx->log_new_dentries = true;
4219 }
4220
4221 if (batch_size == 0)
4222 batch_start = i;
4223 batch_size++;
4224 }
4225
4226 if (batch_size > 0) {
4227 int ret;
4228
4229 ret = flush_dir_items_batch(trans, inode, src, dst_path,
4230 batch_start, batch_size);
4231 if (ret < 0)
4232 return ret;
4233 }
4234
4235 return last_found ? 1 : 0;
4236 }
4237
4238 /*
4239 * log all the items included in the current transaction for a given
4240 * directory. This also creates the range items in the log tree required
4241 * to replay anything deleted before the fsync
4242 */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)4243 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
4244 struct btrfs_inode *inode,
4245 struct btrfs_path *path,
4246 struct btrfs_path *dst_path,
4247 struct btrfs_log_ctx *ctx,
4248 u64 min_offset, u64 *last_offset_ret)
4249 {
4250 struct btrfs_key min_key;
4251 struct btrfs_root *root = inode->root;
4252 struct btrfs_root *log = root->log_root;
4253 int ret;
4254 u64 last_old_dentry_offset = min_offset - 1;
4255 u64 last_offset = (u64)-1;
4256 u64 ino = btrfs_ino(inode);
4257
4258 min_key.objectid = ino;
4259 min_key.type = BTRFS_DIR_INDEX_KEY;
4260 min_key.offset = min_offset;
4261
4262 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
4263
4264 /*
4265 * we didn't find anything from this transaction, see if there
4266 * is anything at all
4267 */
4268 if (ret != 0 || min_key.objectid != ino ||
4269 min_key.type != BTRFS_DIR_INDEX_KEY) {
4270 min_key.objectid = ino;
4271 min_key.type = BTRFS_DIR_INDEX_KEY;
4272 min_key.offset = (u64)-1;
4273 btrfs_release_path(path);
4274 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
4275 if (ret < 0) {
4276 btrfs_release_path(path);
4277 return ret;
4278 }
4279 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
4280
4281 /* if ret == 0 there are items for this type,
4282 * create a range to tell us the last key of this type.
4283 * otherwise, there are no items in this directory after
4284 * *min_offset, and we create a range to indicate that.
4285 */
4286 if (ret == 0) {
4287 struct btrfs_key tmp;
4288
4289 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
4290 path->slots[0]);
4291 if (tmp.type == BTRFS_DIR_INDEX_KEY)
4292 last_old_dentry_offset = tmp.offset;
4293 } else if (ret > 0) {
4294 ret = 0;
4295 }
4296
4297 goto done;
4298 }
4299
4300 /* go backward to find any previous key */
4301 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
4302 if (ret == 0) {
4303 struct btrfs_key tmp;
4304
4305 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
4306 /*
4307 * The dir index key before the first one we found that needs to
4308 * be logged might be in a previous leaf, and there might be a
4309 * gap between these keys, meaning that we had deletions that
4310 * happened. So the key range item we log (key type
4311 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
4312 * previous key's offset plus 1, so that those deletes are replayed.
4313 */
4314 if (tmp.type == BTRFS_DIR_INDEX_KEY)
4315 last_old_dentry_offset = tmp.offset;
4316 } else if (ret < 0) {
4317 goto done;
4318 }
4319
4320 btrfs_release_path(path);
4321
4322 /*
4323 * Find the first key from this transaction again or the one we were at
4324 * in the loop below in case we had to reschedule. We may be logging the
4325 * directory without holding its VFS lock, which happen when logging new
4326 * dentries (through log_new_dir_dentries()) or in some cases when we
4327 * need to log the parent directory of an inode. This means a dir index
4328 * key might be deleted from the inode's root, and therefore we may not
4329 * find it anymore. If we can't find it, just move to the next key. We
4330 * can not bail out and ignore, because if we do that we will simply
4331 * not log dir index keys that come after the one that was just deleted
4332 * and we can end up logging a dir index range that ends at (u64)-1
4333 * (@last_offset is initialized to that), resulting in removing dir
4334 * entries we should not remove at log replay time.
4335 */
4336 search:
4337 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
4338 if (ret > 0) {
4339 ret = btrfs_next_item(root, path);
4340 if (ret > 0) {
4341 /* There are no more keys in the inode's root. */
4342 ret = 0;
4343 goto done;
4344 }
4345 }
4346 if (ret < 0)
4347 goto done;
4348
4349 /*
4350 * we have a block from this transaction, log every item in it
4351 * from our directory
4352 */
4353 while (1) {
4354 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
4355 &last_old_dentry_offset);
4356 if (ret != 0) {
4357 if (ret > 0)
4358 ret = 0;
4359 goto done;
4360 }
4361 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
4362
4363 /*
4364 * look ahead to the next item and see if it is also
4365 * from this directory and from this transaction
4366 */
4367 ret = btrfs_next_leaf(root, path);
4368 if (ret) {
4369 if (ret == 1) {
4370 last_offset = (u64)-1;
4371 ret = 0;
4372 }
4373 goto done;
4374 }
4375 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
4376 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
4377 last_offset = (u64)-1;
4378 goto done;
4379 }
4380 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
4381 /*
4382 * The next leaf was not changed in the current transaction
4383 * and has at least one dir index key.
4384 * We check for the next key because there might have been
4385 * one or more deletions between the last key we logged and
4386 * that next key. So the key range item we log (key type
4387 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
4388 * offset minus 1, so that those deletes are replayed.
4389 */
4390 last_offset = min_key.offset - 1;
4391 goto done;
4392 }
4393 if (need_resched()) {
4394 btrfs_release_path(path);
4395 cond_resched();
4396 goto search;
4397 }
4398 }
4399 done:
4400 btrfs_release_path(path);
4401 btrfs_release_path(dst_path);
4402
4403 if (ret == 0) {
4404 *last_offset_ret = last_offset;
4405 /*
4406 * In case the leaf was changed in the current transaction but
4407 * all its dir items are from a past transaction, the last item
4408 * in the leaf is a dir item and there's no gap between that last
4409 * dir item and the first one on the next leaf (which did not
4410 * change in the current transaction), then we don't need to log
4411 * a range, last_old_dentry_offset is == to last_offset.
4412 */
4413 ASSERT(last_old_dentry_offset <= last_offset);
4414 if (last_old_dentry_offset < last_offset)
4415 ret = insert_dir_log_key(trans, log, path, ino,
4416 last_old_dentry_offset + 1,
4417 last_offset);
4418 }
4419
4420 return ret;
4421 }
4422
4423 /*
4424 * If the inode was logged before and it was evicted, then its
4425 * last_dir_index_offset is 0, so we don't know the value of the last index
4426 * key offset. If that's the case, search for it and update the inode. This
4427 * is to avoid lookups in the log tree every time we try to insert a dir index
4428 * key from a leaf changed in the current transaction, and to allow us to always
4429 * do batch insertions of dir index keys.
4430 */
update_last_dir_index_offset(struct btrfs_inode * inode,struct btrfs_path * path,const struct btrfs_log_ctx * ctx)4431 static int update_last_dir_index_offset(struct btrfs_inode *inode,
4432 struct btrfs_path *path,
4433 const struct btrfs_log_ctx *ctx)
4434 {
4435 const u64 ino = btrfs_ino(inode);
4436 struct btrfs_key key;
4437 int ret;
4438
4439 lockdep_assert_held(&inode->log_mutex);
4440
4441 if (inode->last_dir_index_offset != 0)
4442 return 0;
4443
4444 if (!ctx->logged_before) {
4445 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4446 return 0;
4447 }
4448
4449 key.objectid = ino;
4450 key.type = BTRFS_DIR_INDEX_KEY;
4451 key.offset = (u64)-1;
4452
4453 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4454 /*
4455 * An error happened or we actually have an index key with an offset
4456 * value of (u64)-1. Bail out, we're done.
4457 */
4458 if (ret <= 0)
4459 goto out;
4460
4461 ret = 0;
4462 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4463
4464 /*
4465 * No dir index items, bail out and leave last_dir_index_offset with
4466 * the value right before the first valid index value.
4467 */
4468 if (path->slots[0] == 0)
4469 goto out;
4470
4471 /*
4472 * btrfs_search_slot() left us at one slot beyond the slot with the last
4473 * index key, or beyond the last key of the directory that is not an
4474 * index key. If we have an index key before, set last_dir_index_offset
4475 * to its offset value, otherwise leave it with a value right before the
4476 * first valid index value, as it means we have an empty directory.
4477 */
4478 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4479 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4480 inode->last_dir_index_offset = key.offset;
4481
4482 out:
4483 btrfs_release_path(path);
4484
4485 return ret;
4486 }
4487
4488 /*
4489 * logging directories is very similar to logging inodes, We find all the items
4490 * from the current transaction and write them to the log.
4491 *
4492 * The recovery code scans the directory in the subvolume, and if it finds a
4493 * key in the range logged that is not present in the log tree, then it means
4494 * that dir entry was unlinked during the transaction.
4495 *
4496 * In order for that scan to work, we must include one key smaller than
4497 * the smallest logged by this transaction and one key larger than the largest
4498 * key logged by this transaction.
4499 */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)4500 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4501 struct btrfs_inode *inode,
4502 struct btrfs_path *path,
4503 struct btrfs_path *dst_path,
4504 struct btrfs_log_ctx *ctx)
4505 {
4506 u64 min_key;
4507 u64 max_key;
4508 int ret;
4509
4510 ret = update_last_dir_index_offset(inode, path, ctx);
4511 if (ret)
4512 return ret;
4513
4514 min_key = BTRFS_DIR_START_INDEX;
4515 max_key = 0;
4516
4517 while (1) {
4518 ret = log_dir_items(trans, inode, path, dst_path,
4519 ctx, min_key, &max_key);
4520 if (ret)
4521 return ret;
4522 if (max_key == (u64)-1)
4523 break;
4524 min_key = max_key + 1;
4525 }
4526
4527 return 0;
4528 }
4529
4530 /*
4531 * a helper function to drop items from the log before we relog an
4532 * inode. max_key_type indicates the highest item type to remove.
4533 * This cannot be run for file data extents because it does not
4534 * free the extents they point to.
4535 */
drop_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,int max_key_type)4536 static int drop_inode_items(struct btrfs_trans_handle *trans,
4537 struct btrfs_root *log,
4538 struct btrfs_path *path,
4539 struct btrfs_inode *inode,
4540 int max_key_type)
4541 {
4542 int ret;
4543 struct btrfs_key key;
4544 struct btrfs_key found_key;
4545 int start_slot;
4546
4547 key.objectid = btrfs_ino(inode);
4548 key.type = max_key_type;
4549 key.offset = (u64)-1;
4550
4551 while (1) {
4552 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4553 if (ret < 0) {
4554 break;
4555 } else if (ret > 0) {
4556 if (path->slots[0] == 0)
4557 break;
4558 path->slots[0]--;
4559 }
4560
4561 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4562 path->slots[0]);
4563
4564 if (found_key.objectid != key.objectid)
4565 break;
4566
4567 found_key.offset = 0;
4568 found_key.type = 0;
4569 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4570 if (ret < 0)
4571 break;
4572
4573 ret = btrfs_del_items(trans, log, path, start_slot,
4574 path->slots[0] - start_slot + 1);
4575 /*
4576 * If start slot isn't 0 then we don't need to re-search, we've
4577 * found the last guy with the objectid in this tree.
4578 */
4579 if (ret || start_slot != 0)
4580 break;
4581 btrfs_release_path(path);
4582 }
4583 btrfs_release_path(path);
4584 if (ret > 0)
4585 ret = 0;
4586 return ret;
4587 }
4588
truncate_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log_root,struct btrfs_inode * inode,u64 new_size,u32 min_type)4589 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4590 struct btrfs_root *log_root,
4591 struct btrfs_inode *inode,
4592 u64 new_size, u32 min_type)
4593 {
4594 struct btrfs_truncate_control control = {
4595 .new_size = new_size,
4596 .ino = btrfs_ino(inode),
4597 .min_type = min_type,
4598 .skip_ref_updates = true,
4599 };
4600
4601 return btrfs_truncate_inode_items(trans, log_root, &control);
4602 }
4603
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,bool log_inode_only,u64 logged_isize)4604 static void fill_inode_item(struct btrfs_trans_handle *trans,
4605 struct extent_buffer *leaf,
4606 struct btrfs_inode_item *item,
4607 struct inode *inode, bool log_inode_only,
4608 u64 logged_isize)
4609 {
4610 u64 flags;
4611
4612 if (log_inode_only) {
4613 /* set the generation to zero so the recover code
4614 * can tell the difference between an logging
4615 * just to say 'this inode exists' and a logging
4616 * to say 'update this inode with these values'
4617 */
4618 btrfs_set_inode_generation(leaf, item, 0);
4619 btrfs_set_inode_size(leaf, item, logged_isize);
4620 } else {
4621 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
4622 btrfs_set_inode_size(leaf, item, inode->i_size);
4623 }
4624
4625 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
4626 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
4627 btrfs_set_inode_mode(leaf, item, inode->i_mode);
4628 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
4629
4630 btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
4631 btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
4632
4633 btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
4634 btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
4635
4636 btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
4637 btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
4638
4639 btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
4640 btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4641
4642 /*
4643 * We do not need to set the nbytes field, in fact during a fast fsync
4644 * its value may not even be correct, since a fast fsync does not wait
4645 * for ordered extent completion, which is where we update nbytes, it
4646 * only waits for writeback to complete. During log replay as we find
4647 * file extent items and replay them, we adjust the nbytes field of the
4648 * inode item in subvolume tree as needed (see overwrite_item()).
4649 */
4650
4651 btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
4652 btrfs_set_inode_transid(leaf, item, trans->transid);
4653 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
4654 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4655 BTRFS_I(inode)->ro_flags);
4656 btrfs_set_inode_flags(leaf, item, flags);
4657 btrfs_set_inode_block_group(leaf, item, 0);
4658 }
4659
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,bool inode_item_dropped)4660 static int log_inode_item(struct btrfs_trans_handle *trans,
4661 struct btrfs_root *log, struct btrfs_path *path,
4662 struct btrfs_inode *inode, bool inode_item_dropped)
4663 {
4664 struct btrfs_inode_item *inode_item;
4665 struct btrfs_key key;
4666 int ret;
4667
4668 btrfs_get_inode_key(inode, &key);
4669 /*
4670 * If we are doing a fast fsync and the inode was logged before in the
4671 * current transaction, then we know the inode was previously logged and
4672 * it exists in the log tree. For performance reasons, in this case use
4673 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4674 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4675 * contention in case there are concurrent fsyncs for other inodes of the
4676 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4677 * already exists can also result in unnecessarily splitting a leaf.
4678 */
4679 if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4680 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
4681 ASSERT(ret <= 0);
4682 if (ret > 0)
4683 ret = -ENOENT;
4684 } else {
4685 /*
4686 * This means it is the first fsync in the current transaction,
4687 * so the inode item is not in the log and we need to insert it.
4688 * We can never get -EEXIST because we are only called for a fast
4689 * fsync and in case an inode eviction happens after the inode was
4690 * logged before in the current transaction, when we load again
4691 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4692 * flags and set ->logged_trans to 0.
4693 */
4694 ret = btrfs_insert_empty_item(trans, log, path, &key,
4695 sizeof(*inode_item));
4696 ASSERT(ret != -EEXIST);
4697 }
4698 if (ret)
4699 return ret;
4700 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4701 struct btrfs_inode_item);
4702 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4703 false, 0);
4704 btrfs_release_path(path);
4705 return 0;
4706 }
4707
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)4708 static int log_csums(struct btrfs_trans_handle *trans,
4709 struct btrfs_inode *inode,
4710 struct btrfs_root *log_root,
4711 struct btrfs_ordered_sum *sums)
4712 {
4713 const u64 lock_end = sums->logical + sums->len - 1;
4714 struct extent_state *cached_state = NULL;
4715 int ret;
4716
4717 /*
4718 * If this inode was not used for reflink operations in the current
4719 * transaction with new extents, then do the fast path, no need to
4720 * worry about logging checksum items with overlapping ranges.
4721 */
4722 if (inode->last_reflink_trans < trans->transid)
4723 return btrfs_csum_file_blocks(trans, log_root, sums);
4724
4725 /*
4726 * Serialize logging for checksums. This is to avoid racing with the
4727 * same checksum being logged by another task that is logging another
4728 * file which happens to refer to the same extent as well. Such races
4729 * can leave checksum items in the log with overlapping ranges.
4730 */
4731 ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4732 &cached_state);
4733 if (ret)
4734 return ret;
4735 /*
4736 * Due to extent cloning, we might have logged a csum item that covers a
4737 * subrange of a cloned extent, and later we can end up logging a csum
4738 * item for a larger subrange of the same extent or the entire range.
4739 * This would leave csum items in the log tree that cover the same range
4740 * and break the searches for checksums in the log tree, resulting in
4741 * some checksums missing in the fs/subvolume tree. So just delete (or
4742 * trim and adjust) any existing csum items in the log for this range.
4743 */
4744 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4745 if (!ret)
4746 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4747
4748 btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4749 &cached_state);
4750
4751 return ret;
4752 }
4753
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize,struct btrfs_log_ctx * ctx)4754 static noinline int copy_items(struct btrfs_trans_handle *trans,
4755 struct btrfs_inode *inode,
4756 struct btrfs_path *dst_path,
4757 struct btrfs_path *src_path,
4758 int start_slot, int nr, int inode_only,
4759 u64 logged_isize, struct btrfs_log_ctx *ctx)
4760 {
4761 struct btrfs_root *log = inode->root->log_root;
4762 struct btrfs_file_extent_item *extent;
4763 struct extent_buffer *src;
4764 int ret;
4765 struct btrfs_key *ins_keys;
4766 u32 *ins_sizes;
4767 struct btrfs_item_batch batch;
4768 char *ins_data;
4769 int dst_index;
4770 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4771 const u64 i_size = i_size_read(&inode->vfs_inode);
4772
4773 /*
4774 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4775 * use the clone. This is because otherwise we would be changing the log
4776 * tree, to insert items from the subvolume tree or insert csum items,
4777 * while holding a read lock on a leaf from the subvolume tree, which
4778 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4779 *
4780 * 1) Modifying the log tree triggers an extent buffer allocation while
4781 * holding a write lock on a parent extent buffer from the log tree.
4782 * Allocating the pages for an extent buffer, or the extent buffer
4783 * struct, can trigger inode eviction and finally the inode eviction
4784 * will trigger a release/remove of a delayed node, which requires
4785 * taking the delayed node's mutex;
4786 *
4787 * 2) Allocating a metadata extent for a log tree can trigger the async
4788 * reclaim thread and make us wait for it to release enough space and
4789 * unblock our reservation ticket. The reclaim thread can start
4790 * flushing delayed items, and that in turn results in the need to
4791 * lock delayed node mutexes and in the need to write lock extent
4792 * buffers of a subvolume tree - all this while holding a write lock
4793 * on the parent extent buffer in the log tree.
4794 *
4795 * So one task in scenario 1) running in parallel with another task in
4796 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4797 * node mutex while having a read lock on a leaf from the subvolume,
4798 * while the other is holding the delayed node's mutex and wants to
4799 * write lock the same subvolume leaf for flushing delayed items.
4800 */
4801 ret = clone_leaf(src_path, ctx);
4802 if (ret < 0)
4803 return ret;
4804
4805 src = src_path->nodes[0];
4806
4807 ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
4808 if (!ins_data)
4809 return -ENOMEM;
4810
4811 ins_sizes = (u32 *)ins_data;
4812 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4813 batch.keys = ins_keys;
4814 batch.data_sizes = ins_sizes;
4815 batch.total_data_size = 0;
4816 batch.nr = 0;
4817
4818 dst_index = 0;
4819 for (int i = 0; i < nr; i++) {
4820 const int src_slot = start_slot + i;
4821 struct btrfs_root *csum_root;
4822 struct btrfs_ordered_sum *sums;
4823 struct btrfs_ordered_sum *sums_next;
4824 LIST_HEAD(ordered_sums);
4825 u64 disk_bytenr;
4826 u64 disk_num_bytes;
4827 u64 extent_offset;
4828 u64 extent_num_bytes;
4829 bool is_old_extent;
4830
4831 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4832
4833 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4834 goto add_to_batch;
4835
4836 extent = btrfs_item_ptr(src, src_slot,
4837 struct btrfs_file_extent_item);
4838
4839 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4840 trans->transid);
4841
4842 /*
4843 * Don't copy extents from past generations. That would make us
4844 * log a lot more metadata for common cases like doing only a
4845 * few random writes into a file and then fsync it for the first
4846 * time or after the full sync flag is set on the inode. We can
4847 * get leaves full of extent items, most of which are from past
4848 * generations, so we can skip them - as long as the inode has
4849 * not been the target of a reflink operation in this transaction,
4850 * as in that case it might have had file extent items with old
4851 * generations copied into it. We also must always log prealloc
4852 * extents that start at or beyond eof, otherwise we would lose
4853 * them on log replay.
4854 */
4855 if (is_old_extent &&
4856 ins_keys[dst_index].offset < i_size &&
4857 inode->last_reflink_trans < trans->transid)
4858 continue;
4859
4860 if (skip_csum)
4861 goto add_to_batch;
4862
4863 /* Only regular extents have checksums. */
4864 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4865 goto add_to_batch;
4866
4867 /*
4868 * If it's an extent created in a past transaction, then its
4869 * checksums are already accessible from the committed csum tree,
4870 * no need to log them.
4871 */
4872 if (is_old_extent)
4873 goto add_to_batch;
4874
4875 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4876 /* If it's an explicit hole, there are no checksums. */
4877 if (disk_bytenr == 0)
4878 goto add_to_batch;
4879
4880 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4881
4882 if (btrfs_file_extent_compression(src, extent)) {
4883 extent_offset = 0;
4884 extent_num_bytes = disk_num_bytes;
4885 } else {
4886 extent_offset = btrfs_file_extent_offset(src, extent);
4887 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4888 }
4889
4890 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4891 disk_bytenr += extent_offset;
4892 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4893 disk_bytenr + extent_num_bytes - 1,
4894 &ordered_sums, false);
4895 if (ret < 0)
4896 goto out;
4897 ret = 0;
4898
4899 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4900 if (!ret)
4901 ret = log_csums(trans, inode, log, sums);
4902 list_del(&sums->list);
4903 kfree(sums);
4904 }
4905 if (ret)
4906 goto out;
4907
4908 add_to_batch:
4909 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4910 batch.total_data_size += ins_sizes[dst_index];
4911 batch.nr++;
4912 dst_index++;
4913 }
4914
4915 /*
4916 * We have a leaf full of old extent items that don't need to be logged,
4917 * so we don't need to do anything.
4918 */
4919 if (batch.nr == 0)
4920 goto out;
4921
4922 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4923 if (ret)
4924 goto out;
4925
4926 dst_index = 0;
4927 for (int i = 0; i < nr; i++) {
4928 const int src_slot = start_slot + i;
4929 const int dst_slot = dst_path->slots[0] + dst_index;
4930 struct btrfs_key key;
4931 unsigned long src_offset;
4932 unsigned long dst_offset;
4933
4934 /*
4935 * We're done, all the remaining items in the source leaf
4936 * correspond to old file extent items.
4937 */
4938 if (dst_index >= batch.nr)
4939 break;
4940
4941 btrfs_item_key_to_cpu(src, &key, src_slot);
4942
4943 if (key.type != BTRFS_EXTENT_DATA_KEY)
4944 goto copy_item;
4945
4946 extent = btrfs_item_ptr(src, src_slot,
4947 struct btrfs_file_extent_item);
4948
4949 /* See the comment in the previous loop, same logic. */
4950 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4951 key.offset < i_size &&
4952 inode->last_reflink_trans < trans->transid)
4953 continue;
4954
4955 copy_item:
4956 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4957 src_offset = btrfs_item_ptr_offset(src, src_slot);
4958
4959 if (key.type == BTRFS_INODE_ITEM_KEY) {
4960 struct btrfs_inode_item *inode_item;
4961
4962 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4963 struct btrfs_inode_item);
4964 fill_inode_item(trans, dst_path->nodes[0], inode_item,
4965 &inode->vfs_inode,
4966 inode_only == LOG_INODE_EXISTS,
4967 logged_isize);
4968 } else {
4969 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4970 src_offset, ins_sizes[dst_index]);
4971 }
4972
4973 dst_index++;
4974 }
4975
4976 btrfs_release_path(dst_path);
4977 out:
4978 kfree(ins_data);
4979
4980 return ret;
4981 }
4982
extent_cmp(void * priv,const struct list_head * a,const struct list_head * b)4983 static int extent_cmp(void *priv, const struct list_head *a,
4984 const struct list_head *b)
4985 {
4986 const struct extent_map *em1, *em2;
4987
4988 em1 = list_entry(a, struct extent_map, list);
4989 em2 = list_entry(b, struct extent_map, list);
4990
4991 if (em1->start < em2->start)
4992 return -1;
4993 else if (em1->start > em2->start)
4994 return 1;
4995 return 0;
4996 }
4997
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4998 static int log_extent_csums(struct btrfs_trans_handle *trans,
4999 struct btrfs_inode *inode,
5000 struct btrfs_root *log_root,
5001 const struct extent_map *em,
5002 struct btrfs_log_ctx *ctx)
5003 {
5004 struct btrfs_ordered_extent *ordered;
5005 struct btrfs_root *csum_root;
5006 u64 block_start;
5007 u64 csum_offset;
5008 u64 csum_len;
5009 u64 mod_start = em->start;
5010 u64 mod_len = em->len;
5011 LIST_HEAD(ordered_sums);
5012 int ret = 0;
5013
5014 if (inode->flags & BTRFS_INODE_NODATASUM ||
5015 (em->flags & EXTENT_FLAG_PREALLOC) ||
5016 em->disk_bytenr == EXTENT_MAP_HOLE)
5017 return 0;
5018
5019 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
5020 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
5021 const u64 mod_end = mod_start + mod_len;
5022 struct btrfs_ordered_sum *sums;
5023
5024 if (mod_len == 0)
5025 break;
5026
5027 if (ordered_end <= mod_start)
5028 continue;
5029 if (mod_end <= ordered->file_offset)
5030 break;
5031
5032 /*
5033 * We are going to copy all the csums on this ordered extent, so
5034 * go ahead and adjust mod_start and mod_len in case this ordered
5035 * extent has already been logged.
5036 */
5037 if (ordered->file_offset > mod_start) {
5038 if (ordered_end >= mod_end)
5039 mod_len = ordered->file_offset - mod_start;
5040 /*
5041 * If we have this case
5042 *
5043 * |--------- logged extent ---------|
5044 * |----- ordered extent ----|
5045 *
5046 * Just don't mess with mod_start and mod_len, we'll
5047 * just end up logging more csums than we need and it
5048 * will be ok.
5049 */
5050 } else {
5051 if (ordered_end < mod_end) {
5052 mod_len = mod_end - ordered_end;
5053 mod_start = ordered_end;
5054 } else {
5055 mod_len = 0;
5056 }
5057 }
5058
5059 /*
5060 * To keep us from looping for the above case of an ordered
5061 * extent that falls inside of the logged extent.
5062 */
5063 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
5064 continue;
5065
5066 list_for_each_entry(sums, &ordered->list, list) {
5067 ret = log_csums(trans, inode, log_root, sums);
5068 if (ret)
5069 return ret;
5070 }
5071 }
5072
5073 /* We're done, found all csums in the ordered extents. */
5074 if (mod_len == 0)
5075 return 0;
5076
5077 /* If we're compressed we have to save the entire range of csums. */
5078 if (btrfs_extent_map_is_compressed(em)) {
5079 csum_offset = 0;
5080 csum_len = em->disk_num_bytes;
5081 } else {
5082 csum_offset = mod_start - em->start;
5083 csum_len = mod_len;
5084 }
5085
5086 /* block start is already adjusted for the file extent offset. */
5087 block_start = btrfs_extent_map_block_start(em);
5088 csum_root = btrfs_csum_root(trans->fs_info, block_start);
5089 ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
5090 block_start + csum_offset + csum_len - 1,
5091 &ordered_sums, false);
5092 if (ret < 0)
5093 return ret;
5094 ret = 0;
5095
5096 while (!list_empty(&ordered_sums)) {
5097 struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
5098 struct btrfs_ordered_sum,
5099 list);
5100 if (!ret)
5101 ret = log_csums(trans, inode, log_root, sums);
5102 list_del(&sums->list);
5103 kfree(sums);
5104 }
5105
5106 return ret;
5107 }
5108
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5109 static int log_one_extent(struct btrfs_trans_handle *trans,
5110 struct btrfs_inode *inode,
5111 const struct extent_map *em,
5112 struct btrfs_path *path,
5113 struct btrfs_log_ctx *ctx)
5114 {
5115 struct btrfs_drop_extents_args drop_args = { 0 };
5116 struct btrfs_root *log = inode->root->log_root;
5117 struct btrfs_file_extent_item fi = { 0 };
5118 struct extent_buffer *leaf;
5119 struct btrfs_key key;
5120 enum btrfs_compression_type compress_type;
5121 u64 extent_offset = em->offset;
5122 u64 block_start = btrfs_extent_map_block_start(em);
5123 u64 block_len;
5124 int ret;
5125
5126 btrfs_set_stack_file_extent_generation(&fi, trans->transid);
5127 if (em->flags & EXTENT_FLAG_PREALLOC)
5128 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
5129 else
5130 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
5131
5132 block_len = em->disk_num_bytes;
5133 compress_type = btrfs_extent_map_compression(em);
5134 if (compress_type != BTRFS_COMPRESS_NONE) {
5135 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
5136 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
5137 } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
5138 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
5139 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
5140 }
5141
5142 btrfs_set_stack_file_extent_offset(&fi, extent_offset);
5143 btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
5144 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
5145 btrfs_set_stack_file_extent_compression(&fi, compress_type);
5146
5147 ret = log_extent_csums(trans, inode, log, em, ctx);
5148 if (ret)
5149 return ret;
5150
5151 /*
5152 * If this is the first time we are logging the inode in the current
5153 * transaction, we can avoid btrfs_drop_extents(), which is expensive
5154 * because it does a deletion search, which always acquires write locks
5155 * for extent buffers at levels 2, 1 and 0. This not only wastes time
5156 * but also adds significant contention in a log tree, since log trees
5157 * are small, with a root at level 2 or 3 at most, due to their short
5158 * life span.
5159 */
5160 if (ctx->logged_before) {
5161 drop_args.path = path;
5162 drop_args.start = em->start;
5163 drop_args.end = em->start + em->len;
5164 drop_args.replace_extent = true;
5165 drop_args.extent_item_size = sizeof(fi);
5166 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
5167 if (ret)
5168 return ret;
5169 }
5170
5171 if (!drop_args.extent_inserted) {
5172 key.objectid = btrfs_ino(inode);
5173 key.type = BTRFS_EXTENT_DATA_KEY;
5174 key.offset = em->start;
5175
5176 ret = btrfs_insert_empty_item(trans, log, path, &key,
5177 sizeof(fi));
5178 if (ret)
5179 return ret;
5180 }
5181 leaf = path->nodes[0];
5182 write_extent_buffer(leaf, &fi,
5183 btrfs_item_ptr_offset(leaf, path->slots[0]),
5184 sizeof(fi));
5185
5186 btrfs_release_path(path);
5187
5188 return ret;
5189 }
5190
5191 /*
5192 * Log all prealloc extents beyond the inode's i_size to make sure we do not
5193 * lose them after doing a full/fast fsync and replaying the log. We scan the
5194 * subvolume's root instead of iterating the inode's extent map tree because
5195 * otherwise we can log incorrect extent items based on extent map conversion.
5196 * That can happen due to the fact that extent maps are merged when they
5197 * are not in the extent map tree's list of modified extents.
5198 */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5199 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
5200 struct btrfs_inode *inode,
5201 struct btrfs_path *path,
5202 struct btrfs_log_ctx *ctx)
5203 {
5204 struct btrfs_root *root = inode->root;
5205 struct btrfs_key key;
5206 const u64 i_size = i_size_read(&inode->vfs_inode);
5207 const u64 ino = btrfs_ino(inode);
5208 BTRFS_PATH_AUTO_FREE(dst_path);
5209 bool dropped_extents = false;
5210 u64 truncate_offset = i_size;
5211 struct extent_buffer *leaf;
5212 int slot;
5213 int ins_nr = 0;
5214 int start_slot = 0;
5215 int ret;
5216
5217 if (!(inode->flags & BTRFS_INODE_PREALLOC))
5218 return 0;
5219
5220 key.objectid = ino;
5221 key.type = BTRFS_EXTENT_DATA_KEY;
5222 key.offset = i_size;
5223 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5224 if (ret < 0)
5225 goto out;
5226
5227 /*
5228 * We must check if there is a prealloc extent that starts before the
5229 * i_size and crosses the i_size boundary. This is to ensure later we
5230 * truncate down to the end of that extent and not to the i_size, as
5231 * otherwise we end up losing part of the prealloc extent after a log
5232 * replay and with an implicit hole if there is another prealloc extent
5233 * that starts at an offset beyond i_size.
5234 */
5235 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
5236 if (ret < 0)
5237 goto out;
5238
5239 if (ret == 0) {
5240 struct btrfs_file_extent_item *ei;
5241
5242 leaf = path->nodes[0];
5243 slot = path->slots[0];
5244 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5245
5246 if (btrfs_file_extent_type(leaf, ei) ==
5247 BTRFS_FILE_EXTENT_PREALLOC) {
5248 u64 extent_end;
5249
5250 btrfs_item_key_to_cpu(leaf, &key, slot);
5251 extent_end = key.offset +
5252 btrfs_file_extent_num_bytes(leaf, ei);
5253
5254 if (extent_end > i_size)
5255 truncate_offset = extent_end;
5256 }
5257 } else {
5258 ret = 0;
5259 }
5260
5261 while (true) {
5262 leaf = path->nodes[0];
5263 slot = path->slots[0];
5264
5265 if (slot >= btrfs_header_nritems(leaf)) {
5266 if (ins_nr > 0) {
5267 ret = copy_items(trans, inode, dst_path, path,
5268 start_slot, ins_nr, 1, 0, ctx);
5269 if (ret < 0)
5270 goto out;
5271 ins_nr = 0;
5272 }
5273 ret = btrfs_next_leaf(root, path);
5274 if (ret < 0)
5275 goto out;
5276 if (ret > 0) {
5277 ret = 0;
5278 break;
5279 }
5280 continue;
5281 }
5282
5283 btrfs_item_key_to_cpu(leaf, &key, slot);
5284 if (key.objectid > ino)
5285 break;
5286 if (WARN_ON_ONCE(key.objectid < ino) ||
5287 key.type < BTRFS_EXTENT_DATA_KEY ||
5288 key.offset < i_size) {
5289 path->slots[0]++;
5290 continue;
5291 }
5292 /*
5293 * Avoid overlapping items in the log tree. The first time we
5294 * get here, get rid of everything from a past fsync. After
5295 * that, if the current extent starts before the end of the last
5296 * extent we copied, truncate the last one. This can happen if
5297 * an ordered extent completion modifies the subvolume tree
5298 * while btrfs_next_leaf() has the tree unlocked.
5299 */
5300 if (!dropped_extents || key.offset < truncate_offset) {
5301 ret = truncate_inode_items(trans, root->log_root, inode,
5302 min(key.offset, truncate_offset),
5303 BTRFS_EXTENT_DATA_KEY);
5304 if (ret)
5305 goto out;
5306 dropped_extents = true;
5307 }
5308 truncate_offset = btrfs_file_extent_end(path);
5309 if (ins_nr == 0)
5310 start_slot = slot;
5311 ins_nr++;
5312 path->slots[0]++;
5313 if (!dst_path) {
5314 dst_path = btrfs_alloc_path();
5315 if (!dst_path) {
5316 ret = -ENOMEM;
5317 goto out;
5318 }
5319 }
5320 }
5321 if (ins_nr > 0)
5322 ret = copy_items(trans, inode, dst_path, path,
5323 start_slot, ins_nr, 1, 0, ctx);
5324 out:
5325 btrfs_release_path(path);
5326 return ret;
5327 }
5328
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5329 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
5330 struct btrfs_inode *inode,
5331 struct btrfs_path *path,
5332 struct btrfs_log_ctx *ctx)
5333 {
5334 struct btrfs_ordered_extent *ordered;
5335 struct btrfs_ordered_extent *tmp;
5336 struct extent_map *em, *n;
5337 LIST_HEAD(extents);
5338 struct extent_map_tree *tree = &inode->extent_tree;
5339 int ret = 0;
5340 int num = 0;
5341
5342 write_lock(&tree->lock);
5343
5344 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
5345 list_del_init(&em->list);
5346 /*
5347 * Just an arbitrary number, this can be really CPU intensive
5348 * once we start getting a lot of extents, and really once we
5349 * have a bunch of extents we just want to commit since it will
5350 * be faster.
5351 */
5352 if (++num > 32768) {
5353 list_del_init(&tree->modified_extents);
5354 ret = -EFBIG;
5355 goto process;
5356 }
5357
5358 if (em->generation < trans->transid)
5359 continue;
5360
5361 /* We log prealloc extents beyond eof later. */
5362 if ((em->flags & EXTENT_FLAG_PREALLOC) &&
5363 em->start >= i_size_read(&inode->vfs_inode))
5364 continue;
5365
5366 /* Need a ref to keep it from getting evicted from cache */
5367 refcount_inc(&em->refs);
5368 em->flags |= EXTENT_FLAG_LOGGING;
5369 list_add_tail(&em->list, &extents);
5370 num++;
5371 }
5372
5373 list_sort(NULL, &extents, extent_cmp);
5374 process:
5375 while (!list_empty(&extents)) {
5376 em = list_first_entry(&extents, struct extent_map, list);
5377
5378 list_del_init(&em->list);
5379
5380 /*
5381 * If we had an error we just need to delete everybody from our
5382 * private list.
5383 */
5384 if (ret) {
5385 btrfs_clear_em_logging(inode, em);
5386 btrfs_free_extent_map(em);
5387 continue;
5388 }
5389
5390 write_unlock(&tree->lock);
5391
5392 ret = log_one_extent(trans, inode, em, path, ctx);
5393 write_lock(&tree->lock);
5394 btrfs_clear_em_logging(inode, em);
5395 btrfs_free_extent_map(em);
5396 }
5397 WARN_ON(!list_empty(&extents));
5398 write_unlock(&tree->lock);
5399
5400 if (!ret)
5401 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
5402 if (ret)
5403 return ret;
5404
5405 /*
5406 * We have logged all extents successfully, now make sure the commit of
5407 * the current transaction waits for the ordered extents to complete
5408 * before it commits and wipes out the log trees, otherwise we would
5409 * lose data if an ordered extents completes after the transaction
5410 * commits and a power failure happens after the transaction commit.
5411 */
5412 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
5413 list_del_init(&ordered->log_list);
5414 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
5415
5416 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5417 spin_lock_irq(&inode->ordered_tree_lock);
5418 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5419 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
5420 atomic_inc(&trans->transaction->pending_ordered);
5421 }
5422 spin_unlock_irq(&inode->ordered_tree_lock);
5423 }
5424 btrfs_put_ordered_extent(ordered);
5425 }
5426
5427 return 0;
5428 }
5429
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)5430 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5431 struct btrfs_path *path, u64 *size_ret)
5432 {
5433 struct btrfs_key key;
5434 int ret;
5435
5436 key.objectid = btrfs_ino(inode);
5437 key.type = BTRFS_INODE_ITEM_KEY;
5438 key.offset = 0;
5439
5440 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5441 if (ret < 0) {
5442 return ret;
5443 } else if (ret > 0) {
5444 *size_ret = 0;
5445 } else {
5446 struct btrfs_inode_item *item;
5447
5448 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5449 struct btrfs_inode_item);
5450 *size_ret = btrfs_inode_size(path->nodes[0], item);
5451 /*
5452 * If the in-memory inode's i_size is smaller then the inode
5453 * size stored in the btree, return the inode's i_size, so
5454 * that we get a correct inode size after replaying the log
5455 * when before a power failure we had a shrinking truncate
5456 * followed by addition of a new name (rename / new hard link).
5457 * Otherwise return the inode size from the btree, to avoid
5458 * data loss when replaying a log due to previously doing a
5459 * write that expands the inode's size and logging a new name
5460 * immediately after.
5461 */
5462 if (*size_ret > inode->vfs_inode.i_size)
5463 *size_ret = inode->vfs_inode.i_size;
5464 }
5465
5466 btrfs_release_path(path);
5467 return 0;
5468 }
5469
5470 /*
5471 * At the moment we always log all xattrs. This is to figure out at log replay
5472 * time which xattrs must have their deletion replayed. If a xattr is missing
5473 * in the log tree and exists in the fs/subvol tree, we delete it. This is
5474 * because if a xattr is deleted, the inode is fsynced and a power failure
5475 * happens, causing the log to be replayed the next time the fs is mounted,
5476 * we want the xattr to not exist anymore (same behaviour as other filesystems
5477 * with a journal, ext3/4, xfs, f2fs, etc).
5478 */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)5479 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5480 struct btrfs_inode *inode,
5481 struct btrfs_path *path,
5482 struct btrfs_path *dst_path,
5483 struct btrfs_log_ctx *ctx)
5484 {
5485 struct btrfs_root *root = inode->root;
5486 int ret;
5487 struct btrfs_key key;
5488 const u64 ino = btrfs_ino(inode);
5489 int ins_nr = 0;
5490 int start_slot = 0;
5491 bool found_xattrs = false;
5492
5493 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5494 return 0;
5495
5496 key.objectid = ino;
5497 key.type = BTRFS_XATTR_ITEM_KEY;
5498 key.offset = 0;
5499
5500 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5501 if (ret < 0)
5502 return ret;
5503
5504 while (true) {
5505 int slot = path->slots[0];
5506 struct extent_buffer *leaf = path->nodes[0];
5507 int nritems = btrfs_header_nritems(leaf);
5508
5509 if (slot >= nritems) {
5510 if (ins_nr > 0) {
5511 ret = copy_items(trans, inode, dst_path, path,
5512 start_slot, ins_nr, 1, 0, ctx);
5513 if (ret < 0)
5514 return ret;
5515 ins_nr = 0;
5516 }
5517 ret = btrfs_next_leaf(root, path);
5518 if (ret < 0)
5519 return ret;
5520 else if (ret > 0)
5521 break;
5522 continue;
5523 }
5524
5525 btrfs_item_key_to_cpu(leaf, &key, slot);
5526 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5527 break;
5528
5529 if (ins_nr == 0)
5530 start_slot = slot;
5531 ins_nr++;
5532 path->slots[0]++;
5533 found_xattrs = true;
5534 cond_resched();
5535 }
5536 if (ins_nr > 0) {
5537 ret = copy_items(trans, inode, dst_path, path,
5538 start_slot, ins_nr, 1, 0, ctx);
5539 if (ret < 0)
5540 return ret;
5541 }
5542
5543 if (!found_xattrs)
5544 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5545
5546 return 0;
5547 }
5548
5549 /*
5550 * When using the NO_HOLES feature if we punched a hole that causes the
5551 * deletion of entire leafs or all the extent items of the first leaf (the one
5552 * that contains the inode item and references) we may end up not processing
5553 * any extents, because there are no leafs with a generation matching the
5554 * current transaction that have extent items for our inode. So we need to find
5555 * if any holes exist and then log them. We also need to log holes after any
5556 * truncate operation that changes the inode's size.
5557 */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)5558 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5559 struct btrfs_inode *inode,
5560 struct btrfs_path *path)
5561 {
5562 struct btrfs_root *root = inode->root;
5563 struct btrfs_fs_info *fs_info = root->fs_info;
5564 struct btrfs_key key;
5565 const u64 ino = btrfs_ino(inode);
5566 const u64 i_size = i_size_read(&inode->vfs_inode);
5567 u64 prev_extent_end = 0;
5568 int ret;
5569
5570 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5571 return 0;
5572
5573 key.objectid = ino;
5574 key.type = BTRFS_EXTENT_DATA_KEY;
5575 key.offset = 0;
5576
5577 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5578 if (ret < 0)
5579 return ret;
5580
5581 while (true) {
5582 struct extent_buffer *leaf = path->nodes[0];
5583
5584 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5585 ret = btrfs_next_leaf(root, path);
5586 if (ret < 0)
5587 return ret;
5588 if (ret > 0) {
5589 ret = 0;
5590 break;
5591 }
5592 leaf = path->nodes[0];
5593 }
5594
5595 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5596 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5597 break;
5598
5599 /* We have a hole, log it. */
5600 if (prev_extent_end < key.offset) {
5601 const u64 hole_len = key.offset - prev_extent_end;
5602
5603 /*
5604 * Release the path to avoid deadlocks with other code
5605 * paths that search the root while holding locks on
5606 * leafs from the log root.
5607 */
5608 btrfs_release_path(path);
5609 ret = btrfs_insert_hole_extent(trans, root->log_root,
5610 ino, prev_extent_end,
5611 hole_len);
5612 if (ret < 0)
5613 return ret;
5614
5615 /*
5616 * Search for the same key again in the root. Since it's
5617 * an extent item and we are holding the inode lock, the
5618 * key must still exist. If it doesn't just emit warning
5619 * and return an error to fall back to a transaction
5620 * commit.
5621 */
5622 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5623 if (ret < 0)
5624 return ret;
5625 if (WARN_ON(ret > 0))
5626 return -ENOENT;
5627 leaf = path->nodes[0];
5628 }
5629
5630 prev_extent_end = btrfs_file_extent_end(path);
5631 path->slots[0]++;
5632 cond_resched();
5633 }
5634
5635 if (prev_extent_end < i_size) {
5636 u64 hole_len;
5637
5638 btrfs_release_path(path);
5639 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5640 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5641 prev_extent_end, hole_len);
5642 if (ret < 0)
5643 return ret;
5644 }
5645
5646 return 0;
5647 }
5648
5649 /*
5650 * When we are logging a new inode X, check if it doesn't have a reference that
5651 * matches the reference from some other inode Y created in a past transaction
5652 * and that was renamed in the current transaction. If we don't do this, then at
5653 * log replay time we can lose inode Y (and all its files if it's a directory):
5654 *
5655 * mkdir /mnt/x
5656 * echo "hello world" > /mnt/x/foobar
5657 * sync
5658 * mv /mnt/x /mnt/y
5659 * mkdir /mnt/x # or touch /mnt/x
5660 * xfs_io -c fsync /mnt/x
5661 * <power fail>
5662 * mount fs, trigger log replay
5663 *
5664 * After the log replay procedure, we would lose the first directory and all its
5665 * files (file foobar).
5666 * For the case where inode Y is not a directory we simply end up losing it:
5667 *
5668 * echo "123" > /mnt/foo
5669 * sync
5670 * mv /mnt/foo /mnt/bar
5671 * echo "abc" > /mnt/foo
5672 * xfs_io -c fsync /mnt/foo
5673 * <power fail>
5674 *
5675 * We also need this for cases where a snapshot entry is replaced by some other
5676 * entry (file or directory) otherwise we end up with an unreplayable log due to
5677 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5678 * if it were a regular entry:
5679 *
5680 * mkdir /mnt/x
5681 * btrfs subvolume snapshot /mnt /mnt/x/snap
5682 * btrfs subvolume delete /mnt/x/snap
5683 * rmdir /mnt/x
5684 * mkdir /mnt/x
5685 * fsync /mnt/x or fsync some new file inside it
5686 * <power fail>
5687 *
5688 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5689 * the same transaction.
5690 */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)5691 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5692 const int slot,
5693 const struct btrfs_key *key,
5694 struct btrfs_inode *inode,
5695 u64 *other_ino, u64 *other_parent)
5696 {
5697 int ret;
5698 BTRFS_PATH_AUTO_FREE(search_path);
5699 char *name = NULL;
5700 u32 name_len = 0;
5701 u32 item_size = btrfs_item_size(eb, slot);
5702 u32 cur_offset = 0;
5703 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5704
5705 search_path = btrfs_alloc_path();
5706 if (!search_path)
5707 return -ENOMEM;
5708 search_path->search_commit_root = 1;
5709 search_path->skip_locking = 1;
5710
5711 while (cur_offset < item_size) {
5712 u64 parent;
5713 u32 this_name_len;
5714 u32 this_len;
5715 unsigned long name_ptr;
5716 struct btrfs_dir_item *di;
5717 struct fscrypt_str name_str;
5718
5719 if (key->type == BTRFS_INODE_REF_KEY) {
5720 struct btrfs_inode_ref *iref;
5721
5722 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5723 parent = key->offset;
5724 this_name_len = btrfs_inode_ref_name_len(eb, iref);
5725 name_ptr = (unsigned long)(iref + 1);
5726 this_len = sizeof(*iref) + this_name_len;
5727 } else {
5728 struct btrfs_inode_extref *extref;
5729
5730 extref = (struct btrfs_inode_extref *)(ptr +
5731 cur_offset);
5732 parent = btrfs_inode_extref_parent(eb, extref);
5733 this_name_len = btrfs_inode_extref_name_len(eb, extref);
5734 name_ptr = (unsigned long)&extref->name;
5735 this_len = sizeof(*extref) + this_name_len;
5736 }
5737
5738 if (this_name_len > name_len) {
5739 char *new_name;
5740
5741 new_name = krealloc(name, this_name_len, GFP_NOFS);
5742 if (!new_name) {
5743 ret = -ENOMEM;
5744 goto out;
5745 }
5746 name_len = this_name_len;
5747 name = new_name;
5748 }
5749
5750 read_extent_buffer(eb, name, name_ptr, this_name_len);
5751
5752 name_str.name = name;
5753 name_str.len = this_name_len;
5754 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5755 parent, &name_str, 0);
5756 if (di && !IS_ERR(di)) {
5757 struct btrfs_key di_key;
5758
5759 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5760 di, &di_key);
5761 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5762 if (di_key.objectid != key->objectid) {
5763 ret = 1;
5764 *other_ino = di_key.objectid;
5765 *other_parent = parent;
5766 } else {
5767 ret = 0;
5768 }
5769 } else {
5770 ret = -EAGAIN;
5771 }
5772 goto out;
5773 } else if (IS_ERR(di)) {
5774 ret = PTR_ERR(di);
5775 goto out;
5776 }
5777 btrfs_release_path(search_path);
5778
5779 cur_offset += this_len;
5780 }
5781 ret = 0;
5782 out:
5783 kfree(name);
5784 return ret;
5785 }
5786
5787 /*
5788 * Check if we need to log an inode. This is used in contexts where while
5789 * logging an inode we need to log another inode (either that it exists or in
5790 * full mode). This is used instead of btrfs_inode_in_log() because the later
5791 * requires the inode to be in the log and have the log transaction committed,
5792 * while here we do not care if the log transaction was already committed - our
5793 * caller will commit the log later - and we want to avoid logging an inode
5794 * multiple times when multiple tasks have joined the same log transaction.
5795 */
need_log_inode(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5796 static bool need_log_inode(const struct btrfs_trans_handle *trans,
5797 struct btrfs_inode *inode)
5798 {
5799 /*
5800 * If a directory was not modified, no dentries added or removed, we can
5801 * and should avoid logging it.
5802 */
5803 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5804 return false;
5805
5806 /*
5807 * If this inode does not have new/updated/deleted xattrs since the last
5808 * time it was logged and is flagged as logged in the current transaction,
5809 * we can skip logging it. As for new/deleted names, those are updated in
5810 * the log by link/unlink/rename operations.
5811 * In case the inode was logged and then evicted and reloaded, its
5812 * logged_trans will be 0, in which case we have to fully log it since
5813 * logged_trans is a transient field, not persisted.
5814 */
5815 if (inode_logged(trans, inode, NULL) == 1 &&
5816 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5817 return false;
5818
5819 return true;
5820 }
5821
5822 struct btrfs_dir_list {
5823 u64 ino;
5824 struct list_head list;
5825 };
5826
5827 /*
5828 * Log the inodes of the new dentries of a directory.
5829 * See process_dir_items_leaf() for details about why it is needed.
5830 * This is a recursive operation - if an existing dentry corresponds to a
5831 * directory, that directory's new entries are logged too (same behaviour as
5832 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5833 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5834 * complains about the following circular lock dependency / possible deadlock:
5835 *
5836 * CPU0 CPU1
5837 * ---- ----
5838 * lock(&type->i_mutex_dir_key#3/2);
5839 * lock(sb_internal#2);
5840 * lock(&type->i_mutex_dir_key#3/2);
5841 * lock(&sb->s_type->i_mutex_key#14);
5842 *
5843 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5844 * sb_start_intwrite() in btrfs_start_transaction().
5845 * Not acquiring the VFS lock of the inodes is still safe because:
5846 *
5847 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5848 * that while logging the inode new references (names) are added or removed
5849 * from the inode, leaving the logged inode item with a link count that does
5850 * not match the number of logged inode reference items. This is fine because
5851 * at log replay time we compute the real number of links and correct the
5852 * link count in the inode item (see replay_one_buffer() and
5853 * link_to_fixup_dir());
5854 *
5855 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5856 * while logging the inode's items new index items (key type
5857 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5858 * has a size that doesn't match the sum of the lengths of all the logged
5859 * names - this is ok, not a problem, because at log replay time we set the
5860 * directory's i_size to the correct value (see replay_one_name() and
5861 * overwrite_item()).
5862 */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5863 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5864 struct btrfs_inode *start_inode,
5865 struct btrfs_log_ctx *ctx)
5866 {
5867 struct btrfs_root *root = start_inode->root;
5868 struct btrfs_path *path;
5869 LIST_HEAD(dir_list);
5870 struct btrfs_dir_list *dir_elem;
5871 u64 ino = btrfs_ino(start_inode);
5872 struct btrfs_inode *curr_inode = start_inode;
5873 int ret = 0;
5874
5875 /*
5876 * If we are logging a new name, as part of a link or rename operation,
5877 * don't bother logging new dentries, as we just want to log the names
5878 * of an inode and that any new parents exist.
5879 */
5880 if (ctx->logging_new_name)
5881 return 0;
5882
5883 path = btrfs_alloc_path();
5884 if (!path)
5885 return -ENOMEM;
5886
5887 /* Pairs with btrfs_add_delayed_iput below. */
5888 ihold(&curr_inode->vfs_inode);
5889
5890 while (true) {
5891 struct btrfs_key key;
5892 struct btrfs_key found_key;
5893 u64 next_index;
5894 bool continue_curr_inode = true;
5895 int iter_ret;
5896
5897 key.objectid = ino;
5898 key.type = BTRFS_DIR_INDEX_KEY;
5899 key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5900 next_index = key.offset;
5901 again:
5902 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5903 struct extent_buffer *leaf = path->nodes[0];
5904 struct btrfs_dir_item *di;
5905 struct btrfs_key di_key;
5906 struct btrfs_inode *di_inode;
5907 int log_mode = LOG_INODE_EXISTS;
5908 int type;
5909
5910 if (found_key.objectid != ino ||
5911 found_key.type != BTRFS_DIR_INDEX_KEY) {
5912 continue_curr_inode = false;
5913 break;
5914 }
5915
5916 next_index = found_key.offset + 1;
5917
5918 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5919 type = btrfs_dir_ftype(leaf, di);
5920 if (btrfs_dir_transid(leaf, di) < trans->transid)
5921 continue;
5922 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5923 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5924 continue;
5925
5926 btrfs_release_path(path);
5927 di_inode = btrfs_iget_logging(di_key.objectid, root);
5928 if (IS_ERR(di_inode)) {
5929 ret = PTR_ERR(di_inode);
5930 goto out;
5931 }
5932
5933 if (!need_log_inode(trans, di_inode)) {
5934 btrfs_add_delayed_iput(di_inode);
5935 break;
5936 }
5937
5938 ctx->log_new_dentries = false;
5939 if (type == BTRFS_FT_DIR)
5940 log_mode = LOG_INODE_ALL;
5941 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
5942 btrfs_add_delayed_iput(di_inode);
5943 if (ret)
5944 goto out;
5945 if (ctx->log_new_dentries) {
5946 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5947 if (!dir_elem) {
5948 ret = -ENOMEM;
5949 goto out;
5950 }
5951 dir_elem->ino = di_key.objectid;
5952 list_add_tail(&dir_elem->list, &dir_list);
5953 }
5954 break;
5955 }
5956
5957 btrfs_release_path(path);
5958
5959 if (iter_ret < 0) {
5960 ret = iter_ret;
5961 goto out;
5962 } else if (iter_ret > 0) {
5963 continue_curr_inode = false;
5964 } else {
5965 key = found_key;
5966 }
5967
5968 if (continue_curr_inode && key.offset < (u64)-1) {
5969 key.offset++;
5970 goto again;
5971 }
5972
5973 btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5974
5975 if (list_empty(&dir_list))
5976 break;
5977
5978 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5979 ino = dir_elem->ino;
5980 list_del(&dir_elem->list);
5981 kfree(dir_elem);
5982
5983 btrfs_add_delayed_iput(curr_inode);
5984
5985 curr_inode = btrfs_iget_logging(ino, root);
5986 if (IS_ERR(curr_inode)) {
5987 ret = PTR_ERR(curr_inode);
5988 curr_inode = NULL;
5989 break;
5990 }
5991 }
5992 out:
5993 btrfs_free_path(path);
5994 if (curr_inode)
5995 btrfs_add_delayed_iput(curr_inode);
5996
5997 if (ret) {
5998 struct btrfs_dir_list *next;
5999
6000 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
6001 kfree(dir_elem);
6002 }
6003
6004 return ret;
6005 }
6006
6007 struct btrfs_ino_list {
6008 u64 ino;
6009 u64 parent;
6010 struct list_head list;
6011 };
6012
free_conflicting_inodes(struct btrfs_log_ctx * ctx)6013 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
6014 {
6015 struct btrfs_ino_list *curr;
6016 struct btrfs_ino_list *next;
6017
6018 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
6019 list_del(&curr->list);
6020 kfree(curr);
6021 }
6022 }
6023
conflicting_inode_is_dir(struct btrfs_root * root,u64 ino,struct btrfs_path * path)6024 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
6025 struct btrfs_path *path)
6026 {
6027 struct btrfs_key key;
6028 int ret;
6029
6030 key.objectid = ino;
6031 key.type = BTRFS_INODE_ITEM_KEY;
6032 key.offset = 0;
6033
6034 path->search_commit_root = 1;
6035 path->skip_locking = 1;
6036
6037 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6038 if (WARN_ON_ONCE(ret > 0)) {
6039 /*
6040 * We have previously found the inode through the commit root
6041 * so this should not happen. If it does, just error out and
6042 * fallback to a transaction commit.
6043 */
6044 ret = -ENOENT;
6045 } else if (ret == 0) {
6046 struct btrfs_inode_item *item;
6047
6048 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6049 struct btrfs_inode_item);
6050 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
6051 ret = 1;
6052 }
6053
6054 btrfs_release_path(path);
6055 path->search_commit_root = 0;
6056 path->skip_locking = 0;
6057
6058 return ret;
6059 }
6060
add_conflicting_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 ino,u64 parent,struct btrfs_log_ctx * ctx)6061 static int add_conflicting_inode(struct btrfs_trans_handle *trans,
6062 struct btrfs_root *root,
6063 struct btrfs_path *path,
6064 u64 ino, u64 parent,
6065 struct btrfs_log_ctx *ctx)
6066 {
6067 struct btrfs_ino_list *ino_elem;
6068 struct btrfs_inode *inode;
6069
6070 /*
6071 * It's rare to have a lot of conflicting inodes, in practice it is not
6072 * common to have more than 1 or 2. We don't want to collect too many,
6073 * as we could end up logging too many inodes (even if only in
6074 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
6075 * commits.
6076 */
6077 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
6078 return BTRFS_LOG_FORCE_COMMIT;
6079
6080 inode = btrfs_iget_logging(ino, root);
6081 /*
6082 * If the other inode that had a conflicting dir entry was deleted in
6083 * the current transaction then we either:
6084 *
6085 * 1) Log the parent directory (later after adding it to the list) if
6086 * the inode is a directory. This is because it may be a deleted
6087 * subvolume/snapshot or it may be a regular directory that had
6088 * deleted subvolumes/snapshots (or subdirectories that had them),
6089 * and at the moment we can't deal with dropping subvolumes/snapshots
6090 * during log replay. So we just log the parent, which will result in
6091 * a fallback to a transaction commit if we are dealing with those
6092 * cases (last_unlink_trans will match the current transaction);
6093 *
6094 * 2) Do nothing if it's not a directory. During log replay we simply
6095 * unlink the conflicting dentry from the parent directory and then
6096 * add the dentry for our inode. Like this we can avoid logging the
6097 * parent directory (and maybe fallback to a transaction commit in
6098 * case it has a last_unlink_trans == trans->transid, due to moving
6099 * some inode from it to some other directory).
6100 */
6101 if (IS_ERR(inode)) {
6102 int ret = PTR_ERR(inode);
6103
6104 if (ret != -ENOENT)
6105 return ret;
6106
6107 ret = conflicting_inode_is_dir(root, ino, path);
6108 /* Not a directory or we got an error. */
6109 if (ret <= 0)
6110 return ret;
6111
6112 /* Conflicting inode is a directory, so we'll log its parent. */
6113 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
6114 if (!ino_elem)
6115 return -ENOMEM;
6116 ino_elem->ino = ino;
6117 ino_elem->parent = parent;
6118 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
6119 ctx->num_conflict_inodes++;
6120
6121 return 0;
6122 }
6123
6124 /*
6125 * If the inode was already logged skip it - otherwise we can hit an
6126 * infinite loop. Example:
6127 *
6128 * From the commit root (previous transaction) we have the following
6129 * inodes:
6130 *
6131 * inode 257 a directory
6132 * inode 258 with references "zz" and "zz_link" on inode 257
6133 * inode 259 with reference "a" on inode 257
6134 *
6135 * And in the current (uncommitted) transaction we have:
6136 *
6137 * inode 257 a directory, unchanged
6138 * inode 258 with references "a" and "a2" on inode 257
6139 * inode 259 with reference "zz_link" on inode 257
6140 * inode 261 with reference "zz" on inode 257
6141 *
6142 * When logging inode 261 the following infinite loop could
6143 * happen if we don't skip already logged inodes:
6144 *
6145 * - we detect inode 258 as a conflicting inode, with inode 261
6146 * on reference "zz", and log it;
6147 *
6148 * - we detect inode 259 as a conflicting inode, with inode 258
6149 * on reference "a", and log it;
6150 *
6151 * - we detect inode 258 as a conflicting inode, with inode 259
6152 * on reference "zz_link", and log it - again! After this we
6153 * repeat the above steps forever.
6154 *
6155 * Here we can use need_log_inode() because we only need to log the
6156 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
6157 * so that the log ends up with the new name and without the old name.
6158 */
6159 if (!need_log_inode(trans, inode)) {
6160 btrfs_add_delayed_iput(inode);
6161 return 0;
6162 }
6163
6164 btrfs_add_delayed_iput(inode);
6165
6166 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
6167 if (!ino_elem)
6168 return -ENOMEM;
6169 ino_elem->ino = ino;
6170 ino_elem->parent = parent;
6171 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
6172 ctx->num_conflict_inodes++;
6173
6174 return 0;
6175 }
6176
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)6177 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
6178 struct btrfs_root *root,
6179 struct btrfs_log_ctx *ctx)
6180 {
6181 int ret = 0;
6182
6183 /*
6184 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
6185 * otherwise we could have unbounded recursion of btrfs_log_inode()
6186 * calls. This check guarantees we can have only 1 level of recursion.
6187 */
6188 if (ctx->logging_conflict_inodes)
6189 return 0;
6190
6191 ctx->logging_conflict_inodes = true;
6192
6193 /*
6194 * New conflicting inodes may be found and added to the list while we
6195 * are logging a conflicting inode, so keep iterating while the list is
6196 * not empty.
6197 */
6198 while (!list_empty(&ctx->conflict_inodes)) {
6199 struct btrfs_ino_list *curr;
6200 struct btrfs_inode *inode;
6201 u64 ino;
6202 u64 parent;
6203
6204 curr = list_first_entry(&ctx->conflict_inodes,
6205 struct btrfs_ino_list, list);
6206 ino = curr->ino;
6207 parent = curr->parent;
6208 list_del(&curr->list);
6209 kfree(curr);
6210
6211 inode = btrfs_iget_logging(ino, root);
6212 /*
6213 * If the other inode that had a conflicting dir entry was
6214 * deleted in the current transaction, we need to log its parent
6215 * directory. See the comment at add_conflicting_inode().
6216 */
6217 if (IS_ERR(inode)) {
6218 ret = PTR_ERR(inode);
6219 if (ret != -ENOENT)
6220 break;
6221
6222 inode = btrfs_iget_logging(parent, root);
6223 if (IS_ERR(inode)) {
6224 ret = PTR_ERR(inode);
6225 break;
6226 }
6227
6228 /*
6229 * Always log the directory, we cannot make this
6230 * conditional on need_log_inode() because the directory
6231 * might have been logged in LOG_INODE_EXISTS mode or
6232 * the dir index of the conflicting inode is not in a
6233 * dir index key range logged for the directory. So we
6234 * must make sure the deletion is recorded.
6235 */
6236 ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
6237 btrfs_add_delayed_iput(inode);
6238 if (ret)
6239 break;
6240 continue;
6241 }
6242
6243 /*
6244 * Here we can use need_log_inode() because we only need to log
6245 * the inode in LOG_INODE_EXISTS mode and rename operations
6246 * update the log, so that the log ends up with the new name and
6247 * without the old name.
6248 *
6249 * We did this check at add_conflicting_inode(), but here we do
6250 * it again because if some other task logged the inode after
6251 * that, we can avoid doing it again.
6252 */
6253 if (!need_log_inode(trans, inode)) {
6254 btrfs_add_delayed_iput(inode);
6255 continue;
6256 }
6257
6258 /*
6259 * We are safe logging the other inode without acquiring its
6260 * lock as long as we log with the LOG_INODE_EXISTS mode. We
6261 * are safe against concurrent renames of the other inode as
6262 * well because during a rename we pin the log and update the
6263 * log with the new name before we unpin it.
6264 */
6265 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
6266 btrfs_add_delayed_iput(inode);
6267 if (ret)
6268 break;
6269 }
6270
6271 ctx->logging_conflict_inodes = false;
6272 if (ret)
6273 free_conflicting_inodes(ctx);
6274
6275 return ret;
6276 }
6277
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)6278 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
6279 struct btrfs_inode *inode,
6280 struct btrfs_key *min_key,
6281 const struct btrfs_key *max_key,
6282 struct btrfs_path *path,
6283 struct btrfs_path *dst_path,
6284 const u64 logged_isize,
6285 const int inode_only,
6286 struct btrfs_log_ctx *ctx,
6287 bool *need_log_inode_item)
6288 {
6289 const u64 i_size = i_size_read(&inode->vfs_inode);
6290 struct btrfs_root *root = inode->root;
6291 int ins_start_slot = 0;
6292 int ins_nr = 0;
6293 int ret;
6294
6295 while (1) {
6296 ret = btrfs_search_forward(root, min_key, path, trans->transid);
6297 if (ret < 0)
6298 return ret;
6299 if (ret > 0) {
6300 ret = 0;
6301 break;
6302 }
6303 again:
6304 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
6305 if (min_key->objectid != max_key->objectid)
6306 break;
6307 if (min_key->type > max_key->type)
6308 break;
6309
6310 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
6311 *need_log_inode_item = false;
6312 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
6313 min_key->offset >= i_size) {
6314 /*
6315 * Extents at and beyond eof are logged with
6316 * btrfs_log_prealloc_extents().
6317 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
6318 * and no keys greater than that, so bail out.
6319 */
6320 break;
6321 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
6322 min_key->type == BTRFS_INODE_EXTREF_KEY) &&
6323 (inode->generation == trans->transid ||
6324 ctx->logging_conflict_inodes)) {
6325 u64 other_ino = 0;
6326 u64 other_parent = 0;
6327
6328 ret = btrfs_check_ref_name_override(path->nodes[0],
6329 path->slots[0], min_key, inode,
6330 &other_ino, &other_parent);
6331 if (ret < 0) {
6332 return ret;
6333 } else if (ret > 0 &&
6334 other_ino != btrfs_ino(ctx->inode)) {
6335 if (ins_nr > 0) {
6336 ins_nr++;
6337 } else {
6338 ins_nr = 1;
6339 ins_start_slot = path->slots[0];
6340 }
6341 ret = copy_items(trans, inode, dst_path, path,
6342 ins_start_slot, ins_nr,
6343 inode_only, logged_isize, ctx);
6344 if (ret < 0)
6345 return ret;
6346 ins_nr = 0;
6347
6348 btrfs_release_path(path);
6349 ret = add_conflicting_inode(trans, root, path,
6350 other_ino,
6351 other_parent, ctx);
6352 if (ret)
6353 return ret;
6354 goto next_key;
6355 }
6356 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
6357 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
6358 if (ins_nr == 0)
6359 goto next_slot;
6360 ret = copy_items(trans, inode, dst_path, path,
6361 ins_start_slot,
6362 ins_nr, inode_only, logged_isize, ctx);
6363 if (ret < 0)
6364 return ret;
6365 ins_nr = 0;
6366 goto next_slot;
6367 }
6368
6369 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
6370 ins_nr++;
6371 goto next_slot;
6372 } else if (!ins_nr) {
6373 ins_start_slot = path->slots[0];
6374 ins_nr = 1;
6375 goto next_slot;
6376 }
6377
6378 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
6379 ins_nr, inode_only, logged_isize, ctx);
6380 if (ret < 0)
6381 return ret;
6382 ins_nr = 1;
6383 ins_start_slot = path->slots[0];
6384 next_slot:
6385 path->slots[0]++;
6386 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
6387 btrfs_item_key_to_cpu(path->nodes[0], min_key,
6388 path->slots[0]);
6389 goto again;
6390 }
6391 if (ins_nr) {
6392 ret = copy_items(trans, inode, dst_path, path,
6393 ins_start_slot, ins_nr, inode_only,
6394 logged_isize, ctx);
6395 if (ret < 0)
6396 return ret;
6397 ins_nr = 0;
6398 }
6399 btrfs_release_path(path);
6400 next_key:
6401 if (min_key->offset < (u64)-1) {
6402 min_key->offset++;
6403 } else if (min_key->type < max_key->type) {
6404 min_key->type++;
6405 min_key->offset = 0;
6406 } else {
6407 break;
6408 }
6409
6410 /*
6411 * We may process many leaves full of items for our inode, so
6412 * avoid monopolizing a cpu for too long by rescheduling while
6413 * not holding locks on any tree.
6414 */
6415 cond_resched();
6416 }
6417 if (ins_nr) {
6418 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
6419 ins_nr, inode_only, logged_isize, ctx);
6420 if (ret)
6421 return ret;
6422 }
6423
6424 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
6425 /*
6426 * Release the path because otherwise we might attempt to double
6427 * lock the same leaf with btrfs_log_prealloc_extents() below.
6428 */
6429 btrfs_release_path(path);
6430 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
6431 }
6432
6433 return ret;
6434 }
6435
insert_delayed_items_batch(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,const struct btrfs_item_batch * batch,const struct btrfs_delayed_item * first_item)6436 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6437 struct btrfs_root *log,
6438 struct btrfs_path *path,
6439 const struct btrfs_item_batch *batch,
6440 const struct btrfs_delayed_item *first_item)
6441 {
6442 const struct btrfs_delayed_item *curr = first_item;
6443 int ret;
6444
6445 ret = btrfs_insert_empty_items(trans, log, path, batch);
6446 if (ret)
6447 return ret;
6448
6449 for (int i = 0; i < batch->nr; i++) {
6450 char *data_ptr;
6451
6452 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
6453 write_extent_buffer(path->nodes[0], &curr->data,
6454 (unsigned long)data_ptr, curr->data_len);
6455 curr = list_next_entry(curr, log_list);
6456 path->slots[0]++;
6457 }
6458
6459 btrfs_release_path(path);
6460
6461 return 0;
6462 }
6463
log_delayed_insertion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6464 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6465 struct btrfs_inode *inode,
6466 struct btrfs_path *path,
6467 const struct list_head *delayed_ins_list,
6468 struct btrfs_log_ctx *ctx)
6469 {
6470 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6471 const int max_batch_size = 195;
6472 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6473 const u64 ino = btrfs_ino(inode);
6474 struct btrfs_root *log = inode->root->log_root;
6475 struct btrfs_item_batch batch = {
6476 .nr = 0,
6477 .total_data_size = 0,
6478 };
6479 const struct btrfs_delayed_item *first = NULL;
6480 const struct btrfs_delayed_item *curr;
6481 char *ins_data;
6482 struct btrfs_key *ins_keys;
6483 u32 *ins_sizes;
6484 u64 curr_batch_size = 0;
6485 int batch_idx = 0;
6486 int ret;
6487
6488 /* We are adding dir index items to the log tree. */
6489 lockdep_assert_held(&inode->log_mutex);
6490
6491 /*
6492 * We collect delayed items before copying index keys from the subvolume
6493 * to the log tree. However just after we collected them, they may have
6494 * been flushed (all of them or just some of them), and therefore we
6495 * could have copied them from the subvolume tree to the log tree.
6496 * So find the first delayed item that was not yet logged (they are
6497 * sorted by index number).
6498 */
6499 list_for_each_entry(curr, delayed_ins_list, log_list) {
6500 if (curr->index > inode->last_dir_index_offset) {
6501 first = curr;
6502 break;
6503 }
6504 }
6505
6506 /* Empty list or all delayed items were already logged. */
6507 if (!first)
6508 return 0;
6509
6510 ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
6511 if (!ins_data)
6512 return -ENOMEM;
6513 ins_sizes = (u32 *)ins_data;
6514 batch.data_sizes = ins_sizes;
6515 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6516 batch.keys = ins_keys;
6517
6518 curr = first;
6519 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6520 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6521
6522 if (curr_batch_size + curr_size > leaf_data_size ||
6523 batch.nr == max_batch_size) {
6524 ret = insert_delayed_items_batch(trans, log, path,
6525 &batch, first);
6526 if (ret)
6527 goto out;
6528 batch_idx = 0;
6529 batch.nr = 0;
6530 batch.total_data_size = 0;
6531 curr_batch_size = 0;
6532 first = curr;
6533 }
6534
6535 ins_sizes[batch_idx] = curr->data_len;
6536 ins_keys[batch_idx].objectid = ino;
6537 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6538 ins_keys[batch_idx].offset = curr->index;
6539 curr_batch_size += curr_size;
6540 batch.total_data_size += curr->data_len;
6541 batch.nr++;
6542 batch_idx++;
6543 curr = list_next_entry(curr, log_list);
6544 }
6545
6546 ASSERT(batch.nr >= 1);
6547 ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6548
6549 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6550 log_list);
6551 inode->last_dir_index_offset = curr->index;
6552 out:
6553 kfree(ins_data);
6554
6555 return ret;
6556 }
6557
log_delayed_deletions_full(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6558 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6559 struct btrfs_inode *inode,
6560 struct btrfs_path *path,
6561 const struct list_head *delayed_del_list,
6562 struct btrfs_log_ctx *ctx)
6563 {
6564 const u64 ino = btrfs_ino(inode);
6565 const struct btrfs_delayed_item *curr;
6566
6567 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6568 log_list);
6569
6570 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6571 u64 first_dir_index = curr->index;
6572 u64 last_dir_index;
6573 const struct btrfs_delayed_item *next;
6574 int ret;
6575
6576 /*
6577 * Find a range of consecutive dir index items to delete. Like
6578 * this we log a single dir range item spanning several contiguous
6579 * dir items instead of logging one range item per dir index item.
6580 */
6581 next = list_next_entry(curr, log_list);
6582 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6583 if (next->index != curr->index + 1)
6584 break;
6585 curr = next;
6586 next = list_next_entry(next, log_list);
6587 }
6588
6589 last_dir_index = curr->index;
6590 ASSERT(last_dir_index >= first_dir_index);
6591
6592 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6593 ino, first_dir_index, last_dir_index);
6594 if (ret)
6595 return ret;
6596 curr = list_next_entry(curr, log_list);
6597 }
6598
6599 return 0;
6600 }
6601
batch_delete_dir_index_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,const struct btrfs_delayed_item * first,const struct btrfs_delayed_item ** last_ret)6602 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6603 struct btrfs_inode *inode,
6604 struct btrfs_path *path,
6605 const struct list_head *delayed_del_list,
6606 const struct btrfs_delayed_item *first,
6607 const struct btrfs_delayed_item **last_ret)
6608 {
6609 const struct btrfs_delayed_item *next;
6610 struct extent_buffer *leaf = path->nodes[0];
6611 const int last_slot = btrfs_header_nritems(leaf) - 1;
6612 int slot = path->slots[0] + 1;
6613 const u64 ino = btrfs_ino(inode);
6614
6615 next = list_next_entry(first, log_list);
6616
6617 while (slot < last_slot &&
6618 !list_entry_is_head(next, delayed_del_list, log_list)) {
6619 struct btrfs_key key;
6620
6621 btrfs_item_key_to_cpu(leaf, &key, slot);
6622 if (key.objectid != ino ||
6623 key.type != BTRFS_DIR_INDEX_KEY ||
6624 key.offset != next->index)
6625 break;
6626
6627 slot++;
6628 *last_ret = next;
6629 next = list_next_entry(next, log_list);
6630 }
6631
6632 return btrfs_del_items(trans, inode->root->log_root, path,
6633 path->slots[0], slot - path->slots[0]);
6634 }
6635
log_delayed_deletions_incremental(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6636 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6637 struct btrfs_inode *inode,
6638 struct btrfs_path *path,
6639 const struct list_head *delayed_del_list,
6640 struct btrfs_log_ctx *ctx)
6641 {
6642 struct btrfs_root *log = inode->root->log_root;
6643 const struct btrfs_delayed_item *curr;
6644 u64 last_range_start = 0;
6645 u64 last_range_end = 0;
6646 struct btrfs_key key;
6647
6648 key.objectid = btrfs_ino(inode);
6649 key.type = BTRFS_DIR_INDEX_KEY;
6650 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6651 log_list);
6652
6653 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6654 const struct btrfs_delayed_item *last = curr;
6655 u64 first_dir_index = curr->index;
6656 u64 last_dir_index;
6657 bool deleted_items = false;
6658 int ret;
6659
6660 key.offset = curr->index;
6661 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6662 if (ret < 0) {
6663 return ret;
6664 } else if (ret == 0) {
6665 ret = batch_delete_dir_index_items(trans, inode, path,
6666 delayed_del_list, curr,
6667 &last);
6668 if (ret)
6669 return ret;
6670 deleted_items = true;
6671 }
6672
6673 btrfs_release_path(path);
6674
6675 /*
6676 * If we deleted items from the leaf, it means we have a range
6677 * item logging their range, so no need to add one or update an
6678 * existing one. Otherwise we have to log a dir range item.
6679 */
6680 if (deleted_items)
6681 goto next_batch;
6682
6683 last_dir_index = last->index;
6684 ASSERT(last_dir_index >= first_dir_index);
6685 /*
6686 * If this range starts right after where the previous one ends,
6687 * then we want to reuse the previous range item and change its
6688 * end offset to the end of this range. This is just to minimize
6689 * leaf space usage, by avoiding adding a new range item.
6690 */
6691 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6692 first_dir_index = last_range_start;
6693
6694 ret = insert_dir_log_key(trans, log, path, key.objectid,
6695 first_dir_index, last_dir_index);
6696 if (ret)
6697 return ret;
6698
6699 last_range_start = first_dir_index;
6700 last_range_end = last_dir_index;
6701 next_batch:
6702 curr = list_next_entry(last, log_list);
6703 }
6704
6705 return 0;
6706 }
6707
log_delayed_deletion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6708 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6709 struct btrfs_inode *inode,
6710 struct btrfs_path *path,
6711 const struct list_head *delayed_del_list,
6712 struct btrfs_log_ctx *ctx)
6713 {
6714 /*
6715 * We are deleting dir index items from the log tree or adding range
6716 * items to it.
6717 */
6718 lockdep_assert_held(&inode->log_mutex);
6719
6720 if (list_empty(delayed_del_list))
6721 return 0;
6722
6723 if (ctx->logged_before)
6724 return log_delayed_deletions_incremental(trans, inode, path,
6725 delayed_del_list, ctx);
6726
6727 return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6728 ctx);
6729 }
6730
6731 /*
6732 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6733 * items instead of the subvolume tree.
6734 */
log_new_delayed_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6735 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6736 struct btrfs_inode *inode,
6737 const struct list_head *delayed_ins_list,
6738 struct btrfs_log_ctx *ctx)
6739 {
6740 const bool orig_log_new_dentries = ctx->log_new_dentries;
6741 struct btrfs_delayed_item *item;
6742 int ret = 0;
6743
6744 /*
6745 * No need for the log mutex, plus to avoid potential deadlocks or
6746 * lockdep annotations due to nesting of delayed inode mutexes and log
6747 * mutexes.
6748 */
6749 lockdep_assert_not_held(&inode->log_mutex);
6750
6751 ASSERT(!ctx->logging_new_delayed_dentries);
6752 ctx->logging_new_delayed_dentries = true;
6753
6754 list_for_each_entry(item, delayed_ins_list, log_list) {
6755 struct btrfs_dir_item *dir_item;
6756 struct btrfs_inode *di_inode;
6757 struct btrfs_key key;
6758 int log_mode = LOG_INODE_EXISTS;
6759
6760 dir_item = (struct btrfs_dir_item *)item->data;
6761 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6762
6763 if (key.type == BTRFS_ROOT_ITEM_KEY)
6764 continue;
6765
6766 di_inode = btrfs_iget_logging(key.objectid, inode->root);
6767 if (IS_ERR(di_inode)) {
6768 ret = PTR_ERR(di_inode);
6769 break;
6770 }
6771
6772 if (!need_log_inode(trans, di_inode)) {
6773 btrfs_add_delayed_iput(di_inode);
6774 continue;
6775 }
6776
6777 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6778 log_mode = LOG_INODE_ALL;
6779
6780 ctx->log_new_dentries = false;
6781 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
6782
6783 if (!ret && ctx->log_new_dentries)
6784 ret = log_new_dir_dentries(trans, di_inode, ctx);
6785
6786 btrfs_add_delayed_iput(di_inode);
6787
6788 if (ret)
6789 break;
6790 }
6791
6792 ctx->log_new_dentries = orig_log_new_dentries;
6793 ctx->logging_new_delayed_dentries = false;
6794
6795 return ret;
6796 }
6797
6798 /* log a single inode in the tree log.
6799 * At least one parent directory for this inode must exist in the tree
6800 * or be logged already.
6801 *
6802 * Any items from this inode changed by the current transaction are copied
6803 * to the log tree. An extra reference is taken on any extents in this
6804 * file, allowing us to avoid a whole pile of corner cases around logging
6805 * blocks that have been removed from the tree.
6806 *
6807 * See LOG_INODE_ALL and related defines for a description of what inode_only
6808 * does.
6809 *
6810 * This handles both files and directories.
6811 */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)6812 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6813 struct btrfs_inode *inode,
6814 int inode_only,
6815 struct btrfs_log_ctx *ctx)
6816 {
6817 struct btrfs_path *path;
6818 struct btrfs_path *dst_path;
6819 struct btrfs_key min_key;
6820 struct btrfs_key max_key;
6821 struct btrfs_root *log = inode->root->log_root;
6822 int ret;
6823 bool fast_search = false;
6824 u64 ino = btrfs_ino(inode);
6825 struct extent_map_tree *em_tree = &inode->extent_tree;
6826 u64 logged_isize = 0;
6827 bool need_log_inode_item = true;
6828 bool xattrs_logged = false;
6829 bool inode_item_dropped = true;
6830 bool full_dir_logging = false;
6831 LIST_HEAD(delayed_ins_list);
6832 LIST_HEAD(delayed_del_list);
6833
6834 path = btrfs_alloc_path();
6835 if (!path)
6836 return -ENOMEM;
6837 dst_path = btrfs_alloc_path();
6838 if (!dst_path) {
6839 btrfs_free_path(path);
6840 return -ENOMEM;
6841 }
6842
6843 min_key.objectid = ino;
6844 min_key.type = BTRFS_INODE_ITEM_KEY;
6845 min_key.offset = 0;
6846
6847 max_key.objectid = ino;
6848
6849
6850 /* today the code can only do partial logging of directories */
6851 if (S_ISDIR(inode->vfs_inode.i_mode) ||
6852 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6853 &inode->runtime_flags) &&
6854 inode_only >= LOG_INODE_EXISTS))
6855 max_key.type = BTRFS_XATTR_ITEM_KEY;
6856 else
6857 max_key.type = (u8)-1;
6858 max_key.offset = (u64)-1;
6859
6860 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6861 full_dir_logging = true;
6862
6863 /*
6864 * If we are logging a directory while we are logging dentries of the
6865 * delayed items of some other inode, then we need to flush the delayed
6866 * items of this directory and not log the delayed items directly. This
6867 * is to prevent more than one level of recursion into btrfs_log_inode()
6868 * by having something like this:
6869 *
6870 * $ mkdir -p a/b/c/d/e/f/g/h/...
6871 * $ xfs_io -c "fsync" a
6872 *
6873 * Where all directories in the path did not exist before and are
6874 * created in the current transaction.
6875 * So in such a case we directly log the delayed items of the main
6876 * directory ("a") without flushing them first, while for each of its
6877 * subdirectories we flush their delayed items before logging them.
6878 * This prevents a potential unbounded recursion like this:
6879 *
6880 * btrfs_log_inode()
6881 * log_new_delayed_dentries()
6882 * btrfs_log_inode()
6883 * log_new_delayed_dentries()
6884 * btrfs_log_inode()
6885 * log_new_delayed_dentries()
6886 * (...)
6887 *
6888 * We have thresholds for the maximum number of delayed items to have in
6889 * memory, and once they are hit, the items are flushed asynchronously.
6890 * However the limit is quite high, so lets prevent deep levels of
6891 * recursion to happen by limiting the maximum depth to be 1.
6892 */
6893 if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6894 ret = btrfs_commit_inode_delayed_items(trans, inode);
6895 if (ret)
6896 goto out;
6897 }
6898
6899 mutex_lock(&inode->log_mutex);
6900
6901 /*
6902 * For symlinks, we must always log their content, which is stored in an
6903 * inline extent, otherwise we could end up with an empty symlink after
6904 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6905 * one attempts to create an empty symlink).
6906 * We don't need to worry about flushing delalloc, because when we create
6907 * the inline extent when the symlink is created (we never have delalloc
6908 * for symlinks).
6909 */
6910 if (S_ISLNK(inode->vfs_inode.i_mode))
6911 inode_only = LOG_INODE_ALL;
6912
6913 /*
6914 * Before logging the inode item, cache the value returned by
6915 * inode_logged(), because after that we have the need to figure out if
6916 * the inode was previously logged in this transaction.
6917 */
6918 ret = inode_logged(trans, inode, path);
6919 if (ret < 0)
6920 goto out_unlock;
6921 ctx->logged_before = (ret == 1);
6922 ret = 0;
6923
6924 /*
6925 * This is for cases where logging a directory could result in losing a
6926 * a file after replaying the log. For example, if we move a file from a
6927 * directory A to a directory B, then fsync directory A, we have no way
6928 * to known the file was moved from A to B, so logging just A would
6929 * result in losing the file after a log replay.
6930 */
6931 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6932 ret = BTRFS_LOG_FORCE_COMMIT;
6933 goto out_unlock;
6934 }
6935
6936 /*
6937 * a brute force approach to making sure we get the most uptodate
6938 * copies of everything.
6939 */
6940 if (S_ISDIR(inode->vfs_inode.i_mode)) {
6941 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6942 if (ctx->logged_before)
6943 ret = drop_inode_items(trans, log, path, inode,
6944 BTRFS_XATTR_ITEM_KEY);
6945 } else {
6946 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6947 /*
6948 * Make sure the new inode item we write to the log has
6949 * the same isize as the current one (if it exists).
6950 * This is necessary to prevent data loss after log
6951 * replay, and also to prevent doing a wrong expanding
6952 * truncate - for e.g. create file, write 4K into offset
6953 * 0, fsync, write 4K into offset 4096, add hard link,
6954 * fsync some other file (to sync log), power fail - if
6955 * we use the inode's current i_size, after log replay
6956 * we get a 8Kb file, with the last 4Kb extent as a hole
6957 * (zeroes), as if an expanding truncate happened,
6958 * instead of getting a file of 4Kb only.
6959 */
6960 ret = logged_inode_size(log, inode, path, &logged_isize);
6961 if (ret)
6962 goto out_unlock;
6963 }
6964 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6965 &inode->runtime_flags)) {
6966 if (inode_only == LOG_INODE_EXISTS) {
6967 max_key.type = BTRFS_XATTR_ITEM_KEY;
6968 if (ctx->logged_before)
6969 ret = drop_inode_items(trans, log, path,
6970 inode, max_key.type);
6971 } else {
6972 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6973 &inode->runtime_flags);
6974 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6975 &inode->runtime_flags);
6976 if (ctx->logged_before)
6977 ret = truncate_inode_items(trans, log,
6978 inode, 0, 0);
6979 }
6980 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6981 &inode->runtime_flags) ||
6982 inode_only == LOG_INODE_EXISTS) {
6983 if (inode_only == LOG_INODE_ALL)
6984 fast_search = true;
6985 max_key.type = BTRFS_XATTR_ITEM_KEY;
6986 if (ctx->logged_before)
6987 ret = drop_inode_items(trans, log, path, inode,
6988 max_key.type);
6989 } else {
6990 if (inode_only == LOG_INODE_ALL)
6991 fast_search = true;
6992 inode_item_dropped = false;
6993 goto log_extents;
6994 }
6995
6996 }
6997 if (ret)
6998 goto out_unlock;
6999
7000 /*
7001 * If we are logging a directory in full mode, collect the delayed items
7002 * before iterating the subvolume tree, so that we don't miss any new
7003 * dir index items in case they get flushed while or right after we are
7004 * iterating the subvolume tree.
7005 */
7006 if (full_dir_logging && !ctx->logging_new_delayed_dentries)
7007 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
7008 &delayed_del_list);
7009
7010 /*
7011 * If we are fsyncing a file with 0 hard links, then commit the delayed
7012 * inode because the last inode ref (or extref) item may still be in the
7013 * subvolume tree and if we log it the file will still exist after a log
7014 * replay. So commit the delayed inode to delete that last ref and we
7015 * skip logging it.
7016 */
7017 if (inode->vfs_inode.i_nlink == 0) {
7018 ret = btrfs_commit_inode_delayed_inode(inode);
7019 if (ret)
7020 goto out_unlock;
7021 }
7022
7023 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
7024 path, dst_path, logged_isize,
7025 inode_only, ctx,
7026 &need_log_inode_item);
7027 if (ret)
7028 goto out_unlock;
7029
7030 btrfs_release_path(path);
7031 btrfs_release_path(dst_path);
7032 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
7033 if (ret)
7034 goto out_unlock;
7035 xattrs_logged = true;
7036 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
7037 btrfs_release_path(path);
7038 btrfs_release_path(dst_path);
7039 ret = btrfs_log_holes(trans, inode, path);
7040 if (ret)
7041 goto out_unlock;
7042 }
7043 log_extents:
7044 btrfs_release_path(path);
7045 btrfs_release_path(dst_path);
7046 if (need_log_inode_item) {
7047 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
7048 if (ret)
7049 goto out_unlock;
7050 /*
7051 * If we are doing a fast fsync and the inode was logged before
7052 * in this transaction, we don't need to log the xattrs because
7053 * they were logged before. If xattrs were added, changed or
7054 * deleted since the last time we logged the inode, then we have
7055 * already logged them because the inode had the runtime flag
7056 * BTRFS_INODE_COPY_EVERYTHING set.
7057 */
7058 if (!xattrs_logged && inode->logged_trans < trans->transid) {
7059 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
7060 if (ret)
7061 goto out_unlock;
7062 btrfs_release_path(path);
7063 }
7064 }
7065 if (fast_search) {
7066 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
7067 if (ret)
7068 goto out_unlock;
7069 } else if (inode_only == LOG_INODE_ALL) {
7070 struct extent_map *em, *n;
7071
7072 write_lock(&em_tree->lock);
7073 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
7074 list_del_init(&em->list);
7075 write_unlock(&em_tree->lock);
7076 }
7077
7078 if (full_dir_logging) {
7079 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
7080 if (ret)
7081 goto out_unlock;
7082 ret = log_delayed_insertion_items(trans, inode, path,
7083 &delayed_ins_list, ctx);
7084 if (ret)
7085 goto out_unlock;
7086 ret = log_delayed_deletion_items(trans, inode, path,
7087 &delayed_del_list, ctx);
7088 if (ret)
7089 goto out_unlock;
7090 }
7091
7092 spin_lock(&inode->lock);
7093 inode->logged_trans = trans->transid;
7094 /*
7095 * Don't update last_log_commit if we logged that an inode exists.
7096 * We do this for three reasons:
7097 *
7098 * 1) We might have had buffered writes to this inode that were
7099 * flushed and had their ordered extents completed in this
7100 * transaction, but we did not previously log the inode with
7101 * LOG_INODE_ALL. Later the inode was evicted and after that
7102 * it was loaded again and this LOG_INODE_EXISTS log operation
7103 * happened. We must make sure that if an explicit fsync against
7104 * the inode is performed later, it logs the new extents, an
7105 * updated inode item, etc, and syncs the log. The same logic
7106 * applies to direct IO writes instead of buffered writes.
7107 *
7108 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
7109 * is logged with an i_size of 0 or whatever value was logged
7110 * before. If later the i_size of the inode is increased by a
7111 * truncate operation, the log is synced through an fsync of
7112 * some other inode and then finally an explicit fsync against
7113 * this inode is made, we must make sure this fsync logs the
7114 * inode with the new i_size, the hole between old i_size and
7115 * the new i_size, and syncs the log.
7116 *
7117 * 3) If we are logging that an ancestor inode exists as part of
7118 * logging a new name from a link or rename operation, don't update
7119 * its last_log_commit - otherwise if an explicit fsync is made
7120 * against an ancestor, the fsync considers the inode in the log
7121 * and doesn't sync the log, resulting in the ancestor missing after
7122 * a power failure unless the log was synced as part of an fsync
7123 * against any other unrelated inode.
7124 */
7125 if (inode_only != LOG_INODE_EXISTS)
7126 inode->last_log_commit = inode->last_sub_trans;
7127 spin_unlock(&inode->lock);
7128
7129 /*
7130 * Reset the last_reflink_trans so that the next fsync does not need to
7131 * go through the slower path when logging extents and their checksums.
7132 */
7133 if (inode_only == LOG_INODE_ALL)
7134 inode->last_reflink_trans = 0;
7135
7136 out_unlock:
7137 mutex_unlock(&inode->log_mutex);
7138 out:
7139 btrfs_free_path(path);
7140 btrfs_free_path(dst_path);
7141
7142 if (ret)
7143 free_conflicting_inodes(ctx);
7144 else
7145 ret = log_conflicting_inodes(trans, inode->root, ctx);
7146
7147 if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
7148 if (!ret)
7149 ret = log_new_delayed_dentries(trans, inode,
7150 &delayed_ins_list, ctx);
7151
7152 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
7153 &delayed_del_list);
7154 }
7155
7156 return ret;
7157 }
7158
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)7159 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
7160 struct btrfs_inode *inode,
7161 struct btrfs_log_ctx *ctx)
7162 {
7163 int ret;
7164 BTRFS_PATH_AUTO_FREE(path);
7165 struct btrfs_key key;
7166 struct btrfs_root *root = inode->root;
7167 const u64 ino = btrfs_ino(inode);
7168
7169 path = btrfs_alloc_path();
7170 if (!path)
7171 return -ENOMEM;
7172 path->skip_locking = 1;
7173 path->search_commit_root = 1;
7174
7175 key.objectid = ino;
7176 key.type = BTRFS_INODE_REF_KEY;
7177 key.offset = 0;
7178 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7179 if (ret < 0)
7180 return ret;
7181
7182 while (true) {
7183 struct extent_buffer *leaf = path->nodes[0];
7184 int slot = path->slots[0];
7185 u32 cur_offset = 0;
7186 u32 item_size;
7187 unsigned long ptr;
7188
7189 if (slot >= btrfs_header_nritems(leaf)) {
7190 ret = btrfs_next_leaf(root, path);
7191 if (ret < 0)
7192 return ret;
7193 if (ret > 0)
7194 break;
7195 continue;
7196 }
7197
7198 btrfs_item_key_to_cpu(leaf, &key, slot);
7199 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
7200 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
7201 break;
7202
7203 item_size = btrfs_item_size(leaf, slot);
7204 ptr = btrfs_item_ptr_offset(leaf, slot);
7205 while (cur_offset < item_size) {
7206 struct btrfs_key inode_key;
7207 struct btrfs_inode *dir_inode;
7208
7209 inode_key.type = BTRFS_INODE_ITEM_KEY;
7210 inode_key.offset = 0;
7211
7212 if (key.type == BTRFS_INODE_EXTREF_KEY) {
7213 struct btrfs_inode_extref *extref;
7214
7215 extref = (struct btrfs_inode_extref *)
7216 (ptr + cur_offset);
7217 inode_key.objectid = btrfs_inode_extref_parent(
7218 leaf, extref);
7219 cur_offset += sizeof(*extref);
7220 cur_offset += btrfs_inode_extref_name_len(leaf,
7221 extref);
7222 } else {
7223 inode_key.objectid = key.offset;
7224 cur_offset = item_size;
7225 }
7226
7227 dir_inode = btrfs_iget_logging(inode_key.objectid, root);
7228 /*
7229 * If the parent inode was deleted, return an error to
7230 * fallback to a transaction commit. This is to prevent
7231 * getting an inode that was moved from one parent A to
7232 * a parent B, got its former parent A deleted and then
7233 * it got fsync'ed, from existing at both parents after
7234 * a log replay (and the old parent still existing).
7235 * Example:
7236 *
7237 * mkdir /mnt/A
7238 * mkdir /mnt/B
7239 * touch /mnt/B/bar
7240 * sync
7241 * mv /mnt/B/bar /mnt/A/bar
7242 * mv -T /mnt/A /mnt/B
7243 * fsync /mnt/B/bar
7244 * <power fail>
7245 *
7246 * If we ignore the old parent B which got deleted,
7247 * after a log replay we would have file bar linked
7248 * at both parents and the old parent B would still
7249 * exist.
7250 */
7251 if (IS_ERR(dir_inode))
7252 return PTR_ERR(dir_inode);
7253
7254 if (!need_log_inode(trans, dir_inode)) {
7255 btrfs_add_delayed_iput(dir_inode);
7256 continue;
7257 }
7258
7259 ctx->log_new_dentries = false;
7260 ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
7261 if (!ret && ctx->log_new_dentries)
7262 ret = log_new_dir_dentries(trans, dir_inode, ctx);
7263 btrfs_add_delayed_iput(dir_inode);
7264 if (ret)
7265 return ret;
7266 }
7267 path->slots[0]++;
7268 }
7269 return 0;
7270 }
7271
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)7272 static int log_new_ancestors(struct btrfs_trans_handle *trans,
7273 struct btrfs_root *root,
7274 struct btrfs_path *path,
7275 struct btrfs_log_ctx *ctx)
7276 {
7277 struct btrfs_key found_key;
7278
7279 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
7280
7281 while (true) {
7282 struct extent_buffer *leaf;
7283 int slot;
7284 struct btrfs_key search_key;
7285 struct btrfs_inode *inode;
7286 u64 ino;
7287 int ret = 0;
7288
7289 btrfs_release_path(path);
7290
7291 ino = found_key.offset;
7292
7293 search_key.objectid = found_key.offset;
7294 search_key.type = BTRFS_INODE_ITEM_KEY;
7295 search_key.offset = 0;
7296 inode = btrfs_iget_logging(ino, root);
7297 if (IS_ERR(inode))
7298 return PTR_ERR(inode);
7299
7300 if (inode->generation >= trans->transid &&
7301 need_log_inode(trans, inode))
7302 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
7303 btrfs_add_delayed_iput(inode);
7304 if (ret)
7305 return ret;
7306
7307 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
7308 break;
7309
7310 search_key.type = BTRFS_INODE_REF_KEY;
7311 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
7312 if (ret < 0)
7313 return ret;
7314
7315 leaf = path->nodes[0];
7316 slot = path->slots[0];
7317 if (slot >= btrfs_header_nritems(leaf)) {
7318 ret = btrfs_next_leaf(root, path);
7319 if (ret < 0)
7320 return ret;
7321 else if (ret > 0)
7322 return -ENOENT;
7323 leaf = path->nodes[0];
7324 slot = path->slots[0];
7325 }
7326
7327 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7328 if (found_key.objectid != search_key.objectid ||
7329 found_key.type != BTRFS_INODE_REF_KEY)
7330 return -ENOENT;
7331 }
7332 return 0;
7333 }
7334
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)7335 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
7336 struct btrfs_inode *inode,
7337 struct dentry *parent,
7338 struct btrfs_log_ctx *ctx)
7339 {
7340 struct btrfs_root *root = inode->root;
7341 struct dentry *old_parent = NULL;
7342 struct super_block *sb = inode->vfs_inode.i_sb;
7343 int ret = 0;
7344
7345 while (true) {
7346 if (!parent || d_really_is_negative(parent) ||
7347 sb != parent->d_sb)
7348 break;
7349
7350 inode = BTRFS_I(d_inode(parent));
7351 if (root != inode->root)
7352 break;
7353
7354 if (inode->generation >= trans->transid &&
7355 need_log_inode(trans, inode)) {
7356 ret = btrfs_log_inode(trans, inode,
7357 LOG_INODE_EXISTS, ctx);
7358 if (ret)
7359 break;
7360 }
7361 if (IS_ROOT(parent))
7362 break;
7363
7364 parent = dget_parent(parent);
7365 dput(old_parent);
7366 old_parent = parent;
7367 }
7368 dput(old_parent);
7369
7370 return ret;
7371 }
7372
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)7373 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
7374 struct btrfs_inode *inode,
7375 struct dentry *parent,
7376 struct btrfs_log_ctx *ctx)
7377 {
7378 struct btrfs_root *root = inode->root;
7379 const u64 ino = btrfs_ino(inode);
7380 BTRFS_PATH_AUTO_FREE(path);
7381 struct btrfs_key search_key;
7382 int ret;
7383
7384 /*
7385 * For a single hard link case, go through a fast path that does not
7386 * need to iterate the fs/subvolume tree.
7387 */
7388 if (inode->vfs_inode.i_nlink < 2)
7389 return log_new_ancestors_fast(trans, inode, parent, ctx);
7390
7391 path = btrfs_alloc_path();
7392 if (!path)
7393 return -ENOMEM;
7394
7395 search_key.objectid = ino;
7396 search_key.type = BTRFS_INODE_REF_KEY;
7397 search_key.offset = 0;
7398 again:
7399 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
7400 if (ret < 0)
7401 return ret;
7402 if (ret == 0)
7403 path->slots[0]++;
7404
7405 while (true) {
7406 struct extent_buffer *leaf = path->nodes[0];
7407 int slot = path->slots[0];
7408 struct btrfs_key found_key;
7409
7410 if (slot >= btrfs_header_nritems(leaf)) {
7411 ret = btrfs_next_leaf(root, path);
7412 if (ret < 0)
7413 return ret;
7414 if (ret > 0)
7415 break;
7416 continue;
7417 }
7418
7419 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7420 if (found_key.objectid != ino ||
7421 found_key.type > BTRFS_INODE_EXTREF_KEY)
7422 break;
7423
7424 /*
7425 * Don't deal with extended references because they are rare
7426 * cases and too complex to deal with (we would need to keep
7427 * track of which subitem we are processing for each item in
7428 * this loop, etc). So just return some error to fallback to
7429 * a transaction commit.
7430 */
7431 if (found_key.type == BTRFS_INODE_EXTREF_KEY)
7432 return -EMLINK;
7433
7434 /*
7435 * Logging ancestors needs to do more searches on the fs/subvol
7436 * tree, so it releases the path as needed to avoid deadlocks.
7437 * Keep track of the last inode ref key and resume from that key
7438 * after logging all new ancestors for the current hard link.
7439 */
7440 memcpy(&search_key, &found_key, sizeof(search_key));
7441
7442 ret = log_new_ancestors(trans, root, path, ctx);
7443 if (ret)
7444 return ret;
7445 btrfs_release_path(path);
7446 goto again;
7447 }
7448 return 0;
7449 }
7450
7451 /*
7452 * helper function around btrfs_log_inode to make sure newly created
7453 * parent directories also end up in the log. A minimal inode and backref
7454 * only logging is done of any parent directories that are older than
7455 * the last committed transaction
7456 */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)7457 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7458 struct btrfs_inode *inode,
7459 struct dentry *parent,
7460 int inode_only,
7461 struct btrfs_log_ctx *ctx)
7462 {
7463 struct btrfs_root *root = inode->root;
7464 struct btrfs_fs_info *fs_info = root->fs_info;
7465 int ret = 0;
7466 bool log_dentries;
7467
7468 if (btrfs_test_opt(fs_info, NOTREELOG))
7469 return BTRFS_LOG_FORCE_COMMIT;
7470
7471 if (btrfs_root_refs(&root->root_item) == 0)
7472 return BTRFS_LOG_FORCE_COMMIT;
7473
7474 /*
7475 * If we're logging an inode from a subvolume created in the current
7476 * transaction we must force a commit since the root is not persisted.
7477 */
7478 if (btrfs_root_generation(&root->root_item) == trans->transid)
7479 return BTRFS_LOG_FORCE_COMMIT;
7480
7481 /* Skip already logged inodes and without new extents. */
7482 if (btrfs_inode_in_log(inode, trans->transid) &&
7483 list_empty(&ctx->ordered_extents))
7484 return BTRFS_NO_LOG_SYNC;
7485
7486 ret = start_log_trans(trans, root, ctx);
7487 if (ret)
7488 return ret;
7489
7490 ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7491 if (ret)
7492 goto end_trans;
7493
7494 /*
7495 * for regular files, if its inode is already on disk, we don't
7496 * have to worry about the parents at all. This is because
7497 * we can use the last_unlink_trans field to record renames
7498 * and other fun in this file.
7499 */
7500 if (S_ISREG(inode->vfs_inode.i_mode) &&
7501 inode->generation < trans->transid &&
7502 inode->last_unlink_trans < trans->transid) {
7503 ret = 0;
7504 goto end_trans;
7505 }
7506
7507 /*
7508 * Track if we need to log dentries because ctx->log_new_dentries can
7509 * be modified in the call chains below.
7510 */
7511 log_dentries = ctx->log_new_dentries;
7512
7513 /*
7514 * On unlink we must make sure all our current and old parent directory
7515 * inodes are fully logged. This is to prevent leaving dangling
7516 * directory index entries in directories that were our parents but are
7517 * not anymore. Not doing this results in old parent directory being
7518 * impossible to delete after log replay (rmdir will always fail with
7519 * error -ENOTEMPTY).
7520 *
7521 * Example 1:
7522 *
7523 * mkdir testdir
7524 * touch testdir/foo
7525 * ln testdir/foo testdir/bar
7526 * sync
7527 * unlink testdir/bar
7528 * xfs_io -c fsync testdir/foo
7529 * <power failure>
7530 * mount fs, triggers log replay
7531 *
7532 * If we don't log the parent directory (testdir), after log replay the
7533 * directory still has an entry pointing to the file inode using the bar
7534 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7535 * the file inode has a link count of 1.
7536 *
7537 * Example 2:
7538 *
7539 * mkdir testdir
7540 * touch foo
7541 * ln foo testdir/foo2
7542 * ln foo testdir/foo3
7543 * sync
7544 * unlink testdir/foo3
7545 * xfs_io -c fsync foo
7546 * <power failure>
7547 * mount fs, triggers log replay
7548 *
7549 * Similar as the first example, after log replay the parent directory
7550 * testdir still has an entry pointing to the inode file with name foo3
7551 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7552 * and has a link count of 2.
7553 */
7554 if (inode->last_unlink_trans >= trans->transid) {
7555 ret = btrfs_log_all_parents(trans, inode, ctx);
7556 if (ret)
7557 goto end_trans;
7558 }
7559
7560 ret = log_all_new_ancestors(trans, inode, parent, ctx);
7561 if (ret)
7562 goto end_trans;
7563
7564 if (log_dentries)
7565 ret = log_new_dir_dentries(trans, inode, ctx);
7566 end_trans:
7567 if (ret < 0) {
7568 btrfs_set_log_full_commit(trans);
7569 ret = BTRFS_LOG_FORCE_COMMIT;
7570 }
7571
7572 if (ret)
7573 btrfs_remove_log_ctx(root, ctx);
7574 btrfs_end_log_trans(root);
7575
7576 return ret;
7577 }
7578
7579 /*
7580 * it is not safe to log dentry if the chunk root has added new
7581 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
7582 * If this returns 1, you must commit the transaction to safely get your
7583 * data on disk.
7584 */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)7585 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7586 struct dentry *dentry,
7587 struct btrfs_log_ctx *ctx)
7588 {
7589 struct dentry *parent = dget_parent(dentry);
7590 int ret;
7591
7592 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7593 LOG_INODE_ALL, ctx);
7594 dput(parent);
7595
7596 return ret;
7597 }
7598
7599 /*
7600 * should be called during mount to recover any replay any log trees
7601 * from the FS
7602 */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)7603 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7604 {
7605 int ret;
7606 struct btrfs_path *path;
7607 struct btrfs_trans_handle *trans;
7608 struct btrfs_key key;
7609 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7610 struct walk_control wc = {
7611 .process_func = process_one_buffer,
7612 .stage = LOG_WALK_PIN_ONLY,
7613 };
7614
7615 path = btrfs_alloc_path();
7616 if (!path)
7617 return -ENOMEM;
7618
7619 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7620
7621 trans = btrfs_start_transaction(fs_info->tree_root, 0);
7622 if (IS_ERR(trans)) {
7623 ret = PTR_ERR(trans);
7624 goto error;
7625 }
7626
7627 wc.trans = trans;
7628 wc.pin = true;
7629 wc.log = log_root_tree;
7630
7631 ret = walk_log_tree(&wc);
7632 wc.log = NULL;
7633 if (unlikely(ret)) {
7634 btrfs_abort_transaction(trans, ret);
7635 goto error;
7636 }
7637
7638 again:
7639 key.objectid = BTRFS_TREE_LOG_OBJECTID;
7640 key.type = BTRFS_ROOT_ITEM_KEY;
7641 key.offset = (u64)-1;
7642
7643 while (1) {
7644 struct btrfs_key found_key;
7645
7646 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7647
7648 if (unlikely(ret < 0)) {
7649 btrfs_abort_transaction(trans, ret);
7650 goto error;
7651 }
7652 if (ret > 0) {
7653 if (path->slots[0] == 0)
7654 break;
7655 path->slots[0]--;
7656 }
7657 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7658 path->slots[0]);
7659 btrfs_release_path(path);
7660 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7661 break;
7662
7663 wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
7664 if (IS_ERR(wc.log)) {
7665 ret = PTR_ERR(wc.log);
7666 wc.log = NULL;
7667 btrfs_abort_transaction(trans, ret);
7668 goto error;
7669 }
7670
7671 wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
7672 if (IS_ERR(wc.root)) {
7673 ret = PTR_ERR(wc.root);
7674 wc.root = NULL;
7675 if (unlikely(ret != -ENOENT)) {
7676 btrfs_abort_transaction(trans, ret);
7677 goto error;
7678 }
7679
7680 /*
7681 * We didn't find the subvol, likely because it was
7682 * deleted. This is ok, simply skip this log and go to
7683 * the next one.
7684 *
7685 * We need to exclude the root because we can't have
7686 * other log replays overwriting this log as we'll read
7687 * it back in a few more times. This will keep our
7688 * block from being modified, and we'll just bail for
7689 * each subsequent pass.
7690 */
7691 ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
7692 if (unlikely(ret)) {
7693 btrfs_abort_transaction(trans, ret);
7694 goto error;
7695 }
7696 goto next;
7697 }
7698
7699 wc.root->log_root = wc.log;
7700 ret = btrfs_record_root_in_trans(trans, wc.root);
7701 if (unlikely(ret)) {
7702 btrfs_abort_transaction(trans, ret);
7703 goto next;
7704 }
7705
7706 ret = walk_log_tree(&wc);
7707 if (unlikely(ret)) {
7708 btrfs_abort_transaction(trans, ret);
7709 goto next;
7710 }
7711
7712 if (wc.stage == LOG_WALK_REPLAY_ALL) {
7713 struct btrfs_root *root = wc.root;
7714
7715 wc.subvol_path = path;
7716 ret = fixup_inode_link_counts(&wc);
7717 wc.subvol_path = NULL;
7718 if (unlikely(ret)) {
7719 btrfs_abort_transaction(trans, ret);
7720 goto next;
7721 }
7722 /*
7723 * We have just replayed everything, and the highest
7724 * objectid of fs roots probably has changed in case
7725 * some inode_item's got replayed.
7726 *
7727 * root->objectid_mutex is not acquired as log replay
7728 * could only happen during mount.
7729 */
7730 ret = btrfs_init_root_free_objectid(root);
7731 if (unlikely(ret)) {
7732 btrfs_abort_transaction(trans, ret);
7733 goto next;
7734 }
7735 }
7736 next:
7737 if (wc.root) {
7738 wc.root->log_root = NULL;
7739 btrfs_put_root(wc.root);
7740 }
7741 btrfs_put_root(wc.log);
7742 wc.log = NULL;
7743
7744 if (ret)
7745 goto error;
7746 if (found_key.offset == 0)
7747 break;
7748 key.offset = found_key.offset - 1;
7749 }
7750 btrfs_release_path(path);
7751
7752 /* step one is to pin it all, step two is to replay just inodes */
7753 if (wc.pin) {
7754 wc.pin = false;
7755 wc.process_func = replay_one_buffer;
7756 wc.stage = LOG_WALK_REPLAY_INODES;
7757 goto again;
7758 }
7759 /* step three is to replay everything */
7760 if (wc.stage < LOG_WALK_REPLAY_ALL) {
7761 wc.stage++;
7762 goto again;
7763 }
7764
7765 btrfs_free_path(path);
7766
7767 /* step 4: commit the transaction, which also unpins the blocks */
7768 ret = btrfs_commit_transaction(trans);
7769 if (ret)
7770 return ret;
7771
7772 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7773
7774 return 0;
7775 error:
7776 if (wc.trans)
7777 btrfs_end_transaction(wc.trans);
7778 btrfs_put_root(wc.log);
7779 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7780 btrfs_free_path(path);
7781 return ret;
7782 }
7783
7784 /*
7785 * there are some corner cases where we want to force a full
7786 * commit instead of allowing a directory to be logged.
7787 *
7788 * They revolve around files there were unlinked from the directory, and
7789 * this function updates the parent directory so that a full commit is
7790 * properly done if it is fsync'd later after the unlinks are done.
7791 *
7792 * Must be called before the unlink operations (updates to the subvolume tree,
7793 * inodes, etc) are done.
7794 */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,bool for_rename)7795 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7796 struct btrfs_inode *dir, struct btrfs_inode *inode,
7797 bool for_rename)
7798 {
7799 /*
7800 * when we're logging a file, if it hasn't been renamed
7801 * or unlinked, and its inode is fully committed on disk,
7802 * we don't have to worry about walking up the directory chain
7803 * to log its parents.
7804 *
7805 * So, we use the last_unlink_trans field to put this transid
7806 * into the file. When the file is logged we check it and
7807 * don't log the parents if the file is fully on disk.
7808 */
7809 mutex_lock(&inode->log_mutex);
7810 inode->last_unlink_trans = trans->transid;
7811 mutex_unlock(&inode->log_mutex);
7812
7813 if (!for_rename)
7814 return;
7815
7816 /*
7817 * If this directory was already logged, any new names will be logged
7818 * with btrfs_log_new_name() and old names will be deleted from the log
7819 * tree with btrfs_del_dir_entries_in_log() or with
7820 * btrfs_del_inode_ref_in_log().
7821 */
7822 if (inode_logged(trans, dir, NULL) == 1)
7823 return;
7824
7825 /*
7826 * If the inode we're about to unlink was logged before, the log will be
7827 * properly updated with the new name with btrfs_log_new_name() and the
7828 * old name removed with btrfs_del_dir_entries_in_log() or with
7829 * btrfs_del_inode_ref_in_log().
7830 */
7831 if (inode_logged(trans, inode, NULL) == 1)
7832 return;
7833
7834 /*
7835 * when renaming files across directories, if the directory
7836 * there we're unlinking from gets fsync'd later on, there's
7837 * no way to find the destination directory later and fsync it
7838 * properly. So, we have to be conservative and force commits
7839 * so the new name gets discovered.
7840 */
7841 mutex_lock(&dir->log_mutex);
7842 dir->last_unlink_trans = trans->transid;
7843 mutex_unlock(&dir->log_mutex);
7844 }
7845
7846 /*
7847 * Make sure that if someone attempts to fsync the parent directory of a deleted
7848 * snapshot, it ends up triggering a transaction commit. This is to guarantee
7849 * that after replaying the log tree of the parent directory's root we will not
7850 * see the snapshot anymore and at log replay time we will not see any log tree
7851 * corresponding to the deleted snapshot's root, which could lead to replaying
7852 * it after replaying the log tree of the parent directory (which would replay
7853 * the snapshot delete operation).
7854 *
7855 * Must be called before the actual snapshot destroy operation (updates to the
7856 * parent root and tree of tree roots trees, etc) are done.
7857 */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7858 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7859 struct btrfs_inode *dir)
7860 {
7861 mutex_lock(&dir->log_mutex);
7862 dir->last_unlink_trans = trans->transid;
7863 mutex_unlock(&dir->log_mutex);
7864 }
7865
7866 /*
7867 * Call this when creating a subvolume in a directory.
7868 * Because we don't commit a transaction when creating a subvolume, we can't
7869 * allow the directory pointing to the subvolume to be logged with an entry that
7870 * points to an unpersisted root if we are still in the transaction used to
7871 * create the subvolume, so make any attempt to log the directory to result in a
7872 * full log sync.
7873 * Also we don't need to worry with renames, since btrfs_rename() marks the log
7874 * for full commit when renaming a subvolume.
7875 *
7876 * Must be called before creating the subvolume entry in its parent directory.
7877 */
btrfs_record_new_subvolume(const struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7878 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
7879 struct btrfs_inode *dir)
7880 {
7881 mutex_lock(&dir->log_mutex);
7882 dir->last_unlink_trans = trans->transid;
7883 mutex_unlock(&dir->log_mutex);
7884 }
7885
7886 /*
7887 * Update the log after adding a new name for an inode.
7888 *
7889 * @trans: Transaction handle.
7890 * @old_dentry: The dentry associated with the old name and the old
7891 * parent directory.
7892 * @old_dir: The inode of the previous parent directory for the case
7893 * of a rename. For a link operation, it must be NULL.
7894 * @old_dir_index: The index number associated with the old name, meaningful
7895 * only for rename operations (when @old_dir is not NULL).
7896 * Ignored for link operations.
7897 * @parent: The dentry associated with the directory under which the
7898 * new name is located.
7899 *
7900 * Call this after adding a new name for an inode, as a result of a link or
7901 * rename operation, and it will properly update the log to reflect the new name.
7902 */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct dentry * old_dentry,struct btrfs_inode * old_dir,u64 old_dir_index,struct dentry * parent)7903 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7904 struct dentry *old_dentry, struct btrfs_inode *old_dir,
7905 u64 old_dir_index, struct dentry *parent)
7906 {
7907 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7908 struct btrfs_root *root = inode->root;
7909 struct btrfs_log_ctx ctx;
7910 bool log_pinned = false;
7911 int ret;
7912
7913 btrfs_init_log_ctx(&ctx, inode);
7914 ctx.logging_new_name = true;
7915
7916 /*
7917 * this will force the logging code to walk the dentry chain
7918 * up for the file
7919 */
7920 if (!S_ISDIR(inode->vfs_inode.i_mode))
7921 inode->last_unlink_trans = trans->transid;
7922
7923 /*
7924 * if this inode hasn't been logged and directory we're renaming it
7925 * from hasn't been logged, we don't need to log it
7926 */
7927 ret = inode_logged(trans, inode, NULL);
7928 if (ret < 0) {
7929 goto out;
7930 } else if (ret == 0) {
7931 if (!old_dir)
7932 return;
7933 /*
7934 * If the inode was not logged and we are doing a rename (old_dir is not
7935 * NULL), check if old_dir was logged - if it was not we can return and
7936 * do nothing.
7937 */
7938 ret = inode_logged(trans, old_dir, NULL);
7939 if (ret < 0)
7940 goto out;
7941 else if (ret == 0)
7942 return;
7943 }
7944 ret = 0;
7945
7946 /*
7947 * Now that we know we need to update the log, allocate the scratch eb
7948 * for the context before joining a log transaction below, as this can
7949 * take time and therefore we could delay log commits from other tasks.
7950 */
7951 btrfs_init_log_ctx_scratch_eb(&ctx);
7952
7953 /*
7954 * If we are doing a rename (old_dir is not NULL) from a directory that
7955 * was previously logged, make sure that on log replay we get the old
7956 * dir entry deleted. This is needed because we will also log the new
7957 * name of the renamed inode, so we need to make sure that after log
7958 * replay we don't end up with both the new and old dir entries existing.
7959 */
7960 if (old_dir && old_dir->logged_trans == trans->transid) {
7961 struct btrfs_root *log = old_dir->root->log_root;
7962 struct btrfs_path *path;
7963 struct fscrypt_name fname;
7964
7965 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7966
7967 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7968 &old_dentry->d_name, 0, &fname);
7969 if (ret)
7970 goto out;
7971
7972 path = btrfs_alloc_path();
7973 if (!path) {
7974 ret = -ENOMEM;
7975 fscrypt_free_filename(&fname);
7976 goto out;
7977 }
7978
7979 /*
7980 * We have two inodes to update in the log, the old directory and
7981 * the inode that got renamed, so we must pin the log to prevent
7982 * anyone from syncing the log until we have updated both inodes
7983 * in the log.
7984 */
7985 ret = join_running_log_trans(root);
7986 /*
7987 * At least one of the inodes was logged before, so this should
7988 * not fail, but if it does, it's not serious, just bail out and
7989 * mark the log for a full commit.
7990 */
7991 if (WARN_ON_ONCE(ret < 0)) {
7992 btrfs_free_path(path);
7993 fscrypt_free_filename(&fname);
7994 goto out;
7995 }
7996
7997 log_pinned = true;
7998
7999 /*
8000 * Other concurrent task might be logging the old directory,
8001 * as it can be triggered when logging other inode that had or
8002 * still has a dentry in the old directory. We lock the old
8003 * directory's log_mutex to ensure the deletion of the old
8004 * name is persisted, because during directory logging we
8005 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
8006 * the old name's dir index item is in the delayed items, so
8007 * it could be missed by an in progress directory logging.
8008 */
8009 mutex_lock(&old_dir->log_mutex);
8010 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
8011 &fname.disk_name, old_dir_index);
8012 if (ret > 0) {
8013 /*
8014 * The dentry does not exist in the log, so record its
8015 * deletion.
8016 */
8017 btrfs_release_path(path);
8018 ret = insert_dir_log_key(trans, log, path,
8019 btrfs_ino(old_dir),
8020 old_dir_index, old_dir_index);
8021 }
8022 mutex_unlock(&old_dir->log_mutex);
8023
8024 btrfs_free_path(path);
8025 fscrypt_free_filename(&fname);
8026 if (ret < 0)
8027 goto out;
8028 }
8029
8030 /*
8031 * We don't care about the return value. If we fail to log the new name
8032 * then we know the next attempt to sync the log will fallback to a full
8033 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
8034 * we don't need to worry about getting a log committed that has an
8035 * inconsistent state after a rename operation.
8036 */
8037 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
8038 ASSERT(list_empty(&ctx.conflict_inodes));
8039 out:
8040 /*
8041 * If an error happened mark the log for a full commit because it's not
8042 * consistent and up to date or we couldn't find out if one of the
8043 * inodes was logged before in this transaction. Do it before unpinning
8044 * the log, to avoid any races with someone else trying to commit it.
8045 */
8046 if (ret < 0)
8047 btrfs_set_log_full_commit(trans);
8048 if (log_pinned)
8049 btrfs_end_log_trans(root);
8050 free_extent_buffer(ctx.scratch_eb);
8051 }
8052
8053