1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2008 Oracle. All rights reserved.
4 */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "backref.h"
17 #include "compression.h"
18 #include "qgroup.h"
19 #include "block-group.h"
20 #include "space-info.h"
21 #include "inode-item.h"
22 #include "fs.h"
23 #include "accessors.h"
24 #include "extent-tree.h"
25 #include "root-tree.h"
26 #include "dir-item.h"
27 #include "file-item.h"
28 #include "file.h"
29 #include "orphan.h"
30 #include "print-tree.h"
31 #include "tree-checker.h"
32 #include "delayed-inode.h"
33
34 #define MAX_CONFLICT_INODES 10
35
36 /* magic values for the inode_only field in btrfs_log_inode:
37 *
38 * LOG_INODE_ALL means to log everything
39 * LOG_INODE_EXISTS means to log just enough to recreate the inode
40 * during log replay
41 */
42 enum {
43 LOG_INODE_ALL,
44 LOG_INODE_EXISTS,
45 };
46
47 /*
48 * directory trouble cases
49 *
50 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
51 * log, we must force a full commit before doing an fsync of the directory
52 * where the unlink was done.
53 * ---> record transid of last unlink/rename per directory
54 *
55 * mkdir foo/some_dir
56 * normal commit
57 * rename foo/some_dir foo2/some_dir
58 * mkdir foo/some_dir
59 * fsync foo/some_dir/some_file
60 *
61 * The fsync above will unlink the original some_dir without recording
62 * it in its new location (foo2). After a crash, some_dir will be gone
63 * unless the fsync of some_file forces a full commit
64 *
65 * 2) we must log any new names for any file or dir that is in the fsync
66 * log. ---> check inode while renaming/linking.
67 *
68 * 2a) we must log any new names for any file or dir during rename
69 * when the directory they are being removed from was logged.
70 * ---> check inode and old parent dir during rename
71 *
72 * 2a is actually the more important variant. With the extra logging
73 * a crash might unlink the old name without recreating the new one
74 *
75 * 3) after a crash, we must go through any directories with a link count
76 * of zero and redo the rm -rf
77 *
78 * mkdir f1/foo
79 * normal commit
80 * rm -rf f1/foo
81 * fsync(f1)
82 *
83 * The directory f1 was fully removed from the FS, but fsync was never
84 * called on f1, only its parent dir. After a crash the rm -rf must
85 * be replayed. This must be able to recurse down the entire
86 * directory tree. The inode link count fixup code takes care of the
87 * ugly details.
88 */
89
90 /*
91 * stages for the tree walking. The first
92 * stage (0) is to only pin down the blocks we find
93 * the second stage (1) is to make sure that all the inodes
94 * we find in the log are created in the subvolume.
95 *
96 * The last stage is to deal with directories and links and extents
97 * and all the other fun semantics
98 */
99 enum {
100 LOG_WALK_PIN_ONLY,
101 LOG_WALK_REPLAY_INODES,
102 LOG_WALK_REPLAY_DIR_INDEX,
103 LOG_WALK_REPLAY_ALL,
104 };
105
106 /*
107 * The walk control struct is used to pass state down the chain when processing
108 * the log tree. The stage field tells us which part of the log tree processing
109 * we are currently doing.
110 */
111 struct walk_control {
112 /*
113 * Signal that we are freeing the metadata extents of a log tree.
114 * This is used at transaction commit time while freeing a log tree.
115 */
116 bool free;
117
118 /*
119 * Signal that we are pinning the metadata extents of a log tree and the
120 * data extents its leaves point to (if using mixed block groups).
121 * This happens in the first stage of log replay to ensure that during
122 * replay, while we are modifying subvolume trees, we don't overwrite
123 * the metadata extents of log trees.
124 */
125 bool pin;
126
127 /* What stage of the replay code we're currently in. */
128 int stage;
129
130 /*
131 * Ignore any items from the inode currently being processed. Needs
132 * to be set every time we find a BTRFS_INODE_ITEM_KEY.
133 */
134 bool ignore_cur_inode;
135
136 /*
137 * The root we are currently replaying to. This is NULL for the replay
138 * stage LOG_WALK_PIN_ONLY.
139 */
140 struct btrfs_root *root;
141
142 /* The log tree we are currently processing (not NULL for any stage). */
143 struct btrfs_root *log;
144
145 /* The transaction handle used for replaying all log trees. */
146 struct btrfs_trans_handle *trans;
147
148 /*
149 * The function that gets used to process blocks we find in the tree.
150 * Note the extent_buffer might not be up to date when it is passed in,
151 * and it must be checked or read if you need the data inside it.
152 */
153 int (*process_func)(struct extent_buffer *eb,
154 struct walk_control *wc, u64 gen, int level);
155
156 /*
157 * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
158 * and by the replay_one_buffer() callback.
159 */
160
161 /* The current log leaf being processed. */
162 struct extent_buffer *log_leaf;
163 /* The key being processed of the current log leaf. */
164 struct btrfs_key log_key;
165 /* The slot being processed of the current log leaf. */
166 int log_slot;
167
168 /* A path used for searches and modifications to subvolume trees. */
169 struct btrfs_path *subvol_path;
170 };
171
do_abort_log_replay(struct walk_control * wc,const char * function,unsigned int line,int error,const char * fmt,...)172 static void do_abort_log_replay(struct walk_control *wc, const char *function,
173 unsigned int line, int error, const char *fmt, ...)
174 {
175 struct btrfs_fs_info *fs_info = wc->trans->fs_info;
176 struct va_format vaf;
177 va_list args;
178
179 /*
180 * Do nothing if we already aborted, to avoid dumping leaves again which
181 * can be verbose. Further more, only the first call is useful since it
182 * is where we have a problem. Note that we do not use the flag
183 * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
184 * are outside of tree-log.c that can abort transactions (such as
185 * btrfs_add_link() for example), so if that happens we still want to
186 * dump all log replay specific information below.
187 */
188 if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
189 return;
190
191 btrfs_abort_transaction(wc->trans, error);
192
193 if (wc->subvol_path->nodes[0]) {
194 btrfs_crit(fs_info,
195 "subvolume (root %llu) leaf currently being processed:",
196 btrfs_root_id(wc->root));
197 btrfs_print_leaf(wc->subvol_path->nodes[0]);
198 }
199
200 if (wc->log_leaf) {
201 btrfs_crit(fs_info,
202 "log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):",
203 btrfs_root_id(wc->root), wc->log_slot,
204 BTRFS_KEY_FMT_VALUE(&wc->log_key));
205 btrfs_print_leaf(wc->log_leaf);
206 }
207
208 va_start(args, fmt);
209 vaf.fmt = fmt;
210 vaf.va = &args;
211
212 btrfs_crit(fs_info,
213 "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
214 function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
215
216 va_end(args);
217 }
218
219 /*
220 * Use this for aborting a transaction during log replay while we are down the
221 * call chain of replay_one_buffer(), so that we get a lot more useful
222 * information for debugging issues when compared to a plain call to
223 * btrfs_abort_transaction().
224 */
225 #define btrfs_abort_log_replay(wc, error, fmt, args...) \
226 do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
227
228 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
229 struct btrfs_inode *inode,
230 int inode_only,
231 struct btrfs_log_ctx *ctx);
232 static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
233 static noinline int replay_dir_deletes(struct walk_control *wc,
234 u64 dirid, bool del_all);
235 static void wait_log_commit(struct btrfs_root *root, int transid);
236
237 /*
238 * tree logging is a special write ahead log used to make sure that
239 * fsyncs and O_SYNCs can happen without doing full tree commits.
240 *
241 * Full tree commits are expensive because they require commonly
242 * modified blocks to be recowed, creating many dirty pages in the
243 * extent tree an 4x-6x higher write load than ext3.
244 *
245 * Instead of doing a tree commit on every fsync, we use the
246 * key ranges and transaction ids to find items for a given file or directory
247 * that have changed in this transaction. Those items are copied into
248 * a special tree (one per subvolume root), that tree is written to disk
249 * and then the fsync is considered complete.
250 *
251 * After a crash, items are copied out of the log-tree back into the
252 * subvolume tree. Any file data extents found are recorded in the extent
253 * allocation tree, and the log-tree freed.
254 *
255 * The log tree is read three times, once to pin down all the extents it is
256 * using in ram and once, once to create all the inodes logged in the tree
257 * and once to do all the other items.
258 */
259
btrfs_iget_logging(u64 objectid,struct btrfs_root * root)260 static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
261 {
262 unsigned int nofs_flag;
263 struct btrfs_inode *inode;
264
265 /* Only meant to be called for subvolume roots and not for log roots. */
266 ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root));
267
268 /*
269 * We're holding a transaction handle whether we are logging or
270 * replaying a log tree, so we must make sure NOFS semantics apply
271 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
272 * to allocate an inode, which can recurse back into the filesystem and
273 * attempt a transaction commit, resulting in a deadlock.
274 */
275 nofs_flag = memalloc_nofs_save();
276 inode = btrfs_iget(objectid, root);
277 memalloc_nofs_restore(nofs_flag);
278
279 return inode;
280 }
281
282 /*
283 * start a sub transaction and setup the log tree
284 * this increments the log tree writer count to make the people
285 * syncing the tree wait for us to finish
286 */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)287 static int start_log_trans(struct btrfs_trans_handle *trans,
288 struct btrfs_root *root,
289 struct btrfs_log_ctx *ctx)
290 {
291 struct btrfs_fs_info *fs_info = root->fs_info;
292 struct btrfs_root *tree_root = fs_info->tree_root;
293 const bool zoned = btrfs_is_zoned(fs_info);
294 int ret = 0;
295 bool created = false;
296
297 /*
298 * First check if the log root tree was already created. If not, create
299 * it before locking the root's log_mutex, just to keep lockdep happy.
300 */
301 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
302 mutex_lock(&tree_root->log_mutex);
303 if (!fs_info->log_root_tree) {
304 ret = btrfs_init_log_root_tree(trans, fs_info);
305 if (!ret) {
306 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
307 created = true;
308 }
309 }
310 mutex_unlock(&tree_root->log_mutex);
311 if (ret)
312 return ret;
313 }
314
315 mutex_lock(&root->log_mutex);
316
317 again:
318 if (root->log_root) {
319 int index = (root->log_transid + 1) % 2;
320
321 if (btrfs_need_log_full_commit(trans)) {
322 ret = BTRFS_LOG_FORCE_COMMIT;
323 goto out;
324 }
325
326 if (zoned && atomic_read(&root->log_commit[index])) {
327 wait_log_commit(root, root->log_transid - 1);
328 goto again;
329 }
330
331 if (!root->log_start_pid) {
332 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
333 root->log_start_pid = current->pid;
334 } else if (root->log_start_pid != current->pid) {
335 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
336 }
337 } else {
338 /*
339 * This means fs_info->log_root_tree was already created
340 * for some other FS trees. Do the full commit not to mix
341 * nodes from multiple log transactions to do sequential
342 * writing.
343 */
344 if (zoned && !created) {
345 ret = BTRFS_LOG_FORCE_COMMIT;
346 goto out;
347 }
348
349 ret = btrfs_add_log_tree(trans, root);
350 if (ret)
351 goto out;
352
353 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
354 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
355 root->log_start_pid = current->pid;
356 }
357
358 atomic_inc(&root->log_writers);
359 if (!ctx->logging_new_name) {
360 int index = root->log_transid % 2;
361 list_add_tail(&ctx->list, &root->log_ctxs[index]);
362 ctx->log_transid = root->log_transid;
363 }
364
365 out:
366 mutex_unlock(&root->log_mutex);
367 return ret;
368 }
369
370 /*
371 * returns 0 if there was a log transaction running and we were able
372 * to join, or returns -ENOENT if there were not transactions
373 * in progress
374 */
join_running_log_trans(struct btrfs_root * root)375 static int join_running_log_trans(struct btrfs_root *root)
376 {
377 const bool zoned = btrfs_is_zoned(root->fs_info);
378 int ret = -ENOENT;
379
380 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
381 return ret;
382
383 mutex_lock(&root->log_mutex);
384 again:
385 if (root->log_root) {
386 int index = (root->log_transid + 1) % 2;
387
388 ret = 0;
389 if (zoned && atomic_read(&root->log_commit[index])) {
390 wait_log_commit(root, root->log_transid - 1);
391 goto again;
392 }
393 atomic_inc(&root->log_writers);
394 }
395 mutex_unlock(&root->log_mutex);
396 return ret;
397 }
398
399 /*
400 * This either makes the current running log transaction wait
401 * until you call btrfs_end_log_trans() or it makes any future
402 * log transactions wait until you call btrfs_end_log_trans()
403 */
btrfs_pin_log_trans(struct btrfs_root * root)404 void btrfs_pin_log_trans(struct btrfs_root *root)
405 {
406 atomic_inc(&root->log_writers);
407 }
408
409 /*
410 * indicate we're done making changes to the log tree
411 * and wake up anyone waiting to do a sync
412 */
btrfs_end_log_trans(struct btrfs_root * root)413 void btrfs_end_log_trans(struct btrfs_root *root)
414 {
415 if (atomic_dec_and_test(&root->log_writers)) {
416 /* atomic_dec_and_test implies a barrier */
417 cond_wake_up_nomb(&root->log_writer_wait);
418 }
419 }
420
421 /*
422 * process_func used to pin down extents, write them or wait on them
423 */
process_one_buffer(struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)424 static int process_one_buffer(struct extent_buffer *eb,
425 struct walk_control *wc, u64 gen, int level)
426 {
427 struct btrfs_root *log = wc->log;
428 struct btrfs_trans_handle *trans = wc->trans;
429 struct btrfs_fs_info *fs_info = log->fs_info;
430 int ret = 0;
431
432 /*
433 * If this fs is mixed then we need to be able to process the leaves to
434 * pin down any logged extents, so we have to read the block.
435 */
436 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
437 struct btrfs_tree_parent_check check = {
438 .level = level,
439 .transid = gen
440 };
441
442 ret = btrfs_read_extent_buffer(eb, &check);
443 if (unlikely(ret)) {
444 if (trans)
445 btrfs_abort_transaction(trans, ret);
446 else
447 btrfs_handle_fs_error(fs_info, ret, NULL);
448 return ret;
449 }
450 }
451
452 if (wc->pin) {
453 ASSERT(trans != NULL);
454 ret = btrfs_pin_extent_for_log_replay(trans, eb);
455 if (unlikely(ret)) {
456 btrfs_abort_transaction(trans, ret);
457 return ret;
458 }
459
460 if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
461 ret = btrfs_exclude_logged_extents(eb);
462 if (ret)
463 btrfs_abort_transaction(trans, ret);
464 }
465 }
466 return ret;
467 }
468
469 /*
470 * Item overwrite used by log replay. The given log tree leaf, slot and key
471 * from the walk_control structure all refer to the source data we are copying
472 * out.
473 *
474 * The given root is for the tree we are copying into, and path is a scratch
475 * path for use in this function (it should be released on entry and will be
476 * released on exit).
477 *
478 * If the key is already in the destination tree the existing item is
479 * overwritten. If the existing item isn't big enough, it is extended.
480 * If it is too large, it is truncated.
481 *
482 * If the key isn't in the destination yet, a new item is inserted.
483 */
overwrite_item(struct walk_control * wc)484 static int overwrite_item(struct walk_control *wc)
485 {
486 struct btrfs_trans_handle *trans = wc->trans;
487 struct btrfs_root *root = wc->root;
488 int ret;
489 u32 item_size;
490 u64 saved_i_size = 0;
491 int save_old_i_size = 0;
492 unsigned long src_ptr;
493 unsigned long dst_ptr;
494 struct extent_buffer *dst_eb;
495 int dst_slot;
496 const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
497
498 /*
499 * This is only used during log replay, so the root is always from a
500 * fs/subvolume tree. In case we ever need to support a log root, then
501 * we'll have to clone the leaf in the path, release the path and use
502 * the leaf before writing into the log tree. See the comments at
503 * copy_items() for more details.
504 */
505 ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root));
506
507 item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
508 src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
509
510 /* Look for the key in the destination tree. */
511 ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
512 if (ret < 0) {
513 btrfs_abort_log_replay(wc, ret,
514 "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
515 BTRFS_KEY_FMT_VALUE(&wc->log_key),
516 btrfs_root_id(root));
517 return ret;
518 }
519
520 dst_eb = wc->subvol_path->nodes[0];
521 dst_slot = wc->subvol_path->slots[0];
522
523 if (ret == 0) {
524 char *src_copy;
525 const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
526
527 if (dst_size != item_size)
528 goto insert;
529
530 if (item_size == 0) {
531 btrfs_release_path(wc->subvol_path);
532 return 0;
533 }
534 src_copy = kmalloc(item_size, GFP_NOFS);
535 if (!src_copy) {
536 btrfs_abort_log_replay(wc, -ENOMEM,
537 "failed to allocate memory for log leaf item");
538 return -ENOMEM;
539 }
540
541 read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
542 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
543 ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
544
545 kfree(src_copy);
546 /*
547 * they have the same contents, just return, this saves
548 * us from cowing blocks in the destination tree and doing
549 * extra writes that may not have been done by a previous
550 * sync
551 */
552 if (ret == 0) {
553 btrfs_release_path(wc->subvol_path);
554 return 0;
555 }
556
557 /*
558 * We need to load the old nbytes into the inode so when we
559 * replay the extents we've logged we get the right nbytes.
560 */
561 if (is_inode_item) {
562 struct btrfs_inode_item *item;
563 u64 nbytes;
564 u32 mode;
565
566 item = btrfs_item_ptr(dst_eb, dst_slot,
567 struct btrfs_inode_item);
568 nbytes = btrfs_inode_nbytes(dst_eb, item);
569 item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
570 struct btrfs_inode_item);
571 btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
572
573 /*
574 * If this is a directory we need to reset the i_size to
575 * 0 so that we can set it up properly when replaying
576 * the rest of the items in this log.
577 */
578 mode = btrfs_inode_mode(wc->log_leaf, item);
579 if (S_ISDIR(mode))
580 btrfs_set_inode_size(wc->log_leaf, item, 0);
581 }
582 } else if (is_inode_item) {
583 struct btrfs_inode_item *item;
584 u32 mode;
585
586 /*
587 * New inode, set nbytes to 0 so that the nbytes comes out
588 * properly when we replay the extents.
589 */
590 item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
591 btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
592
593 /*
594 * If this is a directory we need to reset the i_size to 0 so
595 * that we can set it up properly when replaying the rest of
596 * the items in this log.
597 */
598 mode = btrfs_inode_mode(wc->log_leaf, item);
599 if (S_ISDIR(mode))
600 btrfs_set_inode_size(wc->log_leaf, item, 0);
601 }
602 insert:
603 btrfs_release_path(wc->subvol_path);
604 /* try to insert the key into the destination tree */
605 wc->subvol_path->skip_release_on_error = true;
606 ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
607 wc->subvol_path->skip_release_on_error = false;
608
609 dst_eb = wc->subvol_path->nodes[0];
610 dst_slot = wc->subvol_path->slots[0];
611
612 /* make sure any existing item is the correct size */
613 if (ret == -EEXIST || ret == -EOVERFLOW) {
614 const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
615
616 if (found_size > item_size)
617 btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
618 else if (found_size < item_size)
619 btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
620 } else if (ret) {
621 btrfs_abort_log_replay(wc, ret,
622 "failed to insert item for key " BTRFS_KEY_FMT,
623 BTRFS_KEY_FMT_VALUE(&wc->log_key));
624 return ret;
625 }
626 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
627
628 /* don't overwrite an existing inode if the generation number
629 * was logged as zero. This is done when the tree logging code
630 * is just logging an inode to make sure it exists after recovery.
631 *
632 * Also, don't overwrite i_size on directories during replay.
633 * log replay inserts and removes directory items based on the
634 * state of the tree found in the subvolume, and i_size is modified
635 * as it goes
636 */
637 if (is_inode_item && ret == -EEXIST) {
638 struct btrfs_inode_item *src_item;
639 struct btrfs_inode_item *dst_item;
640
641 src_item = (struct btrfs_inode_item *)src_ptr;
642 dst_item = (struct btrfs_inode_item *)dst_ptr;
643
644 if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
645 const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
646
647 /*
648 * For regular files an ino_size == 0 is used only when
649 * logging that an inode exists, as part of a directory
650 * fsync, and the inode wasn't fsynced before. In this
651 * case don't set the size of the inode in the fs/subvol
652 * tree, otherwise we would be throwing valid data away.
653 */
654 if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
655 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
656 ino_size != 0)
657 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
658 goto no_copy;
659 }
660
661 if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
662 S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
663 save_old_i_size = 1;
664 saved_i_size = btrfs_inode_size(dst_eb, dst_item);
665 }
666 }
667
668 copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
669
670 if (save_old_i_size) {
671 struct btrfs_inode_item *dst_item;
672
673 dst_item = (struct btrfs_inode_item *)dst_ptr;
674 btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
675 }
676
677 /* make sure the generation is filled in */
678 if (is_inode_item) {
679 struct btrfs_inode_item *dst_item;
680
681 dst_item = (struct btrfs_inode_item *)dst_ptr;
682 if (btrfs_inode_generation(dst_eb, dst_item) == 0)
683 btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
684 }
685 no_copy:
686 btrfs_release_path(wc->subvol_path);
687 return 0;
688 }
689
read_alloc_one_name(struct extent_buffer * eb,void * start,int len,struct fscrypt_str * name)690 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
691 struct fscrypt_str *name)
692 {
693 char *buf;
694
695 buf = kmalloc(len, GFP_NOFS);
696 if (!buf)
697 return -ENOMEM;
698
699 read_extent_buffer(eb, buf, (unsigned long)start, len);
700 name->name = buf;
701 name->len = len;
702 return 0;
703 }
704
705 /* replays a single extent in 'eb' at 'slot' with 'key' into the
706 * subvolume 'root'. path is released on entry and should be released
707 * on exit.
708 *
709 * extents in the log tree have not been allocated out of the extent
710 * tree yet. So, this completes the allocation, taking a reference
711 * as required if the extent already exists or creating a new extent
712 * if it isn't in the extent allocation tree yet.
713 *
714 * The extent is inserted into the file, dropping any existing extents
715 * from the file that overlap the new one.
716 */
replay_one_extent(struct walk_control * wc)717 static noinline int replay_one_extent(struct walk_control *wc)
718 {
719 struct btrfs_trans_handle *trans = wc->trans;
720 struct btrfs_root *root = wc->root;
721 struct btrfs_drop_extents_args drop_args = { 0 };
722 struct btrfs_fs_info *fs_info = root->fs_info;
723 int found_type;
724 u64 extent_end;
725 const u64 start = wc->log_key.offset;
726 u64 nbytes = 0;
727 u64 csum_start;
728 u64 csum_end;
729 LIST_HEAD(ordered_sums);
730 u64 offset;
731 unsigned long dest_offset;
732 struct btrfs_key ins;
733 struct btrfs_file_extent_item *item;
734 struct btrfs_inode *inode = NULL;
735 int ret = 0;
736
737 item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
738 found_type = btrfs_file_extent_type(wc->log_leaf, item);
739
740 if (found_type == BTRFS_FILE_EXTENT_REG ||
741 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
742 extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
743 /* Holes don't take up space. */
744 if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
745 nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
746 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
747 nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
748 extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
749 } else {
750 btrfs_abort_log_replay(wc, -EUCLEAN,
751 "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
752 found_type, btrfs_root_id(root),
753 wc->log_key.objectid, wc->log_key.offset);
754 return -EUCLEAN;
755 }
756
757 inode = btrfs_iget_logging(wc->log_key.objectid, root);
758 if (IS_ERR(inode)) {
759 ret = PTR_ERR(inode);
760 btrfs_abort_log_replay(wc, ret,
761 "failed to get inode %llu for root %llu",
762 wc->log_key.objectid, btrfs_root_id(root));
763 return ret;
764 }
765
766 /*
767 * first check to see if we already have this extent in the
768 * file. This must be done before the btrfs_drop_extents run
769 * so we don't try to drop this extent.
770 */
771 ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
772 btrfs_ino(inode), start, 0);
773
774 if (ret == 0 &&
775 (found_type == BTRFS_FILE_EXTENT_REG ||
776 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
777 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
778 struct btrfs_file_extent_item existing;
779 unsigned long ptr;
780
781 ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
782 read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
783
784 /*
785 * we already have a pointer to this exact extent,
786 * we don't have to do anything
787 */
788 if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
789 sizeof(existing)) == 0) {
790 btrfs_release_path(wc->subvol_path);
791 goto out;
792 }
793 }
794 btrfs_release_path(wc->subvol_path);
795
796 /* drop any overlapping extents */
797 drop_args.start = start;
798 drop_args.end = extent_end;
799 drop_args.drop_cache = true;
800 drop_args.path = wc->subvol_path;
801 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
802 if (ret) {
803 btrfs_abort_log_replay(wc, ret,
804 "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
805 wc->log_key.objectid, start, extent_end,
806 btrfs_root_id(root));
807 goto out;
808 }
809
810 if (found_type == BTRFS_FILE_EXTENT_INLINE) {
811 /* inline extents are easy, we just overwrite them */
812 ret = overwrite_item(wc);
813 if (ret)
814 goto out;
815 goto update_inode;
816 }
817
818 /*
819 * If not an inline extent, it can only be a regular or prealloc one.
820 * We have checked that above and returned -EUCLEAN if not.
821 */
822
823 /* A hole and NO_HOLES feature enabled, nothing else to do. */
824 if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
825 btrfs_fs_incompat(fs_info, NO_HOLES))
826 goto update_inode;
827
828 ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
829 &wc->log_key, sizeof(*item));
830 if (ret) {
831 btrfs_abort_log_replay(wc, ret,
832 "failed to insert item with key " BTRFS_KEY_FMT " root %llu",
833 BTRFS_KEY_FMT_VALUE(&wc->log_key),
834 btrfs_root_id(root));
835 goto out;
836 }
837 dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
838 wc->subvol_path->slots[0]);
839 copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
840 (unsigned long)item, sizeof(*item));
841
842 /*
843 * We have an explicit hole and NO_HOLES is not enabled. We have added
844 * the hole file extent item to the subvolume tree, so we don't have
845 * anything else to do other than update the file extent item range and
846 * update the inode item.
847 */
848 if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
849 btrfs_release_path(wc->subvol_path);
850 goto update_inode;
851 }
852
853 ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
854 ins.type = BTRFS_EXTENT_ITEM_KEY;
855 ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
856 offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
857
858 /*
859 * Manually record dirty extent, as here we did a shallow file extent
860 * item copy and skip normal backref update, but modifying extent tree
861 * all by ourselves. So need to manually record dirty extent for qgroup,
862 * as the owner of the file extent changed from log tree (doesn't affect
863 * qgroup) to fs/file tree (affects qgroup).
864 */
865 ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
866 if (ret < 0) {
867 btrfs_abort_log_replay(wc, ret,
868 "failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
869 ins.objectid, ins.offset,
870 wc->log_key.objectid, btrfs_root_id(root));
871 goto out;
872 }
873
874 /*
875 * Is this extent already allocated in the extent tree?
876 * If so, just add a reference.
877 */
878 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
879 if (ret < 0) {
880 btrfs_abort_log_replay(wc, ret,
881 "failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
882 ins.objectid, ins.offset,
883 wc->log_key.objectid, btrfs_root_id(root));
884 goto out;
885 } else if (ret == 0) {
886 struct btrfs_ref ref = {
887 .action = BTRFS_ADD_DELAYED_REF,
888 .bytenr = ins.objectid,
889 .num_bytes = ins.offset,
890 .owning_root = btrfs_root_id(root),
891 .ref_root = btrfs_root_id(root),
892 };
893
894 btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
895 ret = btrfs_inc_extent_ref(trans, &ref);
896 if (ret) {
897 btrfs_abort_log_replay(wc, ret,
898 "failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
899 ins.objectid, ins.offset,
900 wc->log_key.objectid,
901 btrfs_root_id(root));
902 goto out;
903 }
904 } else {
905 /* Insert the extent pointer in the extent tree. */
906 ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
907 wc->log_key.objectid, offset, &ins);
908 if (ret) {
909 btrfs_abort_log_replay(wc, ret,
910 "failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
911 ins.objectid, ins.offset, offset,
912 wc->log_key.objectid, btrfs_root_id(root));
913 goto out;
914 }
915 }
916
917 btrfs_release_path(wc->subvol_path);
918
919 if (btrfs_file_extent_compression(wc->log_leaf, item)) {
920 csum_start = ins.objectid;
921 csum_end = csum_start + ins.offset;
922 } else {
923 csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
924 csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
925 }
926
927 ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
928 &ordered_sums, false);
929 if (ret < 0) {
930 btrfs_abort_log_replay(wc, ret,
931 "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
932 csum_start, csum_end, wc->log_key.objectid,
933 btrfs_root_id(root));
934 goto out;
935 }
936 ret = 0;
937 /*
938 * Now delete all existing cums in the csum root that cover our range.
939 * We do this because we can have an extent that is completely
940 * referenced by one file extent item and partially referenced by
941 * another file extent item (like after using the clone or extent_same
942 * ioctls). In this case if we end up doing the replay of the one that
943 * partially references the extent first, and we do not do the csum
944 * deletion below, we can get 2 csum items in the csum tree that overlap
945 * each other. For example, imagine our log has the two following file
946 * extent items:
947 *
948 * key (257 EXTENT_DATA 409600)
949 * extent data disk byte 12845056 nr 102400
950 * extent data offset 20480 nr 20480 ram 102400
951 *
952 * key (257 EXTENT_DATA 819200)
953 * extent data disk byte 12845056 nr 102400
954 * extent data offset 0 nr 102400 ram 102400
955 *
956 * Where the second one fully references the 100K extent that starts at
957 * disk byte 12845056, and the log tree has a single csum item that
958 * covers the entire range of the extent:
959 *
960 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
961 *
962 * After the first file extent item is replayed, the csum tree gets the
963 * following csum item:
964 *
965 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
966 *
967 * Which covers the 20K sub-range starting at offset 20K of our extent.
968 * Now when we replay the second file extent item, if we do not delete
969 * existing csum items that cover any of its blocks, we end up getting
970 * two csum items in our csum tree that overlap each other:
971 *
972 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
973 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
974 *
975 * Which is a problem, because after this anyone trying to lookup for
976 * the checksum of any block of our extent starting at an offset of 40K
977 * or higher, will end up looking at the second csum item only, which
978 * does not contain the checksum for any block starting at offset 40K or
979 * higher of our extent.
980 */
981 while (!list_empty(&ordered_sums)) {
982 struct btrfs_ordered_sum *sums;
983 struct btrfs_root *csum_root;
984
985 sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
986 csum_root = btrfs_csum_root(fs_info, sums->logical);
987 if (!ret) {
988 ret = btrfs_del_csums(trans, csum_root, sums->logical,
989 sums->len);
990 if (ret)
991 btrfs_abort_log_replay(wc, ret,
992 "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
993 sums->logical,
994 sums->logical + sums->len,
995 wc->log_key.objectid,
996 btrfs_root_id(root));
997 }
998 if (!ret) {
999 ret = btrfs_csum_file_blocks(trans, csum_root, sums);
1000 if (ret)
1001 btrfs_abort_log_replay(wc, ret,
1002 "failed to add csums for range [%llu, %llu) inode %llu root %llu",
1003 sums->logical,
1004 sums->logical + sums->len,
1005 wc->log_key.objectid,
1006 btrfs_root_id(root));
1007 }
1008 list_del(&sums->list);
1009 kfree(sums);
1010 }
1011 if (ret)
1012 goto out;
1013
1014 update_inode:
1015 ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
1016 if (ret) {
1017 btrfs_abort_log_replay(wc, ret,
1018 "failed to set file extent range [%llu, %llu) inode %llu root %llu",
1019 start, extent_end, wc->log_key.objectid,
1020 btrfs_root_id(root));
1021 goto out;
1022 }
1023
1024 btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
1025 ret = btrfs_update_inode(trans, inode);
1026 if (ret)
1027 btrfs_abort_log_replay(wc, ret,
1028 "failed to update inode %llu root %llu",
1029 wc->log_key.objectid, btrfs_root_id(root));
1030 out:
1031 iput(&inode->vfs_inode);
1032 return ret;
1033 }
1034
unlink_inode_for_log_replay(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)1035 static int unlink_inode_for_log_replay(struct walk_control *wc,
1036 struct btrfs_inode *dir,
1037 struct btrfs_inode *inode,
1038 const struct fscrypt_str *name)
1039 {
1040 struct btrfs_trans_handle *trans = wc->trans;
1041 int ret;
1042
1043 ret = btrfs_unlink_inode(trans, dir, inode, name);
1044 if (ret) {
1045 btrfs_abort_log_replay(wc, ret,
1046 "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
1047 btrfs_ino(inode), btrfs_ino(dir), name->len,
1048 name->name, btrfs_root_id(inode->root));
1049 return ret;
1050 }
1051 /*
1052 * Whenever we need to check if a name exists or not, we check the
1053 * fs/subvolume tree. So after an unlink we must run delayed items, so
1054 * that future checks for a name during log replay see that the name
1055 * does not exists anymore.
1056 */
1057 ret = btrfs_run_delayed_items(trans);
1058 if (ret)
1059 btrfs_abort_log_replay(wc, ret,
1060 "failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
1061 btrfs_ino(inode), btrfs_ino(dir), name->len,
1062 name->name, btrfs_root_id(inode->root));
1063
1064 return ret;
1065 }
1066
1067 /*
1068 * when cleaning up conflicts between the directory names in the
1069 * subvolume, directory names in the log and directory names in the
1070 * inode back references, we may have to unlink inodes from directories.
1071 *
1072 * This is a helper function to do the unlink of a specific directory
1073 * item
1074 */
drop_one_dir_item(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_dir_item * di)1075 static noinline int drop_one_dir_item(struct walk_control *wc,
1076 struct btrfs_inode *dir,
1077 struct btrfs_dir_item *di)
1078 {
1079 struct btrfs_root *root = dir->root;
1080 struct btrfs_inode *inode;
1081 struct fscrypt_str name;
1082 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
1083 struct btrfs_key location;
1084 int ret;
1085
1086 btrfs_dir_item_key_to_cpu(leaf, di, &location);
1087 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
1088 if (ret) {
1089 btrfs_abort_log_replay(wc, ret,
1090 "failed to allocate name for dir %llu root %llu",
1091 btrfs_ino(dir), btrfs_root_id(root));
1092 return ret;
1093 }
1094
1095 btrfs_release_path(wc->subvol_path);
1096
1097 inode = btrfs_iget_logging(location.objectid, root);
1098 if (IS_ERR(inode)) {
1099 ret = PTR_ERR(inode);
1100 btrfs_abort_log_replay(wc, ret,
1101 "failed to open inode %llu parent dir %llu name %.*s root %llu",
1102 location.objectid, btrfs_ino(dir),
1103 name.len, name.name, btrfs_root_id(root));
1104 inode = NULL;
1105 goto out;
1106 }
1107
1108 ret = link_to_fixup_dir(wc, location.objectid);
1109 if (ret)
1110 goto out;
1111
1112 ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
1113 out:
1114 kfree(name.name);
1115 if (inode)
1116 iput(&inode->vfs_inode);
1117 return ret;
1118 }
1119
1120 /*
1121 * See if a given name and sequence number found in an inode back reference are
1122 * already in a directory and correctly point to this inode.
1123 *
1124 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
1125 * exists.
1126 */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,struct fscrypt_str * name)1127 static noinline int inode_in_dir(struct btrfs_root *root,
1128 struct btrfs_path *path,
1129 u64 dirid, u64 objectid, u64 index,
1130 struct fscrypt_str *name)
1131 {
1132 struct btrfs_dir_item *di;
1133 struct btrfs_key location;
1134 int ret = 0;
1135
1136 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
1137 index, name, 0);
1138 if (IS_ERR(di)) {
1139 ret = PTR_ERR(di);
1140 goto out;
1141 } else if (di) {
1142 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1143 if (location.objectid != objectid)
1144 goto out;
1145 } else {
1146 goto out;
1147 }
1148
1149 btrfs_release_path(path);
1150 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
1151 if (IS_ERR(di)) {
1152 ret = PTR_ERR(di);
1153 goto out;
1154 } else if (di) {
1155 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1156 if (location.objectid == objectid)
1157 ret = 1;
1158 }
1159 out:
1160 btrfs_release_path(path);
1161 return ret;
1162 }
1163
1164 /*
1165 * helper function to check a log tree for a named back reference in
1166 * an inode. This is used to decide if a back reference that is
1167 * found in the subvolume conflicts with what we find in the log.
1168 *
1169 * inode backreferences may have multiple refs in a single item,
1170 * during replay we process one reference at a time, and we don't
1171 * want to delete valid links to a file from the subvolume if that
1172 * link is also in the log.
1173 */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const struct fscrypt_str * name)1174 static noinline int backref_in_log(struct btrfs_root *log,
1175 struct btrfs_key *key,
1176 u64 ref_objectid,
1177 const struct fscrypt_str *name)
1178 {
1179 BTRFS_PATH_AUTO_FREE(path);
1180 int ret;
1181
1182 path = btrfs_alloc_path();
1183 if (!path)
1184 return -ENOMEM;
1185
1186 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1187 if (ret < 0)
1188 return ret;
1189 if (ret == 1)
1190 return 0;
1191
1192 if (key->type == BTRFS_INODE_EXTREF_KEY)
1193 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1194 path->slots[0],
1195 ref_objectid, name);
1196 else
1197 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1198 path->slots[0], name);
1199 return ret;
1200 }
1201
unlink_refs_not_in_log(struct walk_control * wc,struct btrfs_key * search_key,struct btrfs_inode * dir,struct btrfs_inode * inode)1202 static int unlink_refs_not_in_log(struct walk_control *wc,
1203 struct btrfs_key *search_key,
1204 struct btrfs_inode *dir,
1205 struct btrfs_inode *inode)
1206 {
1207 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
1208 unsigned long ptr;
1209 unsigned long ptr_end;
1210
1211 /*
1212 * Check all the names in this back reference to see if they are in the
1213 * log. If so, we allow them to stay otherwise they must be unlinked as
1214 * a conflict.
1215 */
1216 ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
1217 ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
1218 while (ptr < ptr_end) {
1219 struct fscrypt_str victim_name;
1220 struct btrfs_inode_ref *victim_ref;
1221 int ret;
1222
1223 victim_ref = (struct btrfs_inode_ref *)ptr;
1224 ret = read_alloc_one_name(leaf, (victim_ref + 1),
1225 btrfs_inode_ref_name_len(leaf, victim_ref),
1226 &victim_name);
1227 if (ret) {
1228 btrfs_abort_log_replay(wc, ret,
1229 "failed to allocate name for inode %llu parent dir %llu root %llu",
1230 btrfs_ino(inode), btrfs_ino(dir),
1231 btrfs_root_id(inode->root));
1232 return ret;
1233 }
1234
1235 ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
1236 if (ret) {
1237 if (ret < 0) {
1238 btrfs_abort_log_replay(wc, ret,
1239 "failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
1240 btrfs_ino(inode), btrfs_ino(dir),
1241 victim_name.len, victim_name.name,
1242 btrfs_root_id(inode->root));
1243 kfree(victim_name.name);
1244 return ret;
1245 }
1246 kfree(victim_name.name);
1247 ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1248 continue;
1249 }
1250
1251 inc_nlink(&inode->vfs_inode);
1252 btrfs_release_path(wc->subvol_path);
1253
1254 ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
1255 kfree(victim_name.name);
1256 if (ret)
1257 return ret;
1258 return -EAGAIN;
1259 }
1260
1261 return 0;
1262 }
1263
unlink_extrefs_not_in_log(struct walk_control * wc,struct btrfs_key * search_key,struct btrfs_inode * dir,struct btrfs_inode * inode)1264 static int unlink_extrefs_not_in_log(struct walk_control *wc,
1265 struct btrfs_key *search_key,
1266 struct btrfs_inode *dir,
1267 struct btrfs_inode *inode)
1268 {
1269 struct extent_buffer *leaf = wc->subvol_path->nodes[0];
1270 const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
1271 const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
1272 u32 cur_offset = 0;
1273
1274 while (cur_offset < item_size) {
1275 struct btrfs_root *log_root = wc->log;
1276 struct btrfs_inode_extref *extref;
1277 struct fscrypt_str victim_name;
1278 int ret;
1279
1280 extref = (struct btrfs_inode_extref *)(base + cur_offset);
1281 victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
1282
1283 if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
1284 goto next;
1285
1286 ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
1287 &victim_name);
1288 if (ret) {
1289 btrfs_abort_log_replay(wc, ret,
1290 "failed to allocate name for inode %llu parent dir %llu root %llu",
1291 btrfs_ino(inode), btrfs_ino(dir),
1292 btrfs_root_id(inode->root));
1293 return ret;
1294 }
1295
1296 search_key->objectid = btrfs_ino(inode);
1297 search_key->type = BTRFS_INODE_EXTREF_KEY;
1298 search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
1299 victim_name.name,
1300 victim_name.len);
1301 ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
1302 if (ret) {
1303 if (ret < 0) {
1304 btrfs_abort_log_replay(wc, ret,
1305 "failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
1306 btrfs_ino(inode), btrfs_ino(dir),
1307 victim_name.len, victim_name.name,
1308 btrfs_root_id(inode->root));
1309 kfree(victim_name.name);
1310 return ret;
1311 }
1312 kfree(victim_name.name);
1313 next:
1314 cur_offset += victim_name.len + sizeof(*extref);
1315 continue;
1316 }
1317
1318 inc_nlink(&inode->vfs_inode);
1319 btrfs_release_path(wc->subvol_path);
1320
1321 ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
1322 kfree(victim_name.name);
1323 if (ret)
1324 return ret;
1325 return -EAGAIN;
1326 }
1327
1328 return 0;
1329 }
1330
__add_inode_ref(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 ref_index,struct fscrypt_str * name)1331 static inline int __add_inode_ref(struct walk_control *wc,
1332 struct btrfs_inode *dir,
1333 struct btrfs_inode *inode,
1334 u64 ref_index, struct fscrypt_str *name)
1335 {
1336 int ret;
1337 struct btrfs_trans_handle *trans = wc->trans;
1338 struct btrfs_root *root = wc->root;
1339 struct btrfs_dir_item *di;
1340 struct btrfs_key search_key;
1341 struct btrfs_inode_extref *extref;
1342
1343 again:
1344 /* Search old style refs */
1345 search_key.objectid = btrfs_ino(inode);
1346 search_key.type = BTRFS_INODE_REF_KEY;
1347 search_key.offset = btrfs_ino(dir);
1348 ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
1349 if (ret < 0) {
1350 btrfs_abort_log_replay(wc, ret,
1351 "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
1352 BTRFS_KEY_FMT_VALUE(&search_key),
1353 btrfs_root_id(root));
1354 return ret;
1355 } else if (ret == 0) {
1356 /*
1357 * Are we trying to overwrite a back ref for the root directory?
1358 * If so, we're done.
1359 */
1360 if (search_key.objectid == search_key.offset)
1361 return 1;
1362
1363 ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
1364 if (ret == -EAGAIN)
1365 goto again;
1366 else if (ret)
1367 return ret;
1368 }
1369 btrfs_release_path(wc->subvol_path);
1370
1371 /* Same search but for extended refs */
1372 extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
1373 btrfs_ino(inode), btrfs_ino(dir));
1374 if (IS_ERR(extref)) {
1375 return PTR_ERR(extref);
1376 } else if (extref) {
1377 ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
1378 if (ret == -EAGAIN)
1379 goto again;
1380 else if (ret)
1381 return ret;
1382 }
1383 btrfs_release_path(wc->subvol_path);
1384
1385 /* look for a conflicting sequence number */
1386 di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
1387 ref_index, name, 0);
1388 if (IS_ERR(di)) {
1389 ret = PTR_ERR(di);
1390 btrfs_abort_log_replay(wc, ret,
1391 "failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
1392 btrfs_ino(dir), ref_index, name->len,
1393 name->name, btrfs_root_id(root));
1394 return ret;
1395 } else if (di) {
1396 ret = drop_one_dir_item(wc, dir, di);
1397 if (ret)
1398 return ret;
1399 }
1400 btrfs_release_path(wc->subvol_path);
1401
1402 /* look for a conflicting name */
1403 di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
1404 if (IS_ERR(di)) {
1405 ret = PTR_ERR(di);
1406 btrfs_abort_log_replay(wc, ret,
1407 "failed to lookup dir item for dir %llu name %.*s root %llu",
1408 btrfs_ino(dir), name->len, name->name,
1409 btrfs_root_id(root));
1410 return ret;
1411 } else if (di) {
1412 ret = drop_one_dir_item(wc, dir, di);
1413 if (ret)
1414 return ret;
1415 }
1416 btrfs_release_path(wc->subvol_path);
1417
1418 return 0;
1419 }
1420
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index,u64 * parent_objectid)1421 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1422 struct fscrypt_str *name, u64 *index,
1423 u64 *parent_objectid)
1424 {
1425 struct btrfs_inode_extref *extref;
1426 int ret;
1427
1428 extref = (struct btrfs_inode_extref *)ref_ptr;
1429
1430 ret = read_alloc_one_name(eb, &extref->name,
1431 btrfs_inode_extref_name_len(eb, extref), name);
1432 if (ret)
1433 return ret;
1434
1435 if (index)
1436 *index = btrfs_inode_extref_index(eb, extref);
1437 if (parent_objectid)
1438 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1439
1440 return 0;
1441 }
1442
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,struct fscrypt_str * name,u64 * index)1443 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1444 struct fscrypt_str *name, u64 *index)
1445 {
1446 struct btrfs_inode_ref *ref;
1447 int ret;
1448
1449 ref = (struct btrfs_inode_ref *)ref_ptr;
1450
1451 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1452 name);
1453 if (ret)
1454 return ret;
1455
1456 if (index)
1457 *index = btrfs_inode_ref_index(eb, ref);
1458
1459 return 0;
1460 }
1461
1462 /*
1463 * Take an inode reference item from the log tree and iterate all names from the
1464 * inode reference item in the subvolume tree with the same key (if it exists).
1465 * For any name that is not in the inode reference item from the log tree, do a
1466 * proper unlink of that name (that is, remove its entry from the inode
1467 * reference item and both dir index keys).
1468 */
unlink_old_inode_refs(struct walk_control * wc,struct btrfs_inode * inode)1469 static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
1470 {
1471 struct btrfs_root *root = wc->root;
1472 int ret;
1473 unsigned long ref_ptr;
1474 unsigned long ref_end;
1475 struct extent_buffer *eb;
1476
1477 again:
1478 btrfs_release_path(wc->subvol_path);
1479 ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
1480 if (ret > 0) {
1481 ret = 0;
1482 goto out;
1483 }
1484 if (ret < 0) {
1485 btrfs_abort_log_replay(wc, ret,
1486 "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu",
1487 BTRFS_KEY_FMT_VALUE(&wc->log_key),
1488 btrfs_root_id(root));
1489 goto out;
1490 }
1491
1492 eb = wc->subvol_path->nodes[0];
1493 ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
1494 ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
1495 while (ref_ptr < ref_end) {
1496 struct fscrypt_str name;
1497 u64 parent_id;
1498
1499 if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
1500 ret = extref_get_fields(eb, ref_ptr, &name,
1501 NULL, &parent_id);
1502 if (ret) {
1503 btrfs_abort_log_replay(wc, ret,
1504 "failed to get extref details for inode %llu root %llu",
1505 btrfs_ino(inode),
1506 btrfs_root_id(root));
1507 goto out;
1508 }
1509 } else {
1510 parent_id = wc->log_key.offset;
1511 ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1512 if (ret) {
1513 btrfs_abort_log_replay(wc, ret,
1514 "failed to get ref details for inode %llu parent_id %llu root %llu",
1515 btrfs_ino(inode), parent_id,
1516 btrfs_root_id(root));
1517 goto out;
1518 }
1519 }
1520
1521 if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
1522 ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
1523 parent_id, &name);
1524 else
1525 ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
1526 &name);
1527
1528 if (!ret) {
1529 struct btrfs_inode *dir;
1530
1531 btrfs_release_path(wc->subvol_path);
1532 dir = btrfs_iget_logging(parent_id, root);
1533 if (IS_ERR(dir)) {
1534 ret = PTR_ERR(dir);
1535 kfree(name.name);
1536 btrfs_abort_log_replay(wc, ret,
1537 "failed to lookup dir inode %llu root %llu",
1538 parent_id, btrfs_root_id(root));
1539 goto out;
1540 }
1541 ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
1542 kfree(name.name);
1543 iput(&dir->vfs_inode);
1544 if (ret)
1545 goto out;
1546 goto again;
1547 }
1548
1549 kfree(name.name);
1550 ref_ptr += name.len;
1551 if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
1552 ref_ptr += sizeof(struct btrfs_inode_extref);
1553 else
1554 ref_ptr += sizeof(struct btrfs_inode_ref);
1555 }
1556 ret = 0;
1557 out:
1558 btrfs_release_path(wc->subvol_path);
1559 return ret;
1560 }
1561
1562 /*
1563 * Replay one inode back reference item found in the log tree.
1564 * Path is for temporary use by this function (it should be released on return).
1565 */
add_inode_ref(struct walk_control * wc)1566 static noinline int add_inode_ref(struct walk_control *wc)
1567 {
1568 struct btrfs_trans_handle *trans = wc->trans;
1569 struct btrfs_root *root = wc->root;
1570 struct btrfs_inode *dir = NULL;
1571 struct btrfs_inode *inode = NULL;
1572 unsigned long ref_ptr;
1573 unsigned long ref_end;
1574 struct fscrypt_str name = { 0 };
1575 int ret;
1576 const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
1577 u64 parent_objectid;
1578 u64 inode_objectid;
1579 u64 ref_index = 0;
1580 int ref_struct_size;
1581
1582 ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
1583 ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
1584
1585 if (is_extref_item) {
1586 struct btrfs_inode_extref *r;
1587
1588 ref_struct_size = sizeof(struct btrfs_inode_extref);
1589 r = (struct btrfs_inode_extref *)ref_ptr;
1590 parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
1591 } else {
1592 ref_struct_size = sizeof(struct btrfs_inode_ref);
1593 parent_objectid = wc->log_key.offset;
1594 }
1595 inode_objectid = wc->log_key.objectid;
1596
1597 /*
1598 * it is possible that we didn't log all the parent directories
1599 * for a given inode. If we don't find the dir, just don't
1600 * copy the back ref in. The link count fixup code will take
1601 * care of the rest
1602 */
1603 dir = btrfs_iget_logging(parent_objectid, root);
1604 if (IS_ERR(dir)) {
1605 ret = PTR_ERR(dir);
1606 if (ret == -ENOENT)
1607 ret = 0;
1608 else
1609 btrfs_abort_log_replay(wc, ret,
1610 "failed to lookup dir inode %llu root %llu",
1611 parent_objectid, btrfs_root_id(root));
1612 dir = NULL;
1613 goto out;
1614 }
1615
1616 inode = btrfs_iget_logging(inode_objectid, root);
1617 if (IS_ERR(inode)) {
1618 ret = PTR_ERR(inode);
1619 btrfs_abort_log_replay(wc, ret,
1620 "failed to lookup inode %llu root %llu",
1621 inode_objectid, btrfs_root_id(root));
1622 inode = NULL;
1623 goto out;
1624 }
1625
1626 while (ref_ptr < ref_end) {
1627 if (is_extref_item) {
1628 ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
1629 &ref_index, &parent_objectid);
1630 if (ret) {
1631 btrfs_abort_log_replay(wc, ret,
1632 "failed to get extref details for inode %llu root %llu",
1633 btrfs_ino(inode),
1634 btrfs_root_id(root));
1635 goto out;
1636 }
1637 /*
1638 * parent object can change from one array
1639 * item to another.
1640 */
1641 if (!dir) {
1642 dir = btrfs_iget_logging(parent_objectid, root);
1643 if (IS_ERR(dir)) {
1644 ret = PTR_ERR(dir);
1645 dir = NULL;
1646 /*
1647 * A new parent dir may have not been
1648 * logged and not exist in the subvolume
1649 * tree, see the comment above before
1650 * the loop when getting the first
1651 * parent dir.
1652 */
1653 if (ret == -ENOENT) {
1654 /*
1655 * The next extref may refer to
1656 * another parent dir that
1657 * exists, so continue.
1658 */
1659 ret = 0;
1660 goto next;
1661 } else {
1662 btrfs_abort_log_replay(wc, ret,
1663 "failed to lookup dir inode %llu root %llu",
1664 parent_objectid,
1665 btrfs_root_id(root));
1666 }
1667 goto out;
1668 }
1669 }
1670 } else {
1671 ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
1672 if (ret) {
1673 btrfs_abort_log_replay(wc, ret,
1674 "failed to get ref details for inode %llu parent_objectid %llu root %llu",
1675 btrfs_ino(inode),
1676 parent_objectid,
1677 btrfs_root_id(root));
1678 goto out;
1679 }
1680 }
1681
1682 ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
1683 btrfs_ino(inode), ref_index, &name);
1684 if (ret < 0) {
1685 btrfs_abort_log_replay(wc, ret,
1686 "failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
1687 btrfs_ino(inode), btrfs_ino(dir),
1688 ref_index, name.len, name.name,
1689 btrfs_root_id(root));
1690 goto out;
1691 } else if (ret == 0) {
1692 /*
1693 * look for a conflicting back reference in the
1694 * metadata. if we find one we have to unlink that name
1695 * of the file before we add our new link. Later on, we
1696 * overwrite any existing back reference, and we don't
1697 * want to create dangling pointers in the directory.
1698 */
1699 ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
1700 if (ret) {
1701 if (ret == 1)
1702 ret = 0;
1703 goto out;
1704 }
1705
1706 /* insert our name */
1707 ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
1708 if (ret) {
1709 btrfs_abort_log_replay(wc, ret,
1710 "failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
1711 btrfs_ino(inode),
1712 btrfs_ino(dir), ref_index,
1713 name.len, name.name,
1714 btrfs_root_id(root));
1715 goto out;
1716 }
1717
1718 ret = btrfs_update_inode(trans, inode);
1719 if (ret) {
1720 btrfs_abort_log_replay(wc, ret,
1721 "failed to update inode %llu root %llu",
1722 btrfs_ino(inode),
1723 btrfs_root_id(root));
1724 goto out;
1725 }
1726 }
1727 /* Else, ret == 1, we already have a perfect match, we're done. */
1728
1729 next:
1730 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1731 kfree(name.name);
1732 name.name = NULL;
1733 if (is_extref_item && dir) {
1734 iput(&dir->vfs_inode);
1735 dir = NULL;
1736 }
1737 }
1738
1739 /*
1740 * Before we overwrite the inode reference item in the subvolume tree
1741 * with the item from the log tree, we must unlink all names from the
1742 * parent directory that are in the subvolume's tree inode reference
1743 * item, otherwise we end up with an inconsistent subvolume tree where
1744 * dir index entries exist for a name but there is no inode reference
1745 * item with the same name.
1746 */
1747 ret = unlink_old_inode_refs(wc, inode);
1748 if (ret)
1749 goto out;
1750
1751 /* finally write the back reference in the inode */
1752 ret = overwrite_item(wc);
1753 out:
1754 btrfs_release_path(wc->subvol_path);
1755 kfree(name.name);
1756 if (dir)
1757 iput(&dir->vfs_inode);
1758 if (inode)
1759 iput(&inode->vfs_inode);
1760 return ret;
1761 }
1762
count_inode_extrefs(struct btrfs_inode * inode,struct btrfs_path * path)1763 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
1764 {
1765 int ret = 0;
1766 int name_len;
1767 unsigned int nlink = 0;
1768 u32 item_size;
1769 u32 cur_offset = 0;
1770 u64 inode_objectid = btrfs_ino(inode);
1771 u64 offset = 0;
1772 unsigned long ptr;
1773 struct btrfs_inode_extref *extref;
1774 struct extent_buffer *leaf;
1775
1776 while (1) {
1777 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
1778 path, &extref, &offset);
1779 if (ret)
1780 break;
1781
1782 leaf = path->nodes[0];
1783 item_size = btrfs_item_size(leaf, path->slots[0]);
1784 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1785 cur_offset = 0;
1786
1787 while (cur_offset < item_size) {
1788 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1789 name_len = btrfs_inode_extref_name_len(leaf, extref);
1790
1791 nlink++;
1792
1793 cur_offset += name_len + sizeof(*extref);
1794 }
1795
1796 offset++;
1797 btrfs_release_path(path);
1798 }
1799 btrfs_release_path(path);
1800
1801 if (ret < 0 && ret != -ENOENT)
1802 return ret;
1803 return nlink;
1804 }
1805
count_inode_refs(struct btrfs_inode * inode,struct btrfs_path * path)1806 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
1807 {
1808 int ret;
1809 struct btrfs_key key;
1810 unsigned int nlink = 0;
1811 unsigned long ptr;
1812 unsigned long ptr_end;
1813 int name_len;
1814 u64 ino = btrfs_ino(inode);
1815
1816 key.objectid = ino;
1817 key.type = BTRFS_INODE_REF_KEY;
1818 key.offset = (u64)-1;
1819
1820 while (1) {
1821 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
1822 if (ret < 0)
1823 break;
1824 if (ret > 0) {
1825 if (path->slots[0] == 0)
1826 break;
1827 path->slots[0]--;
1828 }
1829 process_slot:
1830 btrfs_item_key_to_cpu(path->nodes[0], &key,
1831 path->slots[0]);
1832 if (key.objectid != ino ||
1833 key.type != BTRFS_INODE_REF_KEY)
1834 break;
1835 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1836 ptr_end = ptr + btrfs_item_size(path->nodes[0],
1837 path->slots[0]);
1838 while (ptr < ptr_end) {
1839 struct btrfs_inode_ref *ref;
1840
1841 ref = (struct btrfs_inode_ref *)ptr;
1842 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1843 ref);
1844 ptr = (unsigned long)(ref + 1) + name_len;
1845 nlink++;
1846 }
1847
1848 if (key.offset == 0)
1849 break;
1850 if (path->slots[0] > 0) {
1851 path->slots[0]--;
1852 goto process_slot;
1853 }
1854 key.offset--;
1855 btrfs_release_path(path);
1856 }
1857 btrfs_release_path(path);
1858
1859 return nlink;
1860 }
1861
1862 /*
1863 * There are a few corners where the link count of the file can't
1864 * be properly maintained during replay. So, instead of adding
1865 * lots of complexity to the log code, we just scan the backrefs
1866 * for any file that has been through replay.
1867 *
1868 * The scan will update the link count on the inode to reflect the
1869 * number of back refs found. If it goes down to zero, the iput
1870 * will free the inode.
1871 */
fixup_inode_link_count(struct walk_control * wc,struct btrfs_inode * inode)1872 static noinline int fixup_inode_link_count(struct walk_control *wc,
1873 struct btrfs_inode *inode)
1874 {
1875 struct btrfs_trans_handle *trans = wc->trans;
1876 struct btrfs_root *root = inode->root;
1877 int ret;
1878 u64 nlink = 0;
1879 const u64 ino = btrfs_ino(inode);
1880
1881 ret = count_inode_refs(inode, wc->subvol_path);
1882 if (ret < 0)
1883 goto out;
1884
1885 nlink = ret;
1886
1887 ret = count_inode_extrefs(inode, wc->subvol_path);
1888 if (ret < 0)
1889 goto out;
1890
1891 nlink += ret;
1892
1893 ret = 0;
1894
1895 if (nlink != inode->vfs_inode.i_nlink) {
1896 set_nlink(&inode->vfs_inode, nlink);
1897 ret = btrfs_update_inode(trans, inode);
1898 if (ret)
1899 goto out;
1900 }
1901 if (S_ISDIR(inode->vfs_inode.i_mode))
1902 inode->index_cnt = (u64)-1;
1903
1904 if (inode->vfs_inode.i_nlink == 0) {
1905 if (S_ISDIR(inode->vfs_inode.i_mode)) {
1906 ret = replay_dir_deletes(wc, ino, true);
1907 if (ret)
1908 goto out;
1909 }
1910 ret = btrfs_insert_orphan_item(trans, root, ino);
1911 if (ret == -EEXIST)
1912 ret = 0;
1913 }
1914
1915 out:
1916 btrfs_release_path(wc->subvol_path);
1917 return ret;
1918 }
1919
fixup_inode_link_counts(struct walk_control * wc)1920 static noinline int fixup_inode_link_counts(struct walk_control *wc)
1921 {
1922 int ret;
1923 struct btrfs_key key;
1924
1925 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1926 key.type = BTRFS_ORPHAN_ITEM_KEY;
1927 key.offset = (u64)-1;
1928 while (1) {
1929 struct btrfs_trans_handle *trans = wc->trans;
1930 struct btrfs_root *root = wc->root;
1931 struct btrfs_inode *inode;
1932
1933 ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
1934 if (ret < 0)
1935 break;
1936
1937 if (ret == 1) {
1938 ret = 0;
1939 if (wc->subvol_path->slots[0] == 0)
1940 break;
1941 wc->subvol_path->slots[0]--;
1942 }
1943
1944 btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
1945 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1946 key.type != BTRFS_ORPHAN_ITEM_KEY)
1947 break;
1948
1949 ret = btrfs_del_item(trans, root, wc->subvol_path);
1950 if (ret)
1951 break;
1952
1953 btrfs_release_path(wc->subvol_path);
1954 inode = btrfs_iget_logging(key.offset, root);
1955 if (IS_ERR(inode)) {
1956 ret = PTR_ERR(inode);
1957 break;
1958 }
1959
1960 ret = fixup_inode_link_count(wc, inode);
1961 iput(&inode->vfs_inode);
1962 if (ret)
1963 break;
1964
1965 /*
1966 * fixup on a directory may create new entries,
1967 * make sure we always look for the highest possible
1968 * offset
1969 */
1970 key.offset = (u64)-1;
1971 }
1972 btrfs_release_path(wc->subvol_path);
1973 return ret;
1974 }
1975
1976
1977 /*
1978 * record a given inode in the fixup dir so we can check its link
1979 * count when replay is done. The link count is incremented here
1980 * so the inode won't go away until we check it
1981 */
link_to_fixup_dir(struct walk_control * wc,u64 objectid)1982 static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
1983 {
1984 struct btrfs_trans_handle *trans = wc->trans;
1985 struct btrfs_root *root = wc->root;
1986 struct btrfs_key key;
1987 int ret = 0;
1988 struct btrfs_inode *inode;
1989 struct inode *vfs_inode;
1990
1991 inode = btrfs_iget_logging(objectid, root);
1992 if (IS_ERR(inode)) {
1993 ret = PTR_ERR(inode);
1994 btrfs_abort_log_replay(wc, ret,
1995 "failed to lookup inode %llu root %llu",
1996 objectid, btrfs_root_id(root));
1997 return ret;
1998 }
1999
2000 vfs_inode = &inode->vfs_inode;
2001 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
2002 key.type = BTRFS_ORPHAN_ITEM_KEY;
2003 key.offset = objectid;
2004
2005 ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
2006
2007 btrfs_release_path(wc->subvol_path);
2008 if (ret == 0) {
2009 if (!vfs_inode->i_nlink)
2010 set_nlink(vfs_inode, 1);
2011 else
2012 inc_nlink(vfs_inode);
2013 ret = btrfs_update_inode(trans, inode);
2014 if (ret)
2015 btrfs_abort_log_replay(wc, ret,
2016 "failed to update inode %llu root %llu",
2017 objectid, btrfs_root_id(root));
2018 } else if (ret == -EEXIST) {
2019 ret = 0;
2020 } else {
2021 btrfs_abort_log_replay(wc, ret,
2022 "failed to insert fixup item for inode %llu root %llu",
2023 objectid, btrfs_root_id(root));
2024 }
2025 iput(vfs_inode);
2026
2027 return ret;
2028 }
2029
2030 /*
2031 * when replaying the log for a directory, we only insert names
2032 * for inodes that actually exist. This means an fsync on a directory
2033 * does not implicitly fsync all the new files in it
2034 */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,const struct fscrypt_str * name,struct btrfs_key * location)2035 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *root,
2037 u64 dirid, u64 index,
2038 const struct fscrypt_str *name,
2039 struct btrfs_key *location)
2040 {
2041 struct btrfs_inode *inode;
2042 struct btrfs_inode *dir;
2043 int ret;
2044
2045 inode = btrfs_iget_logging(location->objectid, root);
2046 if (IS_ERR(inode))
2047 return PTR_ERR(inode);
2048
2049 dir = btrfs_iget_logging(dirid, root);
2050 if (IS_ERR(dir)) {
2051 iput(&inode->vfs_inode);
2052 return PTR_ERR(dir);
2053 }
2054
2055 ret = btrfs_add_link(trans, dir, inode, name, 1, index);
2056
2057 /* FIXME, put inode into FIXUP list */
2058
2059 iput(&inode->vfs_inode);
2060 iput(&dir->vfs_inode);
2061 return ret;
2062 }
2063
delete_conflicting_dir_entry(struct walk_control * wc,struct btrfs_inode * dir,struct btrfs_dir_item * dst_di,const struct btrfs_key * log_key,u8 log_flags,bool exists)2064 static int delete_conflicting_dir_entry(struct walk_control *wc,
2065 struct btrfs_inode *dir,
2066 struct btrfs_dir_item *dst_di,
2067 const struct btrfs_key *log_key,
2068 u8 log_flags,
2069 bool exists)
2070 {
2071 struct btrfs_key found_key;
2072
2073 btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
2074 /* The existing dentry points to the same inode, don't delete it. */
2075 if (found_key.objectid == log_key->objectid &&
2076 found_key.type == log_key->type &&
2077 found_key.offset == log_key->offset &&
2078 btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
2079 return 1;
2080
2081 /*
2082 * Don't drop the conflicting directory entry if the inode for the new
2083 * entry doesn't exist.
2084 */
2085 if (!exists)
2086 return 0;
2087
2088 return drop_one_dir_item(wc, dir, dst_di);
2089 }
2090
2091 /*
2092 * take a single entry in a log directory item and replay it into
2093 * the subvolume.
2094 *
2095 * if a conflicting item exists in the subdirectory already,
2096 * the inode it points to is unlinked and put into the link count
2097 * fix up tree.
2098 *
2099 * If a name from the log points to a file or directory that does
2100 * not exist in the FS, it is skipped. fsyncs on directories
2101 * do not force down inodes inside that directory, just changes to the
2102 * names or unlinks in a directory.
2103 *
2104 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
2105 * non-existing inode) and 1 if the name was replayed.
2106 */
replay_one_name(struct walk_control * wc,struct btrfs_dir_item * di)2107 static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
2108 {
2109 struct btrfs_trans_handle *trans = wc->trans;
2110 struct btrfs_root *root = wc->root;
2111 struct fscrypt_str name = { 0 };
2112 struct btrfs_dir_item *dir_dst_di;
2113 struct btrfs_dir_item *index_dst_di;
2114 bool dir_dst_matches = false;
2115 bool index_dst_matches = false;
2116 struct btrfs_key log_key;
2117 struct btrfs_key search_key;
2118 struct btrfs_inode *dir;
2119 u8 log_flags;
2120 bool exists;
2121 int ret;
2122 bool update_size = true;
2123 bool name_added = false;
2124
2125 dir = btrfs_iget_logging(wc->log_key.objectid, root);
2126 if (IS_ERR(dir)) {
2127 ret = PTR_ERR(dir);
2128 btrfs_abort_log_replay(wc, ret,
2129 "failed to lookup dir inode %llu root %llu",
2130 wc->log_key.objectid, btrfs_root_id(root));
2131 return ret;
2132 }
2133
2134 ret = read_alloc_one_name(wc->log_leaf, di + 1,
2135 btrfs_dir_name_len(wc->log_leaf, di), &name);
2136 if (ret) {
2137 btrfs_abort_log_replay(wc, ret,
2138 "failed to allocate name for dir %llu root %llu",
2139 btrfs_ino(dir), btrfs_root_id(root));
2140 goto out;
2141 }
2142
2143 log_flags = btrfs_dir_flags(wc->log_leaf, di);
2144 btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
2145 ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
2146 btrfs_release_path(wc->subvol_path);
2147 if (ret < 0) {
2148 btrfs_abort_log_replay(wc, ret,
2149 "failed to lookup inode %llu root %llu",
2150 log_key.objectid, btrfs_root_id(root));
2151 goto out;
2152 }
2153 exists = (ret == 0);
2154 ret = 0;
2155
2156 dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
2157 wc->log_key.objectid, &name, 1);
2158 if (IS_ERR(dir_dst_di)) {
2159 ret = PTR_ERR(dir_dst_di);
2160 btrfs_abort_log_replay(wc, ret,
2161 "failed to lookup dir item for dir %llu name %.*s root %llu",
2162 wc->log_key.objectid, name.len, name.name,
2163 btrfs_root_id(root));
2164 goto out;
2165 } else if (dir_dst_di) {
2166 ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
2167 &log_key, log_flags, exists);
2168 if (ret < 0) {
2169 btrfs_abort_log_replay(wc, ret,
2170 "failed to delete conflicting entry for dir %llu name %.*s root %llu",
2171 btrfs_ino(dir), name.len, name.name,
2172 btrfs_root_id(root));
2173 goto out;
2174 }
2175 dir_dst_matches = (ret == 1);
2176 }
2177
2178 btrfs_release_path(wc->subvol_path);
2179
2180 index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
2181 wc->log_key.objectid,
2182 wc->log_key.offset, &name, 1);
2183 if (IS_ERR(index_dst_di)) {
2184 ret = PTR_ERR(index_dst_di);
2185 btrfs_abort_log_replay(wc, ret,
2186 "failed to lookup dir index item for dir %llu name %.*s root %llu",
2187 wc->log_key.objectid, name.len, name.name,
2188 btrfs_root_id(root));
2189 goto out;
2190 } else if (index_dst_di) {
2191 ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
2192 &log_key, log_flags, exists);
2193 if (ret < 0) {
2194 btrfs_abort_log_replay(wc, ret,
2195 "failed to delete conflicting entry for dir %llu name %.*s root %llu",
2196 btrfs_ino(dir), name.len, name.name,
2197 btrfs_root_id(root));
2198 goto out;
2199 }
2200 index_dst_matches = (ret == 1);
2201 }
2202
2203 btrfs_release_path(wc->subvol_path);
2204
2205 if (dir_dst_matches && index_dst_matches) {
2206 ret = 0;
2207 update_size = false;
2208 goto out;
2209 }
2210
2211 /*
2212 * Check if the inode reference exists in the log for the given name,
2213 * inode and parent inode
2214 */
2215 search_key.objectid = log_key.objectid;
2216 search_key.type = BTRFS_INODE_REF_KEY;
2217 search_key.offset = wc->log_key.objectid;
2218 ret = backref_in_log(root->log_root, &search_key, 0, &name);
2219 if (ret < 0) {
2220 btrfs_abort_log_replay(wc, ret,
2221 "failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
2222 search_key.objectid, btrfs_ino(dir),
2223 name.len, name.name, btrfs_root_id(root));
2224 goto out;
2225 } else if (ret) {
2226 /* The dentry will be added later. */
2227 ret = 0;
2228 update_size = false;
2229 goto out;
2230 }
2231
2232 search_key.objectid = log_key.objectid;
2233 search_key.type = BTRFS_INODE_EXTREF_KEY;
2234 search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
2235 ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
2236 if (ret < 0) {
2237 btrfs_abort_log_replay(wc, ret,
2238 "failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
2239 search_key.objectid, btrfs_ino(dir),
2240 name.len, name.name, btrfs_root_id(root));
2241 goto out;
2242 } else if (ret) {
2243 /* The dentry will be added later. */
2244 ret = 0;
2245 update_size = false;
2246 goto out;
2247 }
2248 ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
2249 &name, &log_key);
2250 if (ret && ret != -ENOENT && ret != -EEXIST) {
2251 btrfs_abort_log_replay(wc, ret,
2252 "failed to insert name %.*s for inode %llu dir %llu root %llu",
2253 name.len, name.name, log_key.objectid,
2254 btrfs_ino(dir), btrfs_root_id(root));
2255 goto out;
2256 }
2257 if (!ret)
2258 name_added = true;
2259 update_size = false;
2260 ret = 0;
2261
2262 out:
2263 if (!ret && update_size) {
2264 btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
2265 ret = btrfs_update_inode(trans, dir);
2266 if (ret)
2267 btrfs_abort_log_replay(wc, ret,
2268 "failed to update dir inode %llu root %llu",
2269 btrfs_ino(dir), btrfs_root_id(root));
2270 }
2271 kfree(name.name);
2272 iput(&dir->vfs_inode);
2273 if (!ret && name_added)
2274 ret = 1;
2275 return ret;
2276 }
2277
2278 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
replay_one_dir_item(struct walk_control * wc)2279 static noinline int replay_one_dir_item(struct walk_control *wc)
2280 {
2281 int ret;
2282 struct btrfs_dir_item *di;
2283
2284 /* We only log dir index keys, which only contain a single dir item. */
2285 ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY,
2286 "wc->log_key.type=%u", wc->log_key.type);
2287
2288 di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
2289 ret = replay_one_name(wc, di);
2290 if (ret < 0)
2291 return ret;
2292
2293 /*
2294 * If this entry refers to a non-directory (directories can not have a
2295 * link count > 1) and it was added in the transaction that was not
2296 * committed, make sure we fixup the link count of the inode the entry
2297 * points to. Otherwise something like the following would result in a
2298 * directory pointing to an inode with a wrong link that does not account
2299 * for this dir entry:
2300 *
2301 * mkdir testdir
2302 * touch testdir/foo
2303 * touch testdir/bar
2304 * sync
2305 *
2306 * ln testdir/bar testdir/bar_link
2307 * ln testdir/foo testdir/foo_link
2308 * xfs_io -c "fsync" testdir/bar
2309 *
2310 * <power failure>
2311 *
2312 * mount fs, log replay happens
2313 *
2314 * File foo would remain with a link count of 1 when it has two entries
2315 * pointing to it in the directory testdir. This would make it impossible
2316 * to ever delete the parent directory has it would result in stale
2317 * dentries that can never be deleted.
2318 */
2319 if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
2320 struct btrfs_key di_key;
2321
2322 btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
2323 ret = link_to_fixup_dir(wc, di_key.objectid);
2324 }
2325
2326 return ret;
2327 }
2328
2329 /*
2330 * directory replay has two parts. There are the standard directory
2331 * items in the log copied from the subvolume, and range items
2332 * created in the log while the subvolume was logged.
2333 *
2334 * The range items tell us which parts of the key space the log
2335 * is authoritative for. During replay, if a key in the subvolume
2336 * directory is in a logged range item, but not actually in the log
2337 * that means it was deleted from the directory before the fsync
2338 * and should be removed.
2339 */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 * start_ret,u64 * end_ret)2340 static noinline int find_dir_range(struct btrfs_root *root,
2341 struct btrfs_path *path,
2342 u64 dirid,
2343 u64 *start_ret, u64 *end_ret)
2344 {
2345 struct btrfs_key key;
2346 u64 found_end;
2347 struct btrfs_dir_log_item *item;
2348 int ret;
2349 int nritems;
2350
2351 if (*start_ret == (u64)-1)
2352 return 1;
2353
2354 key.objectid = dirid;
2355 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2356 key.offset = *start_ret;
2357
2358 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2359 if (ret < 0)
2360 goto out;
2361 if (ret > 0) {
2362 if (path->slots[0] == 0)
2363 goto out;
2364 path->slots[0]--;
2365 }
2366 if (ret != 0)
2367 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2368
2369 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2370 ret = 1;
2371 goto next;
2372 }
2373 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2374 struct btrfs_dir_log_item);
2375 found_end = btrfs_dir_log_end(path->nodes[0], item);
2376
2377 if (*start_ret >= key.offset && *start_ret <= found_end) {
2378 ret = 0;
2379 *start_ret = key.offset;
2380 *end_ret = found_end;
2381 goto out;
2382 }
2383 ret = 1;
2384 next:
2385 /* check the next slot in the tree to see if it is a valid item */
2386 nritems = btrfs_header_nritems(path->nodes[0]);
2387 path->slots[0]++;
2388 if (path->slots[0] >= nritems) {
2389 ret = btrfs_next_leaf(root, path);
2390 if (ret)
2391 goto out;
2392 }
2393
2394 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2395
2396 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2397 ret = 1;
2398 goto out;
2399 }
2400 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2401 struct btrfs_dir_log_item);
2402 found_end = btrfs_dir_log_end(path->nodes[0], item);
2403 *start_ret = key.offset;
2404 *end_ret = found_end;
2405 ret = 0;
2406 out:
2407 btrfs_release_path(path);
2408 return ret;
2409 }
2410
2411 /*
2412 * this looks for a given directory item in the log. If the directory
2413 * item is not in the log, the item is removed and the inode it points
2414 * to is unlinked
2415 */
check_item_in_log(struct walk_control * wc,struct btrfs_path * log_path,struct btrfs_inode * dir,struct btrfs_key * dir_key,bool force_remove)2416 static noinline int check_item_in_log(struct walk_control *wc,
2417 struct btrfs_path *log_path,
2418 struct btrfs_inode *dir,
2419 struct btrfs_key *dir_key,
2420 bool force_remove)
2421 {
2422 struct btrfs_trans_handle *trans = wc->trans;
2423 struct btrfs_root *root = dir->root;
2424 int ret;
2425 struct extent_buffer *eb;
2426 int slot;
2427 struct btrfs_dir_item *di;
2428 struct fscrypt_str name = { 0 };
2429 struct btrfs_inode *inode = NULL;
2430 struct btrfs_key location;
2431
2432 /*
2433 * Currently we only log dir index keys. Even if we replay a log created
2434 * by an older kernel that logged both dir index and dir item keys, all
2435 * we need to do is process the dir index keys, we (and our caller) can
2436 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2437 */
2438 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type);
2439
2440 eb = wc->subvol_path->nodes[0];
2441 slot = wc->subvol_path->slots[0];
2442 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2443 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2444 if (ret) {
2445 btrfs_abort_log_replay(wc, ret,
2446 "failed to allocate name for dir %llu index %llu root %llu",
2447 btrfs_ino(dir), dir_key->offset,
2448 btrfs_root_id(root));
2449 goto out;
2450 }
2451
2452 if (!force_remove) {
2453 struct btrfs_dir_item *log_di;
2454
2455 log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
2456 dir_key->objectid,
2457 dir_key->offset, &name, 0);
2458 if (IS_ERR(log_di)) {
2459 ret = PTR_ERR(log_di);
2460 btrfs_abort_log_replay(wc, ret,
2461 "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
2462 btrfs_ino(dir), dir_key->offset,
2463 name.len, name.name,
2464 btrfs_root_id(root));
2465 goto out;
2466 } else if (log_di) {
2467 /* The dentry exists in the log, we have nothing to do. */
2468 ret = 0;
2469 goto out;
2470 }
2471 }
2472
2473 btrfs_dir_item_key_to_cpu(eb, di, &location);
2474 btrfs_release_path(wc->subvol_path);
2475 btrfs_release_path(log_path);
2476 inode = btrfs_iget_logging(location.objectid, root);
2477 if (IS_ERR(inode)) {
2478 ret = PTR_ERR(inode);
2479 inode = NULL;
2480 btrfs_abort_log_replay(wc, ret,
2481 "failed to lookup inode %llu root %llu",
2482 location.objectid, btrfs_root_id(root));
2483 goto out;
2484 }
2485
2486 ret = link_to_fixup_dir(wc, location.objectid);
2487 if (ret)
2488 goto out;
2489
2490 inc_nlink(&inode->vfs_inode);
2491 ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
2492 /*
2493 * Unlike dir item keys, dir index keys can only have one name (entry) in
2494 * them, as there are no key collisions since each key has a unique offset
2495 * (an index number), so we're done.
2496 */
2497 out:
2498 btrfs_release_path(wc->subvol_path);
2499 btrfs_release_path(log_path);
2500 kfree(name.name);
2501 if (inode)
2502 iput(&inode->vfs_inode);
2503 return ret;
2504 }
2505
replay_xattr_deletes(struct walk_control * wc)2506 static int replay_xattr_deletes(struct walk_control *wc)
2507 {
2508 struct btrfs_trans_handle *trans = wc->trans;
2509 struct btrfs_root *root = wc->root;
2510 struct btrfs_root *log = wc->log;
2511 struct btrfs_key search_key;
2512 BTRFS_PATH_AUTO_FREE(log_path);
2513 const u64 ino = wc->log_key.objectid;
2514 int nritems;
2515 int ret;
2516
2517 log_path = btrfs_alloc_path();
2518 if (!log_path) {
2519 btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2520 return -ENOMEM;
2521 }
2522
2523 search_key.objectid = ino;
2524 search_key.type = BTRFS_XATTR_ITEM_KEY;
2525 search_key.offset = 0;
2526 again:
2527 ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
2528 if (ret < 0) {
2529 btrfs_abort_log_replay(wc, ret,
2530 "failed to search xattrs for inode %llu root %llu",
2531 ino, btrfs_root_id(root));
2532 goto out;
2533 }
2534 process_leaf:
2535 nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
2536 for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
2537 struct btrfs_key key;
2538 struct btrfs_dir_item *di;
2539 struct btrfs_dir_item *log_di;
2540 u32 total_size;
2541 u32 cur;
2542
2543 btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
2544 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2545 ret = 0;
2546 goto out;
2547 }
2548
2549 di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
2550 total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
2551 cur = 0;
2552 while (cur < total_size) {
2553 u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
2554 u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
2555 u32 this_len = sizeof(*di) + name_len + data_len;
2556 char *name;
2557
2558 name = kmalloc(name_len, GFP_NOFS);
2559 if (!name) {
2560 ret = -ENOMEM;
2561 btrfs_abort_log_replay(wc, ret,
2562 "failed to allocate memory for name of length %u",
2563 name_len);
2564 goto out;
2565 }
2566 read_extent_buffer(wc->subvol_path->nodes[0], name,
2567 (unsigned long)(di + 1), name_len);
2568
2569 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2570 name, name_len, 0);
2571 btrfs_release_path(log_path);
2572 if (!log_di) {
2573 /* Doesn't exist in log tree, so delete it. */
2574 btrfs_release_path(wc->subvol_path);
2575 di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
2576 name, name_len, -1);
2577 if (IS_ERR(di)) {
2578 ret = PTR_ERR(di);
2579 btrfs_abort_log_replay(wc, ret,
2580 "failed to lookup xattr with name %.*s for inode %llu root %llu",
2581 name_len, name, ino,
2582 btrfs_root_id(root));
2583 kfree(name);
2584 goto out;
2585 }
2586 ASSERT(di);
2587 ret = btrfs_delete_one_dir_name(trans, root,
2588 wc->subvol_path, di);
2589 if (ret) {
2590 btrfs_abort_log_replay(wc, ret,
2591 "failed to delete xattr with name %.*s for inode %llu root %llu",
2592 name_len, name, ino,
2593 btrfs_root_id(root));
2594 kfree(name);
2595 goto out;
2596 }
2597 btrfs_release_path(wc->subvol_path);
2598 kfree(name);
2599 search_key = key;
2600 goto again;
2601 }
2602 if (IS_ERR(log_di)) {
2603 ret = PTR_ERR(log_di);
2604 btrfs_abort_log_replay(wc, ret,
2605 "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
2606 name_len, name, ino,
2607 btrfs_root_id(root));
2608 kfree(name);
2609 goto out;
2610 }
2611 kfree(name);
2612 cur += this_len;
2613 di = (struct btrfs_dir_item *)((char *)di + this_len);
2614 }
2615 }
2616 ret = btrfs_next_leaf(root, wc->subvol_path);
2617 if (ret > 0)
2618 ret = 0;
2619 else if (ret == 0)
2620 goto process_leaf;
2621 else
2622 btrfs_abort_log_replay(wc, ret,
2623 "failed to get next leaf in subvolume root %llu",
2624 btrfs_root_id(root));
2625 out:
2626 btrfs_release_path(wc->subvol_path);
2627 return ret;
2628 }
2629
2630
2631 /*
2632 * deletion replay happens before we copy any new directory items
2633 * out of the log or out of backreferences from inodes. It
2634 * scans the log to find ranges of keys that log is authoritative for,
2635 * and then scans the directory to find items in those ranges that are
2636 * not present in the log.
2637 *
2638 * Anything we don't find in the log is unlinked and removed from the
2639 * directory.
2640 */
replay_dir_deletes(struct walk_control * wc,u64 dirid,bool del_all)2641 static noinline int replay_dir_deletes(struct walk_control *wc,
2642 u64 dirid, bool del_all)
2643 {
2644 struct btrfs_root *root = wc->root;
2645 struct btrfs_root *log = (del_all ? NULL : wc->log);
2646 u64 range_start;
2647 u64 range_end;
2648 int ret = 0;
2649 struct btrfs_key dir_key;
2650 struct btrfs_key found_key;
2651 BTRFS_PATH_AUTO_FREE(log_path);
2652 struct btrfs_inode *dir;
2653
2654 dir_key.objectid = dirid;
2655 dir_key.type = BTRFS_DIR_INDEX_KEY;
2656 log_path = btrfs_alloc_path();
2657 if (!log_path) {
2658 btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2659 return -ENOMEM;
2660 }
2661
2662 dir = btrfs_iget_logging(dirid, root);
2663 /*
2664 * It isn't an error if the inode isn't there, that can happen because
2665 * we replay the deletes before we copy in the inode item from the log.
2666 */
2667 if (IS_ERR(dir)) {
2668 ret = PTR_ERR(dir);
2669 if (ret == -ENOENT)
2670 ret = 0;
2671 else
2672 btrfs_abort_log_replay(wc, ret,
2673 "failed to lookup dir inode %llu root %llu",
2674 dirid, btrfs_root_id(root));
2675 return ret;
2676 }
2677
2678 range_start = 0;
2679 range_end = 0;
2680 while (1) {
2681 if (del_all)
2682 range_end = (u64)-1;
2683 else {
2684 ret = find_dir_range(log, wc->subvol_path, dirid,
2685 &range_start, &range_end);
2686 if (ret < 0) {
2687 btrfs_abort_log_replay(wc, ret,
2688 "failed to find range for dir %llu in log tree root %llu",
2689 dirid, btrfs_root_id(root));
2690 goto out;
2691 } else if (ret > 0) {
2692 break;
2693 }
2694 }
2695
2696 dir_key.offset = range_start;
2697 while (1) {
2698 int nritems;
2699 ret = btrfs_search_slot(NULL, root, &dir_key,
2700 wc->subvol_path, 0, 0);
2701 if (ret < 0) {
2702 btrfs_abort_log_replay(wc, ret,
2703 "failed to search root %llu for key " BTRFS_KEY_FMT,
2704 btrfs_root_id(root),
2705 BTRFS_KEY_FMT_VALUE(&dir_key));
2706 goto out;
2707 }
2708
2709 nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
2710 if (wc->subvol_path->slots[0] >= nritems) {
2711 ret = btrfs_next_leaf(root, wc->subvol_path);
2712 if (ret == 1) {
2713 break;
2714 } else if (ret < 0) {
2715 btrfs_abort_log_replay(wc, ret,
2716 "failed to get next leaf in subvolume root %llu",
2717 btrfs_root_id(root));
2718 goto out;
2719 }
2720 }
2721 btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
2722 wc->subvol_path->slots[0]);
2723 if (found_key.objectid != dirid ||
2724 found_key.type != dir_key.type) {
2725 ret = 0;
2726 goto out;
2727 }
2728
2729 if (found_key.offset > range_end)
2730 break;
2731
2732 ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
2733 if (ret)
2734 goto out;
2735 if (found_key.offset == (u64)-1)
2736 break;
2737 dir_key.offset = found_key.offset + 1;
2738 }
2739 btrfs_release_path(wc->subvol_path);
2740 if (range_end == (u64)-1)
2741 break;
2742 range_start = range_end + 1;
2743 }
2744 ret = 0;
2745 out:
2746 btrfs_release_path(wc->subvol_path);
2747 iput(&dir->vfs_inode);
2748 return ret;
2749 }
2750
2751 /*
2752 * the process_func used to replay items from the log tree. This
2753 * gets called in two different stages. The first stage just looks
2754 * for inodes and makes sure they are all copied into the subvolume.
2755 *
2756 * The second stage copies all the other item types from the log into
2757 * the subvolume. The two stage approach is slower, but gets rid of
2758 * lots of complexity around inodes referencing other inodes that exist
2759 * only in the log (references come from either directory items or inode
2760 * back refs).
2761 */
replay_one_buffer(struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2762 static int replay_one_buffer(struct extent_buffer *eb,
2763 struct walk_control *wc, u64 gen, int level)
2764 {
2765 int nritems;
2766 struct btrfs_tree_parent_check check = {
2767 .transid = gen,
2768 .level = level
2769 };
2770 struct btrfs_root *root = wc->root;
2771 struct btrfs_trans_handle *trans = wc->trans;
2772 int ret;
2773
2774 if (level != 0)
2775 return 0;
2776
2777 /*
2778 * Set to NULL since it was not yet read and in case we abort log replay
2779 * on error, we have no valid log tree leaf to dump.
2780 */
2781 wc->log_leaf = NULL;
2782 ret = btrfs_read_extent_buffer(eb, &check);
2783 if (ret) {
2784 btrfs_abort_log_replay(wc, ret,
2785 "failed to read log tree leaf %llu for root %llu",
2786 eb->start, btrfs_root_id(root));
2787 return ret;
2788 }
2789
2790 ASSERT(wc->subvol_path == NULL);
2791 wc->subvol_path = btrfs_alloc_path();
2792 if (!wc->subvol_path) {
2793 btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
2794 return -ENOMEM;
2795 }
2796
2797 wc->log_leaf = eb;
2798
2799 nritems = btrfs_header_nritems(eb);
2800 for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
2801 struct btrfs_inode_item *inode_item;
2802
2803 btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
2804
2805 if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
2806 inode_item = btrfs_item_ptr(eb, wc->log_slot,
2807 struct btrfs_inode_item);
2808 /*
2809 * An inode with no links is either:
2810 *
2811 * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
2812 * got linked before the fsync, skip it, as replaying
2813 * it is pointless since it would be deleted later.
2814 * We skip logging tmpfiles, but it's always possible
2815 * we are replaying a log created with a kernel that
2816 * used to log tmpfiles;
2817 *
2818 * 2) A non-tmpfile which got its last link deleted
2819 * while holding an open fd on it and later got
2820 * fsynced through that fd. We always log the
2821 * parent inodes when inode->last_unlink_trans is
2822 * set to the current transaction, so ignore all the
2823 * inode items for this inode. We will delete the
2824 * inode when processing the parent directory with
2825 * replay_dir_deletes().
2826 */
2827 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2828 wc->ignore_cur_inode = true;
2829 continue;
2830 } else {
2831 wc->ignore_cur_inode = false;
2832 }
2833 }
2834
2835 /* Inode keys are done during the first stage. */
2836 if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
2837 wc->stage == LOG_WALK_REPLAY_INODES) {
2838 u32 mode;
2839
2840 ret = replay_xattr_deletes(wc);
2841 if (ret)
2842 break;
2843 mode = btrfs_inode_mode(eb, inode_item);
2844 if (S_ISDIR(mode)) {
2845 ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
2846 if (ret)
2847 break;
2848 }
2849 ret = overwrite_item(wc);
2850 if (ret)
2851 break;
2852
2853 /*
2854 * Before replaying extents, truncate the inode to its
2855 * size. We need to do it now and not after log replay
2856 * because before an fsync we can have prealloc extents
2857 * added beyond the inode's i_size. If we did it after,
2858 * through orphan cleanup for example, we would drop
2859 * those prealloc extents just after replaying them.
2860 */
2861 if (S_ISREG(mode)) {
2862 struct btrfs_drop_extents_args drop_args = { 0 };
2863 struct btrfs_inode *inode;
2864 u64 from;
2865
2866 inode = btrfs_iget_logging(wc->log_key.objectid, root);
2867 if (IS_ERR(inode)) {
2868 ret = PTR_ERR(inode);
2869 btrfs_abort_log_replay(wc, ret,
2870 "failed to lookup inode %llu root %llu",
2871 wc->log_key.objectid,
2872 btrfs_root_id(root));
2873 break;
2874 }
2875 from = ALIGN(i_size_read(&inode->vfs_inode),
2876 root->fs_info->sectorsize);
2877 drop_args.start = from;
2878 drop_args.end = (u64)-1;
2879 drop_args.drop_cache = true;
2880 drop_args.path = wc->subvol_path;
2881 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2882 if (ret) {
2883 btrfs_abort_log_replay(wc, ret,
2884 "failed to drop extents for inode %llu root %llu offset %llu",
2885 btrfs_ino(inode),
2886 btrfs_root_id(root),
2887 from);
2888 } else {
2889 inode_sub_bytes(&inode->vfs_inode,
2890 drop_args.bytes_found);
2891 /* Update the inode's nbytes. */
2892 ret = btrfs_update_inode(trans, inode);
2893 if (ret)
2894 btrfs_abort_log_replay(wc, ret,
2895 "failed to update inode %llu root %llu",
2896 btrfs_ino(inode),
2897 btrfs_root_id(root));
2898 }
2899 iput(&inode->vfs_inode);
2900 if (ret)
2901 break;
2902 }
2903
2904 ret = link_to_fixup_dir(wc, wc->log_key.objectid);
2905 if (ret)
2906 break;
2907 }
2908
2909 if (wc->ignore_cur_inode)
2910 continue;
2911
2912 if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
2913 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2914 ret = replay_one_dir_item(wc);
2915 if (ret)
2916 break;
2917 }
2918
2919 if (wc->stage < LOG_WALK_REPLAY_ALL)
2920 continue;
2921
2922 /* these keys are simply copied */
2923 if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
2924 ret = overwrite_item(wc);
2925 if (ret)
2926 break;
2927 } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
2928 wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
2929 ret = add_inode_ref(wc);
2930 if (ret)
2931 break;
2932 } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
2933 ret = replay_one_extent(wc);
2934 if (ret)
2935 break;
2936 }
2937 /*
2938 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2939 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2940 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2941 * older kernel with such keys, ignore them.
2942 */
2943 }
2944 btrfs_free_path(wc->subvol_path);
2945 wc->subvol_path = NULL;
2946 return ret;
2947 }
2948
clean_log_buffer(struct btrfs_trans_handle * trans,struct extent_buffer * eb)2949 static int clean_log_buffer(struct btrfs_trans_handle *trans,
2950 struct extent_buffer *eb)
2951 {
2952 struct btrfs_fs_info *fs_info = eb->fs_info;
2953 struct btrfs_block_group *bg;
2954
2955 btrfs_tree_lock(eb);
2956 btrfs_clear_buffer_dirty(trans, eb);
2957 wait_on_extent_buffer_writeback(eb);
2958 btrfs_tree_unlock(eb);
2959
2960 if (trans) {
2961 int ret;
2962
2963 ret = btrfs_pin_reserved_extent(trans, eb);
2964 if (ret)
2965 btrfs_abort_transaction(trans, ret);
2966 return ret;
2967 }
2968
2969 bg = btrfs_lookup_block_group(fs_info, eb->start);
2970 if (!bg) {
2971 btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
2972 btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
2973 return -ENOENT;
2974 }
2975
2976 spin_lock(&bg->space_info->lock);
2977 spin_lock(&bg->lock);
2978 bg->reserved -= fs_info->nodesize;
2979 bg->space_info->bytes_reserved -= fs_info->nodesize;
2980 spin_unlock(&bg->lock);
2981 spin_unlock(&bg->space_info->lock);
2982
2983 btrfs_put_block_group(bg);
2984
2985 return 0;
2986 }
2987
walk_down_log_tree(struct btrfs_path * path,int * level,struct walk_control * wc)2988 static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
2989 struct walk_control *wc)
2990 {
2991 struct btrfs_trans_handle *trans = wc->trans;
2992 struct btrfs_fs_info *fs_info = wc->log->fs_info;
2993 u64 bytenr;
2994 u64 ptr_gen;
2995 struct extent_buffer *next;
2996 struct extent_buffer *cur;
2997 int ret = 0;
2998
2999 while (*level > 0) {
3000 struct btrfs_tree_parent_check check = { 0 };
3001
3002 cur = path->nodes[*level];
3003
3004 WARN_ON(btrfs_header_level(cur) != *level);
3005
3006 if (path->slots[*level] >=
3007 btrfs_header_nritems(cur))
3008 break;
3009
3010 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3011 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3012 check.transid = ptr_gen;
3013 check.level = *level - 1;
3014 check.has_first_key = true;
3015 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
3016
3017 next = btrfs_find_create_tree_block(fs_info, bytenr,
3018 btrfs_header_owner(cur),
3019 *level - 1);
3020 if (IS_ERR(next)) {
3021 ret = PTR_ERR(next);
3022 if (trans)
3023 btrfs_abort_transaction(trans, ret);
3024 else
3025 btrfs_handle_fs_error(fs_info, ret, NULL);
3026 return ret;
3027 }
3028
3029 if (*level == 1) {
3030 ret = wc->process_func(next, wc, ptr_gen, *level - 1);
3031 if (ret) {
3032 free_extent_buffer(next);
3033 return ret;
3034 }
3035
3036 path->slots[*level]++;
3037 if (wc->free) {
3038 ret = btrfs_read_extent_buffer(next, &check);
3039 if (ret) {
3040 free_extent_buffer(next);
3041 if (trans)
3042 btrfs_abort_transaction(trans, ret);
3043 else
3044 btrfs_handle_fs_error(fs_info, ret, NULL);
3045 return ret;
3046 }
3047
3048 ret = clean_log_buffer(trans, next);
3049 if (ret) {
3050 free_extent_buffer(next);
3051 return ret;
3052 }
3053 }
3054 free_extent_buffer(next);
3055 continue;
3056 }
3057 ret = btrfs_read_extent_buffer(next, &check);
3058 if (ret) {
3059 free_extent_buffer(next);
3060 if (trans)
3061 btrfs_abort_transaction(trans, ret);
3062 else
3063 btrfs_handle_fs_error(fs_info, ret, NULL);
3064 return ret;
3065 }
3066
3067 if (path->nodes[*level-1])
3068 free_extent_buffer(path->nodes[*level-1]);
3069 path->nodes[*level-1] = next;
3070 *level = btrfs_header_level(next);
3071 path->slots[*level] = 0;
3072 cond_resched();
3073 }
3074 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
3075
3076 cond_resched();
3077 return 0;
3078 }
3079
walk_up_log_tree(struct btrfs_path * path,int * level,struct walk_control * wc)3080 static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
3081 struct walk_control *wc)
3082 {
3083 int i;
3084 int slot;
3085 int ret;
3086
3087 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
3088 slot = path->slots[i];
3089 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
3090 path->slots[i]++;
3091 *level = i;
3092 WARN_ON(*level == 0);
3093 return 0;
3094 } else {
3095 ret = wc->process_func(path->nodes[*level], wc,
3096 btrfs_header_generation(path->nodes[*level]),
3097 *level);
3098 if (ret)
3099 return ret;
3100
3101 if (wc->free) {
3102 ret = clean_log_buffer(wc->trans, path->nodes[*level]);
3103 if (ret)
3104 return ret;
3105 }
3106 free_extent_buffer(path->nodes[*level]);
3107 path->nodes[*level] = NULL;
3108 *level = i + 1;
3109 }
3110 }
3111 return 1;
3112 }
3113
3114 /*
3115 * drop the reference count on the tree rooted at 'snap'. This traverses
3116 * the tree freeing any blocks that have a ref count of zero after being
3117 * decremented.
3118 */
walk_log_tree(struct walk_control * wc)3119 static int walk_log_tree(struct walk_control *wc)
3120 {
3121 struct btrfs_root *log = wc->log;
3122 int ret = 0;
3123 int wret;
3124 int level;
3125 BTRFS_PATH_AUTO_FREE(path);
3126 int orig_level;
3127
3128 path = btrfs_alloc_path();
3129 if (!path)
3130 return -ENOMEM;
3131
3132 level = btrfs_header_level(log->node);
3133 orig_level = level;
3134 path->nodes[level] = log->node;
3135 refcount_inc(&log->node->refs);
3136 path->slots[level] = 0;
3137
3138 while (1) {
3139 wret = walk_down_log_tree(path, &level, wc);
3140 if (wret > 0)
3141 break;
3142 if (wret < 0)
3143 return wret;
3144
3145 wret = walk_up_log_tree(path, &level, wc);
3146 if (wret > 0)
3147 break;
3148 if (wret < 0)
3149 return wret;
3150 }
3151
3152 /* was the root node processed? if not, catch it here */
3153 if (path->nodes[orig_level]) {
3154 ret = wc->process_func(path->nodes[orig_level], wc,
3155 btrfs_header_generation(path->nodes[orig_level]),
3156 orig_level);
3157 if (ret)
3158 return ret;
3159 if (wc->free)
3160 ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
3161 }
3162
3163 return ret;
3164 }
3165
3166 /*
3167 * helper function to update the item for a given subvolumes log root
3168 * in the tree of log roots
3169 */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)3170 static int update_log_root(struct btrfs_trans_handle *trans,
3171 struct btrfs_root *log,
3172 struct btrfs_root_item *root_item)
3173 {
3174 struct btrfs_fs_info *fs_info = log->fs_info;
3175 int ret;
3176
3177 if (log->log_transid == 1) {
3178 /* insert root item on the first sync */
3179 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
3180 &log->root_key, root_item);
3181 } else {
3182 ret = btrfs_update_root(trans, fs_info->log_root_tree,
3183 &log->root_key, root_item);
3184 }
3185 return ret;
3186 }
3187
wait_log_commit(struct btrfs_root * root,int transid)3188 static void wait_log_commit(struct btrfs_root *root, int transid)
3189 {
3190 DEFINE_WAIT(wait);
3191 int index = transid % 2;
3192
3193 /*
3194 * we only allow two pending log transactions at a time,
3195 * so we know that if ours is more than 2 older than the
3196 * current transaction, we're done
3197 */
3198 for (;;) {
3199 prepare_to_wait(&root->log_commit_wait[index],
3200 &wait, TASK_UNINTERRUPTIBLE);
3201
3202 if (!(root->log_transid_committed < transid &&
3203 atomic_read(&root->log_commit[index])))
3204 break;
3205
3206 mutex_unlock(&root->log_mutex);
3207 schedule();
3208 mutex_lock(&root->log_mutex);
3209 }
3210 finish_wait(&root->log_commit_wait[index], &wait);
3211 }
3212
wait_for_writer(struct btrfs_root * root)3213 static void wait_for_writer(struct btrfs_root *root)
3214 {
3215 DEFINE_WAIT(wait);
3216
3217 for (;;) {
3218 prepare_to_wait(&root->log_writer_wait, &wait,
3219 TASK_UNINTERRUPTIBLE);
3220 if (!atomic_read(&root->log_writers))
3221 break;
3222
3223 mutex_unlock(&root->log_mutex);
3224 schedule();
3225 mutex_lock(&root->log_mutex);
3226 }
3227 finish_wait(&root->log_writer_wait, &wait);
3228 }
3229
btrfs_init_log_ctx(struct btrfs_log_ctx * ctx,struct btrfs_inode * inode)3230 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
3231 {
3232 ctx->log_ret = 0;
3233 ctx->log_transid = 0;
3234 ctx->log_new_dentries = false;
3235 ctx->logging_new_name = false;
3236 ctx->logging_new_delayed_dentries = false;
3237 ctx->logged_before = false;
3238 ctx->inode = inode;
3239 INIT_LIST_HEAD(&ctx->list);
3240 INIT_LIST_HEAD(&ctx->ordered_extents);
3241 INIT_LIST_HEAD(&ctx->conflict_inodes);
3242 ctx->num_conflict_inodes = 0;
3243 ctx->logging_conflict_inodes = false;
3244 ctx->scratch_eb = NULL;
3245 }
3246
btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx * ctx)3247 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
3248 {
3249 struct btrfs_inode *inode = ctx->inode;
3250
3251 if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3252 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
3253 return;
3254
3255 /*
3256 * Don't care about allocation failure. This is just for optimization,
3257 * if we fail to allocate here, we will try again later if needed.
3258 */
3259 ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
3260 }
3261
btrfs_release_log_ctx_extents(struct btrfs_log_ctx * ctx)3262 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
3263 {
3264 struct btrfs_ordered_extent *ordered;
3265 struct btrfs_ordered_extent *tmp;
3266
3267 btrfs_assert_inode_locked(ctx->inode);
3268
3269 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
3270 list_del_init(&ordered->log_list);
3271 btrfs_put_ordered_extent(ordered);
3272 }
3273 }
3274
3275
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)3276 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3277 struct btrfs_log_ctx *ctx)
3278 {
3279 mutex_lock(&root->log_mutex);
3280 list_del_init(&ctx->list);
3281 mutex_unlock(&root->log_mutex);
3282 }
3283
3284 /*
3285 * Invoked in log mutex context, or be sure there is no other task which
3286 * can access the list.
3287 */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)3288 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3289 int index, int error)
3290 {
3291 struct btrfs_log_ctx *ctx;
3292 struct btrfs_log_ctx *safe;
3293
3294 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3295 list_del_init(&ctx->list);
3296 ctx->log_ret = error;
3297 }
3298 }
3299
3300 /*
3301 * Sends a given tree log down to the disk and updates the super blocks to
3302 * record it. When this call is done, you know that any inodes previously
3303 * logged are safely on disk only if it returns 0.
3304 *
3305 * Any other return value means you need to call btrfs_commit_transaction.
3306 * Some of the edge cases for fsyncing directories that have had unlinks
3307 * or renames done in the past mean that sometimes the only safe
3308 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3309 * that has happened.
3310 */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)3311 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3312 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3313 {
3314 int index1;
3315 int index2;
3316 int mark;
3317 int ret;
3318 struct btrfs_fs_info *fs_info = root->fs_info;
3319 struct btrfs_root *log = root->log_root;
3320 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3321 struct btrfs_root_item new_root_item;
3322 int log_transid = 0;
3323 struct btrfs_log_ctx root_log_ctx;
3324 struct blk_plug plug;
3325 u64 log_root_start;
3326 u64 log_root_level;
3327
3328 mutex_lock(&root->log_mutex);
3329 log_transid = ctx->log_transid;
3330 if (root->log_transid_committed >= log_transid) {
3331 mutex_unlock(&root->log_mutex);
3332 return ctx->log_ret;
3333 }
3334
3335 index1 = log_transid % 2;
3336 if (atomic_read(&root->log_commit[index1])) {
3337 wait_log_commit(root, log_transid);
3338 mutex_unlock(&root->log_mutex);
3339 return ctx->log_ret;
3340 }
3341 ASSERT(log_transid == root->log_transid,
3342 "log_transid=%d root->log_transid=%d", log_transid, root->log_transid);
3343 atomic_set(&root->log_commit[index1], 1);
3344
3345 /* wait for previous tree log sync to complete */
3346 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3347 wait_log_commit(root, log_transid - 1);
3348
3349 while (1) {
3350 int batch = atomic_read(&root->log_batch);
3351 /* when we're on an ssd, just kick the log commit out */
3352 if (!btrfs_test_opt(fs_info, SSD) &&
3353 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3354 mutex_unlock(&root->log_mutex);
3355 schedule_timeout_uninterruptible(1);
3356 mutex_lock(&root->log_mutex);
3357 }
3358 wait_for_writer(root);
3359 if (batch == atomic_read(&root->log_batch))
3360 break;
3361 }
3362
3363 /* bail out if we need to do a full commit */
3364 if (btrfs_need_log_full_commit(trans)) {
3365 ret = BTRFS_LOG_FORCE_COMMIT;
3366 mutex_unlock(&root->log_mutex);
3367 goto out;
3368 }
3369
3370 if (log_transid % 2 == 0)
3371 mark = EXTENT_DIRTY_LOG1;
3372 else
3373 mark = EXTENT_DIRTY_LOG2;
3374
3375 /* we start IO on all the marked extents here, but we don't actually
3376 * wait for them until later.
3377 */
3378 blk_start_plug(&plug);
3379 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3380 /*
3381 * -EAGAIN happens when someone, e.g., a concurrent transaction
3382 * commit, writes a dirty extent in this tree-log commit. This
3383 * concurrent write will create a hole writing out the extents,
3384 * and we cannot proceed on a zoned filesystem, requiring
3385 * sequential writing. While we can bail out to a full commit
3386 * here, but we can continue hoping the concurrent writing fills
3387 * the hole.
3388 */
3389 if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3390 ret = 0;
3391 if (ret) {
3392 blk_finish_plug(&plug);
3393 btrfs_set_log_full_commit(trans);
3394 mutex_unlock(&root->log_mutex);
3395 goto out;
3396 }
3397
3398 /*
3399 * We _must_ update under the root->log_mutex in order to make sure we
3400 * have a consistent view of the log root we are trying to commit at
3401 * this moment.
3402 *
3403 * We _must_ copy this into a local copy, because we are not holding the
3404 * log_root_tree->log_mutex yet. This is important because when we
3405 * commit the log_root_tree we must have a consistent view of the
3406 * log_root_tree when we update the super block to point at the
3407 * log_root_tree bytenr. If we update the log_root_tree here we'll race
3408 * with the commit and possibly point at the new block which we may not
3409 * have written out.
3410 */
3411 btrfs_set_root_node(&log->root_item, log->node);
3412 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3413
3414 btrfs_set_root_log_transid(root, root->log_transid + 1);
3415 log->log_transid = root->log_transid;
3416 root->log_start_pid = 0;
3417 /*
3418 * IO has been started, blocks of the log tree have WRITTEN flag set
3419 * in their headers. new modifications of the log will be written to
3420 * new positions. so it's safe to allow log writers to go in.
3421 */
3422 mutex_unlock(&root->log_mutex);
3423
3424 if (btrfs_is_zoned(fs_info)) {
3425 mutex_lock(&fs_info->tree_root->log_mutex);
3426 if (!log_root_tree->node) {
3427 ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3428 if (ret) {
3429 mutex_unlock(&fs_info->tree_root->log_mutex);
3430 blk_finish_plug(&plug);
3431 goto out;
3432 }
3433 }
3434 mutex_unlock(&fs_info->tree_root->log_mutex);
3435 }
3436
3437 btrfs_init_log_ctx(&root_log_ctx, NULL);
3438
3439 mutex_lock(&log_root_tree->log_mutex);
3440
3441 index2 = log_root_tree->log_transid % 2;
3442 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3443 root_log_ctx.log_transid = log_root_tree->log_transid;
3444
3445 /*
3446 * Now we are safe to update the log_root_tree because we're under the
3447 * log_mutex, and we're a current writer so we're holding the commit
3448 * open until we drop the log_mutex.
3449 */
3450 ret = update_log_root(trans, log, &new_root_item);
3451 if (ret) {
3452 list_del_init(&root_log_ctx.list);
3453 blk_finish_plug(&plug);
3454 btrfs_set_log_full_commit(trans);
3455 if (ret != -ENOSPC)
3456 btrfs_err(fs_info,
3457 "failed to update log for root %llu ret %d",
3458 btrfs_root_id(root), ret);
3459 btrfs_wait_tree_log_extents(log, mark);
3460 mutex_unlock(&log_root_tree->log_mutex);
3461 goto out;
3462 }
3463
3464 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3465 blk_finish_plug(&plug);
3466 list_del_init(&root_log_ctx.list);
3467 mutex_unlock(&log_root_tree->log_mutex);
3468 ret = root_log_ctx.log_ret;
3469 goto out;
3470 }
3471
3472 if (atomic_read(&log_root_tree->log_commit[index2])) {
3473 blk_finish_plug(&plug);
3474 ret = btrfs_wait_tree_log_extents(log, mark);
3475 wait_log_commit(log_root_tree,
3476 root_log_ctx.log_transid);
3477 mutex_unlock(&log_root_tree->log_mutex);
3478 if (!ret)
3479 ret = root_log_ctx.log_ret;
3480 goto out;
3481 }
3482 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid,
3483 "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d",
3484 root_log_ctx.log_transid, log_root_tree->log_transid);
3485 atomic_set(&log_root_tree->log_commit[index2], 1);
3486
3487 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3488 wait_log_commit(log_root_tree,
3489 root_log_ctx.log_transid - 1);
3490 }
3491
3492 /*
3493 * now that we've moved on to the tree of log tree roots,
3494 * check the full commit flag again
3495 */
3496 if (btrfs_need_log_full_commit(trans)) {
3497 blk_finish_plug(&plug);
3498 btrfs_wait_tree_log_extents(log, mark);
3499 mutex_unlock(&log_root_tree->log_mutex);
3500 ret = BTRFS_LOG_FORCE_COMMIT;
3501 goto out_wake_log_root;
3502 }
3503
3504 ret = btrfs_write_marked_extents(fs_info,
3505 &log_root_tree->dirty_log_pages,
3506 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3507 blk_finish_plug(&plug);
3508 /*
3509 * As described above, -EAGAIN indicates a hole in the extents. We
3510 * cannot wait for these write outs since the waiting cause a
3511 * deadlock. Bail out to the full commit instead.
3512 */
3513 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3514 btrfs_set_log_full_commit(trans);
3515 btrfs_wait_tree_log_extents(log, mark);
3516 mutex_unlock(&log_root_tree->log_mutex);
3517 goto out_wake_log_root;
3518 } else if (ret) {
3519 btrfs_set_log_full_commit(trans);
3520 mutex_unlock(&log_root_tree->log_mutex);
3521 goto out_wake_log_root;
3522 }
3523 ret = btrfs_wait_tree_log_extents(log, mark);
3524 if (!ret)
3525 ret = btrfs_wait_tree_log_extents(log_root_tree,
3526 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3527 if (ret) {
3528 btrfs_set_log_full_commit(trans);
3529 mutex_unlock(&log_root_tree->log_mutex);
3530 goto out_wake_log_root;
3531 }
3532
3533 log_root_start = log_root_tree->node->start;
3534 log_root_level = btrfs_header_level(log_root_tree->node);
3535 log_root_tree->log_transid++;
3536 mutex_unlock(&log_root_tree->log_mutex);
3537
3538 /*
3539 * Here we are guaranteed that nobody is going to write the superblock
3540 * for the current transaction before us and that neither we do write
3541 * our superblock before the previous transaction finishes its commit
3542 * and writes its superblock, because:
3543 *
3544 * 1) We are holding a handle on the current transaction, so no body
3545 * can commit it until we release the handle;
3546 *
3547 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3548 * if the previous transaction is still committing, and hasn't yet
3549 * written its superblock, we wait for it to do it, because a
3550 * transaction commit acquires the tree_log_mutex when the commit
3551 * begins and releases it only after writing its superblock.
3552 */
3553 mutex_lock(&fs_info->tree_log_mutex);
3554
3555 /*
3556 * The previous transaction writeout phase could have failed, and thus
3557 * marked the fs in an error state. We must not commit here, as we
3558 * could have updated our generation in the super_for_commit and
3559 * writing the super here would result in transid mismatches. If there
3560 * is an error here just bail.
3561 */
3562 if (BTRFS_FS_ERROR(fs_info)) {
3563 ret = -EIO;
3564 btrfs_set_log_full_commit(trans);
3565 btrfs_abort_transaction(trans, ret);
3566 mutex_unlock(&fs_info->tree_log_mutex);
3567 goto out_wake_log_root;
3568 }
3569
3570 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3571 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3572 ret = write_all_supers(fs_info, 1);
3573 mutex_unlock(&fs_info->tree_log_mutex);
3574 if (unlikely(ret)) {
3575 btrfs_set_log_full_commit(trans);
3576 btrfs_abort_transaction(trans, ret);
3577 goto out_wake_log_root;
3578 }
3579
3580 /*
3581 * We know there can only be one task here, since we have not yet set
3582 * root->log_commit[index1] to 0 and any task attempting to sync the
3583 * log must wait for the previous log transaction to commit if it's
3584 * still in progress or wait for the current log transaction commit if
3585 * someone else already started it. We use <= and not < because the
3586 * first log transaction has an ID of 0.
3587 */
3588 ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid,
3589 "last_log_commit(root)=%d log_transid=%d",
3590 btrfs_get_root_last_log_commit(root), log_transid);
3591 btrfs_set_root_last_log_commit(root, log_transid);
3592
3593 out_wake_log_root:
3594 mutex_lock(&log_root_tree->log_mutex);
3595 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3596
3597 log_root_tree->log_transid_committed++;
3598 atomic_set(&log_root_tree->log_commit[index2], 0);
3599 mutex_unlock(&log_root_tree->log_mutex);
3600
3601 /*
3602 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3603 * all the updates above are seen by the woken threads. It might not be
3604 * necessary, but proving that seems to be hard.
3605 */
3606 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3607 out:
3608 mutex_lock(&root->log_mutex);
3609 btrfs_remove_all_log_ctxs(root, index1, ret);
3610 root->log_transid_committed++;
3611 atomic_set(&root->log_commit[index1], 0);
3612 mutex_unlock(&root->log_mutex);
3613
3614 /*
3615 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3616 * all the updates above are seen by the woken threads. It might not be
3617 * necessary, but proving that seems to be hard.
3618 */
3619 cond_wake_up(&root->log_commit_wait[index1]);
3620 return ret;
3621 }
3622
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3623 static void free_log_tree(struct btrfs_trans_handle *trans,
3624 struct btrfs_root *log)
3625 {
3626 int ret;
3627 struct walk_control wc = {
3628 .free = true,
3629 .process_func = process_one_buffer,
3630 .log = log,
3631 .trans = trans,
3632 };
3633
3634 if (log->node) {
3635 ret = walk_log_tree(&wc);
3636 if (ret) {
3637 /*
3638 * We weren't able to traverse the entire log tree, the
3639 * typical scenario is getting an -EIO when reading an
3640 * extent buffer of the tree, due to a previous writeback
3641 * failure of it.
3642 */
3643 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3644 &log->fs_info->fs_state);
3645
3646 /*
3647 * Some extent buffers of the log tree may still be dirty
3648 * and not yet written back to storage, because we may
3649 * have updates to a log tree without syncing a log tree,
3650 * such as during rename and link operations. So flush
3651 * them out and wait for their writeback to complete, so
3652 * that we properly cleanup their state and pages.
3653 */
3654 btrfs_write_marked_extents(log->fs_info,
3655 &log->dirty_log_pages,
3656 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3657 btrfs_wait_tree_log_extents(log,
3658 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
3659
3660 if (trans)
3661 btrfs_abort_transaction(trans, ret);
3662 else
3663 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3664 }
3665 }
3666
3667 btrfs_extent_io_tree_release(&log->dirty_log_pages);
3668 btrfs_extent_io_tree_release(&log->log_csum_range);
3669
3670 btrfs_put_root(log);
3671 }
3672
3673 /*
3674 * free all the extents used by the tree log. This should be called
3675 * at commit time of the full transaction
3676 */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3677 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3678 {
3679 if (root->log_root) {
3680 free_log_tree(trans, root->log_root);
3681 root->log_root = NULL;
3682 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3683 }
3684 return 0;
3685 }
3686
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3687 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3688 struct btrfs_fs_info *fs_info)
3689 {
3690 if (fs_info->log_root_tree) {
3691 free_log_tree(trans, fs_info->log_root_tree);
3692 fs_info->log_root_tree = NULL;
3693 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3694 }
3695 return 0;
3696 }
3697
mark_inode_as_not_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3698 static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
3699 struct btrfs_inode *inode)
3700 {
3701 bool ret = false;
3702
3703 /*
3704 * Do this only if ->logged_trans is still 0 to prevent races with
3705 * concurrent logging as we may see the inode not logged when
3706 * inode_logged() is called but it gets logged after inode_logged() did
3707 * not find it in the log tree and we end up setting ->logged_trans to a
3708 * value less than trans->transid after the concurrent logging task has
3709 * set it to trans->transid. As a consequence, subsequent rename, unlink
3710 * and link operations may end up not logging new names and removing old
3711 * names from the log.
3712 */
3713 spin_lock(&inode->lock);
3714 if (inode->logged_trans == 0)
3715 inode->logged_trans = trans->transid - 1;
3716 else if (inode->logged_trans == trans->transid)
3717 ret = true;
3718 spin_unlock(&inode->lock);
3719
3720 return ret;
3721 }
3722
3723 /*
3724 * Check if an inode was logged in the current transaction. This correctly deals
3725 * with the case where the inode was logged but has a logged_trans of 0, which
3726 * happens if the inode is evicted and loaded again, as logged_trans is an in
3727 * memory only field (not persisted).
3728 *
3729 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3730 * and < 0 on error.
3731 */
inode_logged(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path_in)3732 static int inode_logged(const struct btrfs_trans_handle *trans,
3733 struct btrfs_inode *inode,
3734 struct btrfs_path *path_in)
3735 {
3736 struct btrfs_path *path = path_in;
3737 struct btrfs_key key;
3738 int ret;
3739
3740 /*
3741 * Quick lockless call, since once ->logged_trans is set to the current
3742 * transaction, we never set it to a lower value anywhere else.
3743 */
3744 if (data_race(inode->logged_trans) == trans->transid)
3745 return 1;
3746
3747 /*
3748 * If logged_trans is not 0 and not trans->transid, then we know the
3749 * inode was not logged in this transaction, so we can return false
3750 * right away. We take the lock to avoid a race caused by load/store
3751 * tearing with a concurrent btrfs_log_inode() call or a concurrent task
3752 * in this function further below - an update to trans->transid can be
3753 * teared into two 32 bits updates for example, in which case we could
3754 * see a positive value that is not trans->transid and assume the inode
3755 * was not logged when it was.
3756 */
3757 spin_lock(&inode->lock);
3758 if (inode->logged_trans == trans->transid) {
3759 spin_unlock(&inode->lock);
3760 return 1;
3761 } else if (inode->logged_trans > 0) {
3762 spin_unlock(&inode->lock);
3763 return 0;
3764 }
3765 spin_unlock(&inode->lock);
3766
3767 /*
3768 * If no log tree was created for this root in this transaction, then
3769 * the inode can not have been logged in this transaction. In that case
3770 * set logged_trans to anything greater than 0 and less than the current
3771 * transaction's ID, to avoid the search below in a future call in case
3772 * a log tree gets created after this.
3773 */
3774 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
3775 return mark_inode_as_not_logged(trans, inode);
3776
3777 /*
3778 * We have a log tree and the inode's logged_trans is 0. We can't tell
3779 * for sure if the inode was logged before in this transaction by looking
3780 * only at logged_trans. We could be pessimistic and assume it was, but
3781 * that can lead to unnecessarily logging an inode during rename and link
3782 * operations, and then further updating the log in followup rename and
3783 * link operations, specially if it's a directory, which adds latency
3784 * visible to applications doing a series of rename or link operations.
3785 *
3786 * A logged_trans of 0 here can mean several things:
3787 *
3788 * 1) The inode was never logged since the filesystem was mounted, and may
3789 * or may have not been evicted and loaded again;
3790 *
3791 * 2) The inode was logged in a previous transaction, then evicted and
3792 * then loaded again;
3793 *
3794 * 3) The inode was logged in the current transaction, then evicted and
3795 * then loaded again.
3796 *
3797 * For cases 1) and 2) we don't want to return true, but we need to detect
3798 * case 3) and return true. So we do a search in the log root for the inode
3799 * item.
3800 */
3801 key.objectid = btrfs_ino(inode);
3802 key.type = BTRFS_INODE_ITEM_KEY;
3803 key.offset = 0;
3804
3805 if (!path) {
3806 path = btrfs_alloc_path();
3807 if (!path)
3808 return -ENOMEM;
3809 }
3810
3811 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3812
3813 if (path_in)
3814 btrfs_release_path(path);
3815 else
3816 btrfs_free_path(path);
3817
3818 /*
3819 * Logging an inode always results in logging its inode item. So if we
3820 * did not find the item we know the inode was not logged for sure.
3821 */
3822 if (ret < 0) {
3823 return ret;
3824 } else if (ret > 0) {
3825 /*
3826 * Set logged_trans to a value greater than 0 and less then the
3827 * current transaction to avoid doing the search in future calls.
3828 */
3829 return mark_inode_as_not_logged(trans, inode);
3830 }
3831
3832 /*
3833 * The inode was previously logged and then evicted, set logged_trans to
3834 * the current transaction's ID, to avoid future tree searches as long as
3835 * the inode is not evicted again.
3836 */
3837 spin_lock(&inode->lock);
3838 inode->logged_trans = trans->transid;
3839 spin_unlock(&inode->lock);
3840
3841 return 1;
3842 }
3843
3844 /*
3845 * Delete a directory entry from the log if it exists.
3846 *
3847 * Returns < 0 on error
3848 * 1 if the entry does not exists
3849 * 0 if the entry existed and was successfully deleted
3850 */
del_logged_dentry(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dir_ino,const struct fscrypt_str * name,u64 index)3851 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3852 struct btrfs_root *log,
3853 struct btrfs_path *path,
3854 u64 dir_ino,
3855 const struct fscrypt_str *name,
3856 u64 index)
3857 {
3858 struct btrfs_dir_item *di;
3859
3860 /*
3861 * We only log dir index items of a directory, so we don't need to look
3862 * for dir item keys.
3863 */
3864 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3865 index, name, -1);
3866 if (IS_ERR(di))
3867 return PTR_ERR(di);
3868 else if (!di)
3869 return 1;
3870
3871 /*
3872 * We do not need to update the size field of the directory's
3873 * inode item because on log replay we update the field to reflect
3874 * all existing entries in the directory (see overwrite_item()).
3875 */
3876 return btrfs_del_item(trans, log, path);
3877 }
3878
3879 /*
3880 * If both a file and directory are logged, and unlinks or renames are
3881 * mixed in, we have a few interesting corners:
3882 *
3883 * create file X in dir Y
3884 * link file X to X.link in dir Y
3885 * fsync file X
3886 * unlink file X but leave X.link
3887 * fsync dir Y
3888 *
3889 * After a crash we would expect only X.link to exist. But file X
3890 * didn't get fsync'd again so the log has back refs for X and X.link.
3891 *
3892 * We solve this by removing directory entries and inode backrefs from the
3893 * log when a file that was logged in the current transaction is
3894 * unlinked. Any later fsync will include the updated log entries, and
3895 * we'll be able to reconstruct the proper directory items from backrefs.
3896 *
3897 * This optimizations allows us to avoid relogging the entire inode
3898 * or the entire directory.
3899 */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,const struct fscrypt_str * name,struct btrfs_inode * dir,u64 index)3900 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3901 const struct fscrypt_str *name,
3902 struct btrfs_inode *dir, u64 index)
3903 {
3904 struct btrfs_root *root = dir->root;
3905 BTRFS_PATH_AUTO_FREE(path);
3906 int ret;
3907
3908 ret = inode_logged(trans, dir, NULL);
3909 if (ret == 0)
3910 return;
3911 if (ret < 0) {
3912 btrfs_set_log_full_commit(trans);
3913 return;
3914 }
3915
3916 path = btrfs_alloc_path();
3917 if (!path) {
3918 btrfs_set_log_full_commit(trans);
3919 return;
3920 }
3921
3922 ret = join_running_log_trans(root);
3923 ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
3924 if (WARN_ON(ret))
3925 return;
3926
3927 mutex_lock(&dir->log_mutex);
3928
3929 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3930 name, index);
3931 mutex_unlock(&dir->log_mutex);
3932 if (ret < 0)
3933 btrfs_set_log_full_commit(trans);
3934 btrfs_end_log_trans(root);
3935 }
3936
3937 /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,const struct fscrypt_str * name,struct btrfs_inode * inode,struct btrfs_inode * dir)3938 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3939 const struct fscrypt_str *name,
3940 struct btrfs_inode *inode,
3941 struct btrfs_inode *dir)
3942 {
3943 struct btrfs_root *root = dir->root;
3944 int ret;
3945
3946 ret = inode_logged(trans, inode, NULL);
3947 if (ret == 0)
3948 return;
3949 else if (ret < 0) {
3950 btrfs_set_log_full_commit(trans);
3951 return;
3952 }
3953
3954 ret = join_running_log_trans(root);
3955 ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
3956 if (WARN_ON(ret))
3957 return;
3958 mutex_lock(&inode->log_mutex);
3959
3960 ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode),
3961 btrfs_ino(dir), NULL);
3962 mutex_unlock(&inode->log_mutex);
3963 if (ret < 0 && ret != -ENOENT)
3964 btrfs_set_log_full_commit(trans);
3965 btrfs_end_log_trans(root);
3966 }
3967
3968 /*
3969 * creates a range item in the log for 'dirid'. first_offset and
3970 * last_offset tell us which parts of the key space the log should
3971 * be considered authoritative for.
3972 */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,u64 first_offset,u64 last_offset)3973 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3974 struct btrfs_root *log,
3975 struct btrfs_path *path,
3976 u64 dirid,
3977 u64 first_offset, u64 last_offset)
3978 {
3979 int ret;
3980 struct btrfs_key key;
3981 struct btrfs_dir_log_item *item;
3982
3983 key.objectid = dirid;
3984 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3985 key.offset = first_offset;
3986 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3987 /*
3988 * -EEXIST is fine and can happen sporadically when we are logging a
3989 * directory and have concurrent insertions in the subvolume's tree for
3990 * items from other inodes and that result in pushing off some dir items
3991 * from one leaf to another in order to accommodate for the new items.
3992 * This results in logging the same dir index range key.
3993 */
3994 if (ret && ret != -EEXIST)
3995 return ret;
3996
3997 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3998 struct btrfs_dir_log_item);
3999 if (ret == -EEXIST) {
4000 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
4001
4002 /*
4003 * btrfs_del_dir_entries_in_log() might have been called during
4004 * an unlink between the initial insertion of this key and the
4005 * current update, or we might be logging a single entry deletion
4006 * during a rename, so set the new last_offset to the max value.
4007 */
4008 last_offset = max(last_offset, curr_end);
4009 }
4010 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
4011 btrfs_release_path(path);
4012 return 0;
4013 }
4014
flush_dir_items_batch(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct extent_buffer * src,struct btrfs_path * dst_path,int start_slot,int count)4015 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
4016 struct btrfs_inode *inode,
4017 struct extent_buffer *src,
4018 struct btrfs_path *dst_path,
4019 int start_slot,
4020 int count)
4021 {
4022 struct btrfs_root *log = inode->root->log_root;
4023 char AUTO_KFREE(ins_data);
4024 struct btrfs_item_batch batch;
4025 struct extent_buffer *dst;
4026 unsigned long src_offset;
4027 unsigned long dst_offset;
4028 u64 last_index;
4029 struct btrfs_key key;
4030 u32 item_size;
4031 int ret;
4032 int i;
4033
4034 ASSERT(count > 0, "count=%d", count);
4035 batch.nr = count;
4036
4037 if (count == 1) {
4038 btrfs_item_key_to_cpu(src, &key, start_slot);
4039 item_size = btrfs_item_size(src, start_slot);
4040 batch.keys = &key;
4041 batch.data_sizes = &item_size;
4042 batch.total_data_size = item_size;
4043 } else {
4044 struct btrfs_key *ins_keys;
4045 u32 *ins_sizes;
4046
4047 ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
4048 if (!ins_data)
4049 return -ENOMEM;
4050
4051 ins_sizes = (u32 *)ins_data;
4052 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
4053 batch.keys = ins_keys;
4054 batch.data_sizes = ins_sizes;
4055 batch.total_data_size = 0;
4056
4057 for (i = 0; i < count; i++) {
4058 const int slot = start_slot + i;
4059
4060 btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
4061 ins_sizes[i] = btrfs_item_size(src, slot);
4062 batch.total_data_size += ins_sizes[i];
4063 }
4064 }
4065
4066 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4067 if (ret)
4068 return ret;
4069
4070 dst = dst_path->nodes[0];
4071 /*
4072 * Copy all the items in bulk, in a single copy operation. Item data is
4073 * organized such that it's placed at the end of a leaf and from right
4074 * to left. For example, the data for the second item ends at an offset
4075 * that matches the offset where the data for the first item starts, the
4076 * data for the third item ends at an offset that matches the offset
4077 * where the data of the second items starts, and so on.
4078 * Therefore our source and destination start offsets for copy match the
4079 * offsets of the last items (highest slots).
4080 */
4081 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
4082 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
4083 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
4084 btrfs_release_path(dst_path);
4085
4086 last_index = batch.keys[count - 1].offset;
4087 ASSERT(last_index > inode->last_dir_index_offset,
4088 "last_index=%llu inode->last_dir_index_offset=%llu",
4089 last_index, inode->last_dir_index_offset);
4090
4091 /*
4092 * If for some unexpected reason the last item's index is not greater
4093 * than the last index we logged, warn and force a transaction commit.
4094 */
4095 if (WARN_ON(last_index <= inode->last_dir_index_offset))
4096 ret = BTRFS_LOG_FORCE_COMMIT;
4097 else
4098 inode->last_dir_index_offset = last_index;
4099
4100 if (btrfs_get_first_dir_index_to_log(inode) == 0)
4101 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
4102
4103 return ret;
4104 }
4105
clone_leaf(struct btrfs_path * path,struct btrfs_log_ctx * ctx)4106 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
4107 {
4108 const int slot = path->slots[0];
4109
4110 if (ctx->scratch_eb) {
4111 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
4112 } else {
4113 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
4114 if (!ctx->scratch_eb)
4115 return -ENOMEM;
4116 }
4117
4118 btrfs_release_path(path);
4119 path->nodes[0] = ctx->scratch_eb;
4120 path->slots[0] = slot;
4121 /*
4122 * Add extra ref to scratch eb so that it is not freed when callers
4123 * release the path, so we can reuse it later if needed.
4124 */
4125 refcount_inc(&ctx->scratch_eb->refs);
4126
4127 return 0;
4128 }
4129
process_dir_items_leaf(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 * last_old_dentry_offset)4130 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
4131 struct btrfs_inode *inode,
4132 struct btrfs_path *path,
4133 struct btrfs_path *dst_path,
4134 struct btrfs_log_ctx *ctx,
4135 u64 *last_old_dentry_offset)
4136 {
4137 struct btrfs_root *log = inode->root->log_root;
4138 struct extent_buffer *src;
4139 const int nritems = btrfs_header_nritems(path->nodes[0]);
4140 const u64 ino = btrfs_ino(inode);
4141 bool last_found = false;
4142 int batch_start = 0;
4143 int batch_size = 0;
4144 int ret;
4145
4146 /*
4147 * We need to clone the leaf, release the read lock on it, and use the
4148 * clone before modifying the log tree. See the comment at copy_items()
4149 * about why we need to do this.
4150 */
4151 ret = clone_leaf(path, ctx);
4152 if (ret < 0)
4153 return ret;
4154
4155 src = path->nodes[0];
4156
4157 for (int i = path->slots[0]; i < nritems; i++) {
4158 struct btrfs_dir_item *di;
4159 struct btrfs_key key;
4160
4161 btrfs_item_key_to_cpu(src, &key, i);
4162
4163 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
4164 last_found = true;
4165 break;
4166 }
4167
4168 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
4169
4170 /*
4171 * Skip ranges of items that consist only of dir item keys created
4172 * in past transactions. However if we find a gap, we must log a
4173 * dir index range item for that gap, so that index keys in that
4174 * gap are deleted during log replay.
4175 */
4176 if (btrfs_dir_transid(src, di) < trans->transid) {
4177 if (key.offset > *last_old_dentry_offset + 1) {
4178 ret = insert_dir_log_key(trans, log, dst_path,
4179 ino, *last_old_dentry_offset + 1,
4180 key.offset - 1);
4181 if (ret < 0)
4182 return ret;
4183 }
4184
4185 *last_old_dentry_offset = key.offset;
4186 continue;
4187 }
4188
4189 /* If we logged this dir index item before, we can skip it. */
4190 if (key.offset <= inode->last_dir_index_offset)
4191 continue;
4192
4193 /*
4194 * We must make sure that when we log a directory entry, the
4195 * corresponding inode, after log replay, has a matching link
4196 * count. For example:
4197 *
4198 * touch foo
4199 * mkdir mydir
4200 * sync
4201 * ln foo mydir/bar
4202 * xfs_io -c "fsync" mydir
4203 * <crash>
4204 * <mount fs and log replay>
4205 *
4206 * Would result in a fsync log that when replayed, our file inode
4207 * would have a link count of 1, but we get two directory entries
4208 * pointing to the same inode. After removing one of the names,
4209 * it would not be possible to remove the other name, which
4210 * resulted always in stale file handle errors, and would not be
4211 * possible to rmdir the parent directory, since its i_size could
4212 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
4213 * resulting in -ENOTEMPTY errors.
4214 */
4215 if (!ctx->log_new_dentries) {
4216 struct btrfs_key di_key;
4217
4218 btrfs_dir_item_key_to_cpu(src, di, &di_key);
4219 if (di_key.type != BTRFS_ROOT_ITEM_KEY)
4220 ctx->log_new_dentries = true;
4221 }
4222
4223 if (batch_size == 0)
4224 batch_start = i;
4225 batch_size++;
4226 }
4227
4228 if (batch_size > 0) {
4229 ret = flush_dir_items_batch(trans, inode, src, dst_path,
4230 batch_start, batch_size);
4231 if (ret < 0)
4232 return ret;
4233 }
4234
4235 return last_found ? 1 : 0;
4236 }
4237
4238 /*
4239 * log all the items included in the current transaction for a given
4240 * directory. This also creates the range items in the log tree required
4241 * to replay anything deleted before the fsync
4242 */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)4243 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
4244 struct btrfs_inode *inode,
4245 struct btrfs_path *path,
4246 struct btrfs_path *dst_path,
4247 struct btrfs_log_ctx *ctx,
4248 u64 min_offset, u64 *last_offset_ret)
4249 {
4250 struct btrfs_key min_key;
4251 struct btrfs_root *root = inode->root;
4252 struct btrfs_root *log = root->log_root;
4253 int ret;
4254 u64 last_old_dentry_offset = min_offset - 1;
4255 u64 last_offset = (u64)-1;
4256 u64 ino = btrfs_ino(inode);
4257
4258 min_key.objectid = ino;
4259 min_key.type = BTRFS_DIR_INDEX_KEY;
4260 min_key.offset = min_offset;
4261
4262 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
4263
4264 /*
4265 * we didn't find anything from this transaction, see if there
4266 * is anything at all
4267 */
4268 if (ret != 0 || min_key.objectid != ino ||
4269 min_key.type != BTRFS_DIR_INDEX_KEY) {
4270 min_key.objectid = ino;
4271 min_key.type = BTRFS_DIR_INDEX_KEY;
4272 min_key.offset = (u64)-1;
4273 btrfs_release_path(path);
4274 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
4275 if (ret < 0) {
4276 btrfs_release_path(path);
4277 return ret;
4278 }
4279 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
4280
4281 /* if ret == 0 there are items for this type,
4282 * create a range to tell us the last key of this type.
4283 * otherwise, there are no items in this directory after
4284 * *min_offset, and we create a range to indicate that.
4285 */
4286 if (ret == 0) {
4287 struct btrfs_key tmp;
4288
4289 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
4290 path->slots[0]);
4291 if (tmp.type == BTRFS_DIR_INDEX_KEY)
4292 last_old_dentry_offset = tmp.offset;
4293 } else if (ret > 0) {
4294 ret = 0;
4295 }
4296
4297 goto done;
4298 }
4299
4300 /* go backward to find any previous key */
4301 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
4302 if (ret == 0) {
4303 struct btrfs_key tmp;
4304
4305 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
4306 /*
4307 * The dir index key before the first one we found that needs to
4308 * be logged might be in a previous leaf, and there might be a
4309 * gap between these keys, meaning that we had deletions that
4310 * happened. So the key range item we log (key type
4311 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
4312 * previous key's offset plus 1, so that those deletes are replayed.
4313 */
4314 if (tmp.type == BTRFS_DIR_INDEX_KEY)
4315 last_old_dentry_offset = tmp.offset;
4316 } else if (ret < 0) {
4317 goto done;
4318 }
4319
4320 btrfs_release_path(path);
4321
4322 /*
4323 * Find the first key from this transaction again or the one we were at
4324 * in the loop below in case we had to reschedule. We may be logging the
4325 * directory without holding its VFS lock, which happen when logging new
4326 * dentries (through log_new_dir_dentries()) or in some cases when we
4327 * need to log the parent directory of an inode. This means a dir index
4328 * key might be deleted from the inode's root, and therefore we may not
4329 * find it anymore. If we can't find it, just move to the next key. We
4330 * can not bail out and ignore, because if we do that we will simply
4331 * not log dir index keys that come after the one that was just deleted
4332 * and we can end up logging a dir index range that ends at (u64)-1
4333 * (@last_offset is initialized to that), resulting in removing dir
4334 * entries we should not remove at log replay time.
4335 */
4336 search:
4337 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
4338 if (ret > 0) {
4339 ret = btrfs_next_item(root, path);
4340 if (ret > 0) {
4341 /* There are no more keys in the inode's root. */
4342 ret = 0;
4343 goto done;
4344 }
4345 }
4346 if (ret < 0)
4347 goto done;
4348
4349 /*
4350 * we have a block from this transaction, log every item in it
4351 * from our directory
4352 */
4353 while (1) {
4354 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
4355 &last_old_dentry_offset);
4356 if (ret != 0) {
4357 if (ret > 0)
4358 ret = 0;
4359 goto done;
4360 }
4361 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
4362
4363 /*
4364 * look ahead to the next item and see if it is also
4365 * from this directory and from this transaction
4366 */
4367 ret = btrfs_next_leaf(root, path);
4368 if (ret) {
4369 if (ret == 1) {
4370 last_offset = (u64)-1;
4371 ret = 0;
4372 }
4373 goto done;
4374 }
4375 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
4376 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
4377 last_offset = (u64)-1;
4378 goto done;
4379 }
4380 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
4381 /*
4382 * The next leaf was not changed in the current transaction
4383 * and has at least one dir index key.
4384 * We check for the next key because there might have been
4385 * one or more deletions between the last key we logged and
4386 * that next key. So the key range item we log (key type
4387 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
4388 * offset minus 1, so that those deletes are replayed.
4389 */
4390 last_offset = min_key.offset - 1;
4391 goto done;
4392 }
4393 if (need_resched()) {
4394 btrfs_release_path(path);
4395 cond_resched();
4396 goto search;
4397 }
4398 }
4399 done:
4400 btrfs_release_path(path);
4401 btrfs_release_path(dst_path);
4402
4403 if (ret == 0) {
4404 *last_offset_ret = last_offset;
4405 /*
4406 * In case the leaf was changed in the current transaction but
4407 * all its dir items are from a past transaction, the last item
4408 * in the leaf is a dir item and there's no gap between that last
4409 * dir item and the first one on the next leaf (which did not
4410 * change in the current transaction), then we don't need to log
4411 * a range, last_old_dentry_offset is == to last_offset.
4412 */
4413 ASSERT(last_old_dentry_offset <= last_offset,
4414 "last_old_dentry_offset=%llu last_offset=%llu",
4415 last_old_dentry_offset, last_offset);
4416 if (last_old_dentry_offset < last_offset)
4417 ret = insert_dir_log_key(trans, log, path, ino,
4418 last_old_dentry_offset + 1,
4419 last_offset);
4420 }
4421
4422 return ret;
4423 }
4424
4425 /*
4426 * If the inode was logged before and it was evicted, then its
4427 * last_dir_index_offset is 0, so we don't know the value of the last index
4428 * key offset. If that's the case, search for it and update the inode. This
4429 * is to avoid lookups in the log tree every time we try to insert a dir index
4430 * key from a leaf changed in the current transaction, and to allow us to always
4431 * do batch insertions of dir index keys.
4432 */
update_last_dir_index_offset(struct btrfs_inode * inode,struct btrfs_path * path,const struct btrfs_log_ctx * ctx)4433 static int update_last_dir_index_offset(struct btrfs_inode *inode,
4434 struct btrfs_path *path,
4435 const struct btrfs_log_ctx *ctx)
4436 {
4437 const u64 ino = btrfs_ino(inode);
4438 struct btrfs_key key;
4439 int ret;
4440
4441 lockdep_assert_held(&inode->log_mutex);
4442
4443 if (inode->last_dir_index_offset != 0)
4444 return 0;
4445
4446 if (!ctx->logged_before) {
4447 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4448 return 0;
4449 }
4450
4451 key.objectid = ino;
4452 key.type = BTRFS_DIR_INDEX_KEY;
4453 key.offset = (u64)-1;
4454
4455 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4456 /*
4457 * An error happened or we actually have an index key with an offset
4458 * value of (u64)-1. Bail out, we're done.
4459 */
4460 if (ret <= 0)
4461 goto out;
4462
4463 ret = 0;
4464 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4465
4466 /*
4467 * No dir index items, bail out and leave last_dir_index_offset with
4468 * the value right before the first valid index value.
4469 */
4470 if (path->slots[0] == 0)
4471 goto out;
4472
4473 /*
4474 * btrfs_search_slot() left us at one slot beyond the slot with the last
4475 * index key, or beyond the last key of the directory that is not an
4476 * index key. If we have an index key before, set last_dir_index_offset
4477 * to its offset value, otherwise leave it with a value right before the
4478 * first valid index value, as it means we have an empty directory.
4479 */
4480 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4481 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4482 inode->last_dir_index_offset = key.offset;
4483
4484 out:
4485 btrfs_release_path(path);
4486
4487 return ret;
4488 }
4489
4490 /*
4491 * logging directories is very similar to logging inodes, We find all the items
4492 * from the current transaction and write them to the log.
4493 *
4494 * The recovery code scans the directory in the subvolume, and if it finds a
4495 * key in the range logged that is not present in the log tree, then it means
4496 * that dir entry was unlinked during the transaction.
4497 *
4498 * In order for that scan to work, we must include one key smaller than
4499 * the smallest logged by this transaction and one key larger than the largest
4500 * key logged by this transaction.
4501 */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)4502 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4503 struct btrfs_inode *inode,
4504 struct btrfs_path *path,
4505 struct btrfs_path *dst_path,
4506 struct btrfs_log_ctx *ctx)
4507 {
4508 u64 min_key;
4509 u64 max_key;
4510 int ret;
4511
4512 ret = update_last_dir_index_offset(inode, path, ctx);
4513 if (ret)
4514 return ret;
4515
4516 min_key = BTRFS_DIR_START_INDEX;
4517 max_key = 0;
4518
4519 while (1) {
4520 ret = log_dir_items(trans, inode, path, dst_path,
4521 ctx, min_key, &max_key);
4522 if (ret)
4523 return ret;
4524 if (max_key == (u64)-1)
4525 break;
4526 min_key = max_key + 1;
4527 }
4528
4529 return 0;
4530 }
4531
4532 /*
4533 * a helper function to drop items from the log before we relog an
4534 * inode. max_key_type indicates the highest item type to remove.
4535 * This cannot be run for file data extents because it does not
4536 * free the extents they point to.
4537 */
drop_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,int max_key_type)4538 static int drop_inode_items(struct btrfs_trans_handle *trans,
4539 struct btrfs_root *log,
4540 struct btrfs_path *path,
4541 struct btrfs_inode *inode,
4542 int max_key_type)
4543 {
4544 int ret;
4545 struct btrfs_key key;
4546 struct btrfs_key found_key;
4547 int start_slot;
4548
4549 key.objectid = btrfs_ino(inode);
4550 key.type = max_key_type;
4551 key.offset = (u64)-1;
4552
4553 while (1) {
4554 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4555 if (ret < 0) {
4556 break;
4557 } else if (ret > 0) {
4558 if (path->slots[0] == 0)
4559 break;
4560 path->slots[0]--;
4561 }
4562
4563 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4564 path->slots[0]);
4565
4566 if (found_key.objectid != key.objectid)
4567 break;
4568
4569 found_key.offset = 0;
4570 found_key.type = 0;
4571 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4572 if (ret < 0)
4573 break;
4574
4575 ret = btrfs_del_items(trans, log, path, start_slot,
4576 path->slots[0] - start_slot + 1);
4577 /*
4578 * If start slot isn't 0 then we don't need to re-search, we've
4579 * found the last guy with the objectid in this tree.
4580 */
4581 if (ret || start_slot != 0)
4582 break;
4583 btrfs_release_path(path);
4584 }
4585 btrfs_release_path(path);
4586 if (ret > 0)
4587 ret = 0;
4588 return ret;
4589 }
4590
truncate_inode_items(struct btrfs_trans_handle * trans,struct btrfs_root * log_root,struct btrfs_inode * inode,u64 new_size,u32 min_type)4591 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4592 struct btrfs_root *log_root,
4593 struct btrfs_inode *inode,
4594 u64 new_size, u32 min_type)
4595 {
4596 struct btrfs_truncate_control control = {
4597 .new_size = new_size,
4598 .ino = btrfs_ino(inode),
4599 .min_type = min_type,
4600 .skip_ref_updates = true,
4601 };
4602
4603 return btrfs_truncate_inode_items(trans, log_root, &control);
4604 }
4605
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,bool log_inode_only,u64 logged_isize)4606 static void fill_inode_item(struct btrfs_trans_handle *trans,
4607 struct extent_buffer *leaf,
4608 struct btrfs_inode_item *item,
4609 struct inode *inode, bool log_inode_only,
4610 u64 logged_isize)
4611 {
4612 u64 flags;
4613
4614 if (log_inode_only) {
4615 /* set the generation to zero so the recover code
4616 * can tell the difference between an logging
4617 * just to say 'this inode exists' and a logging
4618 * to say 'update this inode with these values'
4619 */
4620 btrfs_set_inode_generation(leaf, item, 0);
4621 btrfs_set_inode_size(leaf, item, logged_isize);
4622 } else {
4623 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
4624 btrfs_set_inode_size(leaf, item, inode->i_size);
4625 }
4626
4627 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
4628 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
4629 btrfs_set_inode_mode(leaf, item, inode->i_mode);
4630 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
4631
4632 btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
4633 btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
4634
4635 btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
4636 btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
4637
4638 btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
4639 btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
4640
4641 btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
4642 btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4643
4644 /*
4645 * We do not need to set the nbytes field, in fact during a fast fsync
4646 * its value may not even be correct, since a fast fsync does not wait
4647 * for ordered extent completion, which is where we update nbytes, it
4648 * only waits for writeback to complete. During log replay as we find
4649 * file extent items and replay them, we adjust the nbytes field of the
4650 * inode item in subvolume tree as needed (see overwrite_item()).
4651 */
4652
4653 btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
4654 btrfs_set_inode_transid(leaf, item, trans->transid);
4655 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
4656 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4657 BTRFS_I(inode)->ro_flags);
4658 btrfs_set_inode_flags(leaf, item, flags);
4659 btrfs_set_inode_block_group(leaf, item, 0);
4660 }
4661
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode,bool inode_item_dropped)4662 static int log_inode_item(struct btrfs_trans_handle *trans,
4663 struct btrfs_root *log, struct btrfs_path *path,
4664 struct btrfs_inode *inode, bool inode_item_dropped)
4665 {
4666 struct btrfs_inode_item *inode_item;
4667 struct btrfs_key key;
4668 int ret;
4669
4670 btrfs_get_inode_key(inode, &key);
4671 /*
4672 * If we are doing a fast fsync and the inode was logged before in the
4673 * current transaction, then we know the inode was previously logged and
4674 * it exists in the log tree. For performance reasons, in this case use
4675 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4676 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4677 * contention in case there are concurrent fsyncs for other inodes of the
4678 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4679 * already exists can also result in unnecessarily splitting a leaf.
4680 */
4681 if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4682 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
4683 ASSERT(ret <= 0);
4684 if (ret > 0)
4685 ret = -ENOENT;
4686 } else {
4687 /*
4688 * This means it is the first fsync in the current transaction,
4689 * so the inode item is not in the log and we need to insert it.
4690 * We can never get -EEXIST because we are only called for a fast
4691 * fsync and in case an inode eviction happens after the inode was
4692 * logged before in the current transaction, when we load again
4693 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4694 * flags and set ->logged_trans to 0.
4695 */
4696 ret = btrfs_insert_empty_item(trans, log, path, &key,
4697 sizeof(*inode_item));
4698 ASSERT(ret != -EEXIST);
4699 }
4700 if (ret)
4701 return ret;
4702 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4703 struct btrfs_inode_item);
4704 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4705 false, 0);
4706 btrfs_release_path(path);
4707 return 0;
4708 }
4709
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)4710 static int log_csums(struct btrfs_trans_handle *trans,
4711 struct btrfs_inode *inode,
4712 struct btrfs_root *log_root,
4713 struct btrfs_ordered_sum *sums)
4714 {
4715 const u64 lock_end = sums->logical + sums->len - 1;
4716 struct extent_state *cached_state = NULL;
4717 int ret;
4718
4719 /*
4720 * If this inode was not used for reflink operations in the current
4721 * transaction with new extents, then do the fast path, no need to
4722 * worry about logging checksum items with overlapping ranges.
4723 */
4724 if (inode->last_reflink_trans < trans->transid)
4725 return btrfs_csum_file_blocks(trans, log_root, sums);
4726
4727 /*
4728 * Serialize logging for checksums. This is to avoid racing with the
4729 * same checksum being logged by another task that is logging another
4730 * file which happens to refer to the same extent as well. Such races
4731 * can leave checksum items in the log with overlapping ranges.
4732 */
4733 ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4734 &cached_state);
4735 if (ret)
4736 return ret;
4737 /*
4738 * Due to extent cloning, we might have logged a csum item that covers a
4739 * subrange of a cloned extent, and later we can end up logging a csum
4740 * item for a larger subrange of the same extent or the entire range.
4741 * This would leave csum items in the log tree that cover the same range
4742 * and break the searches for checksums in the log tree, resulting in
4743 * some checksums missing in the fs/subvolume tree. So just delete (or
4744 * trim and adjust) any existing csum items in the log for this range.
4745 */
4746 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4747 if (!ret)
4748 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4749
4750 btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4751 &cached_state);
4752
4753 return ret;
4754 }
4755
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize,struct btrfs_log_ctx * ctx)4756 static noinline int copy_items(struct btrfs_trans_handle *trans,
4757 struct btrfs_inode *inode,
4758 struct btrfs_path *dst_path,
4759 struct btrfs_path *src_path,
4760 int start_slot, int nr, int inode_only,
4761 u64 logged_isize, struct btrfs_log_ctx *ctx)
4762 {
4763 struct btrfs_root *log = inode->root->log_root;
4764 struct btrfs_file_extent_item *extent;
4765 struct extent_buffer *src;
4766 int ret;
4767 struct btrfs_key *ins_keys;
4768 u32 *ins_sizes;
4769 struct btrfs_item_batch batch;
4770 char AUTO_KFREE(ins_data);
4771 int dst_index;
4772 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4773 const u64 i_size = i_size_read(&inode->vfs_inode);
4774
4775 /*
4776 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4777 * use the clone. This is because otherwise we would be changing the log
4778 * tree, to insert items from the subvolume tree or insert csum items,
4779 * while holding a read lock on a leaf from the subvolume tree, which
4780 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4781 *
4782 * 1) Modifying the log tree triggers an extent buffer allocation while
4783 * holding a write lock on a parent extent buffer from the log tree.
4784 * Allocating the pages for an extent buffer, or the extent buffer
4785 * struct, can trigger inode eviction and finally the inode eviction
4786 * will trigger a release/remove of a delayed node, which requires
4787 * taking the delayed node's mutex;
4788 *
4789 * 2) Allocating a metadata extent for a log tree can trigger the async
4790 * reclaim thread and make us wait for it to release enough space and
4791 * unblock our reservation ticket. The reclaim thread can start
4792 * flushing delayed items, and that in turn results in the need to
4793 * lock delayed node mutexes and in the need to write lock extent
4794 * buffers of a subvolume tree - all this while holding a write lock
4795 * on the parent extent buffer in the log tree.
4796 *
4797 * So one task in scenario 1) running in parallel with another task in
4798 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4799 * node mutex while having a read lock on a leaf from the subvolume,
4800 * while the other is holding the delayed node's mutex and wants to
4801 * write lock the same subvolume leaf for flushing delayed items.
4802 */
4803 ret = clone_leaf(src_path, ctx);
4804 if (ret < 0)
4805 return ret;
4806
4807 src = src_path->nodes[0];
4808
4809 ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
4810 if (!ins_data)
4811 return -ENOMEM;
4812
4813 ins_sizes = (u32 *)ins_data;
4814 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4815 batch.keys = ins_keys;
4816 batch.data_sizes = ins_sizes;
4817 batch.total_data_size = 0;
4818 batch.nr = 0;
4819
4820 dst_index = 0;
4821 for (int i = 0; i < nr; i++) {
4822 const int src_slot = start_slot + i;
4823 struct btrfs_root *csum_root;
4824 struct btrfs_ordered_sum *sums;
4825 struct btrfs_ordered_sum *sums_next;
4826 LIST_HEAD(ordered_sums);
4827 u64 disk_bytenr;
4828 u64 disk_num_bytes;
4829 u64 extent_offset;
4830 u64 extent_num_bytes;
4831 bool is_old_extent;
4832
4833 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4834
4835 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4836 goto add_to_batch;
4837
4838 extent = btrfs_item_ptr(src, src_slot,
4839 struct btrfs_file_extent_item);
4840
4841 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4842 trans->transid);
4843
4844 /*
4845 * Don't copy extents from past generations. That would make us
4846 * log a lot more metadata for common cases like doing only a
4847 * few random writes into a file and then fsync it for the first
4848 * time or after the full sync flag is set on the inode. We can
4849 * get leaves full of extent items, most of which are from past
4850 * generations, so we can skip them - as long as the inode has
4851 * not been the target of a reflink operation in this transaction,
4852 * as in that case it might have had file extent items with old
4853 * generations copied into it. We also must always log prealloc
4854 * extents that start at or beyond eof, otherwise we would lose
4855 * them on log replay.
4856 */
4857 if (is_old_extent &&
4858 ins_keys[dst_index].offset < i_size &&
4859 inode->last_reflink_trans < trans->transid)
4860 continue;
4861
4862 if (skip_csum)
4863 goto add_to_batch;
4864
4865 /* Only regular extents have checksums. */
4866 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4867 goto add_to_batch;
4868
4869 /*
4870 * If it's an extent created in a past transaction, then its
4871 * checksums are already accessible from the committed csum tree,
4872 * no need to log them.
4873 */
4874 if (is_old_extent)
4875 goto add_to_batch;
4876
4877 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4878 /* If it's an explicit hole, there are no checksums. */
4879 if (disk_bytenr == 0)
4880 goto add_to_batch;
4881
4882 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4883
4884 if (btrfs_file_extent_compression(src, extent)) {
4885 extent_offset = 0;
4886 extent_num_bytes = disk_num_bytes;
4887 } else {
4888 extent_offset = btrfs_file_extent_offset(src, extent);
4889 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4890 }
4891
4892 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4893 disk_bytenr += extent_offset;
4894 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4895 disk_bytenr + extent_num_bytes - 1,
4896 &ordered_sums, false);
4897 if (ret < 0)
4898 return ret;
4899 ret = 0;
4900
4901 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4902 if (!ret)
4903 ret = log_csums(trans, inode, log, sums);
4904 list_del(&sums->list);
4905 kfree(sums);
4906 }
4907 if (ret)
4908 return ret;
4909
4910 add_to_batch:
4911 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4912 batch.total_data_size += ins_sizes[dst_index];
4913 batch.nr++;
4914 dst_index++;
4915 }
4916
4917 /*
4918 * We have a leaf full of old extent items that don't need to be logged,
4919 * so we don't need to do anything.
4920 */
4921 if (batch.nr == 0)
4922 return 0;
4923
4924 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4925 if (ret)
4926 return ret;
4927
4928 dst_index = 0;
4929 for (int i = 0; i < nr; i++) {
4930 const int src_slot = start_slot + i;
4931 const int dst_slot = dst_path->slots[0] + dst_index;
4932 struct btrfs_key key;
4933 unsigned long src_offset;
4934 unsigned long dst_offset;
4935
4936 /*
4937 * We're done, all the remaining items in the source leaf
4938 * correspond to old file extent items.
4939 */
4940 if (dst_index >= batch.nr)
4941 break;
4942
4943 btrfs_item_key_to_cpu(src, &key, src_slot);
4944
4945 if (key.type != BTRFS_EXTENT_DATA_KEY)
4946 goto copy_item;
4947
4948 extent = btrfs_item_ptr(src, src_slot,
4949 struct btrfs_file_extent_item);
4950
4951 /* See the comment in the previous loop, same logic. */
4952 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4953 key.offset < i_size &&
4954 inode->last_reflink_trans < trans->transid)
4955 continue;
4956
4957 copy_item:
4958 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4959 src_offset = btrfs_item_ptr_offset(src, src_slot);
4960
4961 if (key.type == BTRFS_INODE_ITEM_KEY) {
4962 struct btrfs_inode_item *inode_item;
4963
4964 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4965 struct btrfs_inode_item);
4966 fill_inode_item(trans, dst_path->nodes[0], inode_item,
4967 &inode->vfs_inode,
4968 inode_only == LOG_INODE_EXISTS,
4969 logged_isize);
4970 } else {
4971 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4972 src_offset, ins_sizes[dst_index]);
4973 }
4974
4975 dst_index++;
4976 }
4977
4978 btrfs_release_path(dst_path);
4979
4980 return ret;
4981 }
4982
extent_cmp(void * priv,const struct list_head * a,const struct list_head * b)4983 static int extent_cmp(void *priv, const struct list_head *a,
4984 const struct list_head *b)
4985 {
4986 const struct extent_map *em1, *em2;
4987
4988 em1 = list_entry(a, struct extent_map, list);
4989 em2 = list_entry(b, struct extent_map, list);
4990
4991 if (em1->start < em2->start)
4992 return -1;
4993 else if (em1->start > em2->start)
4994 return 1;
4995 return 0;
4996 }
4997
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4998 static int log_extent_csums(struct btrfs_trans_handle *trans,
4999 struct btrfs_inode *inode,
5000 struct btrfs_root *log_root,
5001 const struct extent_map *em,
5002 struct btrfs_log_ctx *ctx)
5003 {
5004 struct btrfs_ordered_extent *ordered;
5005 struct btrfs_root *csum_root;
5006 u64 block_start;
5007 u64 csum_offset;
5008 u64 csum_len;
5009 u64 mod_start = em->start;
5010 u64 mod_len = em->len;
5011 LIST_HEAD(ordered_sums);
5012 int ret = 0;
5013
5014 if (inode->flags & BTRFS_INODE_NODATASUM ||
5015 (em->flags & EXTENT_FLAG_PREALLOC) ||
5016 em->disk_bytenr == EXTENT_MAP_HOLE)
5017 return 0;
5018
5019 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
5020 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
5021 const u64 mod_end = mod_start + mod_len;
5022 struct btrfs_ordered_sum *sums;
5023
5024 if (mod_len == 0)
5025 break;
5026
5027 if (ordered_end <= mod_start)
5028 continue;
5029 if (mod_end <= ordered->file_offset)
5030 break;
5031
5032 /*
5033 * We are going to copy all the csums on this ordered extent, so
5034 * go ahead and adjust mod_start and mod_len in case this ordered
5035 * extent has already been logged.
5036 */
5037 if (ordered->file_offset > mod_start) {
5038 if (ordered_end >= mod_end)
5039 mod_len = ordered->file_offset - mod_start;
5040 /*
5041 * If we have this case
5042 *
5043 * |--------- logged extent ---------|
5044 * |----- ordered extent ----|
5045 *
5046 * Just don't mess with mod_start and mod_len, we'll
5047 * just end up logging more csums than we need and it
5048 * will be ok.
5049 */
5050 } else {
5051 if (ordered_end < mod_end) {
5052 mod_len = mod_end - ordered_end;
5053 mod_start = ordered_end;
5054 } else {
5055 mod_len = 0;
5056 }
5057 }
5058
5059 /*
5060 * To keep us from looping for the above case of an ordered
5061 * extent that falls inside of the logged extent.
5062 */
5063 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
5064 continue;
5065
5066 list_for_each_entry(sums, &ordered->list, list) {
5067 ret = log_csums(trans, inode, log_root, sums);
5068 if (ret)
5069 return ret;
5070 }
5071 }
5072
5073 /* We're done, found all csums in the ordered extents. */
5074 if (mod_len == 0)
5075 return 0;
5076
5077 /* If we're compressed we have to save the entire range of csums. */
5078 if (btrfs_extent_map_is_compressed(em)) {
5079 csum_offset = 0;
5080 csum_len = em->disk_num_bytes;
5081 } else {
5082 csum_offset = mod_start - em->start;
5083 csum_len = mod_len;
5084 }
5085
5086 /* block start is already adjusted for the file extent offset. */
5087 block_start = btrfs_extent_map_block_start(em);
5088 csum_root = btrfs_csum_root(trans->fs_info, block_start);
5089 ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
5090 block_start + csum_offset + csum_len - 1,
5091 &ordered_sums, false);
5092 if (ret < 0)
5093 return ret;
5094 ret = 0;
5095
5096 while (!list_empty(&ordered_sums)) {
5097 struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
5098 struct btrfs_ordered_sum,
5099 list);
5100 if (!ret)
5101 ret = log_csums(trans, inode, log_root, sums);
5102 list_del(&sums->list);
5103 kfree(sums);
5104 }
5105
5106 return ret;
5107 }
5108
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5109 static int log_one_extent(struct btrfs_trans_handle *trans,
5110 struct btrfs_inode *inode,
5111 const struct extent_map *em,
5112 struct btrfs_path *path,
5113 struct btrfs_log_ctx *ctx)
5114 {
5115 struct btrfs_drop_extents_args drop_args = { 0 };
5116 struct btrfs_root *log = inode->root->log_root;
5117 struct btrfs_file_extent_item fi = { 0 };
5118 struct extent_buffer *leaf;
5119 struct btrfs_key key;
5120 enum btrfs_compression_type compress_type;
5121 u64 extent_offset = em->offset;
5122 u64 block_start = btrfs_extent_map_block_start(em);
5123 u64 block_len;
5124 int ret;
5125
5126 btrfs_set_stack_file_extent_generation(&fi, trans->transid);
5127 if (em->flags & EXTENT_FLAG_PREALLOC)
5128 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
5129 else
5130 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
5131
5132 block_len = em->disk_num_bytes;
5133 compress_type = btrfs_extent_map_compression(em);
5134 if (compress_type != BTRFS_COMPRESS_NONE) {
5135 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
5136 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
5137 } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
5138 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
5139 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
5140 }
5141
5142 btrfs_set_stack_file_extent_offset(&fi, extent_offset);
5143 btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
5144 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
5145 btrfs_set_stack_file_extent_compression(&fi, compress_type);
5146
5147 ret = log_extent_csums(trans, inode, log, em, ctx);
5148 if (ret)
5149 return ret;
5150
5151 /*
5152 * If this is the first time we are logging the inode in the current
5153 * transaction, we can avoid btrfs_drop_extents(), which is expensive
5154 * because it does a deletion search, which always acquires write locks
5155 * for extent buffers at levels 2, 1 and 0. This not only wastes time
5156 * but also adds significant contention in a log tree, since log trees
5157 * are small, with a root at level 2 or 3 at most, due to their short
5158 * life span.
5159 */
5160 if (ctx->logged_before) {
5161 drop_args.path = path;
5162 drop_args.start = em->start;
5163 drop_args.end = em->start + em->len;
5164 drop_args.replace_extent = true;
5165 drop_args.extent_item_size = sizeof(fi);
5166 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
5167 if (ret)
5168 return ret;
5169 }
5170
5171 if (!drop_args.extent_inserted) {
5172 key.objectid = btrfs_ino(inode);
5173 key.type = BTRFS_EXTENT_DATA_KEY;
5174 key.offset = em->start;
5175
5176 ret = btrfs_insert_empty_item(trans, log, path, &key,
5177 sizeof(fi));
5178 if (ret)
5179 return ret;
5180 }
5181 leaf = path->nodes[0];
5182 write_extent_buffer(leaf, &fi,
5183 btrfs_item_ptr_offset(leaf, path->slots[0]),
5184 sizeof(fi));
5185
5186 btrfs_release_path(path);
5187
5188 return ret;
5189 }
5190
5191 /*
5192 * Log all prealloc extents beyond the inode's i_size to make sure we do not
5193 * lose them after doing a full/fast fsync and replaying the log. We scan the
5194 * subvolume's root instead of iterating the inode's extent map tree because
5195 * otherwise we can log incorrect extent items based on extent map conversion.
5196 * That can happen due to the fact that extent maps are merged when they
5197 * are not in the extent map tree's list of modified extents.
5198 */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5199 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
5200 struct btrfs_inode *inode,
5201 struct btrfs_path *path,
5202 struct btrfs_log_ctx *ctx)
5203 {
5204 struct btrfs_root *root = inode->root;
5205 struct btrfs_key key;
5206 const u64 i_size = i_size_read(&inode->vfs_inode);
5207 const u64 ino = btrfs_ino(inode);
5208 BTRFS_PATH_AUTO_FREE(dst_path);
5209 bool dropped_extents = false;
5210 u64 truncate_offset = i_size;
5211 struct extent_buffer *leaf;
5212 int slot;
5213 int ins_nr = 0;
5214 int start_slot = 0;
5215 int ret;
5216
5217 if (!(inode->flags & BTRFS_INODE_PREALLOC))
5218 return 0;
5219
5220 key.objectid = ino;
5221 key.type = BTRFS_EXTENT_DATA_KEY;
5222 key.offset = i_size;
5223 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5224 if (ret < 0)
5225 goto out;
5226
5227 /*
5228 * We must check if there is a prealloc extent that starts before the
5229 * i_size and crosses the i_size boundary. This is to ensure later we
5230 * truncate down to the end of that extent and not to the i_size, as
5231 * otherwise we end up losing part of the prealloc extent after a log
5232 * replay and with an implicit hole if there is another prealloc extent
5233 * that starts at an offset beyond i_size.
5234 */
5235 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
5236 if (ret < 0)
5237 goto out;
5238
5239 if (ret == 0) {
5240 struct btrfs_file_extent_item *ei;
5241
5242 leaf = path->nodes[0];
5243 slot = path->slots[0];
5244 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5245
5246 if (btrfs_file_extent_type(leaf, ei) ==
5247 BTRFS_FILE_EXTENT_PREALLOC) {
5248 u64 extent_end;
5249
5250 btrfs_item_key_to_cpu(leaf, &key, slot);
5251 extent_end = key.offset +
5252 btrfs_file_extent_num_bytes(leaf, ei);
5253
5254 if (extent_end > i_size)
5255 truncate_offset = extent_end;
5256 }
5257 } else {
5258 ret = 0;
5259 }
5260
5261 while (true) {
5262 leaf = path->nodes[0];
5263 slot = path->slots[0];
5264
5265 if (slot >= btrfs_header_nritems(leaf)) {
5266 if (ins_nr > 0) {
5267 ret = copy_items(trans, inode, dst_path, path,
5268 start_slot, ins_nr, 1, 0, ctx);
5269 if (ret < 0)
5270 goto out;
5271 ins_nr = 0;
5272 }
5273 ret = btrfs_next_leaf(root, path);
5274 if (ret < 0)
5275 goto out;
5276 if (ret > 0) {
5277 ret = 0;
5278 break;
5279 }
5280 continue;
5281 }
5282
5283 btrfs_item_key_to_cpu(leaf, &key, slot);
5284 if (key.objectid > ino)
5285 break;
5286 if (WARN_ON_ONCE(key.objectid < ino) ||
5287 key.type < BTRFS_EXTENT_DATA_KEY ||
5288 key.offset < i_size) {
5289 path->slots[0]++;
5290 continue;
5291 }
5292 /*
5293 * Avoid overlapping items in the log tree. The first time we
5294 * get here, get rid of everything from a past fsync. After
5295 * that, if the current extent starts before the end of the last
5296 * extent we copied, truncate the last one. This can happen if
5297 * an ordered extent completion modifies the subvolume tree
5298 * while btrfs_next_leaf() has the tree unlocked.
5299 */
5300 if (!dropped_extents || key.offset < truncate_offset) {
5301 ret = truncate_inode_items(trans, root->log_root, inode,
5302 min(key.offset, truncate_offset),
5303 BTRFS_EXTENT_DATA_KEY);
5304 if (ret)
5305 goto out;
5306 dropped_extents = true;
5307 }
5308 truncate_offset = btrfs_file_extent_end(path);
5309 if (ins_nr == 0)
5310 start_slot = slot;
5311 ins_nr++;
5312 path->slots[0]++;
5313 if (!dst_path) {
5314 dst_path = btrfs_alloc_path();
5315 if (!dst_path) {
5316 ret = -ENOMEM;
5317 goto out;
5318 }
5319 }
5320 }
5321 if (ins_nr > 0)
5322 ret = copy_items(trans, inode, dst_path, path,
5323 start_slot, ins_nr, 1, 0, ctx);
5324 out:
5325 btrfs_release_path(path);
5326 return ret;
5327 }
5328
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5329 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
5330 struct btrfs_inode *inode,
5331 struct btrfs_path *path,
5332 struct btrfs_log_ctx *ctx)
5333 {
5334 struct btrfs_ordered_extent *ordered;
5335 struct btrfs_ordered_extent *tmp;
5336 struct extent_map *em, *n;
5337 LIST_HEAD(extents);
5338 struct extent_map_tree *tree = &inode->extent_tree;
5339 int ret = 0;
5340 int num = 0;
5341
5342 write_lock(&tree->lock);
5343
5344 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
5345 list_del_init(&em->list);
5346 /*
5347 * Just an arbitrary number, this can be really CPU intensive
5348 * once we start getting a lot of extents, and really once we
5349 * have a bunch of extents we just want to commit since it will
5350 * be faster.
5351 */
5352 if (++num > 32768) {
5353 list_del_init(&tree->modified_extents);
5354 ret = -EFBIG;
5355 goto process;
5356 }
5357
5358 if (em->generation < trans->transid)
5359 continue;
5360
5361 /* We log prealloc extents beyond eof later. */
5362 if ((em->flags & EXTENT_FLAG_PREALLOC) &&
5363 em->start >= i_size_read(&inode->vfs_inode))
5364 continue;
5365
5366 /* Need a ref to keep it from getting evicted from cache */
5367 refcount_inc(&em->refs);
5368 em->flags |= EXTENT_FLAG_LOGGING;
5369 list_add_tail(&em->list, &extents);
5370 num++;
5371 }
5372
5373 list_sort(NULL, &extents, extent_cmp);
5374 process:
5375 while (!list_empty(&extents)) {
5376 em = list_first_entry(&extents, struct extent_map, list);
5377
5378 list_del_init(&em->list);
5379
5380 /*
5381 * If we had an error we just need to delete everybody from our
5382 * private list.
5383 */
5384 if (ret) {
5385 btrfs_clear_em_logging(inode, em);
5386 btrfs_free_extent_map(em);
5387 continue;
5388 }
5389
5390 write_unlock(&tree->lock);
5391
5392 ret = log_one_extent(trans, inode, em, path, ctx);
5393 write_lock(&tree->lock);
5394 btrfs_clear_em_logging(inode, em);
5395 btrfs_free_extent_map(em);
5396 }
5397 WARN_ON(!list_empty(&extents));
5398 write_unlock(&tree->lock);
5399
5400 if (!ret)
5401 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
5402 if (ret)
5403 return ret;
5404
5405 /*
5406 * We have logged all extents successfully, now make sure the commit of
5407 * the current transaction waits for the ordered extents to complete
5408 * before it commits and wipes out the log trees, otherwise we would
5409 * lose data if an ordered extents completes after the transaction
5410 * commits and a power failure happens after the transaction commit.
5411 */
5412 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
5413 list_del_init(&ordered->log_list);
5414 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
5415
5416 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5417 spin_lock(&inode->ordered_tree_lock);
5418 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
5419 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
5420 atomic_inc(&trans->transaction->pending_ordered);
5421 }
5422 spin_unlock(&inode->ordered_tree_lock);
5423 }
5424 btrfs_put_ordered_extent(ordered);
5425 }
5426
5427 return 0;
5428 }
5429
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)5430 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5431 struct btrfs_path *path, u64 *size_ret)
5432 {
5433 struct btrfs_key key;
5434 int ret;
5435
5436 key.objectid = btrfs_ino(inode);
5437 key.type = BTRFS_INODE_ITEM_KEY;
5438 key.offset = 0;
5439
5440 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5441 if (ret < 0) {
5442 return ret;
5443 } else if (ret > 0) {
5444 *size_ret = 0;
5445 } else {
5446 struct btrfs_inode_item *item;
5447
5448 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5449 struct btrfs_inode_item);
5450 *size_ret = btrfs_inode_size(path->nodes[0], item);
5451 /*
5452 * If the in-memory inode's i_size is smaller then the inode
5453 * size stored in the btree, return the inode's i_size, so
5454 * that we get a correct inode size after replaying the log
5455 * when before a power failure we had a shrinking truncate
5456 * followed by addition of a new name (rename / new hard link).
5457 * Otherwise return the inode size from the btree, to avoid
5458 * data loss when replaying a log due to previously doing a
5459 * write that expands the inode's size and logging a new name
5460 * immediately after.
5461 */
5462 if (*size_ret > inode->vfs_inode.i_size)
5463 *size_ret = inode->vfs_inode.i_size;
5464 }
5465
5466 btrfs_release_path(path);
5467 return 0;
5468 }
5469
5470 /*
5471 * At the moment we always log all xattrs. This is to figure out at log replay
5472 * time which xattrs must have their deletion replayed. If a xattr is missing
5473 * in the log tree and exists in the fs/subvol tree, we delete it. This is
5474 * because if a xattr is deleted, the inode is fsynced and a power failure
5475 * happens, causing the log to be replayed the next time the fs is mounted,
5476 * we want the xattr to not exist anymore (same behaviour as other filesystems
5477 * with a journal, ext3/4, xfs, f2fs, etc).
5478 */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)5479 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5480 struct btrfs_inode *inode,
5481 struct btrfs_path *path,
5482 struct btrfs_path *dst_path,
5483 struct btrfs_log_ctx *ctx)
5484 {
5485 struct btrfs_root *root = inode->root;
5486 int ret;
5487 struct btrfs_key key;
5488 const u64 ino = btrfs_ino(inode);
5489 int ins_nr = 0;
5490 int start_slot = 0;
5491 bool found_xattrs = false;
5492
5493 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5494 return 0;
5495
5496 key.objectid = ino;
5497 key.type = BTRFS_XATTR_ITEM_KEY;
5498 key.offset = 0;
5499
5500 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5501 if (ret < 0)
5502 return ret;
5503
5504 while (true) {
5505 int slot = path->slots[0];
5506 struct extent_buffer *leaf = path->nodes[0];
5507 int nritems = btrfs_header_nritems(leaf);
5508
5509 if (slot >= nritems) {
5510 if (ins_nr > 0) {
5511 ret = copy_items(trans, inode, dst_path, path,
5512 start_slot, ins_nr, 1, 0, ctx);
5513 if (ret < 0)
5514 return ret;
5515 ins_nr = 0;
5516 }
5517 ret = btrfs_next_leaf(root, path);
5518 if (ret < 0)
5519 return ret;
5520 else if (ret > 0)
5521 break;
5522 continue;
5523 }
5524
5525 btrfs_item_key_to_cpu(leaf, &key, slot);
5526 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5527 break;
5528
5529 if (ins_nr == 0)
5530 start_slot = slot;
5531 ins_nr++;
5532 path->slots[0]++;
5533 found_xattrs = true;
5534 cond_resched();
5535 }
5536 if (ins_nr > 0) {
5537 ret = copy_items(trans, inode, dst_path, path,
5538 start_slot, ins_nr, 1, 0, ctx);
5539 if (ret < 0)
5540 return ret;
5541 }
5542
5543 if (!found_xattrs)
5544 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5545
5546 return 0;
5547 }
5548
5549 /*
5550 * When using the NO_HOLES feature if we punched a hole that causes the
5551 * deletion of entire leafs or all the extent items of the first leaf (the one
5552 * that contains the inode item and references) we may end up not processing
5553 * any extents, because there are no leafs with a generation matching the
5554 * current transaction that have extent items for our inode. So we need to find
5555 * if any holes exist and then log them. We also need to log holes after any
5556 * truncate operation that changes the inode's size.
5557 */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)5558 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5559 struct btrfs_inode *inode,
5560 struct btrfs_path *path)
5561 {
5562 struct btrfs_root *root = inode->root;
5563 struct btrfs_fs_info *fs_info = root->fs_info;
5564 struct btrfs_key key;
5565 const u64 ino = btrfs_ino(inode);
5566 const u64 i_size = i_size_read(&inode->vfs_inode);
5567 u64 prev_extent_end = 0;
5568 int ret;
5569
5570 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5571 return 0;
5572
5573 key.objectid = ino;
5574 key.type = BTRFS_EXTENT_DATA_KEY;
5575 key.offset = 0;
5576
5577 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5578 if (ret < 0)
5579 return ret;
5580
5581 while (true) {
5582 struct extent_buffer *leaf = path->nodes[0];
5583
5584 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5585 ret = btrfs_next_leaf(root, path);
5586 if (ret < 0)
5587 return ret;
5588 if (ret > 0) {
5589 ret = 0;
5590 break;
5591 }
5592 leaf = path->nodes[0];
5593 }
5594
5595 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5596 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5597 break;
5598
5599 /* We have a hole, log it. */
5600 if (prev_extent_end < key.offset) {
5601 const u64 hole_len = key.offset - prev_extent_end;
5602
5603 /*
5604 * Release the path to avoid deadlocks with other code
5605 * paths that search the root while holding locks on
5606 * leafs from the log root.
5607 */
5608 btrfs_release_path(path);
5609 ret = btrfs_insert_hole_extent(trans, root->log_root,
5610 ino, prev_extent_end,
5611 hole_len);
5612 if (ret < 0)
5613 return ret;
5614
5615 /*
5616 * Search for the same key again in the root. Since it's
5617 * an extent item and we are holding the inode lock, the
5618 * key must still exist. If it doesn't just emit warning
5619 * and return an error to fall back to a transaction
5620 * commit.
5621 */
5622 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5623 if (ret < 0)
5624 return ret;
5625 if (WARN_ON(ret > 0))
5626 return -ENOENT;
5627 leaf = path->nodes[0];
5628 }
5629
5630 prev_extent_end = btrfs_file_extent_end(path);
5631 path->slots[0]++;
5632 cond_resched();
5633 }
5634
5635 if (prev_extent_end < i_size) {
5636 u64 hole_len;
5637
5638 btrfs_release_path(path);
5639 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5640 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5641 prev_extent_end, hole_len);
5642 if (ret < 0)
5643 return ret;
5644 }
5645
5646 return 0;
5647 }
5648
5649 /*
5650 * When we are logging a new inode X, check if it doesn't have a reference that
5651 * matches the reference from some other inode Y created in a past transaction
5652 * and that was renamed in the current transaction. If we don't do this, then at
5653 * log replay time we can lose inode Y (and all its files if it's a directory):
5654 *
5655 * mkdir /mnt/x
5656 * echo "hello world" > /mnt/x/foobar
5657 * sync
5658 * mv /mnt/x /mnt/y
5659 * mkdir /mnt/x # or touch /mnt/x
5660 * xfs_io -c fsync /mnt/x
5661 * <power fail>
5662 * mount fs, trigger log replay
5663 *
5664 * After the log replay procedure, we would lose the first directory and all its
5665 * files (file foobar).
5666 * For the case where inode Y is not a directory we simply end up losing it:
5667 *
5668 * echo "123" > /mnt/foo
5669 * sync
5670 * mv /mnt/foo /mnt/bar
5671 * echo "abc" > /mnt/foo
5672 * xfs_io -c fsync /mnt/foo
5673 * <power fail>
5674 *
5675 * We also need this for cases where a snapshot entry is replaced by some other
5676 * entry (file or directory) otherwise we end up with an unreplayable log due to
5677 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5678 * if it were a regular entry:
5679 *
5680 * mkdir /mnt/x
5681 * btrfs subvolume snapshot /mnt /mnt/x/snap
5682 * btrfs subvolume delete /mnt/x/snap
5683 * rmdir /mnt/x
5684 * mkdir /mnt/x
5685 * fsync /mnt/x or fsync some new file inside it
5686 * <power fail>
5687 *
5688 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5689 * the same transaction.
5690 */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)5691 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5692 const int slot,
5693 const struct btrfs_key *key,
5694 struct btrfs_inode *inode,
5695 u64 *other_ino, u64 *other_parent)
5696 {
5697 BTRFS_PATH_AUTO_FREE(search_path);
5698 char AUTO_KFREE(name);
5699 u32 name_len = 0;
5700 u32 item_size = btrfs_item_size(eb, slot);
5701 u32 cur_offset = 0;
5702 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5703
5704 search_path = btrfs_alloc_path();
5705 if (!search_path)
5706 return -ENOMEM;
5707 search_path->search_commit_root = true;
5708 search_path->skip_locking = true;
5709
5710 while (cur_offset < item_size) {
5711 u64 parent;
5712 u32 this_name_len;
5713 u32 this_len;
5714 unsigned long name_ptr;
5715 struct btrfs_dir_item *di;
5716 struct fscrypt_str name_str;
5717
5718 if (key->type == BTRFS_INODE_REF_KEY) {
5719 struct btrfs_inode_ref *iref;
5720
5721 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5722 parent = key->offset;
5723 this_name_len = btrfs_inode_ref_name_len(eb, iref);
5724 name_ptr = (unsigned long)(iref + 1);
5725 this_len = sizeof(*iref) + this_name_len;
5726 } else {
5727 struct btrfs_inode_extref *extref;
5728
5729 extref = (struct btrfs_inode_extref *)(ptr +
5730 cur_offset);
5731 parent = btrfs_inode_extref_parent(eb, extref);
5732 this_name_len = btrfs_inode_extref_name_len(eb, extref);
5733 name_ptr = (unsigned long)&extref->name;
5734 this_len = sizeof(*extref) + this_name_len;
5735 }
5736
5737 if (this_name_len > name_len) {
5738 char *new_name;
5739
5740 new_name = krealloc(name, this_name_len, GFP_NOFS);
5741 if (!new_name)
5742 return -ENOMEM;
5743 name_len = this_name_len;
5744 name = new_name;
5745 }
5746
5747 read_extent_buffer(eb, name, name_ptr, this_name_len);
5748
5749 name_str.name = name;
5750 name_str.len = this_name_len;
5751 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5752 parent, &name_str, 0);
5753 if (di && !IS_ERR(di)) {
5754 struct btrfs_key di_key;
5755
5756 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5757 di, &di_key);
5758 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5759 if (di_key.objectid != key->objectid) {
5760 *other_ino = di_key.objectid;
5761 *other_parent = parent;
5762 return 1;
5763 } else {
5764 return 0;
5765 }
5766 } else {
5767 return -EAGAIN;
5768 }
5769 } else if (IS_ERR(di)) {
5770 return PTR_ERR(di);
5771 }
5772 btrfs_release_path(search_path);
5773
5774 cur_offset += this_len;
5775 }
5776
5777 return 0;
5778 }
5779
5780 /*
5781 * Check if we need to log an inode. This is used in contexts where while
5782 * logging an inode we need to log another inode (either that it exists or in
5783 * full mode). This is used instead of btrfs_inode_in_log() because the later
5784 * requires the inode to be in the log and have the log transaction committed,
5785 * while here we do not care if the log transaction was already committed - our
5786 * caller will commit the log later - and we want to avoid logging an inode
5787 * multiple times when multiple tasks have joined the same log transaction.
5788 */
need_log_inode(const struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5789 static bool need_log_inode(const struct btrfs_trans_handle *trans,
5790 struct btrfs_inode *inode)
5791 {
5792 /*
5793 * If a directory was not modified, no dentries added or removed, we can
5794 * and should avoid logging it.
5795 */
5796 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5797 return false;
5798
5799 /*
5800 * If this inode does not have new/updated/deleted xattrs since the last
5801 * time it was logged and is flagged as logged in the current transaction,
5802 * we can skip logging it. As for new/deleted names, those are updated in
5803 * the log by link/unlink/rename operations.
5804 * In case the inode was logged and then evicted and reloaded, its
5805 * logged_trans will be 0, in which case we have to fully log it since
5806 * logged_trans is a transient field, not persisted.
5807 */
5808 if (inode_logged(trans, inode, NULL) == 1 &&
5809 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5810 return false;
5811
5812 return true;
5813 }
5814
5815 struct btrfs_dir_list {
5816 u64 ino;
5817 struct list_head list;
5818 };
5819
5820 /*
5821 * Log the inodes of the new dentries of a directory.
5822 * See process_dir_items_leaf() for details about why it is needed.
5823 * This is a recursive operation - if an existing dentry corresponds to a
5824 * directory, that directory's new entries are logged too (same behaviour as
5825 * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
5826 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5827 * complains about the following circular lock dependency / possible deadlock:
5828 *
5829 * CPU0 CPU1
5830 * ---- ----
5831 * lock(&type->i_mutex_dir_key#3/2);
5832 * lock(sb_internal#2);
5833 * lock(&type->i_mutex_dir_key#3/2);
5834 * lock(&sb->s_type->i_mutex_key#14);
5835 *
5836 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5837 * sb_start_intwrite() in btrfs_start_transaction().
5838 * Not acquiring the VFS lock of the inodes is still safe because:
5839 *
5840 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5841 * that while logging the inode new references (names) are added or removed
5842 * from the inode, leaving the logged inode item with a link count that does
5843 * not match the number of logged inode reference items. This is fine because
5844 * at log replay time we compute the real number of links and correct the
5845 * link count in the inode item (see replay_one_buffer() and
5846 * link_to_fixup_dir());
5847 *
5848 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5849 * while logging the inode's items new index items (key type
5850 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5851 * has a size that doesn't match the sum of the lengths of all the logged
5852 * names - this is ok, not a problem, because at log replay time we set the
5853 * directory's i_size to the correct value (see replay_one_name() and
5854 * overwrite_item()).
5855 */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5856 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5857 struct btrfs_inode *start_inode,
5858 struct btrfs_log_ctx *ctx)
5859 {
5860 struct btrfs_root *root = start_inode->root;
5861 struct btrfs_path *path;
5862 LIST_HEAD(dir_list);
5863 struct btrfs_dir_list *dir_elem;
5864 u64 ino = btrfs_ino(start_inode);
5865 struct btrfs_inode *curr_inode = start_inode;
5866 int ret = 0;
5867
5868 /*
5869 * If we are logging a new name, as part of a link or rename operation,
5870 * don't bother logging new dentries, as we just want to log the names
5871 * of an inode and that any new parents exist.
5872 */
5873 if (ctx->logging_new_name)
5874 return 0;
5875
5876 path = btrfs_alloc_path();
5877 if (!path)
5878 return -ENOMEM;
5879
5880 /* Pairs with btrfs_add_delayed_iput below. */
5881 ihold(&curr_inode->vfs_inode);
5882
5883 while (true) {
5884 struct btrfs_key key;
5885 struct btrfs_key found_key;
5886 u64 next_index;
5887 bool continue_curr_inode = true;
5888 int iter_ret;
5889
5890 key.objectid = ino;
5891 key.type = BTRFS_DIR_INDEX_KEY;
5892 key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5893 next_index = key.offset;
5894 again:
5895 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5896 struct extent_buffer *leaf = path->nodes[0];
5897 struct btrfs_dir_item *di;
5898 struct btrfs_key di_key;
5899 struct btrfs_inode *di_inode;
5900 int log_mode = LOG_INODE_EXISTS;
5901 int type;
5902
5903 if (found_key.objectid != ino ||
5904 found_key.type != BTRFS_DIR_INDEX_KEY) {
5905 continue_curr_inode = false;
5906 break;
5907 }
5908
5909 next_index = found_key.offset + 1;
5910
5911 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5912 type = btrfs_dir_ftype(leaf, di);
5913 if (btrfs_dir_transid(leaf, di) < trans->transid)
5914 continue;
5915 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5916 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5917 continue;
5918
5919 btrfs_release_path(path);
5920 di_inode = btrfs_iget_logging(di_key.objectid, root);
5921 if (IS_ERR(di_inode)) {
5922 ret = PTR_ERR(di_inode);
5923 goto out;
5924 }
5925
5926 if (!need_log_inode(trans, di_inode)) {
5927 btrfs_add_delayed_iput(di_inode);
5928 break;
5929 }
5930
5931 ctx->log_new_dentries = false;
5932 if (type == BTRFS_FT_DIR)
5933 log_mode = LOG_INODE_ALL;
5934 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
5935 btrfs_add_delayed_iput(di_inode);
5936 if (ret)
5937 goto out;
5938 if (ctx->log_new_dentries) {
5939 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5940 if (!dir_elem) {
5941 ret = -ENOMEM;
5942 goto out;
5943 }
5944 dir_elem->ino = di_key.objectid;
5945 list_add_tail(&dir_elem->list, &dir_list);
5946 }
5947 break;
5948 }
5949
5950 btrfs_release_path(path);
5951
5952 if (iter_ret < 0) {
5953 ret = iter_ret;
5954 goto out;
5955 } else if (iter_ret > 0) {
5956 continue_curr_inode = false;
5957 } else {
5958 key = found_key;
5959 }
5960
5961 if (continue_curr_inode && key.offset < (u64)-1) {
5962 key.offset++;
5963 goto again;
5964 }
5965
5966 btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5967
5968 if (list_empty(&dir_list))
5969 break;
5970
5971 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5972 ino = dir_elem->ino;
5973 list_del(&dir_elem->list);
5974 kfree(dir_elem);
5975
5976 btrfs_add_delayed_iput(curr_inode);
5977
5978 curr_inode = btrfs_iget_logging(ino, root);
5979 if (IS_ERR(curr_inode)) {
5980 ret = PTR_ERR(curr_inode);
5981 curr_inode = NULL;
5982 break;
5983 }
5984 }
5985 out:
5986 btrfs_free_path(path);
5987 if (curr_inode)
5988 btrfs_add_delayed_iput(curr_inode);
5989
5990 if (ret) {
5991 struct btrfs_dir_list *next;
5992
5993 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5994 kfree(dir_elem);
5995 }
5996
5997 return ret;
5998 }
5999
6000 struct btrfs_ino_list {
6001 u64 ino;
6002 u64 parent;
6003 struct list_head list;
6004 };
6005
free_conflicting_inodes(struct btrfs_log_ctx * ctx)6006 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
6007 {
6008 struct btrfs_ino_list *curr;
6009 struct btrfs_ino_list *next;
6010
6011 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
6012 list_del(&curr->list);
6013 kfree(curr);
6014 }
6015 }
6016
conflicting_inode_is_dir(struct btrfs_root * root,u64 ino,struct btrfs_path * path)6017 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
6018 struct btrfs_path *path)
6019 {
6020 struct btrfs_key key;
6021 int ret;
6022
6023 key.objectid = ino;
6024 key.type = BTRFS_INODE_ITEM_KEY;
6025 key.offset = 0;
6026
6027 path->search_commit_root = true;
6028 path->skip_locking = true;
6029
6030 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6031 if (WARN_ON_ONCE(ret > 0)) {
6032 /*
6033 * We have previously found the inode through the commit root
6034 * so this should not happen. If it does, just error out and
6035 * fallback to a transaction commit.
6036 */
6037 ret = -ENOENT;
6038 } else if (ret == 0) {
6039 struct btrfs_inode_item *item;
6040
6041 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6042 struct btrfs_inode_item);
6043 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
6044 ret = 1;
6045 }
6046
6047 btrfs_release_path(path);
6048 path->search_commit_root = false;
6049 path->skip_locking = false;
6050
6051 return ret;
6052 }
6053
add_conflicting_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 ino,u64 parent,struct btrfs_log_ctx * ctx)6054 static int add_conflicting_inode(struct btrfs_trans_handle *trans,
6055 struct btrfs_root *root,
6056 struct btrfs_path *path,
6057 u64 ino, u64 parent,
6058 struct btrfs_log_ctx *ctx)
6059 {
6060 struct btrfs_ino_list *ino_elem;
6061 struct btrfs_inode *inode;
6062
6063 /*
6064 * It's rare to have a lot of conflicting inodes, in practice it is not
6065 * common to have more than 1 or 2. We don't want to collect too many,
6066 * as we could end up logging too many inodes (even if only in
6067 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
6068 * commits.
6069 */
6070 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
6071 return BTRFS_LOG_FORCE_COMMIT;
6072
6073 inode = btrfs_iget_logging(ino, root);
6074 /*
6075 * If the other inode that had a conflicting dir entry was deleted in
6076 * the current transaction then we either:
6077 *
6078 * 1) Log the parent directory (later after adding it to the list) if
6079 * the inode is a directory. This is because it may be a deleted
6080 * subvolume/snapshot or it may be a regular directory that had
6081 * deleted subvolumes/snapshots (or subdirectories that had them),
6082 * and at the moment we can't deal with dropping subvolumes/snapshots
6083 * during log replay. So we just log the parent, which will result in
6084 * a fallback to a transaction commit if we are dealing with those
6085 * cases (last_unlink_trans will match the current transaction);
6086 *
6087 * 2) Do nothing if it's not a directory. During log replay we simply
6088 * unlink the conflicting dentry from the parent directory and then
6089 * add the dentry for our inode. Like this we can avoid logging the
6090 * parent directory (and maybe fallback to a transaction commit in
6091 * case it has a last_unlink_trans == trans->transid, due to moving
6092 * some inode from it to some other directory).
6093 */
6094 if (IS_ERR(inode)) {
6095 int ret = PTR_ERR(inode);
6096
6097 if (ret != -ENOENT)
6098 return ret;
6099
6100 ret = conflicting_inode_is_dir(root, ino, path);
6101 /* Not a directory or we got an error. */
6102 if (ret <= 0)
6103 return ret;
6104
6105 /* Conflicting inode is a directory, so we'll log its parent. */
6106 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
6107 if (!ino_elem)
6108 return -ENOMEM;
6109 ino_elem->ino = ino;
6110 ino_elem->parent = parent;
6111 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
6112 ctx->num_conflict_inodes++;
6113
6114 return 0;
6115 }
6116
6117 /*
6118 * If the inode was already logged skip it - otherwise we can hit an
6119 * infinite loop. Example:
6120 *
6121 * From the commit root (previous transaction) we have the following
6122 * inodes:
6123 *
6124 * inode 257 a directory
6125 * inode 258 with references "zz" and "zz_link" on inode 257
6126 * inode 259 with reference "a" on inode 257
6127 *
6128 * And in the current (uncommitted) transaction we have:
6129 *
6130 * inode 257 a directory, unchanged
6131 * inode 258 with references "a" and "a2" on inode 257
6132 * inode 259 with reference "zz_link" on inode 257
6133 * inode 261 with reference "zz" on inode 257
6134 *
6135 * When logging inode 261 the following infinite loop could
6136 * happen if we don't skip already logged inodes:
6137 *
6138 * - we detect inode 258 as a conflicting inode, with inode 261
6139 * on reference "zz", and log it;
6140 *
6141 * - we detect inode 259 as a conflicting inode, with inode 258
6142 * on reference "a", and log it;
6143 *
6144 * - we detect inode 258 as a conflicting inode, with inode 259
6145 * on reference "zz_link", and log it - again! After this we
6146 * repeat the above steps forever.
6147 *
6148 * Here we can use need_log_inode() because we only need to log the
6149 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
6150 * so that the log ends up with the new name and without the old name.
6151 */
6152 if (!need_log_inode(trans, inode)) {
6153 btrfs_add_delayed_iput(inode);
6154 return 0;
6155 }
6156
6157 btrfs_add_delayed_iput(inode);
6158
6159 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
6160 if (!ino_elem)
6161 return -ENOMEM;
6162 ino_elem->ino = ino;
6163 ino_elem->parent = parent;
6164 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
6165 ctx->num_conflict_inodes++;
6166
6167 return 0;
6168 }
6169
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)6170 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
6171 struct btrfs_root *root,
6172 struct btrfs_log_ctx *ctx)
6173 {
6174 int ret = 0;
6175
6176 /*
6177 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
6178 * otherwise we could have unbounded recursion of btrfs_log_inode()
6179 * calls. This check guarantees we can have only 1 level of recursion.
6180 */
6181 if (ctx->logging_conflict_inodes)
6182 return 0;
6183
6184 ctx->logging_conflict_inodes = true;
6185
6186 /*
6187 * New conflicting inodes may be found and added to the list while we
6188 * are logging a conflicting inode, so keep iterating while the list is
6189 * not empty.
6190 */
6191 while (!list_empty(&ctx->conflict_inodes)) {
6192 struct btrfs_ino_list *curr;
6193 struct btrfs_inode *inode;
6194 u64 ino;
6195 u64 parent;
6196
6197 curr = list_first_entry(&ctx->conflict_inodes,
6198 struct btrfs_ino_list, list);
6199 ino = curr->ino;
6200 parent = curr->parent;
6201 list_del(&curr->list);
6202 kfree(curr);
6203
6204 inode = btrfs_iget_logging(ino, root);
6205 /*
6206 * If the other inode that had a conflicting dir entry was
6207 * deleted in the current transaction, we need to log its parent
6208 * directory. See the comment at add_conflicting_inode().
6209 */
6210 if (IS_ERR(inode)) {
6211 ret = PTR_ERR(inode);
6212 if (ret != -ENOENT)
6213 break;
6214
6215 inode = btrfs_iget_logging(parent, root);
6216 if (IS_ERR(inode)) {
6217 ret = PTR_ERR(inode);
6218 break;
6219 }
6220
6221 /*
6222 * Always log the directory, we cannot make this
6223 * conditional on need_log_inode() because the directory
6224 * might have been logged in LOG_INODE_EXISTS mode or
6225 * the dir index of the conflicting inode is not in a
6226 * dir index key range logged for the directory. So we
6227 * must make sure the deletion is recorded.
6228 */
6229 ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
6230 btrfs_add_delayed_iput(inode);
6231 if (ret)
6232 break;
6233 continue;
6234 }
6235
6236 /*
6237 * Here we can use need_log_inode() because we only need to log
6238 * the inode in LOG_INODE_EXISTS mode and rename operations
6239 * update the log, so that the log ends up with the new name and
6240 * without the old name.
6241 *
6242 * We did this check at add_conflicting_inode(), but here we do
6243 * it again because if some other task logged the inode after
6244 * that, we can avoid doing it again.
6245 */
6246 if (!need_log_inode(trans, inode)) {
6247 btrfs_add_delayed_iput(inode);
6248 continue;
6249 }
6250
6251 /*
6252 * We are safe logging the other inode without acquiring its
6253 * lock as long as we log with the LOG_INODE_EXISTS mode. We
6254 * are safe against concurrent renames of the other inode as
6255 * well because during a rename we pin the log and update the
6256 * log with the new name before we unpin it.
6257 */
6258 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
6259 btrfs_add_delayed_iput(inode);
6260 if (ret)
6261 break;
6262 }
6263
6264 ctx->logging_conflict_inodes = false;
6265 if (ret)
6266 free_conflicting_inodes(ctx);
6267
6268 return ret;
6269 }
6270
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)6271 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
6272 struct btrfs_inode *inode,
6273 struct btrfs_key *min_key,
6274 const struct btrfs_key *max_key,
6275 struct btrfs_path *path,
6276 struct btrfs_path *dst_path,
6277 const u64 logged_isize,
6278 const int inode_only,
6279 struct btrfs_log_ctx *ctx,
6280 bool *need_log_inode_item)
6281 {
6282 const u64 i_size = i_size_read(&inode->vfs_inode);
6283 struct btrfs_root *root = inode->root;
6284 int ins_start_slot = 0;
6285 int ins_nr = 0;
6286 int ret;
6287
6288 while (1) {
6289 ret = btrfs_search_forward(root, min_key, path, trans->transid);
6290 if (ret < 0)
6291 return ret;
6292 if (ret > 0) {
6293 ret = 0;
6294 break;
6295 }
6296 again:
6297 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
6298 if (min_key->objectid != max_key->objectid)
6299 break;
6300 if (min_key->type > max_key->type)
6301 break;
6302
6303 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
6304 *need_log_inode_item = false;
6305 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
6306 min_key->offset >= i_size) {
6307 /*
6308 * Extents at and beyond eof are logged with
6309 * btrfs_log_prealloc_extents().
6310 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
6311 * and no keys greater than that, so bail out.
6312 */
6313 break;
6314 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
6315 min_key->type == BTRFS_INODE_EXTREF_KEY) &&
6316 (inode->generation == trans->transid ||
6317 ctx->logging_conflict_inodes)) {
6318 u64 other_ino = 0;
6319 u64 other_parent = 0;
6320
6321 ret = btrfs_check_ref_name_override(path->nodes[0],
6322 path->slots[0], min_key, inode,
6323 &other_ino, &other_parent);
6324 if (ret < 0) {
6325 return ret;
6326 } else if (ret > 0 &&
6327 other_ino != btrfs_ino(ctx->inode)) {
6328 if (ins_nr > 0) {
6329 ins_nr++;
6330 } else {
6331 ins_nr = 1;
6332 ins_start_slot = path->slots[0];
6333 }
6334 ret = copy_items(trans, inode, dst_path, path,
6335 ins_start_slot, ins_nr,
6336 inode_only, logged_isize, ctx);
6337 if (ret < 0)
6338 return ret;
6339 ins_nr = 0;
6340
6341 btrfs_release_path(path);
6342 ret = add_conflicting_inode(trans, root, path,
6343 other_ino,
6344 other_parent, ctx);
6345 if (ret)
6346 return ret;
6347 goto next_key;
6348 }
6349 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
6350 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
6351 if (ins_nr == 0)
6352 goto next_slot;
6353 ret = copy_items(trans, inode, dst_path, path,
6354 ins_start_slot,
6355 ins_nr, inode_only, logged_isize, ctx);
6356 if (ret < 0)
6357 return ret;
6358 ins_nr = 0;
6359 goto next_slot;
6360 }
6361
6362 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
6363 ins_nr++;
6364 goto next_slot;
6365 } else if (!ins_nr) {
6366 ins_start_slot = path->slots[0];
6367 ins_nr = 1;
6368 goto next_slot;
6369 }
6370
6371 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
6372 ins_nr, inode_only, logged_isize, ctx);
6373 if (ret < 0)
6374 return ret;
6375 ins_nr = 1;
6376 ins_start_slot = path->slots[0];
6377 next_slot:
6378 path->slots[0]++;
6379 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
6380 btrfs_item_key_to_cpu(path->nodes[0], min_key,
6381 path->slots[0]);
6382 goto again;
6383 }
6384 if (ins_nr) {
6385 ret = copy_items(trans, inode, dst_path, path,
6386 ins_start_slot, ins_nr, inode_only,
6387 logged_isize, ctx);
6388 if (ret < 0)
6389 return ret;
6390 ins_nr = 0;
6391 }
6392 btrfs_release_path(path);
6393 next_key:
6394 if (min_key->offset < (u64)-1) {
6395 min_key->offset++;
6396 } else if (min_key->type < max_key->type) {
6397 min_key->type++;
6398 min_key->offset = 0;
6399 } else {
6400 break;
6401 }
6402
6403 /*
6404 * We may process many leaves full of items for our inode, so
6405 * avoid monopolizing a cpu for too long by rescheduling while
6406 * not holding locks on any tree.
6407 */
6408 cond_resched();
6409 }
6410 if (ins_nr) {
6411 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
6412 ins_nr, inode_only, logged_isize, ctx);
6413 if (ret)
6414 return ret;
6415 }
6416
6417 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
6418 /*
6419 * Release the path because otherwise we might attempt to double
6420 * lock the same leaf with btrfs_log_prealloc_extents() below.
6421 */
6422 btrfs_release_path(path);
6423 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
6424 }
6425
6426 return ret;
6427 }
6428
insert_delayed_items_batch(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,const struct btrfs_item_batch * batch,const struct btrfs_delayed_item * first_item)6429 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6430 struct btrfs_root *log,
6431 struct btrfs_path *path,
6432 const struct btrfs_item_batch *batch,
6433 const struct btrfs_delayed_item *first_item)
6434 {
6435 const struct btrfs_delayed_item *curr = first_item;
6436 int ret;
6437
6438 ret = btrfs_insert_empty_items(trans, log, path, batch);
6439 if (ret)
6440 return ret;
6441
6442 for (int i = 0; i < batch->nr; i++) {
6443 char *data_ptr;
6444
6445 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
6446 write_extent_buffer(path->nodes[0], &curr->data,
6447 (unsigned long)data_ptr, curr->data_len);
6448 curr = list_next_entry(curr, log_list);
6449 path->slots[0]++;
6450 }
6451
6452 btrfs_release_path(path);
6453
6454 return 0;
6455 }
6456
log_delayed_insertion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6457 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6458 struct btrfs_inode *inode,
6459 struct btrfs_path *path,
6460 const struct list_head *delayed_ins_list,
6461 struct btrfs_log_ctx *ctx)
6462 {
6463 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6464 const int max_batch_size = 195;
6465 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6466 const u64 ino = btrfs_ino(inode);
6467 struct btrfs_root *log = inode->root->log_root;
6468 struct btrfs_item_batch batch = {
6469 .nr = 0,
6470 .total_data_size = 0,
6471 };
6472 const struct btrfs_delayed_item *first = NULL;
6473 const struct btrfs_delayed_item *curr;
6474 char *ins_data;
6475 struct btrfs_key *ins_keys;
6476 u32 *ins_sizes;
6477 u64 curr_batch_size = 0;
6478 int batch_idx = 0;
6479 int ret;
6480
6481 /* We are adding dir index items to the log tree. */
6482 lockdep_assert_held(&inode->log_mutex);
6483
6484 /*
6485 * We collect delayed items before copying index keys from the subvolume
6486 * to the log tree. However just after we collected them, they may have
6487 * been flushed (all of them or just some of them), and therefore we
6488 * could have copied them from the subvolume tree to the log tree.
6489 * So find the first delayed item that was not yet logged (they are
6490 * sorted by index number).
6491 */
6492 list_for_each_entry(curr, delayed_ins_list, log_list) {
6493 if (curr->index > inode->last_dir_index_offset) {
6494 first = curr;
6495 break;
6496 }
6497 }
6498
6499 /* Empty list or all delayed items were already logged. */
6500 if (!first)
6501 return 0;
6502
6503 ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
6504 if (!ins_data)
6505 return -ENOMEM;
6506 ins_sizes = (u32 *)ins_data;
6507 batch.data_sizes = ins_sizes;
6508 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6509 batch.keys = ins_keys;
6510
6511 curr = first;
6512 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6513 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6514
6515 if (curr_batch_size + curr_size > leaf_data_size ||
6516 batch.nr == max_batch_size) {
6517 ret = insert_delayed_items_batch(trans, log, path,
6518 &batch, first);
6519 if (ret)
6520 goto out;
6521 batch_idx = 0;
6522 batch.nr = 0;
6523 batch.total_data_size = 0;
6524 curr_batch_size = 0;
6525 first = curr;
6526 }
6527
6528 ins_sizes[batch_idx] = curr->data_len;
6529 ins_keys[batch_idx].objectid = ino;
6530 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6531 ins_keys[batch_idx].offset = curr->index;
6532 curr_batch_size += curr_size;
6533 batch.total_data_size += curr->data_len;
6534 batch.nr++;
6535 batch_idx++;
6536 curr = list_next_entry(curr, log_list);
6537 }
6538
6539 ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr);
6540 ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6541
6542 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6543 log_list);
6544 inode->last_dir_index_offset = curr->index;
6545 out:
6546 kfree(ins_data);
6547
6548 return ret;
6549 }
6550
log_delayed_deletions_full(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6551 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6552 struct btrfs_inode *inode,
6553 struct btrfs_path *path,
6554 const struct list_head *delayed_del_list,
6555 struct btrfs_log_ctx *ctx)
6556 {
6557 const u64 ino = btrfs_ino(inode);
6558 const struct btrfs_delayed_item *curr;
6559
6560 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6561 log_list);
6562
6563 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6564 u64 first_dir_index = curr->index;
6565 u64 last_dir_index;
6566 const struct btrfs_delayed_item *next;
6567 int ret;
6568
6569 /*
6570 * Find a range of consecutive dir index items to delete. Like
6571 * this we log a single dir range item spanning several contiguous
6572 * dir items instead of logging one range item per dir index item.
6573 */
6574 next = list_next_entry(curr, log_list);
6575 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6576 if (next->index != curr->index + 1)
6577 break;
6578 curr = next;
6579 next = list_next_entry(next, log_list);
6580 }
6581
6582 last_dir_index = curr->index;
6583 ASSERT(last_dir_index >= first_dir_index,
6584 "last_dir_index=%llu first_dir_index=%llu",
6585 last_dir_index, first_dir_index);
6586
6587 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6588 ino, first_dir_index, last_dir_index);
6589 if (ret)
6590 return ret;
6591 curr = list_next_entry(curr, log_list);
6592 }
6593
6594 return 0;
6595 }
6596
batch_delete_dir_index_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,const struct btrfs_delayed_item * first,const struct btrfs_delayed_item ** last_ret)6597 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6598 struct btrfs_inode *inode,
6599 struct btrfs_path *path,
6600 const struct list_head *delayed_del_list,
6601 const struct btrfs_delayed_item *first,
6602 const struct btrfs_delayed_item **last_ret)
6603 {
6604 const struct btrfs_delayed_item *next;
6605 struct extent_buffer *leaf = path->nodes[0];
6606 const int last_slot = btrfs_header_nritems(leaf) - 1;
6607 int slot = path->slots[0] + 1;
6608 const u64 ino = btrfs_ino(inode);
6609
6610 next = list_next_entry(first, log_list);
6611
6612 while (slot < last_slot &&
6613 !list_entry_is_head(next, delayed_del_list, log_list)) {
6614 struct btrfs_key key;
6615
6616 btrfs_item_key_to_cpu(leaf, &key, slot);
6617 if (key.objectid != ino ||
6618 key.type != BTRFS_DIR_INDEX_KEY ||
6619 key.offset != next->index)
6620 break;
6621
6622 slot++;
6623 *last_ret = next;
6624 next = list_next_entry(next, log_list);
6625 }
6626
6627 return btrfs_del_items(trans, inode->root->log_root, path,
6628 path->slots[0], slot - path->slots[0]);
6629 }
6630
log_delayed_deletions_incremental(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6631 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6632 struct btrfs_inode *inode,
6633 struct btrfs_path *path,
6634 const struct list_head *delayed_del_list,
6635 struct btrfs_log_ctx *ctx)
6636 {
6637 struct btrfs_root *log = inode->root->log_root;
6638 const struct btrfs_delayed_item *curr;
6639 u64 last_range_start = 0;
6640 u64 last_range_end = 0;
6641 struct btrfs_key key;
6642
6643 key.objectid = btrfs_ino(inode);
6644 key.type = BTRFS_DIR_INDEX_KEY;
6645 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6646 log_list);
6647
6648 while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6649 const struct btrfs_delayed_item *last = curr;
6650 u64 first_dir_index = curr->index;
6651 u64 last_dir_index;
6652 bool deleted_items = false;
6653 int ret;
6654
6655 key.offset = curr->index;
6656 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6657 if (ret < 0) {
6658 return ret;
6659 } else if (ret == 0) {
6660 ret = batch_delete_dir_index_items(trans, inode, path,
6661 delayed_del_list, curr,
6662 &last);
6663 if (ret)
6664 return ret;
6665 deleted_items = true;
6666 }
6667
6668 btrfs_release_path(path);
6669
6670 /*
6671 * If we deleted items from the leaf, it means we have a range
6672 * item logging their range, so no need to add one or update an
6673 * existing one. Otherwise we have to log a dir range item.
6674 */
6675 if (deleted_items)
6676 goto next_batch;
6677
6678 last_dir_index = last->index;
6679 ASSERT(last_dir_index >= first_dir_index,
6680 "last_dir_index=%llu first_dir_index=%llu",
6681 last_dir_index, first_dir_index);
6682 /*
6683 * If this range starts right after where the previous one ends,
6684 * then we want to reuse the previous range item and change its
6685 * end offset to the end of this range. This is just to minimize
6686 * leaf space usage, by avoiding adding a new range item.
6687 */
6688 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6689 first_dir_index = last_range_start;
6690
6691 ret = insert_dir_log_key(trans, log, path, key.objectid,
6692 first_dir_index, last_dir_index);
6693 if (ret)
6694 return ret;
6695
6696 last_range_start = first_dir_index;
6697 last_range_end = last_dir_index;
6698 next_batch:
6699 curr = list_next_entry(last, log_list);
6700 }
6701
6702 return 0;
6703 }
6704
log_delayed_deletion_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,const struct list_head * delayed_del_list,struct btrfs_log_ctx * ctx)6705 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6706 struct btrfs_inode *inode,
6707 struct btrfs_path *path,
6708 const struct list_head *delayed_del_list,
6709 struct btrfs_log_ctx *ctx)
6710 {
6711 /*
6712 * We are deleting dir index items from the log tree or adding range
6713 * items to it.
6714 */
6715 lockdep_assert_held(&inode->log_mutex);
6716
6717 if (list_empty(delayed_del_list))
6718 return 0;
6719
6720 if (ctx->logged_before)
6721 return log_delayed_deletions_incremental(trans, inode, path,
6722 delayed_del_list, ctx);
6723
6724 return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6725 ctx);
6726 }
6727
6728 /*
6729 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6730 * items instead of the subvolume tree.
6731 */
log_new_delayed_dentries(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,const struct list_head * delayed_ins_list,struct btrfs_log_ctx * ctx)6732 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6733 struct btrfs_inode *inode,
6734 const struct list_head *delayed_ins_list,
6735 struct btrfs_log_ctx *ctx)
6736 {
6737 const bool orig_log_new_dentries = ctx->log_new_dentries;
6738 struct btrfs_delayed_item *item;
6739 int ret = 0;
6740
6741 /*
6742 * No need for the log mutex, plus to avoid potential deadlocks or
6743 * lockdep annotations due to nesting of delayed inode mutexes and log
6744 * mutexes.
6745 */
6746 lockdep_assert_not_held(&inode->log_mutex);
6747
6748 ASSERT(!ctx->logging_new_delayed_dentries,
6749 "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries);
6750 ctx->logging_new_delayed_dentries = true;
6751
6752 list_for_each_entry(item, delayed_ins_list, log_list) {
6753 struct btrfs_dir_item *dir_item;
6754 struct btrfs_inode *di_inode;
6755 struct btrfs_key key;
6756 int log_mode = LOG_INODE_EXISTS;
6757
6758 dir_item = (struct btrfs_dir_item *)item->data;
6759 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6760
6761 if (key.type == BTRFS_ROOT_ITEM_KEY)
6762 continue;
6763
6764 di_inode = btrfs_iget_logging(key.objectid, inode->root);
6765 if (IS_ERR(di_inode)) {
6766 ret = PTR_ERR(di_inode);
6767 break;
6768 }
6769
6770 if (!need_log_inode(trans, di_inode)) {
6771 btrfs_add_delayed_iput(di_inode);
6772 continue;
6773 }
6774
6775 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6776 log_mode = LOG_INODE_ALL;
6777
6778 ctx->log_new_dentries = false;
6779 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
6780
6781 if (!ret && ctx->log_new_dentries)
6782 ret = log_new_dir_dentries(trans, di_inode, ctx);
6783
6784 btrfs_add_delayed_iput(di_inode);
6785
6786 if (ret)
6787 break;
6788 }
6789
6790 ctx->log_new_dentries = orig_log_new_dentries;
6791 ctx->logging_new_delayed_dentries = false;
6792
6793 return ret;
6794 }
6795
6796 /* log a single inode in the tree log.
6797 * At least one parent directory for this inode must exist in the tree
6798 * or be logged already.
6799 *
6800 * Any items from this inode changed by the current transaction are copied
6801 * to the log tree. An extra reference is taken on any extents in this
6802 * file, allowing us to avoid a whole pile of corner cases around logging
6803 * blocks that have been removed from the tree.
6804 *
6805 * See LOG_INODE_ALL and related defines for a description of what inode_only
6806 * does.
6807 *
6808 * This handles both files and directories.
6809 */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)6810 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6811 struct btrfs_inode *inode,
6812 int inode_only,
6813 struct btrfs_log_ctx *ctx)
6814 {
6815 struct btrfs_path *path;
6816 struct btrfs_path *dst_path;
6817 struct btrfs_key min_key;
6818 struct btrfs_key max_key;
6819 struct btrfs_root *log = inode->root->log_root;
6820 int ret;
6821 bool fast_search = false;
6822 u64 ino = btrfs_ino(inode);
6823 struct extent_map_tree *em_tree = &inode->extent_tree;
6824 u64 logged_isize = 0;
6825 bool need_log_inode_item = true;
6826 bool xattrs_logged = false;
6827 bool inode_item_dropped = true;
6828 bool full_dir_logging = false;
6829 LIST_HEAD(delayed_ins_list);
6830 LIST_HEAD(delayed_del_list);
6831
6832 path = btrfs_alloc_path();
6833 if (!path)
6834 return -ENOMEM;
6835 dst_path = btrfs_alloc_path();
6836 if (!dst_path) {
6837 btrfs_free_path(path);
6838 return -ENOMEM;
6839 }
6840
6841 min_key.objectid = ino;
6842 min_key.type = BTRFS_INODE_ITEM_KEY;
6843 min_key.offset = 0;
6844
6845 max_key.objectid = ino;
6846
6847
6848 /* today the code can only do partial logging of directories */
6849 if (S_ISDIR(inode->vfs_inode.i_mode) ||
6850 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6851 &inode->runtime_flags) &&
6852 inode_only >= LOG_INODE_EXISTS))
6853 max_key.type = BTRFS_XATTR_ITEM_KEY;
6854 else
6855 max_key.type = (u8)-1;
6856 max_key.offset = (u64)-1;
6857
6858 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6859 full_dir_logging = true;
6860
6861 /*
6862 * If we are logging a directory while we are logging dentries of the
6863 * delayed items of some other inode, then we need to flush the delayed
6864 * items of this directory and not log the delayed items directly. This
6865 * is to prevent more than one level of recursion into btrfs_log_inode()
6866 * by having something like this:
6867 *
6868 * $ mkdir -p a/b/c/d/e/f/g/h/...
6869 * $ xfs_io -c "fsync" a
6870 *
6871 * Where all directories in the path did not exist before and are
6872 * created in the current transaction.
6873 * So in such a case we directly log the delayed items of the main
6874 * directory ("a") without flushing them first, while for each of its
6875 * subdirectories we flush their delayed items before logging them.
6876 * This prevents a potential unbounded recursion like this:
6877 *
6878 * btrfs_log_inode()
6879 * log_new_delayed_dentries()
6880 * btrfs_log_inode()
6881 * log_new_delayed_dentries()
6882 * btrfs_log_inode()
6883 * log_new_delayed_dentries()
6884 * (...)
6885 *
6886 * We have thresholds for the maximum number of delayed items to have in
6887 * memory, and once they are hit, the items are flushed asynchronously.
6888 * However the limit is quite high, so lets prevent deep levels of
6889 * recursion to happen by limiting the maximum depth to be 1.
6890 */
6891 if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6892 ret = btrfs_commit_inode_delayed_items(trans, inode);
6893 if (ret)
6894 goto out;
6895 }
6896
6897 mutex_lock(&inode->log_mutex);
6898
6899 /*
6900 * For symlinks, we must always log their content, which is stored in an
6901 * inline extent, otherwise we could end up with an empty symlink after
6902 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6903 * one attempts to create an empty symlink).
6904 * We don't need to worry about flushing delalloc, because when we create
6905 * the inline extent when the symlink is created (we never have delalloc
6906 * for symlinks).
6907 */
6908 if (S_ISLNK(inode->vfs_inode.i_mode))
6909 inode_only = LOG_INODE_ALL;
6910
6911 /*
6912 * Before logging the inode item, cache the value returned by
6913 * inode_logged(), because after that we have the need to figure out if
6914 * the inode was previously logged in this transaction.
6915 */
6916 ret = inode_logged(trans, inode, path);
6917 if (ret < 0)
6918 goto out_unlock;
6919 ctx->logged_before = (ret == 1);
6920 ret = 0;
6921
6922 /*
6923 * This is for cases where logging a directory could result in losing a
6924 * a file after replaying the log. For example, if we move a file from a
6925 * directory A to a directory B, then fsync directory A, we have no way
6926 * to known the file was moved from A to B, so logging just A would
6927 * result in losing the file after a log replay.
6928 */
6929 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6930 ret = BTRFS_LOG_FORCE_COMMIT;
6931 goto out_unlock;
6932 }
6933
6934 /*
6935 * a brute force approach to making sure we get the most uptodate
6936 * copies of everything.
6937 */
6938 if (S_ISDIR(inode->vfs_inode.i_mode)) {
6939 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6940 if (ctx->logged_before)
6941 ret = drop_inode_items(trans, log, path, inode,
6942 BTRFS_XATTR_ITEM_KEY);
6943 } else {
6944 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6945 /*
6946 * Make sure the new inode item we write to the log has
6947 * the same isize as the current one (if it exists).
6948 * This is necessary to prevent data loss after log
6949 * replay, and also to prevent doing a wrong expanding
6950 * truncate - for e.g. create file, write 4K into offset
6951 * 0, fsync, write 4K into offset 4096, add hard link,
6952 * fsync some other file (to sync log), power fail - if
6953 * we use the inode's current i_size, after log replay
6954 * we get a 8Kb file, with the last 4Kb extent as a hole
6955 * (zeroes), as if an expanding truncate happened,
6956 * instead of getting a file of 4Kb only.
6957 */
6958 ret = logged_inode_size(log, inode, path, &logged_isize);
6959 if (ret)
6960 goto out_unlock;
6961 }
6962 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6963 &inode->runtime_flags)) {
6964 if (inode_only == LOG_INODE_EXISTS) {
6965 max_key.type = BTRFS_XATTR_ITEM_KEY;
6966 if (ctx->logged_before)
6967 ret = drop_inode_items(trans, log, path,
6968 inode, max_key.type);
6969 } else {
6970 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6971 &inode->runtime_flags);
6972 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6973 &inode->runtime_flags);
6974 if (ctx->logged_before)
6975 ret = truncate_inode_items(trans, log,
6976 inode, 0, 0);
6977 }
6978 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6979 &inode->runtime_flags) ||
6980 inode_only == LOG_INODE_EXISTS) {
6981 if (inode_only == LOG_INODE_ALL)
6982 fast_search = true;
6983 max_key.type = BTRFS_XATTR_ITEM_KEY;
6984 if (ctx->logged_before)
6985 ret = drop_inode_items(trans, log, path, inode,
6986 max_key.type);
6987 } else {
6988 if (inode_only == LOG_INODE_ALL)
6989 fast_search = true;
6990 inode_item_dropped = false;
6991 goto log_extents;
6992 }
6993
6994 }
6995 if (ret)
6996 goto out_unlock;
6997
6998 /*
6999 * If we are logging a directory in full mode, collect the delayed items
7000 * before iterating the subvolume tree, so that we don't miss any new
7001 * dir index items in case they get flushed while or right after we are
7002 * iterating the subvolume tree.
7003 */
7004 if (full_dir_logging && !ctx->logging_new_delayed_dentries)
7005 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
7006 &delayed_del_list);
7007
7008 /*
7009 * If we are fsyncing a file with 0 hard links, then commit the delayed
7010 * inode because the last inode ref (or extref) item may still be in the
7011 * subvolume tree and if we log it the file will still exist after a log
7012 * replay. So commit the delayed inode to delete that last ref and we
7013 * skip logging it.
7014 */
7015 if (inode->vfs_inode.i_nlink == 0) {
7016 ret = btrfs_commit_inode_delayed_inode(inode);
7017 if (ret)
7018 goto out_unlock;
7019 }
7020
7021 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
7022 path, dst_path, logged_isize,
7023 inode_only, ctx,
7024 &need_log_inode_item);
7025 if (ret)
7026 goto out_unlock;
7027
7028 btrfs_release_path(path);
7029 btrfs_release_path(dst_path);
7030 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
7031 if (ret)
7032 goto out_unlock;
7033 xattrs_logged = true;
7034 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
7035 btrfs_release_path(path);
7036 btrfs_release_path(dst_path);
7037 ret = btrfs_log_holes(trans, inode, path);
7038 if (ret)
7039 goto out_unlock;
7040 }
7041 log_extents:
7042 btrfs_release_path(path);
7043 btrfs_release_path(dst_path);
7044 if (need_log_inode_item) {
7045 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
7046 if (ret)
7047 goto out_unlock;
7048 /*
7049 * If we are doing a fast fsync and the inode was logged before
7050 * in this transaction, we don't need to log the xattrs because
7051 * they were logged before. If xattrs were added, changed or
7052 * deleted since the last time we logged the inode, then we have
7053 * already logged them because the inode had the runtime flag
7054 * BTRFS_INODE_COPY_EVERYTHING set.
7055 */
7056 if (!xattrs_logged && inode->logged_trans < trans->transid) {
7057 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
7058 if (ret)
7059 goto out_unlock;
7060 btrfs_release_path(path);
7061 }
7062 }
7063 if (fast_search) {
7064 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
7065 if (ret)
7066 goto out_unlock;
7067 } else if (inode_only == LOG_INODE_ALL) {
7068 struct extent_map *em, *n;
7069
7070 write_lock(&em_tree->lock);
7071 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
7072 list_del_init(&em->list);
7073 write_unlock(&em_tree->lock);
7074 }
7075
7076 if (full_dir_logging) {
7077 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
7078 if (ret)
7079 goto out_unlock;
7080 ret = log_delayed_insertion_items(trans, inode, path,
7081 &delayed_ins_list, ctx);
7082 if (ret)
7083 goto out_unlock;
7084 ret = log_delayed_deletion_items(trans, inode, path,
7085 &delayed_del_list, ctx);
7086 if (ret)
7087 goto out_unlock;
7088 }
7089
7090 spin_lock(&inode->lock);
7091 inode->logged_trans = trans->transid;
7092 /*
7093 * Don't update last_log_commit if we logged that an inode exists.
7094 * We do this for three reasons:
7095 *
7096 * 1) We might have had buffered writes to this inode that were
7097 * flushed and had their ordered extents completed in this
7098 * transaction, but we did not previously log the inode with
7099 * LOG_INODE_ALL. Later the inode was evicted and after that
7100 * it was loaded again and this LOG_INODE_EXISTS log operation
7101 * happened. We must make sure that if an explicit fsync against
7102 * the inode is performed later, it logs the new extents, an
7103 * updated inode item, etc, and syncs the log. The same logic
7104 * applies to direct IO writes instead of buffered writes.
7105 *
7106 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
7107 * is logged with an i_size of 0 or whatever value was logged
7108 * before. If later the i_size of the inode is increased by a
7109 * truncate operation, the log is synced through an fsync of
7110 * some other inode and then finally an explicit fsync against
7111 * this inode is made, we must make sure this fsync logs the
7112 * inode with the new i_size, the hole between old i_size and
7113 * the new i_size, and syncs the log.
7114 *
7115 * 3) If we are logging that an ancestor inode exists as part of
7116 * logging a new name from a link or rename operation, don't update
7117 * its last_log_commit - otherwise if an explicit fsync is made
7118 * against an ancestor, the fsync considers the inode in the log
7119 * and doesn't sync the log, resulting in the ancestor missing after
7120 * a power failure unless the log was synced as part of an fsync
7121 * against any other unrelated inode.
7122 */
7123 if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
7124 inode->last_log_commit = inode->last_sub_trans;
7125 spin_unlock(&inode->lock);
7126
7127 /*
7128 * Reset the last_reflink_trans so that the next fsync does not need to
7129 * go through the slower path when logging extents and their checksums.
7130 */
7131 if (inode_only == LOG_INODE_ALL)
7132 inode->last_reflink_trans = 0;
7133
7134 out_unlock:
7135 mutex_unlock(&inode->log_mutex);
7136 out:
7137 btrfs_free_path(path);
7138 btrfs_free_path(dst_path);
7139
7140 if (ret)
7141 free_conflicting_inodes(ctx);
7142 else
7143 ret = log_conflicting_inodes(trans, inode->root, ctx);
7144
7145 if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
7146 if (!ret)
7147 ret = log_new_delayed_dentries(trans, inode,
7148 &delayed_ins_list, ctx);
7149
7150 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
7151 &delayed_del_list);
7152 }
7153
7154 return ret;
7155 }
7156
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)7157 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
7158 struct btrfs_inode *inode,
7159 struct btrfs_log_ctx *ctx)
7160 {
7161 int ret;
7162 BTRFS_PATH_AUTO_FREE(path);
7163 struct btrfs_key key;
7164 struct btrfs_root *root = inode->root;
7165 const u64 ino = btrfs_ino(inode);
7166
7167 path = btrfs_alloc_path();
7168 if (!path)
7169 return -ENOMEM;
7170 path->skip_locking = true;
7171 path->search_commit_root = true;
7172
7173 key.objectid = ino;
7174 key.type = BTRFS_INODE_REF_KEY;
7175 key.offset = 0;
7176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7177 if (ret < 0)
7178 return ret;
7179
7180 while (true) {
7181 struct extent_buffer *leaf = path->nodes[0];
7182 int slot = path->slots[0];
7183 u32 cur_offset = 0;
7184 u32 item_size;
7185 unsigned long ptr;
7186
7187 if (slot >= btrfs_header_nritems(leaf)) {
7188 ret = btrfs_next_leaf(root, path);
7189 if (ret < 0)
7190 return ret;
7191 if (ret > 0)
7192 break;
7193 continue;
7194 }
7195
7196 btrfs_item_key_to_cpu(leaf, &key, slot);
7197 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
7198 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
7199 break;
7200
7201 item_size = btrfs_item_size(leaf, slot);
7202 ptr = btrfs_item_ptr_offset(leaf, slot);
7203 while (cur_offset < item_size) {
7204 u64 dir_id;
7205 struct btrfs_inode *dir_inode;
7206
7207 if (key.type == BTRFS_INODE_EXTREF_KEY) {
7208 struct btrfs_inode_extref *extref;
7209
7210 extref = (struct btrfs_inode_extref *)
7211 (ptr + cur_offset);
7212 dir_id = btrfs_inode_extref_parent(leaf, extref);
7213 cur_offset += sizeof(*extref);
7214 cur_offset += btrfs_inode_extref_name_len(leaf,
7215 extref);
7216 } else {
7217 dir_id = key.offset;
7218 cur_offset = item_size;
7219 }
7220
7221 dir_inode = btrfs_iget_logging(dir_id, root);
7222 /*
7223 * If the parent inode was deleted, return an error to
7224 * fallback to a transaction commit. This is to prevent
7225 * getting an inode that was moved from one parent A to
7226 * a parent B, got its former parent A deleted and then
7227 * it got fsync'ed, from existing at both parents after
7228 * a log replay (and the old parent still existing).
7229 * Example:
7230 *
7231 * mkdir /mnt/A
7232 * mkdir /mnt/B
7233 * touch /mnt/B/bar
7234 * sync
7235 * mv /mnt/B/bar /mnt/A/bar
7236 * mv -T /mnt/A /mnt/B
7237 * fsync /mnt/B/bar
7238 * <power fail>
7239 *
7240 * If we ignore the old parent B which got deleted,
7241 * after a log replay we would have file bar linked
7242 * at both parents and the old parent B would still
7243 * exist.
7244 */
7245 if (IS_ERR(dir_inode))
7246 return PTR_ERR(dir_inode);
7247
7248 if (!need_log_inode(trans, dir_inode)) {
7249 btrfs_add_delayed_iput(dir_inode);
7250 continue;
7251 }
7252
7253 ctx->log_new_dentries = false;
7254 ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
7255 if (!ret && ctx->log_new_dentries)
7256 ret = log_new_dir_dentries(trans, dir_inode, ctx);
7257 btrfs_add_delayed_iput(dir_inode);
7258 if (ret)
7259 return ret;
7260 }
7261 path->slots[0]++;
7262 }
7263 return 0;
7264 }
7265
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)7266 static int log_new_ancestors(struct btrfs_trans_handle *trans,
7267 struct btrfs_root *root,
7268 struct btrfs_path *path,
7269 struct btrfs_log_ctx *ctx)
7270 {
7271 struct btrfs_key found_key;
7272
7273 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
7274
7275 while (true) {
7276 struct extent_buffer *leaf;
7277 int slot;
7278 struct btrfs_key search_key;
7279 struct btrfs_inode *inode;
7280 u64 ino;
7281 int ret = 0;
7282
7283 btrfs_release_path(path);
7284
7285 ino = found_key.offset;
7286
7287 search_key.objectid = found_key.offset;
7288 search_key.type = BTRFS_INODE_ITEM_KEY;
7289 search_key.offset = 0;
7290 inode = btrfs_iget_logging(ino, root);
7291 if (IS_ERR(inode))
7292 return PTR_ERR(inode);
7293
7294 if (inode->generation >= trans->transid &&
7295 need_log_inode(trans, inode))
7296 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
7297 btrfs_add_delayed_iput(inode);
7298 if (ret)
7299 return ret;
7300
7301 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
7302 break;
7303
7304 search_key.type = BTRFS_INODE_REF_KEY;
7305 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
7306 if (ret < 0)
7307 return ret;
7308
7309 leaf = path->nodes[0];
7310 slot = path->slots[0];
7311 if (slot >= btrfs_header_nritems(leaf)) {
7312 ret = btrfs_next_leaf(root, path);
7313 if (ret < 0)
7314 return ret;
7315 else if (ret > 0)
7316 return -ENOENT;
7317 leaf = path->nodes[0];
7318 slot = path->slots[0];
7319 }
7320
7321 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7322 if (found_key.objectid != search_key.objectid ||
7323 found_key.type != BTRFS_INODE_REF_KEY)
7324 return -ENOENT;
7325 }
7326 return 0;
7327 }
7328
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)7329 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
7330 struct btrfs_inode *inode,
7331 struct dentry *parent,
7332 struct btrfs_log_ctx *ctx)
7333 {
7334 struct btrfs_root *root = inode->root;
7335 struct dentry *old_parent = NULL;
7336 struct super_block *sb = inode->vfs_inode.i_sb;
7337 int ret = 0;
7338
7339 while (true) {
7340 if (!parent || d_really_is_negative(parent) ||
7341 sb != parent->d_sb)
7342 break;
7343
7344 inode = BTRFS_I(d_inode(parent));
7345 if (root != inode->root)
7346 break;
7347
7348 if (inode->generation >= trans->transid &&
7349 need_log_inode(trans, inode)) {
7350 ret = btrfs_log_inode(trans, inode,
7351 LOG_INODE_EXISTS, ctx);
7352 if (ret)
7353 break;
7354 }
7355 if (IS_ROOT(parent))
7356 break;
7357
7358 parent = dget_parent(parent);
7359 dput(old_parent);
7360 old_parent = parent;
7361 }
7362 dput(old_parent);
7363
7364 return ret;
7365 }
7366
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)7367 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
7368 struct btrfs_inode *inode,
7369 struct dentry *parent,
7370 struct btrfs_log_ctx *ctx)
7371 {
7372 struct btrfs_root *root = inode->root;
7373 const u64 ino = btrfs_ino(inode);
7374 BTRFS_PATH_AUTO_FREE(path);
7375 struct btrfs_key search_key;
7376 int ret;
7377
7378 /*
7379 * For a single hard link case, go through a fast path that does not
7380 * need to iterate the fs/subvolume tree.
7381 */
7382 if (inode->vfs_inode.i_nlink < 2)
7383 return log_new_ancestors_fast(trans, inode, parent, ctx);
7384
7385 path = btrfs_alloc_path();
7386 if (!path)
7387 return -ENOMEM;
7388
7389 search_key.objectid = ino;
7390 search_key.type = BTRFS_INODE_REF_KEY;
7391 search_key.offset = 0;
7392 again:
7393 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
7394 if (ret < 0)
7395 return ret;
7396 if (ret == 0)
7397 path->slots[0]++;
7398
7399 while (true) {
7400 struct extent_buffer *leaf = path->nodes[0];
7401 int slot = path->slots[0];
7402 struct btrfs_key found_key;
7403
7404 if (slot >= btrfs_header_nritems(leaf)) {
7405 ret = btrfs_next_leaf(root, path);
7406 if (ret < 0)
7407 return ret;
7408 if (ret > 0)
7409 break;
7410 continue;
7411 }
7412
7413 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7414 if (found_key.objectid != ino ||
7415 found_key.type > BTRFS_INODE_EXTREF_KEY)
7416 break;
7417
7418 /*
7419 * Don't deal with extended references because they are rare
7420 * cases and too complex to deal with (we would need to keep
7421 * track of which subitem we are processing for each item in
7422 * this loop, etc). So just return some error to fallback to
7423 * a transaction commit.
7424 */
7425 if (found_key.type == BTRFS_INODE_EXTREF_KEY)
7426 return -EMLINK;
7427
7428 /*
7429 * Logging ancestors needs to do more searches on the fs/subvol
7430 * tree, so it releases the path as needed to avoid deadlocks.
7431 * Keep track of the last inode ref key and resume from that key
7432 * after logging all new ancestors for the current hard link.
7433 */
7434 memcpy(&search_key, &found_key, sizeof(search_key));
7435
7436 ret = log_new_ancestors(trans, root, path, ctx);
7437 if (ret)
7438 return ret;
7439 btrfs_release_path(path);
7440 goto again;
7441 }
7442 return 0;
7443 }
7444
7445 /*
7446 * helper function around btrfs_log_inode to make sure newly created
7447 * parent directories also end up in the log. A minimal inode and backref
7448 * only logging is done of any parent directories that are older than
7449 * the last committed transaction
7450 */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)7451 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7452 struct btrfs_inode *inode,
7453 struct dentry *parent,
7454 int inode_only,
7455 struct btrfs_log_ctx *ctx)
7456 {
7457 struct btrfs_root *root = inode->root;
7458 struct btrfs_fs_info *fs_info = root->fs_info;
7459 int ret = 0;
7460 bool log_dentries;
7461
7462 if (btrfs_test_opt(fs_info, NOTREELOG))
7463 return BTRFS_LOG_FORCE_COMMIT;
7464
7465 if (btrfs_root_refs(&root->root_item) == 0)
7466 return BTRFS_LOG_FORCE_COMMIT;
7467
7468 /*
7469 * If we're logging an inode from a subvolume created in the current
7470 * transaction we must force a commit since the root is not persisted.
7471 */
7472 if (btrfs_root_generation(&root->root_item) == trans->transid)
7473 return BTRFS_LOG_FORCE_COMMIT;
7474
7475 /* Skip already logged inodes and without new extents. */
7476 if (btrfs_inode_in_log(inode, trans->transid) &&
7477 list_empty(&ctx->ordered_extents))
7478 return BTRFS_NO_LOG_SYNC;
7479
7480 ret = start_log_trans(trans, root, ctx);
7481 if (ret)
7482 return ret;
7483
7484 ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7485 if (ret)
7486 goto end_trans;
7487
7488 /*
7489 * for regular files, if its inode is already on disk, we don't
7490 * have to worry about the parents at all. This is because
7491 * we can use the last_unlink_trans field to record renames
7492 * and other fun in this file.
7493 */
7494 if (S_ISREG(inode->vfs_inode.i_mode) &&
7495 inode->generation < trans->transid &&
7496 inode->last_unlink_trans < trans->transid) {
7497 ret = 0;
7498 goto end_trans;
7499 }
7500
7501 /*
7502 * Track if we need to log dentries because ctx->log_new_dentries can
7503 * be modified in the call chains below.
7504 */
7505 log_dentries = ctx->log_new_dentries;
7506
7507 /*
7508 * On unlink we must make sure all our current and old parent directory
7509 * inodes are fully logged. This is to prevent leaving dangling
7510 * directory index entries in directories that were our parents but are
7511 * not anymore. Not doing this results in old parent directory being
7512 * impossible to delete after log replay (rmdir will always fail with
7513 * error -ENOTEMPTY).
7514 *
7515 * Example 1:
7516 *
7517 * mkdir testdir
7518 * touch testdir/foo
7519 * ln testdir/foo testdir/bar
7520 * sync
7521 * unlink testdir/bar
7522 * xfs_io -c fsync testdir/foo
7523 * <power failure>
7524 * mount fs, triggers log replay
7525 *
7526 * If we don't log the parent directory (testdir), after log replay the
7527 * directory still has an entry pointing to the file inode using the bar
7528 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7529 * the file inode has a link count of 1.
7530 *
7531 * Example 2:
7532 *
7533 * mkdir testdir
7534 * touch foo
7535 * ln foo testdir/foo2
7536 * ln foo testdir/foo3
7537 * sync
7538 * unlink testdir/foo3
7539 * xfs_io -c fsync foo
7540 * <power failure>
7541 * mount fs, triggers log replay
7542 *
7543 * Similar as the first example, after log replay the parent directory
7544 * testdir still has an entry pointing to the inode file with name foo3
7545 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7546 * and has a link count of 2.
7547 */
7548 if (inode->last_unlink_trans >= trans->transid) {
7549 ret = btrfs_log_all_parents(trans, inode, ctx);
7550 if (ret)
7551 goto end_trans;
7552 }
7553
7554 ret = log_all_new_ancestors(trans, inode, parent, ctx);
7555 if (ret)
7556 goto end_trans;
7557
7558 if (log_dentries)
7559 ret = log_new_dir_dentries(trans, inode, ctx);
7560 end_trans:
7561 if (ret < 0) {
7562 btrfs_set_log_full_commit(trans);
7563 ret = BTRFS_LOG_FORCE_COMMIT;
7564 }
7565
7566 if (ret)
7567 btrfs_remove_log_ctx(root, ctx);
7568 btrfs_end_log_trans(root);
7569
7570 return ret;
7571 }
7572
7573 /*
7574 * it is not safe to log dentry if the chunk root has added new
7575 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
7576 * If this returns 1, you must commit the transaction to safely get your
7577 * data on disk.
7578 */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)7579 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7580 struct dentry *dentry,
7581 struct btrfs_log_ctx *ctx)
7582 {
7583 struct dentry *parent = dget_parent(dentry);
7584 int ret;
7585
7586 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7587 LOG_INODE_ALL, ctx);
7588 dput(parent);
7589
7590 return ret;
7591 }
7592
7593 /*
7594 * should be called during mount to recover any replay any log trees
7595 * from the FS
7596 */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)7597 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7598 {
7599 int ret;
7600 struct btrfs_path *path;
7601 struct btrfs_trans_handle *trans;
7602 struct btrfs_key key;
7603 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7604 struct walk_control wc = {
7605 .process_func = process_one_buffer,
7606 .stage = LOG_WALK_PIN_ONLY,
7607 };
7608
7609 path = btrfs_alloc_path();
7610 if (!path)
7611 return -ENOMEM;
7612
7613 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7614
7615 trans = btrfs_start_transaction(fs_info->tree_root, 0);
7616 if (IS_ERR(trans)) {
7617 ret = PTR_ERR(trans);
7618 goto error;
7619 }
7620
7621 wc.trans = trans;
7622 wc.pin = true;
7623 wc.log = log_root_tree;
7624
7625 ret = walk_log_tree(&wc);
7626 wc.log = NULL;
7627 if (unlikely(ret)) {
7628 btrfs_abort_transaction(trans, ret);
7629 goto error;
7630 }
7631
7632 again:
7633 key.objectid = BTRFS_TREE_LOG_OBJECTID;
7634 key.type = BTRFS_ROOT_ITEM_KEY;
7635 key.offset = (u64)-1;
7636
7637 while (1) {
7638 struct btrfs_key found_key;
7639
7640 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7641
7642 if (unlikely(ret < 0)) {
7643 btrfs_abort_transaction(trans, ret);
7644 goto error;
7645 }
7646 if (ret > 0) {
7647 if (path->slots[0] == 0)
7648 break;
7649 path->slots[0]--;
7650 }
7651 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7652 path->slots[0]);
7653 btrfs_release_path(path);
7654 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7655 break;
7656
7657 wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
7658 if (IS_ERR(wc.log)) {
7659 ret = PTR_ERR(wc.log);
7660 wc.log = NULL;
7661 btrfs_abort_transaction(trans, ret);
7662 goto error;
7663 }
7664
7665 wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
7666 if (IS_ERR(wc.root)) {
7667 ret = PTR_ERR(wc.root);
7668 wc.root = NULL;
7669 if (unlikely(ret != -ENOENT)) {
7670 btrfs_abort_transaction(trans, ret);
7671 goto error;
7672 }
7673
7674 /*
7675 * We didn't find the subvol, likely because it was
7676 * deleted. This is ok, simply skip this log and go to
7677 * the next one.
7678 *
7679 * We need to exclude the root because we can't have
7680 * other log replays overwriting this log as we'll read
7681 * it back in a few more times. This will keep our
7682 * block from being modified, and we'll just bail for
7683 * each subsequent pass.
7684 */
7685 ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
7686 if (unlikely(ret)) {
7687 btrfs_abort_transaction(trans, ret);
7688 goto error;
7689 }
7690 goto next;
7691 }
7692
7693 wc.root->log_root = wc.log;
7694 ret = btrfs_record_root_in_trans(trans, wc.root);
7695 if (unlikely(ret)) {
7696 btrfs_abort_transaction(trans, ret);
7697 goto next;
7698 }
7699
7700 ret = walk_log_tree(&wc);
7701 if (unlikely(ret)) {
7702 btrfs_abort_transaction(trans, ret);
7703 goto next;
7704 }
7705
7706 if (wc.stage == LOG_WALK_REPLAY_ALL) {
7707 struct btrfs_root *root = wc.root;
7708
7709 wc.subvol_path = path;
7710 ret = fixup_inode_link_counts(&wc);
7711 wc.subvol_path = NULL;
7712 if (unlikely(ret)) {
7713 btrfs_abort_transaction(trans, ret);
7714 goto next;
7715 }
7716 /*
7717 * We have just replayed everything, and the highest
7718 * objectid of fs roots probably has changed in case
7719 * some inode_item's got replayed.
7720 *
7721 * root->objectid_mutex is not acquired as log replay
7722 * could only happen during mount.
7723 */
7724 ret = btrfs_init_root_free_objectid(root);
7725 if (unlikely(ret)) {
7726 btrfs_abort_transaction(trans, ret);
7727 goto next;
7728 }
7729 }
7730 next:
7731 if (wc.root) {
7732 wc.root->log_root = NULL;
7733 btrfs_put_root(wc.root);
7734 }
7735 btrfs_put_root(wc.log);
7736 wc.log = NULL;
7737
7738 if (ret)
7739 goto error;
7740 if (found_key.offset == 0)
7741 break;
7742 key.offset = found_key.offset - 1;
7743 }
7744 btrfs_release_path(path);
7745
7746 /* step one is to pin it all, step two is to replay just inodes */
7747 if (wc.pin) {
7748 wc.pin = false;
7749 wc.process_func = replay_one_buffer;
7750 wc.stage = LOG_WALK_REPLAY_INODES;
7751 goto again;
7752 }
7753 /* step three is to replay everything */
7754 if (wc.stage < LOG_WALK_REPLAY_ALL) {
7755 wc.stage++;
7756 goto again;
7757 }
7758
7759 btrfs_free_path(path);
7760
7761 /* step 4: commit the transaction, which also unpins the blocks */
7762 ret = btrfs_commit_transaction(trans);
7763 if (ret)
7764 return ret;
7765
7766 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7767
7768 return 0;
7769 error:
7770 if (wc.trans)
7771 btrfs_end_transaction(wc.trans);
7772 btrfs_put_root(wc.log);
7773 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7774 btrfs_free_path(path);
7775 return ret;
7776 }
7777
7778 /*
7779 * there are some corner cases where we want to force a full
7780 * commit instead of allowing a directory to be logged.
7781 *
7782 * They revolve around files there were unlinked from the directory, and
7783 * this function updates the parent directory so that a full commit is
7784 * properly done if it is fsync'd later after the unlinks are done.
7785 *
7786 * Must be called before the unlink operations (updates to the subvolume tree,
7787 * inodes, etc) are done.
7788 */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,bool for_rename)7789 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7790 struct btrfs_inode *dir, struct btrfs_inode *inode,
7791 bool for_rename)
7792 {
7793 /*
7794 * when we're logging a file, if it hasn't been renamed
7795 * or unlinked, and its inode is fully committed on disk,
7796 * we don't have to worry about walking up the directory chain
7797 * to log its parents.
7798 *
7799 * So, we use the last_unlink_trans field to put this transid
7800 * into the file. When the file is logged we check it and
7801 * don't log the parents if the file is fully on disk.
7802 */
7803 mutex_lock(&inode->log_mutex);
7804 inode->last_unlink_trans = trans->transid;
7805 mutex_unlock(&inode->log_mutex);
7806
7807 if (!for_rename)
7808 return;
7809
7810 /*
7811 * If this directory was already logged, any new names will be logged
7812 * with btrfs_log_new_name() and old names will be deleted from the log
7813 * tree with btrfs_del_dir_entries_in_log() or with
7814 * btrfs_del_inode_ref_in_log().
7815 */
7816 if (inode_logged(trans, dir, NULL) == 1)
7817 return;
7818
7819 /*
7820 * If the inode we're about to unlink was logged before, the log will be
7821 * properly updated with the new name with btrfs_log_new_name() and the
7822 * old name removed with btrfs_del_dir_entries_in_log() or with
7823 * btrfs_del_inode_ref_in_log().
7824 */
7825 if (inode_logged(trans, inode, NULL) == 1)
7826 return;
7827
7828 /*
7829 * when renaming files across directories, if the directory
7830 * there we're unlinking from gets fsync'd later on, there's
7831 * no way to find the destination directory later and fsync it
7832 * properly. So, we have to be conservative and force commits
7833 * so the new name gets discovered.
7834 */
7835 mutex_lock(&dir->log_mutex);
7836 dir->last_unlink_trans = trans->transid;
7837 mutex_unlock(&dir->log_mutex);
7838 }
7839
7840 /*
7841 * Make sure that if someone attempts to fsync the parent directory of a deleted
7842 * snapshot, it ends up triggering a transaction commit. This is to guarantee
7843 * that after replaying the log tree of the parent directory's root we will not
7844 * see the snapshot anymore and at log replay time we will not see any log tree
7845 * corresponding to the deleted snapshot's root, which could lead to replaying
7846 * it after replaying the log tree of the parent directory (which would replay
7847 * the snapshot delete operation).
7848 *
7849 * Must be called before the actual snapshot destroy operation (updates to the
7850 * parent root and tree of tree roots trees, etc) are done.
7851 */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7852 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7853 struct btrfs_inode *dir)
7854 {
7855 mutex_lock(&dir->log_mutex);
7856 dir->last_unlink_trans = trans->transid;
7857 mutex_unlock(&dir->log_mutex);
7858 }
7859
7860 /*
7861 * Call this when creating a subvolume in a directory.
7862 * Because we don't commit a transaction when creating a subvolume, we can't
7863 * allow the directory pointing to the subvolume to be logged with an entry that
7864 * points to an unpersisted root if we are still in the transaction used to
7865 * create the subvolume, so make any attempt to log the directory to result in a
7866 * full log sync.
7867 * Also we don't need to worry with renames, since btrfs_rename() marks the log
7868 * for full commit when renaming a subvolume.
7869 *
7870 * Must be called before creating the subvolume entry in its parent directory.
7871 */
btrfs_record_new_subvolume(const struct btrfs_trans_handle * trans,struct btrfs_inode * dir)7872 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
7873 struct btrfs_inode *dir)
7874 {
7875 mutex_lock(&dir->log_mutex);
7876 dir->last_unlink_trans = trans->transid;
7877 mutex_unlock(&dir->log_mutex);
7878 }
7879
7880 /*
7881 * Update the log after adding a new name for an inode.
7882 *
7883 * @trans: Transaction handle.
7884 * @old_dentry: The dentry associated with the old name and the old
7885 * parent directory.
7886 * @old_dir: The inode of the previous parent directory for the case
7887 * of a rename. For a link operation, it must be NULL.
7888 * @old_dir_index: The index number associated with the old name, meaningful
7889 * only for rename operations (when @old_dir is not NULL).
7890 * Ignored for link operations.
7891 * @parent: The dentry associated with the directory under which the
7892 * new name is located.
7893 *
7894 * Call this after adding a new name for an inode, as a result of a link or
7895 * rename operation, and it will properly update the log to reflect the new name.
7896 */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct dentry * old_dentry,struct btrfs_inode * old_dir,u64 old_dir_index,struct dentry * parent)7897 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7898 struct dentry *old_dentry, struct btrfs_inode *old_dir,
7899 u64 old_dir_index, struct dentry *parent)
7900 {
7901 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7902 struct btrfs_root *root = inode->root;
7903 struct btrfs_log_ctx ctx;
7904 bool log_pinned = false;
7905 int ret;
7906
7907 /* The inode has a new name (ref/extref), so make sure we log it. */
7908 set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
7909
7910 btrfs_init_log_ctx(&ctx, inode);
7911 ctx.logging_new_name = true;
7912
7913 /*
7914 * this will force the logging code to walk the dentry chain
7915 * up for the file
7916 */
7917 if (!S_ISDIR(inode->vfs_inode.i_mode))
7918 inode->last_unlink_trans = trans->transid;
7919
7920 /*
7921 * if this inode hasn't been logged and directory we're renaming it
7922 * from hasn't been logged, we don't need to log it
7923 */
7924 ret = inode_logged(trans, inode, NULL);
7925 if (ret < 0) {
7926 goto out;
7927 } else if (ret == 0) {
7928 if (!old_dir)
7929 return;
7930 /*
7931 * If the inode was not logged and we are doing a rename (old_dir is not
7932 * NULL), check if old_dir was logged - if it was not we can return and
7933 * do nothing.
7934 */
7935 ret = inode_logged(trans, old_dir, NULL);
7936 if (ret < 0)
7937 goto out;
7938 else if (ret == 0)
7939 return;
7940 }
7941 ret = 0;
7942
7943 /*
7944 * Now that we know we need to update the log, allocate the scratch eb
7945 * for the context before joining a log transaction below, as this can
7946 * take time and therefore we could delay log commits from other tasks.
7947 */
7948 btrfs_init_log_ctx_scratch_eb(&ctx);
7949
7950 /*
7951 * If we are doing a rename (old_dir is not NULL) from a directory that
7952 * was previously logged, make sure that on log replay we get the old
7953 * dir entry deleted. This is needed because we will also log the new
7954 * name of the renamed inode, so we need to make sure that after log
7955 * replay we don't end up with both the new and old dir entries existing.
7956 */
7957 if (old_dir && old_dir->logged_trans == trans->transid) {
7958 struct btrfs_root *log = old_dir->root->log_root;
7959 struct btrfs_path *path;
7960 struct fscrypt_name fname;
7961
7962 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX,
7963 "old_dir_index=%llu", old_dir_index);
7964
7965 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7966 &old_dentry->d_name, 0, &fname);
7967 if (ret)
7968 goto out;
7969
7970 path = btrfs_alloc_path();
7971 if (!path) {
7972 ret = -ENOMEM;
7973 fscrypt_free_filename(&fname);
7974 goto out;
7975 }
7976
7977 /*
7978 * We have two inodes to update in the log, the old directory and
7979 * the inode that got renamed, so we must pin the log to prevent
7980 * anyone from syncing the log until we have updated both inodes
7981 * in the log.
7982 */
7983 ret = join_running_log_trans(root);
7984 /*
7985 * At least one of the inodes was logged before, so this should
7986 * not fail, but if it does, it's not serious, just bail out and
7987 * mark the log for a full commit.
7988 */
7989 if (WARN_ON_ONCE(ret < 0)) {
7990 btrfs_free_path(path);
7991 fscrypt_free_filename(&fname);
7992 goto out;
7993 }
7994
7995 log_pinned = true;
7996
7997 /*
7998 * Other concurrent task might be logging the old directory,
7999 * as it can be triggered when logging other inode that had or
8000 * still has a dentry in the old directory. We lock the old
8001 * directory's log_mutex to ensure the deletion of the old
8002 * name is persisted, because during directory logging we
8003 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
8004 * the old name's dir index item is in the delayed items, so
8005 * it could be missed by an in progress directory logging.
8006 */
8007 mutex_lock(&old_dir->log_mutex);
8008 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
8009 &fname.disk_name, old_dir_index);
8010 if (ret > 0) {
8011 /*
8012 * The dentry does not exist in the log, so record its
8013 * deletion.
8014 */
8015 btrfs_release_path(path);
8016 ret = insert_dir_log_key(trans, log, path,
8017 btrfs_ino(old_dir),
8018 old_dir_index, old_dir_index);
8019 }
8020 mutex_unlock(&old_dir->log_mutex);
8021
8022 btrfs_free_path(path);
8023 fscrypt_free_filename(&fname);
8024 if (ret < 0)
8025 goto out;
8026 }
8027
8028 /*
8029 * We don't care about the return value. If we fail to log the new name
8030 * then we know the next attempt to sync the log will fallback to a full
8031 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
8032 * we don't need to worry about getting a log committed that has an
8033 * inconsistent state after a rename operation.
8034 */
8035 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
8036 ASSERT(list_empty(&ctx.conflict_inodes));
8037 out:
8038 /*
8039 * If an error happened mark the log for a full commit because it's not
8040 * consistent and up to date or we couldn't find out if one of the
8041 * inodes was logged before in this transaction. Do it before unpinning
8042 * the log, to avoid any races with someone else trying to commit it.
8043 */
8044 if (ret < 0)
8045 btrfs_set_log_full_commit(trans);
8046 if (log_pinned)
8047 btrfs_end_log_trans(root);
8048 free_extent_buffer(ctx.scratch_eb);
8049 }
8050
8051