xref: /linux/fs/btrfs/transaction.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include <linux/writeback.h>
22 #include <linux/pagemap.h>
23 #include <linux/blkdev.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "locking.h"
28 #include "ref-cache.h"
29 #include "tree-log.h"
30 
31 #define BTRFS_ROOT_TRANS_TAG 0
32 
33 static noinline void put_transaction(struct btrfs_transaction *transaction)
34 {
35 	WARN_ON(transaction->use_count == 0);
36 	transaction->use_count--;
37 	if (transaction->use_count == 0) {
38 		list_del_init(&transaction->list);
39 		memset(transaction, 0, sizeof(*transaction));
40 		kmem_cache_free(btrfs_transaction_cachep, transaction);
41 	}
42 }
43 
44 /*
45  * either allocate a new transaction or hop into the existing one
46  */
47 static noinline int join_transaction(struct btrfs_root *root)
48 {
49 	struct btrfs_transaction *cur_trans;
50 	cur_trans = root->fs_info->running_transaction;
51 	if (!cur_trans) {
52 		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53 					     GFP_NOFS);
54 		BUG_ON(!cur_trans);
55 		root->fs_info->generation++;
56 		root->fs_info->last_alloc = 0;
57 		root->fs_info->last_data_alloc = 0;
58 		cur_trans->num_writers = 1;
59 		cur_trans->num_joined = 0;
60 		cur_trans->transid = root->fs_info->generation;
61 		init_waitqueue_head(&cur_trans->writer_wait);
62 		init_waitqueue_head(&cur_trans->commit_wait);
63 		cur_trans->in_commit = 0;
64 		cur_trans->blocked = 0;
65 		cur_trans->use_count = 1;
66 		cur_trans->commit_done = 0;
67 		cur_trans->start_time = get_seconds();
68 
69 		cur_trans->delayed_refs.root.rb_node = NULL;
70 		cur_trans->delayed_refs.num_entries = 0;
71 		cur_trans->delayed_refs.num_heads_ready = 0;
72 		cur_trans->delayed_refs.num_heads = 0;
73 		cur_trans->delayed_refs.flushing = 0;
74 		cur_trans->delayed_refs.run_delayed_start = 0;
75 		spin_lock_init(&cur_trans->delayed_refs.lock);
76 
77 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
78 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
79 		extent_io_tree_init(&cur_trans->dirty_pages,
80 				     root->fs_info->btree_inode->i_mapping,
81 				     GFP_NOFS);
82 		spin_lock(&root->fs_info->new_trans_lock);
83 		root->fs_info->running_transaction = cur_trans;
84 		spin_unlock(&root->fs_info->new_trans_lock);
85 	} else {
86 		cur_trans->num_writers++;
87 		cur_trans->num_joined++;
88 	}
89 
90 	return 0;
91 }
92 
93 /*
94  * this does all the record keeping required to make sure that a reference
95  * counted root is properly recorded in a given transaction.  This is required
96  * to make sure the old root from before we joined the transaction is deleted
97  * when the transaction commits
98  */
99 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
100 {
101 	struct btrfs_dirty_root *dirty;
102 	u64 running_trans_id = root->fs_info->running_transaction->transid;
103 	if (root->ref_cows && root->last_trans < running_trans_id) {
104 		WARN_ON(root == root->fs_info->extent_root);
105 		if (root->root_item.refs != 0) {
106 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
107 				   (unsigned long)root->root_key.objectid,
108 				   BTRFS_ROOT_TRANS_TAG);
109 
110 			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
111 			BUG_ON(!dirty);
112 			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
113 			BUG_ON(!dirty->root);
114 			dirty->latest_root = root;
115 			INIT_LIST_HEAD(&dirty->list);
116 
117 			root->commit_root = btrfs_root_node(root);
118 
119 			memcpy(dirty->root, root, sizeof(*root));
120 			spin_lock_init(&dirty->root->node_lock);
121 			spin_lock_init(&dirty->root->list_lock);
122 			mutex_init(&dirty->root->objectid_mutex);
123 			mutex_init(&dirty->root->log_mutex);
124 			INIT_LIST_HEAD(&dirty->root->dead_list);
125 			dirty->root->node = root->commit_root;
126 			dirty->root->commit_root = NULL;
127 
128 			spin_lock(&root->list_lock);
129 			list_add(&dirty->root->dead_list, &root->dead_list);
130 			spin_unlock(&root->list_lock);
131 
132 			root->dirty_root = dirty;
133 		} else {
134 			WARN_ON(1);
135 		}
136 		root->last_trans = running_trans_id;
137 	}
138 	return 0;
139 }
140 
141 /* wait for commit against the current transaction to become unblocked
142  * when this is done, it is safe to start a new transaction, but the current
143  * transaction might not be fully on disk.
144  */
145 static void wait_current_trans(struct btrfs_root *root)
146 {
147 	struct btrfs_transaction *cur_trans;
148 
149 	cur_trans = root->fs_info->running_transaction;
150 	if (cur_trans && cur_trans->blocked) {
151 		DEFINE_WAIT(wait);
152 		cur_trans->use_count++;
153 		while (1) {
154 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
155 					TASK_UNINTERRUPTIBLE);
156 			if (cur_trans->blocked) {
157 				mutex_unlock(&root->fs_info->trans_mutex);
158 				schedule();
159 				mutex_lock(&root->fs_info->trans_mutex);
160 				finish_wait(&root->fs_info->transaction_wait,
161 					    &wait);
162 			} else {
163 				finish_wait(&root->fs_info->transaction_wait,
164 					    &wait);
165 				break;
166 			}
167 		}
168 		put_transaction(cur_trans);
169 	}
170 }
171 
172 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
173 					     int num_blocks, int wait)
174 {
175 	struct btrfs_trans_handle *h =
176 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
177 	int ret;
178 
179 	mutex_lock(&root->fs_info->trans_mutex);
180 	if (!root->fs_info->log_root_recovering &&
181 	    ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
182 		wait_current_trans(root);
183 	ret = join_transaction(root);
184 	BUG_ON(ret);
185 
186 	btrfs_record_root_in_trans(root);
187 	h->transid = root->fs_info->running_transaction->transid;
188 	h->transaction = root->fs_info->running_transaction;
189 	h->blocks_reserved = num_blocks;
190 	h->blocks_used = 0;
191 	h->block_group = 0;
192 	h->alloc_exclude_nr = 0;
193 	h->alloc_exclude_start = 0;
194 	h->delayed_ref_updates = 0;
195 
196 	root->fs_info->running_transaction->use_count++;
197 	mutex_unlock(&root->fs_info->trans_mutex);
198 	return h;
199 }
200 
201 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
202 						   int num_blocks)
203 {
204 	return start_transaction(root, num_blocks, 1);
205 }
206 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
207 						   int num_blocks)
208 {
209 	return start_transaction(root, num_blocks, 0);
210 }
211 
212 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
213 							 int num_blocks)
214 {
215 	return start_transaction(r, num_blocks, 2);
216 }
217 
218 /* wait for a transaction commit to be fully complete */
219 static noinline int wait_for_commit(struct btrfs_root *root,
220 				    struct btrfs_transaction *commit)
221 {
222 	DEFINE_WAIT(wait);
223 	mutex_lock(&root->fs_info->trans_mutex);
224 	while (!commit->commit_done) {
225 		prepare_to_wait(&commit->commit_wait, &wait,
226 				TASK_UNINTERRUPTIBLE);
227 		if (commit->commit_done)
228 			break;
229 		mutex_unlock(&root->fs_info->trans_mutex);
230 		schedule();
231 		mutex_lock(&root->fs_info->trans_mutex);
232 	}
233 	mutex_unlock(&root->fs_info->trans_mutex);
234 	finish_wait(&commit->commit_wait, &wait);
235 	return 0;
236 }
237 
238 /*
239  * rate limit against the drop_snapshot code.  This helps to slow down new
240  * operations if the drop_snapshot code isn't able to keep up.
241  */
242 static void throttle_on_drops(struct btrfs_root *root)
243 {
244 	struct btrfs_fs_info *info = root->fs_info;
245 	int harder_count = 0;
246 
247 harder:
248 	if (atomic_read(&info->throttles)) {
249 		DEFINE_WAIT(wait);
250 		int thr;
251 		thr = atomic_read(&info->throttle_gen);
252 
253 		do {
254 			prepare_to_wait(&info->transaction_throttle,
255 					&wait, TASK_UNINTERRUPTIBLE);
256 			if (!atomic_read(&info->throttles)) {
257 				finish_wait(&info->transaction_throttle, &wait);
258 				break;
259 			}
260 			schedule();
261 			finish_wait(&info->transaction_throttle, &wait);
262 		} while (thr == atomic_read(&info->throttle_gen));
263 		harder_count++;
264 
265 		if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
266 		    harder_count < 2)
267 			goto harder;
268 
269 		if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
270 		    harder_count < 10)
271 			goto harder;
272 
273 		if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
274 		    harder_count < 20)
275 			goto harder;
276 	}
277 }
278 
279 void btrfs_throttle(struct btrfs_root *root)
280 {
281 	mutex_lock(&root->fs_info->trans_mutex);
282 	if (!root->fs_info->open_ioctl_trans)
283 		wait_current_trans(root);
284 	mutex_unlock(&root->fs_info->trans_mutex);
285 	throttle_on_drops(root);
286 }
287 
288 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
289 			  struct btrfs_root *root, int throttle)
290 {
291 	struct btrfs_transaction *cur_trans;
292 	struct btrfs_fs_info *info = root->fs_info;
293 	int count = 0;
294 
295 	while (count < 4) {
296 		unsigned long cur = trans->delayed_ref_updates;
297 		trans->delayed_ref_updates = 0;
298 		if (cur &&
299 		    trans->transaction->delayed_refs.num_heads_ready > 64) {
300 			trans->delayed_ref_updates = 0;
301 
302 			/*
303 			 * do a full flush if the transaction is trying
304 			 * to close
305 			 */
306 			if (trans->transaction->delayed_refs.flushing)
307 				cur = 0;
308 			btrfs_run_delayed_refs(trans, root, cur);
309 		} else {
310 			break;
311 		}
312 		count++;
313 	}
314 
315 	mutex_lock(&info->trans_mutex);
316 	cur_trans = info->running_transaction;
317 	WARN_ON(cur_trans != trans->transaction);
318 	WARN_ON(cur_trans->num_writers < 1);
319 	cur_trans->num_writers--;
320 
321 	if (waitqueue_active(&cur_trans->writer_wait))
322 		wake_up(&cur_trans->writer_wait);
323 	put_transaction(cur_trans);
324 	mutex_unlock(&info->trans_mutex);
325 	memset(trans, 0, sizeof(*trans));
326 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
327 
328 	if (throttle)
329 		throttle_on_drops(root);
330 
331 	return 0;
332 }
333 
334 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
335 			  struct btrfs_root *root)
336 {
337 	return __btrfs_end_transaction(trans, root, 0);
338 }
339 
340 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
341 				   struct btrfs_root *root)
342 {
343 	return __btrfs_end_transaction(trans, root, 1);
344 }
345 
346 /*
347  * when btree blocks are allocated, they have some corresponding bits set for
348  * them in one of two extent_io trees.  This is used to make sure all of
349  * those extents are on disk for transaction or log commit
350  */
351 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
352 					struct extent_io_tree *dirty_pages)
353 {
354 	int ret;
355 	int err = 0;
356 	int werr = 0;
357 	struct page *page;
358 	struct inode *btree_inode = root->fs_info->btree_inode;
359 	u64 start = 0;
360 	u64 end;
361 	unsigned long index;
362 
363 	while (1) {
364 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
365 					    EXTENT_DIRTY);
366 		if (ret)
367 			break;
368 		while (start <= end) {
369 			cond_resched();
370 
371 			index = start >> PAGE_CACHE_SHIFT;
372 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
373 			page = find_get_page(btree_inode->i_mapping, index);
374 			if (!page)
375 				continue;
376 
377 			btree_lock_page_hook(page);
378 			if (!page->mapping) {
379 				unlock_page(page);
380 				page_cache_release(page);
381 				continue;
382 			}
383 
384 			if (PageWriteback(page)) {
385 				if (PageDirty(page))
386 					wait_on_page_writeback(page);
387 				else {
388 					unlock_page(page);
389 					page_cache_release(page);
390 					continue;
391 				}
392 			}
393 			err = write_one_page(page, 0);
394 			if (err)
395 				werr = err;
396 			page_cache_release(page);
397 		}
398 	}
399 	while (1) {
400 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
401 					    EXTENT_DIRTY);
402 		if (ret)
403 			break;
404 
405 		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
406 		while (start <= end) {
407 			index = start >> PAGE_CACHE_SHIFT;
408 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
409 			page = find_get_page(btree_inode->i_mapping, index);
410 			if (!page)
411 				continue;
412 			if (PageDirty(page)) {
413 				btree_lock_page_hook(page);
414 				wait_on_page_writeback(page);
415 				err = write_one_page(page, 0);
416 				if (err)
417 					werr = err;
418 			}
419 			wait_on_page_writeback(page);
420 			page_cache_release(page);
421 			cond_resched();
422 		}
423 	}
424 	if (err)
425 		werr = err;
426 	return werr;
427 }
428 
429 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
430 				     struct btrfs_root *root)
431 {
432 	if (!trans || !trans->transaction) {
433 		struct inode *btree_inode;
434 		btree_inode = root->fs_info->btree_inode;
435 		return filemap_write_and_wait(btree_inode->i_mapping);
436 	}
437 	return btrfs_write_and_wait_marked_extents(root,
438 					   &trans->transaction->dirty_pages);
439 }
440 
441 /*
442  * this is used to update the root pointer in the tree of tree roots.
443  *
444  * But, in the case of the extent allocation tree, updating the root
445  * pointer may allocate blocks which may change the root of the extent
446  * allocation tree.
447  *
448  * So, this loops and repeats and makes sure the cowonly root didn't
449  * change while the root pointer was being updated in the metadata.
450  */
451 static int update_cowonly_root(struct btrfs_trans_handle *trans,
452 			       struct btrfs_root *root)
453 {
454 	int ret;
455 	u64 old_root_bytenr;
456 	struct btrfs_root *tree_root = root->fs_info->tree_root;
457 
458 	btrfs_write_dirty_block_groups(trans, root);
459 
460 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
461 	BUG_ON(ret);
462 
463 	while (1) {
464 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
465 		if (old_root_bytenr == root->node->start)
466 			break;
467 		btrfs_set_root_bytenr(&root->root_item,
468 				       root->node->start);
469 		btrfs_set_root_level(&root->root_item,
470 				     btrfs_header_level(root->node));
471 		btrfs_set_root_generation(&root->root_item, trans->transid);
472 
473 		ret = btrfs_update_root(trans, tree_root,
474 					&root->root_key,
475 					&root->root_item);
476 		BUG_ON(ret);
477 		btrfs_write_dirty_block_groups(trans, root);
478 
479 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
480 		BUG_ON(ret);
481 	}
482 	return 0;
483 }
484 
485 /*
486  * update all the cowonly tree roots on disk
487  */
488 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
489 			    struct btrfs_root *root)
490 {
491 	struct btrfs_fs_info *fs_info = root->fs_info;
492 	struct list_head *next;
493 	struct extent_buffer *eb;
494 	int ret;
495 
496 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
497 	BUG_ON(ret);
498 
499 	eb = btrfs_lock_root_node(fs_info->tree_root);
500 	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
501 	btrfs_tree_unlock(eb);
502 	free_extent_buffer(eb);
503 
504 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
505 	BUG_ON(ret);
506 
507 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
508 		next = fs_info->dirty_cowonly_roots.next;
509 		list_del_init(next);
510 		root = list_entry(next, struct btrfs_root, dirty_list);
511 
512 		update_cowonly_root(trans, root);
513 
514 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
515 		BUG_ON(ret);
516 	}
517 	return 0;
518 }
519 
520 /*
521  * dead roots are old snapshots that need to be deleted.  This allocates
522  * a dirty root struct and adds it into the list of dead roots that need to
523  * be deleted
524  */
525 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
526 {
527 	struct btrfs_dirty_root *dirty;
528 
529 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
530 	if (!dirty)
531 		return -ENOMEM;
532 	dirty->root = root;
533 	dirty->latest_root = latest;
534 
535 	mutex_lock(&root->fs_info->trans_mutex);
536 	list_add(&dirty->list, &latest->fs_info->dead_roots);
537 	mutex_unlock(&root->fs_info->trans_mutex);
538 	return 0;
539 }
540 
541 /*
542  * at transaction commit time we need to schedule the old roots for
543  * deletion via btrfs_drop_snapshot.  This runs through all the
544  * reference counted roots that were modified in the current
545  * transaction and puts them into the drop list
546  */
547 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
548 				    struct radix_tree_root *radix,
549 				    struct list_head *list)
550 {
551 	struct btrfs_dirty_root *dirty;
552 	struct btrfs_root *gang[8];
553 	struct btrfs_root *root;
554 	int i;
555 	int ret;
556 	int err = 0;
557 	u32 refs;
558 
559 	while (1) {
560 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
561 						 ARRAY_SIZE(gang),
562 						 BTRFS_ROOT_TRANS_TAG);
563 		if (ret == 0)
564 			break;
565 		for (i = 0; i < ret; i++) {
566 			root = gang[i];
567 			radix_tree_tag_clear(radix,
568 				     (unsigned long)root->root_key.objectid,
569 				     BTRFS_ROOT_TRANS_TAG);
570 
571 			BUG_ON(!root->ref_tree);
572 			dirty = root->dirty_root;
573 
574 			btrfs_free_log(trans, root);
575 			btrfs_free_reloc_root(trans, root);
576 
577 			if (root->commit_root == root->node) {
578 				WARN_ON(root->node->start !=
579 					btrfs_root_bytenr(&root->root_item));
580 
581 				free_extent_buffer(root->commit_root);
582 				root->commit_root = NULL;
583 				root->dirty_root = NULL;
584 
585 				spin_lock(&root->list_lock);
586 				list_del_init(&dirty->root->dead_list);
587 				spin_unlock(&root->list_lock);
588 
589 				kfree(dirty->root);
590 				kfree(dirty);
591 
592 				/* make sure to update the root on disk
593 				 * so we get any updates to the block used
594 				 * counts
595 				 */
596 				err = btrfs_update_root(trans,
597 						root->fs_info->tree_root,
598 						&root->root_key,
599 						&root->root_item);
600 				continue;
601 			}
602 
603 			memset(&root->root_item.drop_progress, 0,
604 			       sizeof(struct btrfs_disk_key));
605 			root->root_item.drop_level = 0;
606 			root->commit_root = NULL;
607 			root->dirty_root = NULL;
608 			root->root_key.offset = root->fs_info->generation;
609 			btrfs_set_root_bytenr(&root->root_item,
610 					      root->node->start);
611 			btrfs_set_root_level(&root->root_item,
612 					     btrfs_header_level(root->node));
613 			btrfs_set_root_generation(&root->root_item,
614 						  root->root_key.offset);
615 
616 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
617 						&root->root_key,
618 						&root->root_item);
619 			if (err)
620 				break;
621 
622 			refs = btrfs_root_refs(&dirty->root->root_item);
623 			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
624 			err = btrfs_update_root(trans, root->fs_info->tree_root,
625 						&dirty->root->root_key,
626 						&dirty->root->root_item);
627 
628 			BUG_ON(err);
629 			if (refs == 1) {
630 				list_add(&dirty->list, list);
631 			} else {
632 				WARN_ON(1);
633 				free_extent_buffer(dirty->root->node);
634 				kfree(dirty->root);
635 				kfree(dirty);
636 			}
637 		}
638 	}
639 	return err;
640 }
641 
642 /*
643  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
644  * otherwise every leaf in the btree is read and defragged.
645  */
646 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
647 {
648 	struct btrfs_fs_info *info = root->fs_info;
649 	int ret;
650 	struct btrfs_trans_handle *trans;
651 	unsigned long nr;
652 
653 	smp_mb();
654 	if (root->defrag_running)
655 		return 0;
656 	trans = btrfs_start_transaction(root, 1);
657 	while (1) {
658 		root->defrag_running = 1;
659 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
660 		nr = trans->blocks_used;
661 		btrfs_end_transaction(trans, root);
662 		btrfs_btree_balance_dirty(info->tree_root, nr);
663 		cond_resched();
664 
665 		trans = btrfs_start_transaction(root, 1);
666 		if (root->fs_info->closing || ret != -EAGAIN)
667 			break;
668 	}
669 	root->defrag_running = 0;
670 	smp_mb();
671 	btrfs_end_transaction(trans, root);
672 	return 0;
673 }
674 
675 /*
676  * when dropping snapshots, we generate a ton of delayed refs, and it makes
677  * sense not to join the transaction while it is trying to flush the current
678  * queue of delayed refs out.
679  *
680  * This is used by the drop snapshot code only
681  */
682 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
683 {
684 	DEFINE_WAIT(wait);
685 
686 	mutex_lock(&info->trans_mutex);
687 	while (info->running_transaction &&
688 	       info->running_transaction->delayed_refs.flushing) {
689 		prepare_to_wait(&info->transaction_wait, &wait,
690 				TASK_UNINTERRUPTIBLE);
691 		mutex_unlock(&info->trans_mutex);
692 		schedule();
693 		mutex_lock(&info->trans_mutex);
694 		finish_wait(&info->transaction_wait, &wait);
695 	}
696 	mutex_unlock(&info->trans_mutex);
697 	return 0;
698 }
699 
700 /*
701  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
702  * all of them
703  */
704 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
705 				     struct list_head *list)
706 {
707 	struct btrfs_dirty_root *dirty;
708 	struct btrfs_trans_handle *trans;
709 	unsigned long nr;
710 	u64 num_bytes;
711 	u64 bytes_used;
712 	u64 max_useless;
713 	int ret = 0;
714 	int err;
715 
716 	while (!list_empty(list)) {
717 		struct btrfs_root *root;
718 
719 		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
720 		list_del_init(&dirty->list);
721 
722 		num_bytes = btrfs_root_used(&dirty->root->root_item);
723 		root = dirty->latest_root;
724 		atomic_inc(&root->fs_info->throttles);
725 
726 		while (1) {
727 			/*
728 			 * we don't want to jump in and create a bunch of
729 			 * delayed refs if the transaction is starting to close
730 			 */
731 			wait_transaction_pre_flush(tree_root->fs_info);
732 			trans = btrfs_start_transaction(tree_root, 1);
733 
734 			/*
735 			 * we've joined a transaction, make sure it isn't
736 			 * closing right now
737 			 */
738 			if (trans->transaction->delayed_refs.flushing) {
739 				btrfs_end_transaction(trans, tree_root);
740 				continue;
741 			}
742 
743 			mutex_lock(&root->fs_info->drop_mutex);
744 			ret = btrfs_drop_snapshot(trans, dirty->root);
745 			if (ret != -EAGAIN)
746 				break;
747 			mutex_unlock(&root->fs_info->drop_mutex);
748 
749 			err = btrfs_update_root(trans,
750 					tree_root,
751 					&dirty->root->root_key,
752 					&dirty->root->root_item);
753 			if (err)
754 				ret = err;
755 			nr = trans->blocks_used;
756 			ret = btrfs_end_transaction(trans, tree_root);
757 			BUG_ON(ret);
758 
759 			btrfs_btree_balance_dirty(tree_root, nr);
760 			cond_resched();
761 		}
762 		BUG_ON(ret);
763 		atomic_dec(&root->fs_info->throttles);
764 		wake_up(&root->fs_info->transaction_throttle);
765 
766 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
767 		bytes_used = btrfs_root_used(&root->root_item);
768 		if (num_bytes) {
769 			mutex_lock(&root->fs_info->trans_mutex);
770 			btrfs_record_root_in_trans(root);
771 			mutex_unlock(&root->fs_info->trans_mutex);
772 			btrfs_set_root_used(&root->root_item,
773 					    bytes_used - num_bytes);
774 		}
775 
776 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
777 		if (ret) {
778 			BUG();
779 			break;
780 		}
781 		mutex_unlock(&root->fs_info->drop_mutex);
782 
783 		spin_lock(&root->list_lock);
784 		list_del_init(&dirty->root->dead_list);
785 		if (!list_empty(&root->dead_list)) {
786 			struct btrfs_root *oldest;
787 			oldest = list_entry(root->dead_list.prev,
788 					    struct btrfs_root, dead_list);
789 			max_useless = oldest->root_key.offset - 1;
790 		} else {
791 			max_useless = root->root_key.offset - 1;
792 		}
793 		spin_unlock(&root->list_lock);
794 
795 		nr = trans->blocks_used;
796 		ret = btrfs_end_transaction(trans, tree_root);
797 		BUG_ON(ret);
798 
799 		ret = btrfs_remove_leaf_refs(root, max_useless, 0);
800 		BUG_ON(ret);
801 
802 		free_extent_buffer(dirty->root->node);
803 		kfree(dirty->root);
804 		kfree(dirty);
805 
806 		btrfs_btree_balance_dirty(tree_root, nr);
807 		cond_resched();
808 	}
809 	return ret;
810 }
811 
812 /*
813  * new snapshots need to be created at a very specific time in the
814  * transaction commit.  This does the actual creation
815  */
816 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
817 				   struct btrfs_fs_info *fs_info,
818 				   struct btrfs_pending_snapshot *pending)
819 {
820 	struct btrfs_key key;
821 	struct btrfs_root_item *new_root_item;
822 	struct btrfs_root *tree_root = fs_info->tree_root;
823 	struct btrfs_root *root = pending->root;
824 	struct extent_buffer *tmp;
825 	struct extent_buffer *old;
826 	int ret;
827 	u64 objectid;
828 
829 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
830 	if (!new_root_item) {
831 		ret = -ENOMEM;
832 		goto fail;
833 	}
834 	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
835 	if (ret)
836 		goto fail;
837 
838 	btrfs_record_root_in_trans(root);
839 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
840 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
841 
842 	key.objectid = objectid;
843 	key.offset = trans->transid;
844 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
845 
846 	old = btrfs_lock_root_node(root);
847 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
848 
849 	btrfs_copy_root(trans, root, old, &tmp, objectid);
850 	btrfs_tree_unlock(old);
851 	free_extent_buffer(old);
852 
853 	btrfs_set_root_bytenr(new_root_item, tmp->start);
854 	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
855 	btrfs_set_root_generation(new_root_item, trans->transid);
856 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
857 				new_root_item);
858 	btrfs_tree_unlock(tmp);
859 	free_extent_buffer(tmp);
860 	if (ret)
861 		goto fail;
862 
863 	key.offset = (u64)-1;
864 	memcpy(&pending->root_key, &key, sizeof(key));
865 fail:
866 	kfree(new_root_item);
867 	return ret;
868 }
869 
870 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
871 				   struct btrfs_pending_snapshot *pending)
872 {
873 	int ret;
874 	int namelen;
875 	u64 index = 0;
876 	struct btrfs_trans_handle *trans;
877 	struct inode *parent_inode;
878 	struct inode *inode;
879 	struct btrfs_root *parent_root;
880 
881 	parent_inode = pending->dentry->d_parent->d_inode;
882 	parent_root = BTRFS_I(parent_inode)->root;
883 	trans = btrfs_join_transaction(parent_root, 1);
884 
885 	/*
886 	 * insert the directory item
887 	 */
888 	namelen = strlen(pending->name);
889 	ret = btrfs_set_inode_index(parent_inode, &index);
890 	ret = btrfs_insert_dir_item(trans, parent_root,
891 			    pending->name, namelen,
892 			    parent_inode->i_ino,
893 			    &pending->root_key, BTRFS_FT_DIR, index);
894 
895 	if (ret)
896 		goto fail;
897 
898 	btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
899 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
900 	BUG_ON(ret);
901 
902 	/* add the backref first */
903 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
904 				 pending->root_key.objectid,
905 				 BTRFS_ROOT_BACKREF_KEY,
906 				 parent_root->root_key.objectid,
907 				 parent_inode->i_ino, index, pending->name,
908 				 namelen);
909 
910 	BUG_ON(ret);
911 
912 	/* now add the forward ref */
913 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
914 				 parent_root->root_key.objectid,
915 				 BTRFS_ROOT_REF_KEY,
916 				 pending->root_key.objectid,
917 				 parent_inode->i_ino, index, pending->name,
918 				 namelen);
919 
920 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
921 	d_instantiate(pending->dentry, inode);
922 fail:
923 	btrfs_end_transaction(trans, fs_info->fs_root);
924 	return ret;
925 }
926 
927 /*
928  * create all the snapshots we've scheduled for creation
929  */
930 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
931 					     struct btrfs_fs_info *fs_info)
932 {
933 	struct btrfs_pending_snapshot *pending;
934 	struct list_head *head = &trans->transaction->pending_snapshots;
935 	int ret;
936 
937 	list_for_each_entry(pending, head, list) {
938 		ret = create_pending_snapshot(trans, fs_info, pending);
939 		BUG_ON(ret);
940 	}
941 	return 0;
942 }
943 
944 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
945 					     struct btrfs_fs_info *fs_info)
946 {
947 	struct btrfs_pending_snapshot *pending;
948 	struct list_head *head = &trans->transaction->pending_snapshots;
949 	int ret;
950 
951 	while (!list_empty(head)) {
952 		pending = list_entry(head->next,
953 				     struct btrfs_pending_snapshot, list);
954 		ret = finish_pending_snapshot(fs_info, pending);
955 		BUG_ON(ret);
956 		list_del(&pending->list);
957 		kfree(pending->name);
958 		kfree(pending);
959 	}
960 	return 0;
961 }
962 
963 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
964 			     struct btrfs_root *root)
965 {
966 	unsigned long joined = 0;
967 	unsigned long timeout = 1;
968 	struct btrfs_transaction *cur_trans;
969 	struct btrfs_transaction *prev_trans = NULL;
970 	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
971 	struct list_head dirty_fs_roots;
972 	struct extent_io_tree *pinned_copy;
973 	DEFINE_WAIT(wait);
974 	int ret;
975 	int should_grow = 0;
976 	unsigned long now = get_seconds();
977 
978 	btrfs_run_ordered_operations(root, 0);
979 
980 	/* make a pass through all the delayed refs we have so far
981 	 * any runnings procs may add more while we are here
982 	 */
983 	ret = btrfs_run_delayed_refs(trans, root, 0);
984 	BUG_ON(ret);
985 
986 	cur_trans = trans->transaction;
987 	/*
988 	 * set the flushing flag so procs in this transaction have to
989 	 * start sending their work down.
990 	 */
991 	cur_trans->delayed_refs.flushing = 1;
992 
993 	ret = btrfs_run_delayed_refs(trans, root, 0);
994 	BUG_ON(ret);
995 
996 	mutex_lock(&root->fs_info->trans_mutex);
997 	INIT_LIST_HEAD(&dirty_fs_roots);
998 	if (cur_trans->in_commit) {
999 		cur_trans->use_count++;
1000 		mutex_unlock(&root->fs_info->trans_mutex);
1001 		btrfs_end_transaction(trans, root);
1002 
1003 		ret = wait_for_commit(root, cur_trans);
1004 		BUG_ON(ret);
1005 
1006 		mutex_lock(&root->fs_info->trans_mutex);
1007 		put_transaction(cur_trans);
1008 		mutex_unlock(&root->fs_info->trans_mutex);
1009 
1010 		return 0;
1011 	}
1012 
1013 	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
1014 	if (!pinned_copy)
1015 		return -ENOMEM;
1016 
1017 	extent_io_tree_init(pinned_copy,
1018 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
1019 
1020 	trans->transaction->in_commit = 1;
1021 	trans->transaction->blocked = 1;
1022 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
1023 		prev_trans = list_entry(cur_trans->list.prev,
1024 					struct btrfs_transaction, list);
1025 		if (!prev_trans->commit_done) {
1026 			prev_trans->use_count++;
1027 			mutex_unlock(&root->fs_info->trans_mutex);
1028 
1029 			wait_for_commit(root, prev_trans);
1030 
1031 			mutex_lock(&root->fs_info->trans_mutex);
1032 			put_transaction(prev_trans);
1033 		}
1034 	}
1035 
1036 	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1037 		should_grow = 1;
1038 
1039 	do {
1040 		int snap_pending = 0;
1041 		joined = cur_trans->num_joined;
1042 		if (!list_empty(&trans->transaction->pending_snapshots))
1043 			snap_pending = 1;
1044 
1045 		WARN_ON(cur_trans != trans->transaction);
1046 		prepare_to_wait(&cur_trans->writer_wait, &wait,
1047 				TASK_UNINTERRUPTIBLE);
1048 
1049 		if (cur_trans->num_writers > 1)
1050 			timeout = MAX_SCHEDULE_TIMEOUT;
1051 		else if (should_grow)
1052 			timeout = 1;
1053 
1054 		mutex_unlock(&root->fs_info->trans_mutex);
1055 
1056 		if (snap_pending) {
1057 			ret = btrfs_wait_ordered_extents(root, 1);
1058 			BUG_ON(ret);
1059 		}
1060 
1061 		/*
1062 		 * rename don't use btrfs_join_transaction, so, once we
1063 		 * set the transaction to blocked above, we aren't going
1064 		 * to get any new ordered operations.  We can safely run
1065 		 * it here and no for sure that nothing new will be added
1066 		 * to the list
1067 		 */
1068 		btrfs_run_ordered_operations(root, 1);
1069 
1070 		smp_mb();
1071 		if (cur_trans->num_writers > 1 || should_grow)
1072 			schedule_timeout(timeout);
1073 
1074 		mutex_lock(&root->fs_info->trans_mutex);
1075 		finish_wait(&cur_trans->writer_wait, &wait);
1076 	} while (cur_trans->num_writers > 1 ||
1077 		 (should_grow && cur_trans->num_joined != joined));
1078 
1079 	ret = create_pending_snapshots(trans, root->fs_info);
1080 	BUG_ON(ret);
1081 
1082 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1083 	BUG_ON(ret);
1084 
1085 	WARN_ON(cur_trans != trans->transaction);
1086 
1087 	/* btrfs_commit_tree_roots is responsible for getting the
1088 	 * various roots consistent with each other.  Every pointer
1089 	 * in the tree of tree roots has to point to the most up to date
1090 	 * root for every subvolume and other tree.  So, we have to keep
1091 	 * the tree logging code from jumping in and changing any
1092 	 * of the trees.
1093 	 *
1094 	 * At this point in the commit, there can't be any tree-log
1095 	 * writers, but a little lower down we drop the trans mutex
1096 	 * and let new people in.  By holding the tree_log_mutex
1097 	 * from now until after the super is written, we avoid races
1098 	 * with the tree-log code.
1099 	 */
1100 	mutex_lock(&root->fs_info->tree_log_mutex);
1101 	/*
1102 	 * keep tree reloc code from adding new reloc trees
1103 	 */
1104 	mutex_lock(&root->fs_info->tree_reloc_mutex);
1105 
1106 
1107 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
1108 			      &dirty_fs_roots);
1109 	BUG_ON(ret);
1110 
1111 	/* add_dirty_roots gets rid of all the tree log roots, it is now
1112 	 * safe to free the root of tree log roots
1113 	 */
1114 	btrfs_free_log_root_tree(trans, root->fs_info);
1115 
1116 	ret = btrfs_commit_tree_roots(trans, root);
1117 	BUG_ON(ret);
1118 
1119 	cur_trans = root->fs_info->running_transaction;
1120 	spin_lock(&root->fs_info->new_trans_lock);
1121 	root->fs_info->running_transaction = NULL;
1122 	spin_unlock(&root->fs_info->new_trans_lock);
1123 	btrfs_set_super_generation(&root->fs_info->super_copy,
1124 				   cur_trans->transid);
1125 	btrfs_set_super_root(&root->fs_info->super_copy,
1126 			     root->fs_info->tree_root->node->start);
1127 	btrfs_set_super_root_level(&root->fs_info->super_copy,
1128 			   btrfs_header_level(root->fs_info->tree_root->node));
1129 
1130 	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1131 				   chunk_root->node->start);
1132 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1133 					 btrfs_header_level(chunk_root->node));
1134 	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1135 				btrfs_header_generation(chunk_root->node));
1136 
1137 	if (!root->fs_info->log_root_recovering) {
1138 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1139 		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1140 	}
1141 
1142 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1143 	       sizeof(root->fs_info->super_copy));
1144 
1145 	btrfs_copy_pinned(root, pinned_copy);
1146 
1147 	trans->transaction->blocked = 0;
1148 
1149 	wake_up(&root->fs_info->transaction_throttle);
1150 	wake_up(&root->fs_info->transaction_wait);
1151 
1152 	mutex_unlock(&root->fs_info->trans_mutex);
1153 	ret = btrfs_write_and_wait_transaction(trans, root);
1154 	BUG_ON(ret);
1155 	write_ctree_super(trans, root, 0);
1156 
1157 	/*
1158 	 * the super is written, we can safely allow the tree-loggers
1159 	 * to go about their business
1160 	 */
1161 	mutex_unlock(&root->fs_info->tree_log_mutex);
1162 
1163 	btrfs_finish_extent_commit(trans, root, pinned_copy);
1164 	kfree(pinned_copy);
1165 
1166 	btrfs_drop_dead_reloc_roots(root);
1167 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
1168 
1169 	/* do the directory inserts of any pending snapshot creations */
1170 	finish_pending_snapshots(trans, root->fs_info);
1171 
1172 	mutex_lock(&root->fs_info->trans_mutex);
1173 
1174 	cur_trans->commit_done = 1;
1175 
1176 	root->fs_info->last_trans_committed = cur_trans->transid;
1177 	wake_up(&cur_trans->commit_wait);
1178 
1179 	put_transaction(cur_trans);
1180 	put_transaction(cur_trans);
1181 
1182 	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1183 	if (root->fs_info->closing)
1184 		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1185 
1186 	mutex_unlock(&root->fs_info->trans_mutex);
1187 
1188 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
1189 
1190 	if (root->fs_info->closing)
1191 		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1192 	return ret;
1193 }
1194 
1195 /*
1196  * interface function to delete all the snapshots we have scheduled for deletion
1197  */
1198 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1199 {
1200 	struct list_head dirty_roots;
1201 	INIT_LIST_HEAD(&dirty_roots);
1202 again:
1203 	mutex_lock(&root->fs_info->trans_mutex);
1204 	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1205 	mutex_unlock(&root->fs_info->trans_mutex);
1206 
1207 	if (!list_empty(&dirty_roots)) {
1208 		drop_dirty_roots(root, &dirty_roots);
1209 		goto again;
1210 	}
1211 	return 0;
1212 }
1213