xref: /linux/fs/jbd2/commit.c (revision 2fe05e1139a555ae91f00a812cb9520e7d3022ab)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * IO end handler for temporary buffer_heads handling writes to the journal.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	struct buffer_head *orig_bh = bh->b_private;
38 
39 	BUFFER_TRACE(bh, "");
40 	if (uptodate)
41 		set_buffer_uptodate(bh);
42 	else
43 		clear_buffer_uptodate(bh);
44 	if (orig_bh) {
45 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 		smp_mb__after_atomic();
47 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 	}
49 	unlock_buffer(bh);
50 }
51 
52 /*
53  * When an ext4 file is truncated, it is possible that some pages are not
54  * successfully freed, because they are attached to a committing transaction.
55  * After the transaction commits, these pages are left on the LRU, with no
56  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
57  * by the VM, but their apparent absence upsets the VM accounting, and it makes
58  * the numbers in /proc/meminfo look odd.
59  *
60  * So here, we have a buffer which has just come off the forget list.  Look to
61  * see if we can strip all buffers from the backing page.
62  *
63  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
64  * caller provided us with a ref against the buffer, and we drop that here.
65  */
66 static void release_buffer_page(struct buffer_head *bh)
67 {
68 	struct page *page;
69 
70 	if (buffer_dirty(bh))
71 		goto nope;
72 	if (atomic_read(&bh->b_count) != 1)
73 		goto nope;
74 	page = bh->b_page;
75 	if (!page)
76 		goto nope;
77 	if (page->mapping)
78 		goto nope;
79 
80 	/* OK, it's a truncated page */
81 	if (!trylock_page(page))
82 		goto nope;
83 
84 	get_page(page);
85 	__brelse(bh);
86 	try_to_free_buffers(page);
87 	unlock_page(page);
88 	put_page(page);
89 	return;
90 
91 nope:
92 	__brelse(bh);
93 }
94 
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96 {
97 	struct commit_header *h;
98 	__u32 csum;
99 
100 	if (!jbd2_journal_has_csum_v2or3(j))
101 		return;
102 
103 	h = (struct commit_header *)(bh->b_data);
104 	h->h_chksum_type = 0;
105 	h->h_chksum_size = 0;
106 	h->h_chksum[0] = 0;
107 	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 	h->h_chksum[0] = cpu_to_be32(csum);
109 }
110 
111 /*
112  * Done it all: now submit the commit record.  We should have
113  * cleaned up our previous buffers by now, so if we are in abort
114  * mode we can now just skip the rest of the journal write
115  * entirely.
116  *
117  * Returns 1 if the journal needs to be aborted or 0 on success
118  */
119 static int journal_submit_commit_record(journal_t *journal,
120 					transaction_t *commit_transaction,
121 					struct buffer_head **cbh,
122 					__u32 crc32_sum)
123 {
124 	struct commit_header *tmp;
125 	struct buffer_head *bh;
126 	int ret;
127 	struct timespec64 now = current_kernel_time64();
128 
129 	*cbh = NULL;
130 
131 	if (is_journal_aborted(journal))
132 		return 0;
133 
134 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
135 						JBD2_COMMIT_BLOCK);
136 	if (!bh)
137 		return 1;
138 
139 	tmp = (struct commit_header *)bh->b_data;
140 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143 	if (jbd2_has_feature_checksum(journal)) {
144 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
145 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
146 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
147 	}
148 	jbd2_commit_block_csum_set(journal, bh);
149 
150 	BUFFER_TRACE(bh, "submit commit block");
151 	lock_buffer(bh);
152 	clear_buffer_dirty(bh);
153 	set_buffer_uptodate(bh);
154 	bh->b_end_io = journal_end_buffer_io_sync;
155 
156 	if (journal->j_flags & JBD2_BARRIER &&
157 	    !jbd2_has_feature_async_commit(journal))
158 		ret = submit_bh(REQ_OP_WRITE,
159 			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
160 	else
161 		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
162 
163 	*cbh = bh;
164 	return ret;
165 }
166 
167 /*
168  * This function along with journal_submit_commit_record
169  * allows to write the commit record asynchronously.
170  */
171 static int journal_wait_on_commit_record(journal_t *journal,
172 					 struct buffer_head *bh)
173 {
174 	int ret = 0;
175 
176 	clear_buffer_dirty(bh);
177 	wait_on_buffer(bh);
178 
179 	if (unlikely(!buffer_uptodate(bh)))
180 		ret = -EIO;
181 	put_bh(bh);            /* One for getblk() */
182 
183 	return ret;
184 }
185 
186 /*
187  * write the filemap data using writepage() address_space_operations.
188  * We don't do block allocation here even for delalloc. We don't
189  * use writepages() because with dealyed allocation we may be doing
190  * block allocation in writepages().
191  */
192 static int journal_submit_inode_data_buffers(struct address_space *mapping)
193 {
194 	int ret;
195 	struct writeback_control wbc = {
196 		.sync_mode =  WB_SYNC_ALL,
197 		.nr_to_write = mapping->nrpages * 2,
198 		.range_start = 0,
199 		.range_end = i_size_read(mapping->host),
200 	};
201 
202 	ret = generic_writepages(mapping, &wbc);
203 	return ret;
204 }
205 
206 /*
207  * Submit all the data buffers of inode associated with the transaction to
208  * disk.
209  *
210  * We are in a committing transaction. Therefore no new inode can be added to
211  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
212  * operate on from being released while we write out pages.
213  */
214 static int journal_submit_data_buffers(journal_t *journal,
215 		transaction_t *commit_transaction)
216 {
217 	struct jbd2_inode *jinode;
218 	int err, ret = 0;
219 	struct address_space *mapping;
220 
221 	spin_lock(&journal->j_list_lock);
222 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
223 		if (!(jinode->i_flags & JI_WRITE_DATA))
224 			continue;
225 		mapping = jinode->i_vfs_inode->i_mapping;
226 		jinode->i_flags |= JI_COMMIT_RUNNING;
227 		spin_unlock(&journal->j_list_lock);
228 		/*
229 		 * submit the inode data buffers. We use writepage
230 		 * instead of writepages. Because writepages can do
231 		 * block allocation  with delalloc. We need to write
232 		 * only allocated blocks here.
233 		 */
234 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
235 		err = journal_submit_inode_data_buffers(mapping);
236 		if (!ret)
237 			ret = err;
238 		spin_lock(&journal->j_list_lock);
239 		J_ASSERT(jinode->i_transaction == commit_transaction);
240 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
241 		smp_mb();
242 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 	}
244 	spin_unlock(&journal->j_list_lock);
245 	return ret;
246 }
247 
248 /*
249  * Wait for data submitted for writeout, refile inodes to proper
250  * transaction if needed.
251  *
252  */
253 static int journal_finish_inode_data_buffers(journal_t *journal,
254 		transaction_t *commit_transaction)
255 {
256 	struct jbd2_inode *jinode, *next_i;
257 	int err, ret = 0;
258 
259 	/* For locking, see the comment in journal_submit_data_buffers() */
260 	spin_lock(&journal->j_list_lock);
261 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262 		if (!(jinode->i_flags & JI_WAIT_DATA))
263 			continue;
264 		jinode->i_flags |= JI_COMMIT_RUNNING;
265 		spin_unlock(&journal->j_list_lock);
266 		err = filemap_fdatawait_keep_errors(
267 				jinode->i_vfs_inode->i_mapping);
268 		if (!ret)
269 			ret = err;
270 		spin_lock(&journal->j_list_lock);
271 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
272 		smp_mb();
273 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
274 	}
275 
276 	/* Now refile inode to proper lists */
277 	list_for_each_entry_safe(jinode, next_i,
278 				 &commit_transaction->t_inode_list, i_list) {
279 		list_del(&jinode->i_list);
280 		if (jinode->i_next_transaction) {
281 			jinode->i_transaction = jinode->i_next_transaction;
282 			jinode->i_next_transaction = NULL;
283 			list_add(&jinode->i_list,
284 				&jinode->i_transaction->t_inode_list);
285 		} else {
286 			jinode->i_transaction = NULL;
287 		}
288 	}
289 	spin_unlock(&journal->j_list_lock);
290 
291 	return ret;
292 }
293 
294 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
295 {
296 	struct page *page = bh->b_page;
297 	char *addr;
298 	__u32 checksum;
299 
300 	addr = kmap_atomic(page);
301 	checksum = crc32_be(crc32_sum,
302 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
303 	kunmap_atomic(addr);
304 
305 	return checksum;
306 }
307 
308 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
309 				   unsigned long long block)
310 {
311 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
312 	if (jbd2_has_feature_64bit(j))
313 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
314 }
315 
316 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
317 				    struct buffer_head *bh, __u32 sequence)
318 {
319 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
320 	struct page *page = bh->b_page;
321 	__u8 *addr;
322 	__u32 csum32;
323 	__be32 seq;
324 
325 	if (!jbd2_journal_has_csum_v2or3(j))
326 		return;
327 
328 	seq = cpu_to_be32(sequence);
329 	addr = kmap_atomic(page);
330 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
331 	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
332 			     bh->b_size);
333 	kunmap_atomic(addr);
334 
335 	if (jbd2_has_feature_csum3(j))
336 		tag3->t_checksum = cpu_to_be32(csum32);
337 	else
338 		tag->t_checksum = cpu_to_be16(csum32);
339 }
340 /*
341  * jbd2_journal_commit_transaction
342  *
343  * The primary function for committing a transaction to the log.  This
344  * function is called by the journal thread to begin a complete commit.
345  */
346 void jbd2_journal_commit_transaction(journal_t *journal)
347 {
348 	struct transaction_stats_s stats;
349 	transaction_t *commit_transaction;
350 	struct journal_head *jh;
351 	struct buffer_head *descriptor;
352 	struct buffer_head **wbuf = journal->j_wbuf;
353 	int bufs;
354 	int flags;
355 	int err;
356 	unsigned long long blocknr;
357 	ktime_t start_time;
358 	u64 commit_time;
359 	char *tagp = NULL;
360 	journal_block_tag_t *tag = NULL;
361 	int space_left = 0;
362 	int first_tag = 0;
363 	int tag_flag;
364 	int i;
365 	int tag_bytes = journal_tag_bytes(journal);
366 	struct buffer_head *cbh = NULL; /* For transactional checksums */
367 	__u32 crc32_sum = ~0;
368 	struct blk_plug plug;
369 	/* Tail of the journal */
370 	unsigned long first_block;
371 	tid_t first_tid;
372 	int update_tail;
373 	int csum_size = 0;
374 	LIST_HEAD(io_bufs);
375 	LIST_HEAD(log_bufs);
376 
377 	if (jbd2_journal_has_csum_v2or3(journal))
378 		csum_size = sizeof(struct jbd2_journal_block_tail);
379 
380 	/*
381 	 * First job: lock down the current transaction and wait for
382 	 * all outstanding updates to complete.
383 	 */
384 
385 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
386 	if (journal->j_flags & JBD2_FLUSHED) {
387 		jbd_debug(3, "super block updated\n");
388 		mutex_lock_io(&journal->j_checkpoint_mutex);
389 		/*
390 		 * We hold j_checkpoint_mutex so tail cannot change under us.
391 		 * We don't need any special data guarantees for writing sb
392 		 * since journal is empty and it is ok for write to be
393 		 * flushed only with transaction commit.
394 		 */
395 		jbd2_journal_update_sb_log_tail(journal,
396 						journal->j_tail_sequence,
397 						journal->j_tail,
398 						REQ_SYNC);
399 		mutex_unlock(&journal->j_checkpoint_mutex);
400 	} else {
401 		jbd_debug(3, "superblock not updated\n");
402 	}
403 
404 	J_ASSERT(journal->j_running_transaction != NULL);
405 	J_ASSERT(journal->j_committing_transaction == NULL);
406 
407 	commit_transaction = journal->j_running_transaction;
408 
409 	trace_jbd2_start_commit(journal, commit_transaction);
410 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
411 			commit_transaction->t_tid);
412 
413 	write_lock(&journal->j_state_lock);
414 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
415 	commit_transaction->t_state = T_LOCKED;
416 
417 	trace_jbd2_commit_locking(journal, commit_transaction);
418 	stats.run.rs_wait = commit_transaction->t_max_wait;
419 	stats.run.rs_request_delay = 0;
420 	stats.run.rs_locked = jiffies;
421 	if (commit_transaction->t_requested)
422 		stats.run.rs_request_delay =
423 			jbd2_time_diff(commit_transaction->t_requested,
424 				       stats.run.rs_locked);
425 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
426 					      stats.run.rs_locked);
427 
428 	spin_lock(&commit_transaction->t_handle_lock);
429 	while (atomic_read(&commit_transaction->t_updates)) {
430 		DEFINE_WAIT(wait);
431 
432 		prepare_to_wait(&journal->j_wait_updates, &wait,
433 					TASK_UNINTERRUPTIBLE);
434 		if (atomic_read(&commit_transaction->t_updates)) {
435 			spin_unlock(&commit_transaction->t_handle_lock);
436 			write_unlock(&journal->j_state_lock);
437 			schedule();
438 			write_lock(&journal->j_state_lock);
439 			spin_lock(&commit_transaction->t_handle_lock);
440 		}
441 		finish_wait(&journal->j_wait_updates, &wait);
442 	}
443 	spin_unlock(&commit_transaction->t_handle_lock);
444 
445 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
446 			journal->j_max_transaction_buffers);
447 
448 	/*
449 	 * First thing we are allowed to do is to discard any remaining
450 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
451 	 * that there are no such buffers: if a large filesystem
452 	 * operation like a truncate needs to split itself over multiple
453 	 * transactions, then it may try to do a jbd2_journal_restart() while
454 	 * there are still BJ_Reserved buffers outstanding.  These must
455 	 * be released cleanly from the current transaction.
456 	 *
457 	 * In this case, the filesystem must still reserve write access
458 	 * again before modifying the buffer in the new transaction, but
459 	 * we do not require it to remember exactly which old buffers it
460 	 * has reserved.  This is consistent with the existing behaviour
461 	 * that multiple jbd2_journal_get_write_access() calls to the same
462 	 * buffer are perfectly permissible.
463 	 */
464 	while (commit_transaction->t_reserved_list) {
465 		jh = commit_transaction->t_reserved_list;
466 		JBUFFER_TRACE(jh, "reserved, unused: refile");
467 		/*
468 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
469 		 * leave undo-committed data.
470 		 */
471 		if (jh->b_committed_data) {
472 			struct buffer_head *bh = jh2bh(jh);
473 
474 			jbd_lock_bh_state(bh);
475 			jbd2_free(jh->b_committed_data, bh->b_size);
476 			jh->b_committed_data = NULL;
477 			jbd_unlock_bh_state(bh);
478 		}
479 		jbd2_journal_refile_buffer(journal, jh);
480 	}
481 
482 	/*
483 	 * Now try to drop any written-back buffers from the journal's
484 	 * checkpoint lists.  We do this *before* commit because it potentially
485 	 * frees some memory
486 	 */
487 	spin_lock(&journal->j_list_lock);
488 	__jbd2_journal_clean_checkpoint_list(journal, false);
489 	spin_unlock(&journal->j_list_lock);
490 
491 	jbd_debug(3, "JBD2: commit phase 1\n");
492 
493 	/*
494 	 * Clear revoked flag to reflect there is no revoked buffers
495 	 * in the next transaction which is going to be started.
496 	 */
497 	jbd2_clear_buffer_revoked_flags(journal);
498 
499 	/*
500 	 * Switch to a new revoke table.
501 	 */
502 	jbd2_journal_switch_revoke_table(journal);
503 
504 	/*
505 	 * Reserved credits cannot be claimed anymore, free them
506 	 */
507 	atomic_sub(atomic_read(&journal->j_reserved_credits),
508 		   &commit_transaction->t_outstanding_credits);
509 
510 	trace_jbd2_commit_flushing(journal, commit_transaction);
511 	stats.run.rs_flushing = jiffies;
512 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
513 					     stats.run.rs_flushing);
514 
515 	commit_transaction->t_state = T_FLUSH;
516 	journal->j_committing_transaction = commit_transaction;
517 	journal->j_running_transaction = NULL;
518 	start_time = ktime_get();
519 	commit_transaction->t_log_start = journal->j_head;
520 	wake_up(&journal->j_wait_transaction_locked);
521 	write_unlock(&journal->j_state_lock);
522 
523 	jbd_debug(3, "JBD2: commit phase 2a\n");
524 
525 	/*
526 	 * Now start flushing things to disk, in the order they appear
527 	 * on the transaction lists.  Data blocks go first.
528 	 */
529 	err = journal_submit_data_buffers(journal, commit_transaction);
530 	if (err)
531 		jbd2_journal_abort(journal, err);
532 
533 	blk_start_plug(&plug);
534 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
535 
536 	jbd_debug(3, "JBD2: commit phase 2b\n");
537 
538 	/*
539 	 * Way to go: we have now written out all of the data for a
540 	 * transaction!  Now comes the tricky part: we need to write out
541 	 * metadata.  Loop over the transaction's entire buffer list:
542 	 */
543 	write_lock(&journal->j_state_lock);
544 	commit_transaction->t_state = T_COMMIT;
545 	write_unlock(&journal->j_state_lock);
546 
547 	trace_jbd2_commit_logging(journal, commit_transaction);
548 	stats.run.rs_logging = jiffies;
549 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
550 					       stats.run.rs_logging);
551 	stats.run.rs_blocks =
552 		atomic_read(&commit_transaction->t_outstanding_credits);
553 	stats.run.rs_blocks_logged = 0;
554 
555 	J_ASSERT(commit_transaction->t_nr_buffers <=
556 		 atomic_read(&commit_transaction->t_outstanding_credits));
557 
558 	err = 0;
559 	bufs = 0;
560 	descriptor = NULL;
561 	while (commit_transaction->t_buffers) {
562 
563 		/* Find the next buffer to be journaled... */
564 
565 		jh = commit_transaction->t_buffers;
566 
567 		/* If we're in abort mode, we just un-journal the buffer and
568 		   release it. */
569 
570 		if (is_journal_aborted(journal)) {
571 			clear_buffer_jbddirty(jh2bh(jh));
572 			JBUFFER_TRACE(jh, "journal is aborting: refile");
573 			jbd2_buffer_abort_trigger(jh,
574 						  jh->b_frozen_data ?
575 						  jh->b_frozen_triggers :
576 						  jh->b_triggers);
577 			jbd2_journal_refile_buffer(journal, jh);
578 			/* If that was the last one, we need to clean up
579 			 * any descriptor buffers which may have been
580 			 * already allocated, even if we are now
581 			 * aborting. */
582 			if (!commit_transaction->t_buffers)
583 				goto start_journal_io;
584 			continue;
585 		}
586 
587 		/* Make sure we have a descriptor block in which to
588 		   record the metadata buffer. */
589 
590 		if (!descriptor) {
591 			J_ASSERT (bufs == 0);
592 
593 			jbd_debug(4, "JBD2: get descriptor\n");
594 
595 			descriptor = jbd2_journal_get_descriptor_buffer(
596 							commit_transaction,
597 							JBD2_DESCRIPTOR_BLOCK);
598 			if (!descriptor) {
599 				jbd2_journal_abort(journal, -EIO);
600 				continue;
601 			}
602 
603 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
604 				(unsigned long long)descriptor->b_blocknr,
605 				descriptor->b_data);
606 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
607 			space_left = descriptor->b_size -
608 						sizeof(journal_header_t);
609 			first_tag = 1;
610 			set_buffer_jwrite(descriptor);
611 			set_buffer_dirty(descriptor);
612 			wbuf[bufs++] = descriptor;
613 
614 			/* Record it so that we can wait for IO
615                            completion later */
616 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
617 			jbd2_file_log_bh(&log_bufs, descriptor);
618 		}
619 
620 		/* Where is the buffer to be written? */
621 
622 		err = jbd2_journal_next_log_block(journal, &blocknr);
623 		/* If the block mapping failed, just abandon the buffer
624 		   and repeat this loop: we'll fall into the
625 		   refile-on-abort condition above. */
626 		if (err) {
627 			jbd2_journal_abort(journal, err);
628 			continue;
629 		}
630 
631 		/*
632 		 * start_this_handle() uses t_outstanding_credits to determine
633 		 * the free space in the log, but this counter is changed
634 		 * by jbd2_journal_next_log_block() also.
635 		 */
636 		atomic_dec(&commit_transaction->t_outstanding_credits);
637 
638 		/* Bump b_count to prevent truncate from stumbling over
639                    the shadowed buffer!  @@@ This can go if we ever get
640                    rid of the shadow pairing of buffers. */
641 		atomic_inc(&jh2bh(jh)->b_count);
642 
643 		/*
644 		 * Make a temporary IO buffer with which to write it out
645 		 * (this will requeue the metadata buffer to BJ_Shadow).
646 		 */
647 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
648 		JBUFFER_TRACE(jh, "ph3: write metadata");
649 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
650 						jh, &wbuf[bufs], blocknr);
651 		if (flags < 0) {
652 			jbd2_journal_abort(journal, flags);
653 			continue;
654 		}
655 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
656 
657 		/* Record the new block's tag in the current descriptor
658                    buffer */
659 
660 		tag_flag = 0;
661 		if (flags & 1)
662 			tag_flag |= JBD2_FLAG_ESCAPE;
663 		if (!first_tag)
664 			tag_flag |= JBD2_FLAG_SAME_UUID;
665 
666 		tag = (journal_block_tag_t *) tagp;
667 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
668 		tag->t_flags = cpu_to_be16(tag_flag);
669 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
670 					commit_transaction->t_tid);
671 		tagp += tag_bytes;
672 		space_left -= tag_bytes;
673 		bufs++;
674 
675 		if (first_tag) {
676 			memcpy (tagp, journal->j_uuid, 16);
677 			tagp += 16;
678 			space_left -= 16;
679 			first_tag = 0;
680 		}
681 
682 		/* If there's no more to do, or if the descriptor is full,
683 		   let the IO rip! */
684 
685 		if (bufs == journal->j_wbufsize ||
686 		    commit_transaction->t_buffers == NULL ||
687 		    space_left < tag_bytes + 16 + csum_size) {
688 
689 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
690 
691 			/* Write an end-of-descriptor marker before
692                            submitting the IOs.  "tag" still points to
693                            the last tag we set up. */
694 
695 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
696 
697 			jbd2_descriptor_block_csum_set(journal, descriptor);
698 start_journal_io:
699 			for (i = 0; i < bufs; i++) {
700 				struct buffer_head *bh = wbuf[i];
701 				/*
702 				 * Compute checksum.
703 				 */
704 				if (jbd2_has_feature_checksum(journal)) {
705 					crc32_sum =
706 					    jbd2_checksum_data(crc32_sum, bh);
707 				}
708 
709 				lock_buffer(bh);
710 				clear_buffer_dirty(bh);
711 				set_buffer_uptodate(bh);
712 				bh->b_end_io = journal_end_buffer_io_sync;
713 				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
714 			}
715 			cond_resched();
716 			stats.run.rs_blocks_logged += bufs;
717 
718 			/* Force a new descriptor to be generated next
719                            time round the loop. */
720 			descriptor = NULL;
721 			bufs = 0;
722 		}
723 	}
724 
725 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
726 	if (err) {
727 		printk(KERN_WARNING
728 			"JBD2: Detected IO errors while flushing file data "
729 		       "on %s\n", journal->j_devname);
730 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
731 			jbd2_journal_abort(journal, err);
732 		err = 0;
733 	}
734 
735 	/*
736 	 * Get current oldest transaction in the log before we issue flush
737 	 * to the filesystem device. After the flush we can be sure that
738 	 * blocks of all older transactions are checkpointed to persistent
739 	 * storage and we will be safe to update journal start in the
740 	 * superblock with the numbers we get here.
741 	 */
742 	update_tail =
743 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
744 
745 	write_lock(&journal->j_state_lock);
746 	if (update_tail) {
747 		long freed = first_block - journal->j_tail;
748 
749 		if (first_block < journal->j_tail)
750 			freed += journal->j_last - journal->j_first;
751 		/* Update tail only if we free significant amount of space */
752 		if (freed < journal->j_maxlen / 4)
753 			update_tail = 0;
754 	}
755 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
756 	commit_transaction->t_state = T_COMMIT_DFLUSH;
757 	write_unlock(&journal->j_state_lock);
758 
759 	/*
760 	 * If the journal is not located on the file system device,
761 	 * then we must flush the file system device before we issue
762 	 * the commit record
763 	 */
764 	if (commit_transaction->t_need_data_flush &&
765 	    (journal->j_fs_dev != journal->j_dev) &&
766 	    (journal->j_flags & JBD2_BARRIER))
767 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
768 
769 	/* Done it all: now write the commit record asynchronously. */
770 	if (jbd2_has_feature_async_commit(journal)) {
771 		err = journal_submit_commit_record(journal, commit_transaction,
772 						 &cbh, crc32_sum);
773 		if (err)
774 			__jbd2_journal_abort_hard(journal);
775 	}
776 
777 	blk_finish_plug(&plug);
778 
779 	/* Lo and behold: we have just managed to send a transaction to
780            the log.  Before we can commit it, wait for the IO so far to
781            complete.  Control buffers being written are on the
782            transaction's t_log_list queue, and metadata buffers are on
783            the io_bufs list.
784 
785 	   Wait for the buffers in reverse order.  That way we are
786 	   less likely to be woken up until all IOs have completed, and
787 	   so we incur less scheduling load.
788 	*/
789 
790 	jbd_debug(3, "JBD2: commit phase 3\n");
791 
792 	while (!list_empty(&io_bufs)) {
793 		struct buffer_head *bh = list_entry(io_bufs.prev,
794 						    struct buffer_head,
795 						    b_assoc_buffers);
796 
797 		wait_on_buffer(bh);
798 		cond_resched();
799 
800 		if (unlikely(!buffer_uptodate(bh)))
801 			err = -EIO;
802 		jbd2_unfile_log_bh(bh);
803 
804 		/*
805 		 * The list contains temporary buffer heads created by
806 		 * jbd2_journal_write_metadata_buffer().
807 		 */
808 		BUFFER_TRACE(bh, "dumping temporary bh");
809 		__brelse(bh);
810 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
811 		free_buffer_head(bh);
812 
813 		/* We also have to refile the corresponding shadowed buffer */
814 		jh = commit_transaction->t_shadow_list->b_tprev;
815 		bh = jh2bh(jh);
816 		clear_buffer_jwrite(bh);
817 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
818 		J_ASSERT_BH(bh, !buffer_shadow(bh));
819 
820 		/* The metadata is now released for reuse, but we need
821                    to remember it against this transaction so that when
822                    we finally commit, we can do any checkpointing
823                    required. */
824 		JBUFFER_TRACE(jh, "file as BJ_Forget");
825 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
826 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
827 		__brelse(bh);
828 	}
829 
830 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
831 
832 	jbd_debug(3, "JBD2: commit phase 4\n");
833 
834 	/* Here we wait for the revoke record and descriptor record buffers */
835 	while (!list_empty(&log_bufs)) {
836 		struct buffer_head *bh;
837 
838 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
839 		wait_on_buffer(bh);
840 		cond_resched();
841 
842 		if (unlikely(!buffer_uptodate(bh)))
843 			err = -EIO;
844 
845 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
846 		clear_buffer_jwrite(bh);
847 		jbd2_unfile_log_bh(bh);
848 		__brelse(bh);		/* One for getblk */
849 		/* AKPM: bforget here */
850 	}
851 
852 	if (err)
853 		jbd2_journal_abort(journal, err);
854 
855 	jbd_debug(3, "JBD2: commit phase 5\n");
856 	write_lock(&journal->j_state_lock);
857 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
858 	commit_transaction->t_state = T_COMMIT_JFLUSH;
859 	write_unlock(&journal->j_state_lock);
860 
861 	if (!jbd2_has_feature_async_commit(journal)) {
862 		err = journal_submit_commit_record(journal, commit_transaction,
863 						&cbh, crc32_sum);
864 		if (err)
865 			__jbd2_journal_abort_hard(journal);
866 	}
867 	if (cbh)
868 		err = journal_wait_on_commit_record(journal, cbh);
869 	if (jbd2_has_feature_async_commit(journal) &&
870 	    journal->j_flags & JBD2_BARRIER) {
871 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
872 	}
873 
874 	if (err)
875 		jbd2_journal_abort(journal, err);
876 
877 	/*
878 	 * Now disk caches for filesystem device are flushed so we are safe to
879 	 * erase checkpointed transactions from the log by updating journal
880 	 * superblock.
881 	 */
882 	if (update_tail)
883 		jbd2_update_log_tail(journal, first_tid, first_block);
884 
885 	/* End of a transaction!  Finally, we can do checkpoint
886            processing: any buffers committed as a result of this
887            transaction can be removed from any checkpoint list it was on
888            before. */
889 
890 	jbd_debug(3, "JBD2: commit phase 6\n");
891 
892 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
893 	J_ASSERT(commit_transaction->t_buffers == NULL);
894 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
895 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
896 
897 restart_loop:
898 	/*
899 	 * As there are other places (journal_unmap_buffer()) adding buffers
900 	 * to this list we have to be careful and hold the j_list_lock.
901 	 */
902 	spin_lock(&journal->j_list_lock);
903 	while (commit_transaction->t_forget) {
904 		transaction_t *cp_transaction;
905 		struct buffer_head *bh;
906 		int try_to_free = 0;
907 
908 		jh = commit_transaction->t_forget;
909 		spin_unlock(&journal->j_list_lock);
910 		bh = jh2bh(jh);
911 		/*
912 		 * Get a reference so that bh cannot be freed before we are
913 		 * done with it.
914 		 */
915 		get_bh(bh);
916 		jbd_lock_bh_state(bh);
917 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
918 
919 		/*
920 		 * If there is undo-protected committed data against
921 		 * this buffer, then we can remove it now.  If it is a
922 		 * buffer needing such protection, the old frozen_data
923 		 * field now points to a committed version of the
924 		 * buffer, so rotate that field to the new committed
925 		 * data.
926 		 *
927 		 * Otherwise, we can just throw away the frozen data now.
928 		 *
929 		 * We also know that the frozen data has already fired
930 		 * its triggers if they exist, so we can clear that too.
931 		 */
932 		if (jh->b_committed_data) {
933 			jbd2_free(jh->b_committed_data, bh->b_size);
934 			jh->b_committed_data = NULL;
935 			if (jh->b_frozen_data) {
936 				jh->b_committed_data = jh->b_frozen_data;
937 				jh->b_frozen_data = NULL;
938 				jh->b_frozen_triggers = NULL;
939 			}
940 		} else if (jh->b_frozen_data) {
941 			jbd2_free(jh->b_frozen_data, bh->b_size);
942 			jh->b_frozen_data = NULL;
943 			jh->b_frozen_triggers = NULL;
944 		}
945 
946 		spin_lock(&journal->j_list_lock);
947 		cp_transaction = jh->b_cp_transaction;
948 		if (cp_transaction) {
949 			JBUFFER_TRACE(jh, "remove from old cp transaction");
950 			cp_transaction->t_chp_stats.cs_dropped++;
951 			__jbd2_journal_remove_checkpoint(jh);
952 		}
953 
954 		/* Only re-checkpoint the buffer_head if it is marked
955 		 * dirty.  If the buffer was added to the BJ_Forget list
956 		 * by jbd2_journal_forget, it may no longer be dirty and
957 		 * there's no point in keeping a checkpoint record for
958 		 * it. */
959 
960 		/*
961 		* A buffer which has been freed while still being journaled by
962 		* a previous transaction.
963 		*/
964 		if (buffer_freed(bh)) {
965 			/*
966 			 * If the running transaction is the one containing
967 			 * "add to orphan" operation (b_next_transaction !=
968 			 * NULL), we have to wait for that transaction to
969 			 * commit before we can really get rid of the buffer.
970 			 * So just clear b_modified to not confuse transaction
971 			 * credit accounting and refile the buffer to
972 			 * BJ_Forget of the running transaction. If the just
973 			 * committed transaction contains "add to orphan"
974 			 * operation, we can completely invalidate the buffer
975 			 * now. We are rather through in that since the
976 			 * buffer may be still accessible when blocksize <
977 			 * pagesize and it is attached to the last partial
978 			 * page.
979 			 */
980 			jh->b_modified = 0;
981 			if (!jh->b_next_transaction) {
982 				clear_buffer_freed(bh);
983 				clear_buffer_jbddirty(bh);
984 				clear_buffer_mapped(bh);
985 				clear_buffer_new(bh);
986 				clear_buffer_req(bh);
987 				bh->b_bdev = NULL;
988 			}
989 		}
990 
991 		if (buffer_jbddirty(bh)) {
992 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
993 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
994 			if (is_journal_aborted(journal))
995 				clear_buffer_jbddirty(bh);
996 		} else {
997 			J_ASSERT_BH(bh, !buffer_dirty(bh));
998 			/*
999 			 * The buffer on BJ_Forget list and not jbddirty means
1000 			 * it has been freed by this transaction and hence it
1001 			 * could not have been reallocated until this
1002 			 * transaction has committed. *BUT* it could be
1003 			 * reallocated once we have written all the data to
1004 			 * disk and before we process the buffer on BJ_Forget
1005 			 * list.
1006 			 */
1007 			if (!jh->b_next_transaction)
1008 				try_to_free = 1;
1009 		}
1010 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1011 		__jbd2_journal_refile_buffer(jh);
1012 		jbd_unlock_bh_state(bh);
1013 		if (try_to_free)
1014 			release_buffer_page(bh);	/* Drops bh reference */
1015 		else
1016 			__brelse(bh);
1017 		cond_resched_lock(&journal->j_list_lock);
1018 	}
1019 	spin_unlock(&journal->j_list_lock);
1020 	/*
1021 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1022 	 * of a transaction into T_FINISHED state and calling
1023 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1024 	 * other checkpointing code processing the transaction...
1025 	 */
1026 	write_lock(&journal->j_state_lock);
1027 	spin_lock(&journal->j_list_lock);
1028 	/*
1029 	 * Now recheck if some buffers did not get attached to the transaction
1030 	 * while the lock was dropped...
1031 	 */
1032 	if (commit_transaction->t_forget) {
1033 		spin_unlock(&journal->j_list_lock);
1034 		write_unlock(&journal->j_state_lock);
1035 		goto restart_loop;
1036 	}
1037 
1038 	/* Add the transaction to the checkpoint list
1039 	 * __journal_remove_checkpoint() can not destroy transaction
1040 	 * under us because it is not marked as T_FINISHED yet */
1041 	if (journal->j_checkpoint_transactions == NULL) {
1042 		journal->j_checkpoint_transactions = commit_transaction;
1043 		commit_transaction->t_cpnext = commit_transaction;
1044 		commit_transaction->t_cpprev = commit_transaction;
1045 	} else {
1046 		commit_transaction->t_cpnext =
1047 			journal->j_checkpoint_transactions;
1048 		commit_transaction->t_cpprev =
1049 			commit_transaction->t_cpnext->t_cpprev;
1050 		commit_transaction->t_cpnext->t_cpprev =
1051 			commit_transaction;
1052 		commit_transaction->t_cpprev->t_cpnext =
1053 				commit_transaction;
1054 	}
1055 	spin_unlock(&journal->j_list_lock);
1056 
1057 	/* Done with this transaction! */
1058 
1059 	jbd_debug(3, "JBD2: commit phase 7\n");
1060 
1061 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1062 
1063 	commit_transaction->t_start = jiffies;
1064 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1065 					      commit_transaction->t_start);
1066 
1067 	/*
1068 	 * File the transaction statistics
1069 	 */
1070 	stats.ts_tid = commit_transaction->t_tid;
1071 	stats.run.rs_handle_count =
1072 		atomic_read(&commit_transaction->t_handle_count);
1073 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1074 			     commit_transaction->t_tid, &stats.run);
1075 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1076 
1077 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1078 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1079 	journal->j_commit_sequence = commit_transaction->t_tid;
1080 	journal->j_committing_transaction = NULL;
1081 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1082 
1083 	/*
1084 	 * weight the commit time higher than the average time so we don't
1085 	 * react too strongly to vast changes in the commit time
1086 	 */
1087 	if (likely(journal->j_average_commit_time))
1088 		journal->j_average_commit_time = (commit_time +
1089 				journal->j_average_commit_time*3) / 4;
1090 	else
1091 		journal->j_average_commit_time = commit_time;
1092 
1093 	write_unlock(&journal->j_state_lock);
1094 
1095 	if (journal->j_commit_callback)
1096 		journal->j_commit_callback(journal, commit_transaction);
1097 
1098 	trace_jbd2_end_commit(journal, commit_transaction);
1099 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1100 		  journal->j_commit_sequence, journal->j_tail_sequence);
1101 
1102 	write_lock(&journal->j_state_lock);
1103 	spin_lock(&journal->j_list_lock);
1104 	commit_transaction->t_state = T_FINISHED;
1105 	/* Check if the transaction can be dropped now that we are finished */
1106 	if (commit_transaction->t_checkpoint_list == NULL &&
1107 	    commit_transaction->t_checkpoint_io_list == NULL) {
1108 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1109 		jbd2_journal_free_transaction(commit_transaction);
1110 	}
1111 	spin_unlock(&journal->j_list_lock);
1112 	write_unlock(&journal->j_state_lock);
1113 	wake_up(&journal->j_wait_done_commit);
1114 
1115 	/*
1116 	 * Calculate overall stats
1117 	 */
1118 	spin_lock(&journal->j_history_lock);
1119 	journal->j_stats.ts_tid++;
1120 	journal->j_stats.ts_requested += stats.ts_requested;
1121 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1122 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1123 	journal->j_stats.run.rs_running += stats.run.rs_running;
1124 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1125 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1126 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1127 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1128 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1129 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1130 	spin_unlock(&journal->j_history_lock);
1131 }
1132