xref: /linux/fs/jbd2/commit.c (revision 9ffc93f203c18a70623f21950f1dd473c9ec48cd)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * Default IO end handler for temporary BJ_IO buffer_heads.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	BUFFER_TRACE(bh, "");
38 	if (uptodate)
39 		set_buffer_uptodate(bh);
40 	else
41 		clear_buffer_uptodate(bh);
42 	unlock_buffer(bh);
43 }
44 
45 /*
46  * When an ext4 file is truncated, it is possible that some pages are not
47  * successfully freed, because they are attached to a committing transaction.
48  * After the transaction commits, these pages are left on the LRU, with no
49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
51  * the numbers in /proc/meminfo look odd.
52  *
53  * So here, we have a buffer which has just come off the forget list.  Look to
54  * see if we can strip all buffers from the backing page.
55  *
56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
57  * caller provided us with a ref against the buffer, and we drop that here.
58  */
59 static void release_buffer_page(struct buffer_head *bh)
60 {
61 	struct page *page;
62 
63 	if (buffer_dirty(bh))
64 		goto nope;
65 	if (atomic_read(&bh->b_count) != 1)
66 		goto nope;
67 	page = bh->b_page;
68 	if (!page)
69 		goto nope;
70 	if (page->mapping)
71 		goto nope;
72 
73 	/* OK, it's a truncated page */
74 	if (!trylock_page(page))
75 		goto nope;
76 
77 	page_cache_get(page);
78 	__brelse(bh);
79 	try_to_free_buffers(page);
80 	unlock_page(page);
81 	page_cache_release(page);
82 	return;
83 
84 nope:
85 	__brelse(bh);
86 }
87 
88 /*
89  * Done it all: now submit the commit record.  We should have
90  * cleaned up our previous buffers by now, so if we are in abort
91  * mode we can now just skip the rest of the journal write
92  * entirely.
93  *
94  * Returns 1 if the journal needs to be aborted or 0 on success
95  */
96 static int journal_submit_commit_record(journal_t *journal,
97 					transaction_t *commit_transaction,
98 					struct buffer_head **cbh,
99 					__u32 crc32_sum)
100 {
101 	struct journal_head *descriptor;
102 	struct commit_header *tmp;
103 	struct buffer_head *bh;
104 	int ret;
105 	struct timespec now = current_kernel_time();
106 
107 	*cbh = NULL;
108 
109 	if (is_journal_aborted(journal))
110 		return 0;
111 
112 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
113 	if (!descriptor)
114 		return 1;
115 
116 	bh = jh2bh(descriptor);
117 
118 	tmp = (struct commit_header *)bh->b_data;
119 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
120 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
121 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
122 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
123 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
124 
125 	if (JBD2_HAS_COMPAT_FEATURE(journal,
126 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
127 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
128 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
129 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
130 	}
131 
132 	JBUFFER_TRACE(descriptor, "submit commit block");
133 	lock_buffer(bh);
134 	clear_buffer_dirty(bh);
135 	set_buffer_uptodate(bh);
136 	bh->b_end_io = journal_end_buffer_io_sync;
137 
138 	if (journal->j_flags & JBD2_BARRIER &&
139 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
140 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
141 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
142 	else
143 		ret = submit_bh(WRITE_SYNC, bh);
144 
145 	*cbh = bh;
146 	return ret;
147 }
148 
149 /*
150  * This function along with journal_submit_commit_record
151  * allows to write the commit record asynchronously.
152  */
153 static int journal_wait_on_commit_record(journal_t *journal,
154 					 struct buffer_head *bh)
155 {
156 	int ret = 0;
157 
158 	clear_buffer_dirty(bh);
159 	wait_on_buffer(bh);
160 
161 	if (unlikely(!buffer_uptodate(bh)))
162 		ret = -EIO;
163 	put_bh(bh);            /* One for getblk() */
164 	jbd2_journal_put_journal_head(bh2jh(bh));
165 
166 	return ret;
167 }
168 
169 /*
170  * write the filemap data using writepage() address_space_operations.
171  * We don't do block allocation here even for delalloc. We don't
172  * use writepages() because with dealyed allocation we may be doing
173  * block allocation in writepages().
174  */
175 static int journal_submit_inode_data_buffers(struct address_space *mapping)
176 {
177 	int ret;
178 	struct writeback_control wbc = {
179 		.sync_mode =  WB_SYNC_ALL,
180 		.nr_to_write = mapping->nrpages * 2,
181 		.range_start = 0,
182 		.range_end = i_size_read(mapping->host),
183 	};
184 
185 	ret = generic_writepages(mapping, &wbc);
186 	return ret;
187 }
188 
189 /*
190  * Submit all the data buffers of inode associated with the transaction to
191  * disk.
192  *
193  * We are in a committing transaction. Therefore no new inode can be added to
194  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
195  * operate on from being released while we write out pages.
196  */
197 static int journal_submit_data_buffers(journal_t *journal,
198 		transaction_t *commit_transaction)
199 {
200 	struct jbd2_inode *jinode;
201 	int err, ret = 0;
202 	struct address_space *mapping;
203 
204 	spin_lock(&journal->j_list_lock);
205 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
206 		mapping = jinode->i_vfs_inode->i_mapping;
207 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
208 		spin_unlock(&journal->j_list_lock);
209 		/*
210 		 * submit the inode data buffers. We use writepage
211 		 * instead of writepages. Because writepages can do
212 		 * block allocation  with delalloc. We need to write
213 		 * only allocated blocks here.
214 		 */
215 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
216 		err = journal_submit_inode_data_buffers(mapping);
217 		if (!ret)
218 			ret = err;
219 		spin_lock(&journal->j_list_lock);
220 		J_ASSERT(jinode->i_transaction == commit_transaction);
221 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 		smp_mb__after_clear_bit();
223 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224 	}
225 	spin_unlock(&journal->j_list_lock);
226 	return ret;
227 }
228 
229 /*
230  * Wait for data submitted for writeout, refile inodes to proper
231  * transaction if needed.
232  *
233  */
234 static int journal_finish_inode_data_buffers(journal_t *journal,
235 		transaction_t *commit_transaction)
236 {
237 	struct jbd2_inode *jinode, *next_i;
238 	int err, ret = 0;
239 
240 	/* For locking, see the comment in journal_submit_data_buffers() */
241 	spin_lock(&journal->j_list_lock);
242 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
243 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
244 		spin_unlock(&journal->j_list_lock);
245 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
246 		if (err) {
247 			/*
248 			 * Because AS_EIO is cleared by
249 			 * filemap_fdatawait_range(), set it again so
250 			 * that user process can get -EIO from fsync().
251 			 */
252 			set_bit(AS_EIO,
253 				&jinode->i_vfs_inode->i_mapping->flags);
254 
255 			if (!ret)
256 				ret = err;
257 		}
258 		spin_lock(&journal->j_list_lock);
259 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 		smp_mb__after_clear_bit();
261 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262 	}
263 
264 	/* Now refile inode to proper lists */
265 	list_for_each_entry_safe(jinode, next_i,
266 				 &commit_transaction->t_inode_list, i_list) {
267 		list_del(&jinode->i_list);
268 		if (jinode->i_next_transaction) {
269 			jinode->i_transaction = jinode->i_next_transaction;
270 			jinode->i_next_transaction = NULL;
271 			list_add(&jinode->i_list,
272 				&jinode->i_transaction->t_inode_list);
273 		} else {
274 			jinode->i_transaction = NULL;
275 		}
276 	}
277 	spin_unlock(&journal->j_list_lock);
278 
279 	return ret;
280 }
281 
282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283 {
284 	struct page *page = bh->b_page;
285 	char *addr;
286 	__u32 checksum;
287 
288 	addr = kmap_atomic(page);
289 	checksum = crc32_be(crc32_sum,
290 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
291 	kunmap_atomic(addr);
292 
293 	return checksum;
294 }
295 
296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
297 				   unsigned long long block)
298 {
299 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
300 	if (tag_bytes > JBD2_TAG_SIZE32)
301 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302 }
303 
304 /*
305  * jbd2_journal_commit_transaction
306  *
307  * The primary function for committing a transaction to the log.  This
308  * function is called by the journal thread to begin a complete commit.
309  */
310 void jbd2_journal_commit_transaction(journal_t *journal)
311 {
312 	struct transaction_stats_s stats;
313 	transaction_t *commit_transaction;
314 	struct journal_head *jh, *new_jh, *descriptor;
315 	struct buffer_head **wbuf = journal->j_wbuf;
316 	int bufs;
317 	int flags;
318 	int err;
319 	unsigned long long blocknr;
320 	ktime_t start_time;
321 	u64 commit_time;
322 	char *tagp = NULL;
323 	journal_header_t *header;
324 	journal_block_tag_t *tag = NULL;
325 	int space_left = 0;
326 	int first_tag = 0;
327 	int tag_flag;
328 	int i, to_free = 0;
329 	int tag_bytes = journal_tag_bytes(journal);
330 	struct buffer_head *cbh = NULL; /* For transactional checksums */
331 	__u32 crc32_sum = ~0;
332 	struct blk_plug plug;
333 
334 	/*
335 	 * First job: lock down the current transaction and wait for
336 	 * all outstanding updates to complete.
337 	 */
338 
339 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
340 	if (journal->j_flags & JBD2_FLUSHED) {
341 		jbd_debug(3, "super block updated\n");
342 		jbd2_journal_update_superblock(journal, 1);
343 	} else {
344 		jbd_debug(3, "superblock not updated\n");
345 	}
346 
347 	J_ASSERT(journal->j_running_transaction != NULL);
348 	J_ASSERT(journal->j_committing_transaction == NULL);
349 
350 	commit_transaction = journal->j_running_transaction;
351 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
352 
353 	trace_jbd2_start_commit(journal, commit_transaction);
354 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
355 			commit_transaction->t_tid);
356 
357 	write_lock(&journal->j_state_lock);
358 	commit_transaction->t_state = T_LOCKED;
359 
360 	trace_jbd2_commit_locking(journal, commit_transaction);
361 	stats.run.rs_wait = commit_transaction->t_max_wait;
362 	stats.run.rs_locked = jiffies;
363 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
364 					      stats.run.rs_locked);
365 
366 	spin_lock(&commit_transaction->t_handle_lock);
367 	while (atomic_read(&commit_transaction->t_updates)) {
368 		DEFINE_WAIT(wait);
369 
370 		prepare_to_wait(&journal->j_wait_updates, &wait,
371 					TASK_UNINTERRUPTIBLE);
372 		if (atomic_read(&commit_transaction->t_updates)) {
373 			spin_unlock(&commit_transaction->t_handle_lock);
374 			write_unlock(&journal->j_state_lock);
375 			schedule();
376 			write_lock(&journal->j_state_lock);
377 			spin_lock(&commit_transaction->t_handle_lock);
378 		}
379 		finish_wait(&journal->j_wait_updates, &wait);
380 	}
381 	spin_unlock(&commit_transaction->t_handle_lock);
382 
383 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
384 			journal->j_max_transaction_buffers);
385 
386 	/*
387 	 * First thing we are allowed to do is to discard any remaining
388 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
389 	 * that there are no such buffers: if a large filesystem
390 	 * operation like a truncate needs to split itself over multiple
391 	 * transactions, then it may try to do a jbd2_journal_restart() while
392 	 * there are still BJ_Reserved buffers outstanding.  These must
393 	 * be released cleanly from the current transaction.
394 	 *
395 	 * In this case, the filesystem must still reserve write access
396 	 * again before modifying the buffer in the new transaction, but
397 	 * we do not require it to remember exactly which old buffers it
398 	 * has reserved.  This is consistent with the existing behaviour
399 	 * that multiple jbd2_journal_get_write_access() calls to the same
400 	 * buffer are perfectly permissible.
401 	 */
402 	while (commit_transaction->t_reserved_list) {
403 		jh = commit_transaction->t_reserved_list;
404 		JBUFFER_TRACE(jh, "reserved, unused: refile");
405 		/*
406 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
407 		 * leave undo-committed data.
408 		 */
409 		if (jh->b_committed_data) {
410 			struct buffer_head *bh = jh2bh(jh);
411 
412 			jbd_lock_bh_state(bh);
413 			jbd2_free(jh->b_committed_data, bh->b_size);
414 			jh->b_committed_data = NULL;
415 			jbd_unlock_bh_state(bh);
416 		}
417 		jbd2_journal_refile_buffer(journal, jh);
418 	}
419 
420 	/*
421 	 * Now try to drop any written-back buffers from the journal's
422 	 * checkpoint lists.  We do this *before* commit because it potentially
423 	 * frees some memory
424 	 */
425 	spin_lock(&journal->j_list_lock);
426 	__jbd2_journal_clean_checkpoint_list(journal);
427 	spin_unlock(&journal->j_list_lock);
428 
429 	jbd_debug(3, "JBD2: commit phase 1\n");
430 
431 	/*
432 	 * Clear revoked flag to reflect there is no revoked buffers
433 	 * in the next transaction which is going to be started.
434 	 */
435 	jbd2_clear_buffer_revoked_flags(journal);
436 
437 	/*
438 	 * Switch to a new revoke table.
439 	 */
440 	jbd2_journal_switch_revoke_table(journal);
441 
442 	trace_jbd2_commit_flushing(journal, commit_transaction);
443 	stats.run.rs_flushing = jiffies;
444 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
445 					     stats.run.rs_flushing);
446 
447 	commit_transaction->t_state = T_FLUSH;
448 	journal->j_committing_transaction = commit_transaction;
449 	journal->j_running_transaction = NULL;
450 	start_time = ktime_get();
451 	commit_transaction->t_log_start = journal->j_head;
452 	wake_up(&journal->j_wait_transaction_locked);
453 	write_unlock(&journal->j_state_lock);
454 
455 	jbd_debug(3, "JBD2: commit phase 2\n");
456 
457 	/*
458 	 * Now start flushing things to disk, in the order they appear
459 	 * on the transaction lists.  Data blocks go first.
460 	 */
461 	err = journal_submit_data_buffers(journal, commit_transaction);
462 	if (err)
463 		jbd2_journal_abort(journal, err);
464 
465 	blk_start_plug(&plug);
466 	jbd2_journal_write_revoke_records(journal, commit_transaction,
467 					  WRITE_SYNC);
468 	blk_finish_plug(&plug);
469 
470 	jbd_debug(3, "JBD2: commit phase 2\n");
471 
472 	/*
473 	 * Way to go: we have now written out all of the data for a
474 	 * transaction!  Now comes the tricky part: we need to write out
475 	 * metadata.  Loop over the transaction's entire buffer list:
476 	 */
477 	write_lock(&journal->j_state_lock);
478 	commit_transaction->t_state = T_COMMIT;
479 	write_unlock(&journal->j_state_lock);
480 
481 	trace_jbd2_commit_logging(journal, commit_transaction);
482 	stats.run.rs_logging = jiffies;
483 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
484 					       stats.run.rs_logging);
485 	stats.run.rs_blocks =
486 		atomic_read(&commit_transaction->t_outstanding_credits);
487 	stats.run.rs_blocks_logged = 0;
488 
489 	J_ASSERT(commit_transaction->t_nr_buffers <=
490 		 atomic_read(&commit_transaction->t_outstanding_credits));
491 
492 	err = 0;
493 	descriptor = NULL;
494 	bufs = 0;
495 	blk_start_plug(&plug);
496 	while (commit_transaction->t_buffers) {
497 
498 		/* Find the next buffer to be journaled... */
499 
500 		jh = commit_transaction->t_buffers;
501 
502 		/* If we're in abort mode, we just un-journal the buffer and
503 		   release it. */
504 
505 		if (is_journal_aborted(journal)) {
506 			clear_buffer_jbddirty(jh2bh(jh));
507 			JBUFFER_TRACE(jh, "journal is aborting: refile");
508 			jbd2_buffer_abort_trigger(jh,
509 						  jh->b_frozen_data ?
510 						  jh->b_frozen_triggers :
511 						  jh->b_triggers);
512 			jbd2_journal_refile_buffer(journal, jh);
513 			/* If that was the last one, we need to clean up
514 			 * any descriptor buffers which may have been
515 			 * already allocated, even if we are now
516 			 * aborting. */
517 			if (!commit_transaction->t_buffers)
518 				goto start_journal_io;
519 			continue;
520 		}
521 
522 		/* Make sure we have a descriptor block in which to
523 		   record the metadata buffer. */
524 
525 		if (!descriptor) {
526 			struct buffer_head *bh;
527 
528 			J_ASSERT (bufs == 0);
529 
530 			jbd_debug(4, "JBD2: get descriptor\n");
531 
532 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
533 			if (!descriptor) {
534 				jbd2_journal_abort(journal, -EIO);
535 				continue;
536 			}
537 
538 			bh = jh2bh(descriptor);
539 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
540 				(unsigned long long)bh->b_blocknr, bh->b_data);
541 			header = (journal_header_t *)&bh->b_data[0];
542 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545 
546 			tagp = &bh->b_data[sizeof(journal_header_t)];
547 			space_left = bh->b_size - sizeof(journal_header_t);
548 			first_tag = 1;
549 			set_buffer_jwrite(bh);
550 			set_buffer_dirty(bh);
551 			wbuf[bufs++] = bh;
552 
553 			/* Record it so that we can wait for IO
554                            completion later */
555 			BUFFER_TRACE(bh, "ph3: file as descriptor");
556 			jbd2_journal_file_buffer(descriptor, commit_transaction,
557 					BJ_LogCtl);
558 		}
559 
560 		/* Where is the buffer to be written? */
561 
562 		err = jbd2_journal_next_log_block(journal, &blocknr);
563 		/* If the block mapping failed, just abandon the buffer
564 		   and repeat this loop: we'll fall into the
565 		   refile-on-abort condition above. */
566 		if (err) {
567 			jbd2_journal_abort(journal, err);
568 			continue;
569 		}
570 
571 		/*
572 		 * start_this_handle() uses t_outstanding_credits to determine
573 		 * the free space in the log, but this counter is changed
574 		 * by jbd2_journal_next_log_block() also.
575 		 */
576 		atomic_dec(&commit_transaction->t_outstanding_credits);
577 
578 		/* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581 		atomic_inc(&jh2bh(jh)->b_count);
582 
583 		/* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586 
587 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588 		/*
589 		 * akpm: jbd2_journal_write_metadata_buffer() sets
590 		 * new_bh->b_transaction to commit_transaction.
591 		 * We need to clean this up before we release new_bh
592 		 * (which is of type BJ_IO)
593 		 */
594 		JBUFFER_TRACE(jh, "ph3: write metadata");
595 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596 						      jh, &new_jh, blocknr);
597 		if (flags < 0) {
598 			jbd2_journal_abort(journal, flags);
599 			continue;
600 		}
601 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
602 		wbuf[bufs++] = jh2bh(new_jh);
603 
604 		/* Record the new block's tag in the current descriptor
605                    buffer */
606 
607 		tag_flag = 0;
608 		if (flags & 1)
609 			tag_flag |= JBD2_FLAG_ESCAPE;
610 		if (!first_tag)
611 			tag_flag |= JBD2_FLAG_SAME_UUID;
612 
613 		tag = (journal_block_tag_t *) tagp;
614 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
615 		tag->t_flags = cpu_to_be32(tag_flag);
616 		tagp += tag_bytes;
617 		space_left -= tag_bytes;
618 
619 		if (first_tag) {
620 			memcpy (tagp, journal->j_uuid, 16);
621 			tagp += 16;
622 			space_left -= 16;
623 			first_tag = 0;
624 		}
625 
626 		/* If there's no more to do, or if the descriptor is full,
627 		   let the IO rip! */
628 
629 		if (bufs == journal->j_wbufsize ||
630 		    commit_transaction->t_buffers == NULL ||
631 		    space_left < tag_bytes + 16) {
632 
633 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
634 
635 			/* Write an end-of-descriptor marker before
636                            submitting the IOs.  "tag" still points to
637                            the last tag we set up. */
638 
639 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
640 
641 start_journal_io:
642 			for (i = 0; i < bufs; i++) {
643 				struct buffer_head *bh = wbuf[i];
644 				/*
645 				 * Compute checksum.
646 				 */
647 				if (JBD2_HAS_COMPAT_FEATURE(journal,
648 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
649 					crc32_sum =
650 					    jbd2_checksum_data(crc32_sum, bh);
651 				}
652 
653 				lock_buffer(bh);
654 				clear_buffer_dirty(bh);
655 				set_buffer_uptodate(bh);
656 				bh->b_end_io = journal_end_buffer_io_sync;
657 				submit_bh(WRITE_SYNC, bh);
658 			}
659 			cond_resched();
660 			stats.run.rs_blocks_logged += bufs;
661 
662 			/* Force a new descriptor to be generated next
663                            time round the loop. */
664 			descriptor = NULL;
665 			bufs = 0;
666 		}
667 	}
668 
669 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
670 	if (err) {
671 		printk(KERN_WARNING
672 			"JBD2: Detected IO errors while flushing file data "
673 		       "on %s\n", journal->j_devname);
674 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675 			jbd2_journal_abort(journal, err);
676 		err = 0;
677 	}
678 
679 	write_lock(&journal->j_state_lock);
680 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
681 	commit_transaction->t_state = T_COMMIT_DFLUSH;
682 	write_unlock(&journal->j_state_lock);
683 	/*
684 	 * If the journal is not located on the file system device,
685 	 * then we must flush the file system device before we issue
686 	 * the commit record
687 	 */
688 	if (commit_transaction->t_need_data_flush &&
689 	    (journal->j_fs_dev != journal->j_dev) &&
690 	    (journal->j_flags & JBD2_BARRIER))
691 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
692 
693 	/* Done it all: now write the commit record asynchronously. */
694 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
695 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
696 		err = journal_submit_commit_record(journal, commit_transaction,
697 						 &cbh, crc32_sum);
698 		if (err)
699 			__jbd2_journal_abort_hard(journal);
700 	}
701 
702 	blk_finish_plug(&plug);
703 
704 	/* Lo and behold: we have just managed to send a transaction to
705            the log.  Before we can commit it, wait for the IO so far to
706            complete.  Control buffers being written are on the
707            transaction's t_log_list queue, and metadata buffers are on
708            the t_iobuf_list queue.
709 
710 	   Wait for the buffers in reverse order.  That way we are
711 	   less likely to be woken up until all IOs have completed, and
712 	   so we incur less scheduling load.
713 	*/
714 
715 	jbd_debug(3, "JBD2: commit phase 3\n");
716 
717 	/*
718 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
719 	 * See __journal_try_to_free_buffer.
720 	 */
721 wait_for_iobuf:
722 	while (commit_transaction->t_iobuf_list != NULL) {
723 		struct buffer_head *bh;
724 
725 		jh = commit_transaction->t_iobuf_list->b_tprev;
726 		bh = jh2bh(jh);
727 		if (buffer_locked(bh)) {
728 			wait_on_buffer(bh);
729 			goto wait_for_iobuf;
730 		}
731 		if (cond_resched())
732 			goto wait_for_iobuf;
733 
734 		if (unlikely(!buffer_uptodate(bh)))
735 			err = -EIO;
736 
737 		clear_buffer_jwrite(bh);
738 
739 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
740 		jbd2_journal_unfile_buffer(journal, jh);
741 
742 		/*
743 		 * ->t_iobuf_list should contain only dummy buffer_heads
744 		 * which were created by jbd2_journal_write_metadata_buffer().
745 		 */
746 		BUFFER_TRACE(bh, "dumping temporary bh");
747 		jbd2_journal_put_journal_head(jh);
748 		__brelse(bh);
749 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
750 		free_buffer_head(bh);
751 
752 		/* We also have to unlock and free the corresponding
753                    shadowed buffer */
754 		jh = commit_transaction->t_shadow_list->b_tprev;
755 		bh = jh2bh(jh);
756 		clear_bit(BH_JWrite, &bh->b_state);
757 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
758 
759 		/* The metadata is now released for reuse, but we need
760                    to remember it against this transaction so that when
761                    we finally commit, we can do any checkpointing
762                    required. */
763 		JBUFFER_TRACE(jh, "file as BJ_Forget");
764 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
765 		/*
766 		 * Wake up any transactions which were waiting for this IO to
767 		 * complete. The barrier must be here so that changes by
768 		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
769 		 * does the waitqueue check.
770 		 */
771 		smp_mb();
772 		wake_up_bit(&bh->b_state, BH_Unshadow);
773 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
774 		__brelse(bh);
775 	}
776 
777 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
778 
779 	jbd_debug(3, "JBD2: commit phase 4\n");
780 
781 	/* Here we wait for the revoke record and descriptor record buffers */
782  wait_for_ctlbuf:
783 	while (commit_transaction->t_log_list != NULL) {
784 		struct buffer_head *bh;
785 
786 		jh = commit_transaction->t_log_list->b_tprev;
787 		bh = jh2bh(jh);
788 		if (buffer_locked(bh)) {
789 			wait_on_buffer(bh);
790 			goto wait_for_ctlbuf;
791 		}
792 		if (cond_resched())
793 			goto wait_for_ctlbuf;
794 
795 		if (unlikely(!buffer_uptodate(bh)))
796 			err = -EIO;
797 
798 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
799 		clear_buffer_jwrite(bh);
800 		jbd2_journal_unfile_buffer(journal, jh);
801 		jbd2_journal_put_journal_head(jh);
802 		__brelse(bh);		/* One for getblk */
803 		/* AKPM: bforget here */
804 	}
805 
806 	if (err)
807 		jbd2_journal_abort(journal, err);
808 
809 	jbd_debug(3, "JBD2: commit phase 5\n");
810 	write_lock(&journal->j_state_lock);
811 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
812 	commit_transaction->t_state = T_COMMIT_JFLUSH;
813 	write_unlock(&journal->j_state_lock);
814 
815 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
816 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
817 		err = journal_submit_commit_record(journal, commit_transaction,
818 						&cbh, crc32_sum);
819 		if (err)
820 			__jbd2_journal_abort_hard(journal);
821 	}
822 	if (cbh)
823 		err = journal_wait_on_commit_record(journal, cbh);
824 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
825 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
826 	    journal->j_flags & JBD2_BARRIER) {
827 		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
828 	}
829 
830 	if (err)
831 		jbd2_journal_abort(journal, err);
832 
833 	/* End of a transaction!  Finally, we can do checkpoint
834            processing: any buffers committed as a result of this
835            transaction can be removed from any checkpoint list it was on
836            before. */
837 
838 	jbd_debug(3, "JBD2: commit phase 6\n");
839 
840 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
841 	J_ASSERT(commit_transaction->t_buffers == NULL);
842 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
843 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
844 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
845 	J_ASSERT(commit_transaction->t_log_list == NULL);
846 
847 restart_loop:
848 	/*
849 	 * As there are other places (journal_unmap_buffer()) adding buffers
850 	 * to this list we have to be careful and hold the j_list_lock.
851 	 */
852 	spin_lock(&journal->j_list_lock);
853 	while (commit_transaction->t_forget) {
854 		transaction_t *cp_transaction;
855 		struct buffer_head *bh;
856 		int try_to_free = 0;
857 
858 		jh = commit_transaction->t_forget;
859 		spin_unlock(&journal->j_list_lock);
860 		bh = jh2bh(jh);
861 		/*
862 		 * Get a reference so that bh cannot be freed before we are
863 		 * done with it.
864 		 */
865 		get_bh(bh);
866 		jbd_lock_bh_state(bh);
867 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
868 
869 		/*
870 		 * If there is undo-protected committed data against
871 		 * this buffer, then we can remove it now.  If it is a
872 		 * buffer needing such protection, the old frozen_data
873 		 * field now points to a committed version of the
874 		 * buffer, so rotate that field to the new committed
875 		 * data.
876 		 *
877 		 * Otherwise, we can just throw away the frozen data now.
878 		 *
879 		 * We also know that the frozen data has already fired
880 		 * its triggers if they exist, so we can clear that too.
881 		 */
882 		if (jh->b_committed_data) {
883 			jbd2_free(jh->b_committed_data, bh->b_size);
884 			jh->b_committed_data = NULL;
885 			if (jh->b_frozen_data) {
886 				jh->b_committed_data = jh->b_frozen_data;
887 				jh->b_frozen_data = NULL;
888 				jh->b_frozen_triggers = NULL;
889 			}
890 		} else if (jh->b_frozen_data) {
891 			jbd2_free(jh->b_frozen_data, bh->b_size);
892 			jh->b_frozen_data = NULL;
893 			jh->b_frozen_triggers = NULL;
894 		}
895 
896 		spin_lock(&journal->j_list_lock);
897 		cp_transaction = jh->b_cp_transaction;
898 		if (cp_transaction) {
899 			JBUFFER_TRACE(jh, "remove from old cp transaction");
900 			cp_transaction->t_chp_stats.cs_dropped++;
901 			__jbd2_journal_remove_checkpoint(jh);
902 		}
903 
904 		/* Only re-checkpoint the buffer_head if it is marked
905 		 * dirty.  If the buffer was added to the BJ_Forget list
906 		 * by jbd2_journal_forget, it may no longer be dirty and
907 		 * there's no point in keeping a checkpoint record for
908 		 * it. */
909 
910 		/* A buffer which has been freed while still being
911 		 * journaled by a previous transaction may end up still
912 		 * being dirty here, but we want to avoid writing back
913 		 * that buffer in the future after the "add to orphan"
914 		 * operation been committed,  That's not only a performance
915 		 * gain, it also stops aliasing problems if the buffer is
916 		 * left behind for writeback and gets reallocated for another
917 		 * use in a different page. */
918 		if (buffer_freed(bh) && !jh->b_next_transaction) {
919 			clear_buffer_freed(bh);
920 			clear_buffer_jbddirty(bh);
921 		}
922 
923 		if (buffer_jbddirty(bh)) {
924 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
925 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
926 			if (is_journal_aborted(journal))
927 				clear_buffer_jbddirty(bh);
928 		} else {
929 			J_ASSERT_BH(bh, !buffer_dirty(bh));
930 			/*
931 			 * The buffer on BJ_Forget list and not jbddirty means
932 			 * it has been freed by this transaction and hence it
933 			 * could not have been reallocated until this
934 			 * transaction has committed. *BUT* it could be
935 			 * reallocated once we have written all the data to
936 			 * disk and before we process the buffer on BJ_Forget
937 			 * list.
938 			 */
939 			if (!jh->b_next_transaction)
940 				try_to_free = 1;
941 		}
942 		JBUFFER_TRACE(jh, "refile or unfile buffer");
943 		__jbd2_journal_refile_buffer(jh);
944 		jbd_unlock_bh_state(bh);
945 		if (try_to_free)
946 			release_buffer_page(bh);	/* Drops bh reference */
947 		else
948 			__brelse(bh);
949 		cond_resched_lock(&journal->j_list_lock);
950 	}
951 	spin_unlock(&journal->j_list_lock);
952 	/*
953 	 * This is a bit sleazy.  We use j_list_lock to protect transition
954 	 * of a transaction into T_FINISHED state and calling
955 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
956 	 * other checkpointing code processing the transaction...
957 	 */
958 	write_lock(&journal->j_state_lock);
959 	spin_lock(&journal->j_list_lock);
960 	/*
961 	 * Now recheck if some buffers did not get attached to the transaction
962 	 * while the lock was dropped...
963 	 */
964 	if (commit_transaction->t_forget) {
965 		spin_unlock(&journal->j_list_lock);
966 		write_unlock(&journal->j_state_lock);
967 		goto restart_loop;
968 	}
969 
970 	/* Done with this transaction! */
971 
972 	jbd_debug(3, "JBD2: commit phase 7\n");
973 
974 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
975 
976 	commit_transaction->t_start = jiffies;
977 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
978 					      commit_transaction->t_start);
979 
980 	/*
981 	 * File the transaction statistics
982 	 */
983 	stats.ts_tid = commit_transaction->t_tid;
984 	stats.run.rs_handle_count =
985 		atomic_read(&commit_transaction->t_handle_count);
986 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
987 			     commit_transaction->t_tid, &stats.run);
988 
989 	/*
990 	 * Calculate overall stats
991 	 */
992 	spin_lock(&journal->j_history_lock);
993 	journal->j_stats.ts_tid++;
994 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
995 	journal->j_stats.run.rs_running += stats.run.rs_running;
996 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
997 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
998 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
999 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1000 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1001 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1002 	spin_unlock(&journal->j_history_lock);
1003 
1004 	commit_transaction->t_state = T_FINISHED;
1005 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1006 	journal->j_commit_sequence = commit_transaction->t_tid;
1007 	journal->j_committing_transaction = NULL;
1008 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1009 
1010 	/*
1011 	 * weight the commit time higher than the average time so we don't
1012 	 * react too strongly to vast changes in the commit time
1013 	 */
1014 	if (likely(journal->j_average_commit_time))
1015 		journal->j_average_commit_time = (commit_time +
1016 				journal->j_average_commit_time*3) / 4;
1017 	else
1018 		journal->j_average_commit_time = commit_time;
1019 	write_unlock(&journal->j_state_lock);
1020 
1021 	if (commit_transaction->t_checkpoint_list == NULL &&
1022 	    commit_transaction->t_checkpoint_io_list == NULL) {
1023 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1024 		to_free = 1;
1025 	} else {
1026 		if (journal->j_checkpoint_transactions == NULL) {
1027 			journal->j_checkpoint_transactions = commit_transaction;
1028 			commit_transaction->t_cpnext = commit_transaction;
1029 			commit_transaction->t_cpprev = commit_transaction;
1030 		} else {
1031 			commit_transaction->t_cpnext =
1032 				journal->j_checkpoint_transactions;
1033 			commit_transaction->t_cpprev =
1034 				commit_transaction->t_cpnext->t_cpprev;
1035 			commit_transaction->t_cpnext->t_cpprev =
1036 				commit_transaction;
1037 			commit_transaction->t_cpprev->t_cpnext =
1038 				commit_transaction;
1039 		}
1040 	}
1041 	spin_unlock(&journal->j_list_lock);
1042 
1043 	if (journal->j_commit_callback)
1044 		journal->j_commit_callback(journal, commit_transaction);
1045 
1046 	trace_jbd2_end_commit(journal, commit_transaction);
1047 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1048 		  journal->j_commit_sequence, journal->j_tail_sequence);
1049 	if (to_free)
1050 		kfree(commit_transaction);
1051 
1052 	wake_up(&journal->j_wait_done_commit);
1053 }
1054