xref: /linux/fs/jbd2/commit.c (revision 43f5b3085fdd27c4edf535d938b2cb0ccead4f75)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31 	BUFFER_TRACE(bh, "");
32 	if (uptodate)
33 		set_buffer_uptodate(bh);
34 	else
35 		clear_buffer_uptodate(bh);
36 	unlock_buffer(bh);
37 }
38 
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not sucessfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
51  * caller provided us with a ref against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55 	struct page *page;
56 
57 	if (buffer_dirty(bh))
58 		goto nope;
59 	if (atomic_read(&bh->b_count) != 1)
60 		goto nope;
61 	page = bh->b_page;
62 	if (!page)
63 		goto nope;
64 	if (page->mapping)
65 		goto nope;
66 
67 	/* OK, it's a truncated page */
68 	if (TestSetPageLocked(page))
69 		goto nope;
70 
71 	page_cache_get(page);
72 	__brelse(bh);
73 	try_to_free_buffers(page);
74 	unlock_page(page);
75 	page_cache_release(page);
76 	return;
77 
78 nope:
79 	__brelse(bh);
80 }
81 
82 /*
83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
85  * return 0.  j_list_lock is dropped in this case.
86  */
87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88 {
89 	if (!jbd_trylock_bh_state(bh)) {
90 		spin_unlock(&journal->j_list_lock);
91 		schedule();
92 		return 0;
93 	}
94 	return 1;
95 }
96 
97 /*
98  * Done it all: now submit the commit record.  We should have
99  * cleaned up our previous buffers by now, so if we are in abort
100  * mode we can now just skip the rest of the journal write
101  * entirely.
102  *
103  * Returns 1 if the journal needs to be aborted or 0 on success
104  */
105 static int journal_submit_commit_record(journal_t *journal,
106 					transaction_t *commit_transaction,
107 					struct buffer_head **cbh,
108 					__u32 crc32_sum)
109 {
110 	struct journal_head *descriptor;
111 	struct commit_header *tmp;
112 	struct buffer_head *bh;
113 	int ret;
114 	int barrier_done = 0;
115 
116 	if (is_journal_aborted(journal))
117 		return 0;
118 
119 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
120 	if (!descriptor)
121 		return 1;
122 
123 	bh = jh2bh(descriptor);
124 
125 	tmp = (struct commit_header *)bh->b_data;
126 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
129 
130 	if (JBD2_HAS_COMPAT_FEATURE(journal,
131 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
132 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
133 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
134 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
135 	}
136 
137 	JBUFFER_TRACE(descriptor, "submit commit block");
138 	lock_buffer(bh);
139 	get_bh(bh);
140 	set_buffer_dirty(bh);
141 	set_buffer_uptodate(bh);
142 	bh->b_end_io = journal_end_buffer_io_sync;
143 
144 	if (journal->j_flags & JBD2_BARRIER &&
145 		!JBD2_HAS_INCOMPAT_FEATURE(journal,
146 					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
147 		set_buffer_ordered(bh);
148 		barrier_done = 1;
149 	}
150 	ret = submit_bh(WRITE, bh);
151 	if (barrier_done)
152 		clear_buffer_ordered(bh);
153 
154 	/* is it possible for another commit to fail at roughly
155 	 * the same time as this one?  If so, we don't want to
156 	 * trust the barrier flag in the super, but instead want
157 	 * to remember if we sent a barrier request
158 	 */
159 	if (ret == -EOPNOTSUPP && barrier_done) {
160 		char b[BDEVNAME_SIZE];
161 
162 		printk(KERN_WARNING
163 			"JBD: barrier-based sync failed on %s - "
164 			"disabling barriers\n",
165 			bdevname(journal->j_dev, b));
166 		spin_lock(&journal->j_state_lock);
167 		journal->j_flags &= ~JBD2_BARRIER;
168 		spin_unlock(&journal->j_state_lock);
169 
170 		/* And try again, without the barrier */
171 		set_buffer_uptodate(bh);
172 		set_buffer_dirty(bh);
173 		ret = submit_bh(WRITE, bh);
174 	}
175 	*cbh = bh;
176 	return ret;
177 }
178 
179 /*
180  * This function along with journal_submit_commit_record
181  * allows to write the commit record asynchronously.
182  */
183 static int journal_wait_on_commit_record(struct buffer_head *bh)
184 {
185 	int ret = 0;
186 
187 	clear_buffer_dirty(bh);
188 	wait_on_buffer(bh);
189 
190 	if (unlikely(!buffer_uptodate(bh)))
191 		ret = -EIO;
192 	put_bh(bh);            /* One for getblk() */
193 	jbd2_journal_put_journal_head(bh2jh(bh));
194 
195 	return ret;
196 }
197 
198 /*
199  * Wait for all submitted IO to complete.
200  */
201 static int journal_wait_on_locked_list(journal_t *journal,
202 				       transaction_t *commit_transaction)
203 {
204 	int ret = 0;
205 	struct journal_head *jh;
206 
207 	while (commit_transaction->t_locked_list) {
208 		struct buffer_head *bh;
209 
210 		jh = commit_transaction->t_locked_list->b_tprev;
211 		bh = jh2bh(jh);
212 		get_bh(bh);
213 		if (buffer_locked(bh)) {
214 			spin_unlock(&journal->j_list_lock);
215 			wait_on_buffer(bh);
216 			if (unlikely(!buffer_uptodate(bh)))
217 				ret = -EIO;
218 			spin_lock(&journal->j_list_lock);
219 		}
220 		if (!inverted_lock(journal, bh)) {
221 			put_bh(bh);
222 			spin_lock(&journal->j_list_lock);
223 			continue;
224 		}
225 		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
226 			__jbd2_journal_unfile_buffer(jh);
227 			jbd_unlock_bh_state(bh);
228 			jbd2_journal_remove_journal_head(bh);
229 			put_bh(bh);
230 		} else {
231 			jbd_unlock_bh_state(bh);
232 		}
233 		put_bh(bh);
234 		cond_resched_lock(&journal->j_list_lock);
235 	}
236 	return ret;
237   }
238 
239 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
240 {
241 	int i;
242 
243 	for (i = 0; i < bufs; i++) {
244 		wbuf[i]->b_end_io = end_buffer_write_sync;
245 		/* We use-up our safety reference in submit_bh() */
246 		submit_bh(WRITE, wbuf[i]);
247 	}
248 }
249 
250 /*
251  *  Submit all the data buffers to disk
252  */
253 static void journal_submit_data_buffers(journal_t *journal,
254 				transaction_t *commit_transaction)
255 {
256 	struct journal_head *jh;
257 	struct buffer_head *bh;
258 	int locked;
259 	int bufs = 0;
260 	struct buffer_head **wbuf = journal->j_wbuf;
261 
262 	/*
263 	 * Whenever we unlock the journal and sleep, things can get added
264 	 * onto ->t_sync_datalist, so we have to keep looping back to
265 	 * write_out_data until we *know* that the list is empty.
266 	 *
267 	 * Cleanup any flushed data buffers from the data list.  Even in
268 	 * abort mode, we want to flush this out as soon as possible.
269 	 */
270 write_out_data:
271 	cond_resched();
272 	spin_lock(&journal->j_list_lock);
273 
274 	while (commit_transaction->t_sync_datalist) {
275 		jh = commit_transaction->t_sync_datalist;
276 		bh = jh2bh(jh);
277 		locked = 0;
278 
279 		/* Get reference just to make sure buffer does not disappear
280 		 * when we are forced to drop various locks */
281 		get_bh(bh);
282 		/* If the buffer is dirty, we need to submit IO and hence
283 		 * we need the buffer lock. We try to lock the buffer without
284 		 * blocking. If we fail, we need to drop j_list_lock and do
285 		 * blocking lock_buffer().
286 		 */
287 		if (buffer_dirty(bh)) {
288 			if (test_set_buffer_locked(bh)) {
289 				BUFFER_TRACE(bh, "needs blocking lock");
290 				spin_unlock(&journal->j_list_lock);
291 				/* Write out all data to prevent deadlocks */
292 				journal_do_submit_data(wbuf, bufs);
293 				bufs = 0;
294 				lock_buffer(bh);
295 				spin_lock(&journal->j_list_lock);
296 			}
297 			locked = 1;
298 		}
299 		/* We have to get bh_state lock. Again out of order, sigh. */
300 		if (!inverted_lock(journal, bh)) {
301 			jbd_lock_bh_state(bh);
302 			spin_lock(&journal->j_list_lock);
303 		}
304 		/* Someone already cleaned up the buffer? */
305 		if (!buffer_jbd(bh)
306 			|| jh->b_transaction != commit_transaction
307 			|| jh->b_jlist != BJ_SyncData) {
308 			jbd_unlock_bh_state(bh);
309 			if (locked)
310 				unlock_buffer(bh);
311 			BUFFER_TRACE(bh, "already cleaned up");
312 			put_bh(bh);
313 			continue;
314 		}
315 		if (locked && test_clear_buffer_dirty(bh)) {
316 			BUFFER_TRACE(bh, "needs writeout, adding to array");
317 			wbuf[bufs++] = bh;
318 			__jbd2_journal_file_buffer(jh, commit_transaction,
319 						BJ_Locked);
320 			jbd_unlock_bh_state(bh);
321 			if (bufs == journal->j_wbufsize) {
322 				spin_unlock(&journal->j_list_lock);
323 				journal_do_submit_data(wbuf, bufs);
324 				bufs = 0;
325 				goto write_out_data;
326 			}
327 		} else if (!locked && buffer_locked(bh)) {
328 			__jbd2_journal_file_buffer(jh, commit_transaction,
329 						BJ_Locked);
330 			jbd_unlock_bh_state(bh);
331 			put_bh(bh);
332 		} else {
333 			BUFFER_TRACE(bh, "writeout complete: unfile");
334 			__jbd2_journal_unfile_buffer(jh);
335 			jbd_unlock_bh_state(bh);
336 			if (locked)
337 				unlock_buffer(bh);
338 			jbd2_journal_remove_journal_head(bh);
339 			/* Once for our safety reference, once for
340 			 * jbd2_journal_remove_journal_head() */
341 			put_bh(bh);
342 			put_bh(bh);
343 		}
344 
345 		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
346 			spin_unlock(&journal->j_list_lock);
347 			goto write_out_data;
348 		}
349 	}
350 	spin_unlock(&journal->j_list_lock);
351 	journal_do_submit_data(wbuf, bufs);
352 }
353 
354 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
355 {
356 	struct page *page = bh->b_page;
357 	char *addr;
358 	__u32 checksum;
359 
360 	addr = kmap_atomic(page, KM_USER0);
361 	checksum = crc32_be(crc32_sum,
362 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
363 	kunmap_atomic(addr, KM_USER0);
364 
365 	return checksum;
366 }
367 
368 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
369 				   unsigned long long block)
370 {
371 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
372 	if (tag_bytes > JBD2_TAG_SIZE32)
373 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
374 }
375 
376 /*
377  * jbd2_journal_commit_transaction
378  *
379  * The primary function for committing a transaction to the log.  This
380  * function is called by the journal thread to begin a complete commit.
381  */
382 void jbd2_journal_commit_transaction(journal_t *journal)
383 {
384 	struct transaction_stats_s stats;
385 	transaction_t *commit_transaction;
386 	struct journal_head *jh, *new_jh, *descriptor;
387 	struct buffer_head **wbuf = journal->j_wbuf;
388 	int bufs;
389 	int flags;
390 	int err;
391 	unsigned long long blocknr;
392 	char *tagp = NULL;
393 	journal_header_t *header;
394 	journal_block_tag_t *tag = NULL;
395 	int space_left = 0;
396 	int first_tag = 0;
397 	int tag_flag;
398 	int i;
399 	int tag_bytes = journal_tag_bytes(journal);
400 	struct buffer_head *cbh = NULL; /* For transactional checksums */
401 	__u32 crc32_sum = ~0;
402 
403 	/*
404 	 * First job: lock down the current transaction and wait for
405 	 * all outstanding updates to complete.
406 	 */
407 
408 #ifdef COMMIT_STATS
409 	spin_lock(&journal->j_list_lock);
410 	summarise_journal_usage(journal);
411 	spin_unlock(&journal->j_list_lock);
412 #endif
413 
414 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
415 	if (journal->j_flags & JBD2_FLUSHED) {
416 		jbd_debug(3, "super block updated\n");
417 		jbd2_journal_update_superblock(journal, 1);
418 	} else {
419 		jbd_debug(3, "superblock not updated\n");
420 	}
421 
422 	J_ASSERT(journal->j_running_transaction != NULL);
423 	J_ASSERT(journal->j_committing_transaction == NULL);
424 
425 	commit_transaction = journal->j_running_transaction;
426 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
427 
428 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
429 			commit_transaction->t_tid);
430 
431 	spin_lock(&journal->j_state_lock);
432 	commit_transaction->t_state = T_LOCKED;
433 
434 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
435 	stats.u.run.rs_locked = jiffies;
436 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
437 						stats.u.run.rs_locked);
438 
439 	spin_lock(&commit_transaction->t_handle_lock);
440 	while (commit_transaction->t_updates) {
441 		DEFINE_WAIT(wait);
442 
443 		prepare_to_wait(&journal->j_wait_updates, &wait,
444 					TASK_UNINTERRUPTIBLE);
445 		if (commit_transaction->t_updates) {
446 			spin_unlock(&commit_transaction->t_handle_lock);
447 			spin_unlock(&journal->j_state_lock);
448 			schedule();
449 			spin_lock(&journal->j_state_lock);
450 			spin_lock(&commit_transaction->t_handle_lock);
451 		}
452 		finish_wait(&journal->j_wait_updates, &wait);
453 	}
454 	spin_unlock(&commit_transaction->t_handle_lock);
455 
456 	J_ASSERT (commit_transaction->t_outstanding_credits <=
457 			journal->j_max_transaction_buffers);
458 
459 	/*
460 	 * First thing we are allowed to do is to discard any remaining
461 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
462 	 * that there are no such buffers: if a large filesystem
463 	 * operation like a truncate needs to split itself over multiple
464 	 * transactions, then it may try to do a jbd2_journal_restart() while
465 	 * there are still BJ_Reserved buffers outstanding.  These must
466 	 * be released cleanly from the current transaction.
467 	 *
468 	 * In this case, the filesystem must still reserve write access
469 	 * again before modifying the buffer in the new transaction, but
470 	 * we do not require it to remember exactly which old buffers it
471 	 * has reserved.  This is consistent with the existing behaviour
472 	 * that multiple jbd2_journal_get_write_access() calls to the same
473 	 * buffer are perfectly permissable.
474 	 */
475 	while (commit_transaction->t_reserved_list) {
476 		jh = commit_transaction->t_reserved_list;
477 		JBUFFER_TRACE(jh, "reserved, unused: refile");
478 		/*
479 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
480 		 * leave undo-committed data.
481 		 */
482 		if (jh->b_committed_data) {
483 			struct buffer_head *bh = jh2bh(jh);
484 
485 			jbd_lock_bh_state(bh);
486 			jbd2_free(jh->b_committed_data, bh->b_size);
487 			jh->b_committed_data = NULL;
488 			jbd_unlock_bh_state(bh);
489 		}
490 		jbd2_journal_refile_buffer(journal, jh);
491 	}
492 
493 	/*
494 	 * Now try to drop any written-back buffers from the journal's
495 	 * checkpoint lists.  We do this *before* commit because it potentially
496 	 * frees some memory
497 	 */
498 	spin_lock(&journal->j_list_lock);
499 	__jbd2_journal_clean_checkpoint_list(journal);
500 	spin_unlock(&journal->j_list_lock);
501 
502 	jbd_debug (3, "JBD: commit phase 1\n");
503 
504 	/*
505 	 * Switch to a new revoke table.
506 	 */
507 	jbd2_journal_switch_revoke_table(journal);
508 
509 	stats.u.run.rs_flushing = jiffies;
510 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
511 					       stats.u.run.rs_flushing);
512 
513 	commit_transaction->t_state = T_FLUSH;
514 	journal->j_committing_transaction = commit_transaction;
515 	journal->j_running_transaction = NULL;
516 	commit_transaction->t_log_start = journal->j_head;
517 	wake_up(&journal->j_wait_transaction_locked);
518 	spin_unlock(&journal->j_state_lock);
519 
520 	jbd_debug (3, "JBD: commit phase 2\n");
521 
522 	/*
523 	 * Now start flushing things to disk, in the order they appear
524 	 * on the transaction lists.  Data blocks go first.
525 	 */
526 	err = 0;
527 	journal_submit_data_buffers(journal, commit_transaction);
528 
529 	/*
530 	 * Wait for all previously submitted IO to complete if commit
531 	 * record is to be written synchronously.
532 	 */
533 	spin_lock(&journal->j_list_lock);
534 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
535 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
536 		err = journal_wait_on_locked_list(journal,
537 						commit_transaction);
538 
539 	spin_unlock(&journal->j_list_lock);
540 
541 	if (err)
542 		jbd2_journal_abort(journal, err);
543 
544 	jbd2_journal_write_revoke_records(journal, commit_transaction);
545 
546 	jbd_debug(3, "JBD: commit phase 2\n");
547 
548 	/*
549 	 * If we found any dirty or locked buffers, then we should have
550 	 * looped back up to the write_out_data label.  If there weren't
551 	 * any then journal_clean_data_list should have wiped the list
552 	 * clean by now, so check that it is in fact empty.
553 	 */
554 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
555 
556 	jbd_debug (3, "JBD: commit phase 3\n");
557 
558 	/*
559 	 * Way to go: we have now written out all of the data for a
560 	 * transaction!  Now comes the tricky part: we need to write out
561 	 * metadata.  Loop over the transaction's entire buffer list:
562 	 */
563 	commit_transaction->t_state = T_COMMIT;
564 
565 	stats.u.run.rs_logging = jiffies;
566 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
567 						 stats.u.run.rs_logging);
568 	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
569 	stats.u.run.rs_blocks_logged = 0;
570 
571 	J_ASSERT(commit_transaction->t_nr_buffers <=
572 		 commit_transaction->t_outstanding_credits);
573 
574 	descriptor = NULL;
575 	bufs = 0;
576 	while (commit_transaction->t_buffers) {
577 
578 		/* Find the next buffer to be journaled... */
579 
580 		jh = commit_transaction->t_buffers;
581 
582 		/* If we're in abort mode, we just un-journal the buffer and
583 		   release it for background writing. */
584 
585 		if (is_journal_aborted(journal)) {
586 			JBUFFER_TRACE(jh, "journal is aborting: refile");
587 			jbd2_journal_refile_buffer(journal, jh);
588 			/* If that was the last one, we need to clean up
589 			 * any descriptor buffers which may have been
590 			 * already allocated, even if we are now
591 			 * aborting. */
592 			if (!commit_transaction->t_buffers)
593 				goto start_journal_io;
594 			continue;
595 		}
596 
597 		/* Make sure we have a descriptor block in which to
598 		   record the metadata buffer. */
599 
600 		if (!descriptor) {
601 			struct buffer_head *bh;
602 
603 			J_ASSERT (bufs == 0);
604 
605 			jbd_debug(4, "JBD: get descriptor\n");
606 
607 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
608 			if (!descriptor) {
609 				jbd2_journal_abort(journal, -EIO);
610 				continue;
611 			}
612 
613 			bh = jh2bh(descriptor);
614 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
615 				(unsigned long long)bh->b_blocknr, bh->b_data);
616 			header = (journal_header_t *)&bh->b_data[0];
617 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
618 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
619 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
620 
621 			tagp = &bh->b_data[sizeof(journal_header_t)];
622 			space_left = bh->b_size - sizeof(journal_header_t);
623 			first_tag = 1;
624 			set_buffer_jwrite(bh);
625 			set_buffer_dirty(bh);
626 			wbuf[bufs++] = bh;
627 
628 			/* Record it so that we can wait for IO
629                            completion later */
630 			BUFFER_TRACE(bh, "ph3: file as descriptor");
631 			jbd2_journal_file_buffer(descriptor, commit_transaction,
632 					BJ_LogCtl);
633 		}
634 
635 		/* Where is the buffer to be written? */
636 
637 		err = jbd2_journal_next_log_block(journal, &blocknr);
638 		/* If the block mapping failed, just abandon the buffer
639 		   and repeat this loop: we'll fall into the
640 		   refile-on-abort condition above. */
641 		if (err) {
642 			jbd2_journal_abort(journal, err);
643 			continue;
644 		}
645 
646 		/*
647 		 * start_this_handle() uses t_outstanding_credits to determine
648 		 * the free space in the log, but this counter is changed
649 		 * by jbd2_journal_next_log_block() also.
650 		 */
651 		commit_transaction->t_outstanding_credits--;
652 
653 		/* Bump b_count to prevent truncate from stumbling over
654                    the shadowed buffer!  @@@ This can go if we ever get
655                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
656 		atomic_inc(&jh2bh(jh)->b_count);
657 
658 		/* Make a temporary IO buffer with which to write it out
659                    (this will requeue both the metadata buffer and the
660                    temporary IO buffer). new_bh goes on BJ_IO*/
661 
662 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
663 		/*
664 		 * akpm: jbd2_journal_write_metadata_buffer() sets
665 		 * new_bh->b_transaction to commit_transaction.
666 		 * We need to clean this up before we release new_bh
667 		 * (which is of type BJ_IO)
668 		 */
669 		JBUFFER_TRACE(jh, "ph3: write metadata");
670 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
671 						      jh, &new_jh, blocknr);
672 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
673 		wbuf[bufs++] = jh2bh(new_jh);
674 
675 		/* Record the new block's tag in the current descriptor
676                    buffer */
677 
678 		tag_flag = 0;
679 		if (flags & 1)
680 			tag_flag |= JBD2_FLAG_ESCAPE;
681 		if (!first_tag)
682 			tag_flag |= JBD2_FLAG_SAME_UUID;
683 
684 		tag = (journal_block_tag_t *) tagp;
685 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
686 		tag->t_flags = cpu_to_be32(tag_flag);
687 		tagp += tag_bytes;
688 		space_left -= tag_bytes;
689 
690 		if (first_tag) {
691 			memcpy (tagp, journal->j_uuid, 16);
692 			tagp += 16;
693 			space_left -= 16;
694 			first_tag = 0;
695 		}
696 
697 		/* If there's no more to do, or if the descriptor is full,
698 		   let the IO rip! */
699 
700 		if (bufs == journal->j_wbufsize ||
701 		    commit_transaction->t_buffers == NULL ||
702 		    space_left < tag_bytes + 16) {
703 
704 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
705 
706 			/* Write an end-of-descriptor marker before
707                            submitting the IOs.  "tag" still points to
708                            the last tag we set up. */
709 
710 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
711 
712 start_journal_io:
713 			for (i = 0; i < bufs; i++) {
714 				struct buffer_head *bh = wbuf[i];
715 				/*
716 				 * Compute checksum.
717 				 */
718 				if (JBD2_HAS_COMPAT_FEATURE(journal,
719 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
720 					crc32_sum =
721 					    jbd2_checksum_data(crc32_sum, bh);
722 				}
723 
724 				lock_buffer(bh);
725 				clear_buffer_dirty(bh);
726 				set_buffer_uptodate(bh);
727 				bh->b_end_io = journal_end_buffer_io_sync;
728 				submit_bh(WRITE, bh);
729 			}
730 			cond_resched();
731 			stats.u.run.rs_blocks_logged += bufs;
732 
733 			/* Force a new descriptor to be generated next
734                            time round the loop. */
735 			descriptor = NULL;
736 			bufs = 0;
737 		}
738 	}
739 
740 	/* Done it all: now write the commit record asynchronously. */
741 
742 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
743 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
744 		err = journal_submit_commit_record(journal, commit_transaction,
745 						 &cbh, crc32_sum);
746 		if (err)
747 			__jbd2_journal_abort_hard(journal);
748 
749 		spin_lock(&journal->j_list_lock);
750 		err = journal_wait_on_locked_list(journal,
751 						commit_transaction);
752 		spin_unlock(&journal->j_list_lock);
753 		if (err)
754 			__jbd2_journal_abort_hard(journal);
755 	}
756 
757 	/* Lo and behold: we have just managed to send a transaction to
758            the log.  Before we can commit it, wait for the IO so far to
759            complete.  Control buffers being written are on the
760            transaction's t_log_list queue, and metadata buffers are on
761            the t_iobuf_list queue.
762 
763 	   Wait for the buffers in reverse order.  That way we are
764 	   less likely to be woken up until all IOs have completed, and
765 	   so we incur less scheduling load.
766 	*/
767 
768 	jbd_debug(3, "JBD: commit phase 4\n");
769 
770 	/*
771 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
772 	 * See __journal_try_to_free_buffer.
773 	 */
774 wait_for_iobuf:
775 	while (commit_transaction->t_iobuf_list != NULL) {
776 		struct buffer_head *bh;
777 
778 		jh = commit_transaction->t_iobuf_list->b_tprev;
779 		bh = jh2bh(jh);
780 		if (buffer_locked(bh)) {
781 			wait_on_buffer(bh);
782 			goto wait_for_iobuf;
783 		}
784 		if (cond_resched())
785 			goto wait_for_iobuf;
786 
787 		if (unlikely(!buffer_uptodate(bh)))
788 			err = -EIO;
789 
790 		clear_buffer_jwrite(bh);
791 
792 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
793 		jbd2_journal_unfile_buffer(journal, jh);
794 
795 		/*
796 		 * ->t_iobuf_list should contain only dummy buffer_heads
797 		 * which were created by jbd2_journal_write_metadata_buffer().
798 		 */
799 		BUFFER_TRACE(bh, "dumping temporary bh");
800 		jbd2_journal_put_journal_head(jh);
801 		__brelse(bh);
802 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
803 		free_buffer_head(bh);
804 
805 		/* We also have to unlock and free the corresponding
806                    shadowed buffer */
807 		jh = commit_transaction->t_shadow_list->b_tprev;
808 		bh = jh2bh(jh);
809 		clear_bit(BH_JWrite, &bh->b_state);
810 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
811 
812 		/* The metadata is now released for reuse, but we need
813                    to remember it against this transaction so that when
814                    we finally commit, we can do any checkpointing
815                    required. */
816 		JBUFFER_TRACE(jh, "file as BJ_Forget");
817 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
818 		/* Wake up any transactions which were waiting for this
819 		   IO to complete */
820 		wake_up_bit(&bh->b_state, BH_Unshadow);
821 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
822 		__brelse(bh);
823 	}
824 
825 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
826 
827 	jbd_debug(3, "JBD: commit phase 5\n");
828 
829 	/* Here we wait for the revoke record and descriptor record buffers */
830  wait_for_ctlbuf:
831 	while (commit_transaction->t_log_list != NULL) {
832 		struct buffer_head *bh;
833 
834 		jh = commit_transaction->t_log_list->b_tprev;
835 		bh = jh2bh(jh);
836 		if (buffer_locked(bh)) {
837 			wait_on_buffer(bh);
838 			goto wait_for_ctlbuf;
839 		}
840 		if (cond_resched())
841 			goto wait_for_ctlbuf;
842 
843 		if (unlikely(!buffer_uptodate(bh)))
844 			err = -EIO;
845 
846 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
847 		clear_buffer_jwrite(bh);
848 		jbd2_journal_unfile_buffer(journal, jh);
849 		jbd2_journal_put_journal_head(jh);
850 		__brelse(bh);		/* One for getblk */
851 		/* AKPM: bforget here */
852 	}
853 
854 	jbd_debug(3, "JBD: commit phase 6\n");
855 
856 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
857 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
858 		err = journal_submit_commit_record(journal, commit_transaction,
859 						&cbh, crc32_sum);
860 		if (err)
861 			__jbd2_journal_abort_hard(journal);
862 	}
863 	if (!err && !is_journal_aborted(journal))
864 		err = journal_wait_on_commit_record(cbh);
865 
866 	if (err)
867 		jbd2_journal_abort(journal, err);
868 
869 	/* End of a transaction!  Finally, we can do checkpoint
870            processing: any buffers committed as a result of this
871            transaction can be removed from any checkpoint list it was on
872            before. */
873 
874 	jbd_debug(3, "JBD: commit phase 7\n");
875 
876 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
877 	J_ASSERT(commit_transaction->t_buffers == NULL);
878 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
879 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
880 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
881 	J_ASSERT(commit_transaction->t_log_list == NULL);
882 
883 restart_loop:
884 	/*
885 	 * As there are other places (journal_unmap_buffer()) adding buffers
886 	 * to this list we have to be careful and hold the j_list_lock.
887 	 */
888 	spin_lock(&journal->j_list_lock);
889 	while (commit_transaction->t_forget) {
890 		transaction_t *cp_transaction;
891 		struct buffer_head *bh;
892 
893 		jh = commit_transaction->t_forget;
894 		spin_unlock(&journal->j_list_lock);
895 		bh = jh2bh(jh);
896 		jbd_lock_bh_state(bh);
897 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
898 			jh->b_transaction == journal->j_running_transaction);
899 
900 		/*
901 		 * If there is undo-protected committed data against
902 		 * this buffer, then we can remove it now.  If it is a
903 		 * buffer needing such protection, the old frozen_data
904 		 * field now points to a committed version of the
905 		 * buffer, so rotate that field to the new committed
906 		 * data.
907 		 *
908 		 * Otherwise, we can just throw away the frozen data now.
909 		 */
910 		if (jh->b_committed_data) {
911 			jbd2_free(jh->b_committed_data, bh->b_size);
912 			jh->b_committed_data = NULL;
913 			if (jh->b_frozen_data) {
914 				jh->b_committed_data = jh->b_frozen_data;
915 				jh->b_frozen_data = NULL;
916 			}
917 		} else if (jh->b_frozen_data) {
918 			jbd2_free(jh->b_frozen_data, bh->b_size);
919 			jh->b_frozen_data = NULL;
920 		}
921 
922 		spin_lock(&journal->j_list_lock);
923 		cp_transaction = jh->b_cp_transaction;
924 		if (cp_transaction) {
925 			JBUFFER_TRACE(jh, "remove from old cp transaction");
926 			cp_transaction->t_chp_stats.cs_dropped++;
927 			__jbd2_journal_remove_checkpoint(jh);
928 		}
929 
930 		/* Only re-checkpoint the buffer_head if it is marked
931 		 * dirty.  If the buffer was added to the BJ_Forget list
932 		 * by jbd2_journal_forget, it may no longer be dirty and
933 		 * there's no point in keeping a checkpoint record for
934 		 * it. */
935 
936 		/* A buffer which has been freed while still being
937 		 * journaled by a previous transaction may end up still
938 		 * being dirty here, but we want to avoid writing back
939 		 * that buffer in the future now that the last use has
940 		 * been committed.  That's not only a performance gain,
941 		 * it also stops aliasing problems if the buffer is left
942 		 * behind for writeback and gets reallocated for another
943 		 * use in a different page. */
944 		if (buffer_freed(bh)) {
945 			clear_buffer_freed(bh);
946 			clear_buffer_jbddirty(bh);
947 		}
948 
949 		if (buffer_jbddirty(bh)) {
950 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
951 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
952 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
953 			__jbd2_journal_refile_buffer(jh);
954 			jbd_unlock_bh_state(bh);
955 		} else {
956 			J_ASSERT_BH(bh, !buffer_dirty(bh));
957 			/* The buffer on BJ_Forget list and not jbddirty means
958 			 * it has been freed by this transaction and hence it
959 			 * could not have been reallocated until this
960 			 * transaction has committed. *BUT* it could be
961 			 * reallocated once we have written all the data to
962 			 * disk and before we process the buffer on BJ_Forget
963 			 * list. */
964 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
965 			__jbd2_journal_refile_buffer(jh);
966 			if (!jh->b_transaction) {
967 				jbd_unlock_bh_state(bh);
968 				 /* needs a brelse */
969 				jbd2_journal_remove_journal_head(bh);
970 				release_buffer_page(bh);
971 			} else
972 				jbd_unlock_bh_state(bh);
973 		}
974 		cond_resched_lock(&journal->j_list_lock);
975 	}
976 	spin_unlock(&journal->j_list_lock);
977 	/*
978 	 * This is a bit sleazy.  We use j_list_lock to protect transition
979 	 * of a transaction into T_FINISHED state and calling
980 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
981 	 * other checkpointing code processing the transaction...
982 	 */
983 	spin_lock(&journal->j_state_lock);
984 	spin_lock(&journal->j_list_lock);
985 	/*
986 	 * Now recheck if some buffers did not get attached to the transaction
987 	 * while the lock was dropped...
988 	 */
989 	if (commit_transaction->t_forget) {
990 		spin_unlock(&journal->j_list_lock);
991 		spin_unlock(&journal->j_state_lock);
992 		goto restart_loop;
993 	}
994 
995 	/* Done with this transaction! */
996 
997 	jbd_debug(3, "JBD: commit phase 8\n");
998 
999 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
1000 
1001 	commit_transaction->t_start = jiffies;
1002 	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1003 						commit_transaction->t_start);
1004 
1005 	/*
1006 	 * File the transaction for history
1007 	 */
1008 	stats.ts_type = JBD2_STATS_RUN;
1009 	stats.ts_tid = commit_transaction->t_tid;
1010 	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1011 	spin_lock(&journal->j_history_lock);
1012 	memcpy(journal->j_history + journal->j_history_cur, &stats,
1013 			sizeof(stats));
1014 	if (++journal->j_history_cur == journal->j_history_max)
1015 		journal->j_history_cur = 0;
1016 
1017 	/*
1018 	 * Calculate overall stats
1019 	 */
1020 	journal->j_stats.ts_tid++;
1021 	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1022 	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1023 	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1024 	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1025 	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1026 	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1027 	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1028 	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1029 	spin_unlock(&journal->j_history_lock);
1030 
1031 	commit_transaction->t_state = T_FINISHED;
1032 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1033 	journal->j_commit_sequence = commit_transaction->t_tid;
1034 	journal->j_committing_transaction = NULL;
1035 	spin_unlock(&journal->j_state_lock);
1036 
1037 	if (commit_transaction->t_checkpoint_list == NULL &&
1038 	    commit_transaction->t_checkpoint_io_list == NULL) {
1039 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1040 	} else {
1041 		if (journal->j_checkpoint_transactions == NULL) {
1042 			journal->j_checkpoint_transactions = commit_transaction;
1043 			commit_transaction->t_cpnext = commit_transaction;
1044 			commit_transaction->t_cpprev = commit_transaction;
1045 		} else {
1046 			commit_transaction->t_cpnext =
1047 				journal->j_checkpoint_transactions;
1048 			commit_transaction->t_cpprev =
1049 				commit_transaction->t_cpnext->t_cpprev;
1050 			commit_transaction->t_cpnext->t_cpprev =
1051 				commit_transaction;
1052 			commit_transaction->t_cpprev->t_cpnext =
1053 				commit_transaction;
1054 		}
1055 	}
1056 	spin_unlock(&journal->j_list_lock);
1057 
1058 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1059 		  journal->j_commit_sequence, journal->j_tail_sequence);
1060 
1061 	wake_up(&journal->j_wait_done_commit);
1062 }
1063