xref: /linux/fs/jbd2/commit.c (revision 99b5aa3c10c7cff1e97239fda93649222fc12d25)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/smp_lock.h>
24 
25 /*
26  * Default IO end handler for temporary BJ_IO buffer_heads.
27  */
28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29 {
30 	BUFFER_TRACE(bh, "");
31 	if (uptodate)
32 		set_buffer_uptodate(bh);
33 	else
34 		clear_buffer_uptodate(bh);
35 	unlock_buffer(bh);
36 }
37 
38 /*
39  * When an ext3-ordered file is truncated, it is possible that many pages are
40  * not sucessfully freed, because they are attached to a committing transaction.
41  * After the transaction commits, these pages are left on the LRU, with no
42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
44  * the numbers in /proc/meminfo look odd.
45  *
46  * So here, we have a buffer which has just come off the forget list.  Look to
47  * see if we can strip all buffers from the backing page.
48  *
49  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
50  * caller provided us with a ref against the buffer, and we drop that here.
51  */
52 static void release_buffer_page(struct buffer_head *bh)
53 {
54 	struct page *page;
55 
56 	if (buffer_dirty(bh))
57 		goto nope;
58 	if (atomic_read(&bh->b_count) != 1)
59 		goto nope;
60 	page = bh->b_page;
61 	if (!page)
62 		goto nope;
63 	if (page->mapping)
64 		goto nope;
65 
66 	/* OK, it's a truncated page */
67 	if (TestSetPageLocked(page))
68 		goto nope;
69 
70 	page_cache_get(page);
71 	__brelse(bh);
72 	try_to_free_buffers(page);
73 	unlock_page(page);
74 	page_cache_release(page);
75 	return;
76 
77 nope:
78 	__brelse(bh);
79 }
80 
81 /*
82  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
84  * return 0.  j_list_lock is dropped in this case.
85  */
86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87 {
88 	if (!jbd_trylock_bh_state(bh)) {
89 		spin_unlock(&journal->j_list_lock);
90 		schedule();
91 		return 0;
92 	}
93 	return 1;
94 }
95 
96 /* Done it all: now write the commit record.  We should have
97  * cleaned up our previous buffers by now, so if we are in abort
98  * mode we can now just skip the rest of the journal write
99  * entirely.
100  *
101  * Returns 1 if the journal needs to be aborted or 0 on success
102  */
103 static int journal_write_commit_record(journal_t *journal,
104 					transaction_t *commit_transaction)
105 {
106 	struct journal_head *descriptor;
107 	struct buffer_head *bh;
108 	int i, ret;
109 	int barrier_done = 0;
110 
111 	if (is_journal_aborted(journal))
112 		return 0;
113 
114 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
115 	if (!descriptor)
116 		return 1;
117 
118 	bh = jh2bh(descriptor);
119 
120 	/* AKPM: buglet - add `i' to tmp! */
121 	for (i = 0; i < bh->b_size; i += 512) {
122 		journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 		tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124 		tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
125 		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 	}
127 
128 	JBUFFER_TRACE(descriptor, "write commit block");
129 	set_buffer_dirty(bh);
130 	if (journal->j_flags & JBD2_BARRIER) {
131 		set_buffer_ordered(bh);
132 		barrier_done = 1;
133 	}
134 	ret = sync_dirty_buffer(bh);
135 	/* is it possible for another commit to fail at roughly
136 	 * the same time as this one?  If so, we don't want to
137 	 * trust the barrier flag in the super, but instead want
138 	 * to remember if we sent a barrier request
139 	 */
140 	if (ret == -EOPNOTSUPP && barrier_done) {
141 		char b[BDEVNAME_SIZE];
142 
143 		printk(KERN_WARNING
144 			"JBD: barrier-based sync failed on %s - "
145 			"disabling barriers\n",
146 			bdevname(journal->j_dev, b));
147 		spin_lock(&journal->j_state_lock);
148 		journal->j_flags &= ~JBD2_BARRIER;
149 		spin_unlock(&journal->j_state_lock);
150 
151 		/* And try again, without the barrier */
152 		clear_buffer_ordered(bh);
153 		set_buffer_uptodate(bh);
154 		set_buffer_dirty(bh);
155 		ret = sync_dirty_buffer(bh);
156 	}
157 	put_bh(bh);		/* One for getblk() */
158 	jbd2_journal_put_journal_head(descriptor);
159 
160 	return (ret == -EIO);
161 }
162 
163 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164 {
165 	int i;
166 
167 	for (i = 0; i < bufs; i++) {
168 		wbuf[i]->b_end_io = end_buffer_write_sync;
169 		/* We use-up our safety reference in submit_bh() */
170 		submit_bh(WRITE, wbuf[i]);
171 	}
172 }
173 
174 /*
175  *  Submit all the data buffers to disk
176  */
177 static void journal_submit_data_buffers(journal_t *journal,
178 				transaction_t *commit_transaction)
179 {
180 	struct journal_head *jh;
181 	struct buffer_head *bh;
182 	int locked;
183 	int bufs = 0;
184 	struct buffer_head **wbuf = journal->j_wbuf;
185 
186 	/*
187 	 * Whenever we unlock the journal and sleep, things can get added
188 	 * onto ->t_sync_datalist, so we have to keep looping back to
189 	 * write_out_data until we *know* that the list is empty.
190 	 *
191 	 * Cleanup any flushed data buffers from the data list.  Even in
192 	 * abort mode, we want to flush this out as soon as possible.
193 	 */
194 write_out_data:
195 	cond_resched();
196 	spin_lock(&journal->j_list_lock);
197 
198 	while (commit_transaction->t_sync_datalist) {
199 		jh = commit_transaction->t_sync_datalist;
200 		bh = jh2bh(jh);
201 		locked = 0;
202 
203 		/* Get reference just to make sure buffer does not disappear
204 		 * when we are forced to drop various locks */
205 		get_bh(bh);
206 		/* If the buffer is dirty, we need to submit IO and hence
207 		 * we need the buffer lock. We try to lock the buffer without
208 		 * blocking. If we fail, we need to drop j_list_lock and do
209 		 * blocking lock_buffer().
210 		 */
211 		if (buffer_dirty(bh)) {
212 			if (test_set_buffer_locked(bh)) {
213 				BUFFER_TRACE(bh, "needs blocking lock");
214 				spin_unlock(&journal->j_list_lock);
215 				/* Write out all data to prevent deadlocks */
216 				journal_do_submit_data(wbuf, bufs);
217 				bufs = 0;
218 				lock_buffer(bh);
219 				spin_lock(&journal->j_list_lock);
220 			}
221 			locked = 1;
222 		}
223 		/* We have to get bh_state lock. Again out of order, sigh. */
224 		if (!inverted_lock(journal, bh)) {
225 			jbd_lock_bh_state(bh);
226 			spin_lock(&journal->j_list_lock);
227 		}
228 		/* Someone already cleaned up the buffer? */
229 		if (!buffer_jbd(bh)
230 			|| jh->b_transaction != commit_transaction
231 			|| jh->b_jlist != BJ_SyncData) {
232 			jbd_unlock_bh_state(bh);
233 			if (locked)
234 				unlock_buffer(bh);
235 			BUFFER_TRACE(bh, "already cleaned up");
236 			put_bh(bh);
237 			continue;
238 		}
239 		if (locked && test_clear_buffer_dirty(bh)) {
240 			BUFFER_TRACE(bh, "needs writeout, adding to array");
241 			wbuf[bufs++] = bh;
242 			__jbd2_journal_file_buffer(jh, commit_transaction,
243 						BJ_Locked);
244 			jbd_unlock_bh_state(bh);
245 			if (bufs == journal->j_wbufsize) {
246 				spin_unlock(&journal->j_list_lock);
247 				journal_do_submit_data(wbuf, bufs);
248 				bufs = 0;
249 				goto write_out_data;
250 			}
251 		} else if (!locked && buffer_locked(bh)) {
252 			__jbd2_journal_file_buffer(jh, commit_transaction,
253 						BJ_Locked);
254 			jbd_unlock_bh_state(bh);
255 			put_bh(bh);
256 		} else {
257 			BUFFER_TRACE(bh, "writeout complete: unfile");
258 			__jbd2_journal_unfile_buffer(jh);
259 			jbd_unlock_bh_state(bh);
260 			if (locked)
261 				unlock_buffer(bh);
262 			jbd2_journal_remove_journal_head(bh);
263 			/* Once for our safety reference, once for
264 			 * jbd2_journal_remove_journal_head() */
265 			put_bh(bh);
266 			put_bh(bh);
267 		}
268 
269 		if (lock_need_resched(&journal->j_list_lock)) {
270 			spin_unlock(&journal->j_list_lock);
271 			goto write_out_data;
272 		}
273 	}
274 	spin_unlock(&journal->j_list_lock);
275 	journal_do_submit_data(wbuf, bufs);
276 }
277 
278 static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
279 				   unsigned long long block)
280 {
281 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
282 	if (tag_bytes > JBD_TAG_SIZE32)
283 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
284 }
285 
286 /*
287  * jbd2_journal_commit_transaction
288  *
289  * The primary function for committing a transaction to the log.  This
290  * function is called by the journal thread to begin a complete commit.
291  */
292 void jbd2_journal_commit_transaction(journal_t *journal)
293 {
294 	transaction_t *commit_transaction;
295 	struct journal_head *jh, *new_jh, *descriptor;
296 	struct buffer_head **wbuf = journal->j_wbuf;
297 	int bufs;
298 	int flags;
299 	int err;
300 	unsigned long long blocknr;
301 	char *tagp = NULL;
302 	journal_header_t *header;
303 	journal_block_tag_t *tag = NULL;
304 	int space_left = 0;
305 	int first_tag = 0;
306 	int tag_flag;
307 	int i;
308 	int tag_bytes = journal_tag_bytes(journal);
309 
310 	/*
311 	 * First job: lock down the current transaction and wait for
312 	 * all outstanding updates to complete.
313 	 */
314 
315 #ifdef COMMIT_STATS
316 	spin_lock(&journal->j_list_lock);
317 	summarise_journal_usage(journal);
318 	spin_unlock(&journal->j_list_lock);
319 #endif
320 
321 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
322 	if (journal->j_flags & JBD2_FLUSHED) {
323 		jbd_debug(3, "super block updated\n");
324 		jbd2_journal_update_superblock(journal, 1);
325 	} else {
326 		jbd_debug(3, "superblock not updated\n");
327 	}
328 
329 	J_ASSERT(journal->j_running_transaction != NULL);
330 	J_ASSERT(journal->j_committing_transaction == NULL);
331 
332 	commit_transaction = journal->j_running_transaction;
333 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
334 
335 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
336 			commit_transaction->t_tid);
337 
338 	spin_lock(&journal->j_state_lock);
339 	commit_transaction->t_state = T_LOCKED;
340 
341 	spin_lock(&commit_transaction->t_handle_lock);
342 	while (commit_transaction->t_updates) {
343 		DEFINE_WAIT(wait);
344 
345 		prepare_to_wait(&journal->j_wait_updates, &wait,
346 					TASK_UNINTERRUPTIBLE);
347 		if (commit_transaction->t_updates) {
348 			spin_unlock(&commit_transaction->t_handle_lock);
349 			spin_unlock(&journal->j_state_lock);
350 			schedule();
351 			spin_lock(&journal->j_state_lock);
352 			spin_lock(&commit_transaction->t_handle_lock);
353 		}
354 		finish_wait(&journal->j_wait_updates, &wait);
355 	}
356 	spin_unlock(&commit_transaction->t_handle_lock);
357 
358 	J_ASSERT (commit_transaction->t_outstanding_credits <=
359 			journal->j_max_transaction_buffers);
360 
361 	/*
362 	 * First thing we are allowed to do is to discard any remaining
363 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
364 	 * that there are no such buffers: if a large filesystem
365 	 * operation like a truncate needs to split itself over multiple
366 	 * transactions, then it may try to do a jbd2_journal_restart() while
367 	 * there are still BJ_Reserved buffers outstanding.  These must
368 	 * be released cleanly from the current transaction.
369 	 *
370 	 * In this case, the filesystem must still reserve write access
371 	 * again before modifying the buffer in the new transaction, but
372 	 * we do not require it to remember exactly which old buffers it
373 	 * has reserved.  This is consistent with the existing behaviour
374 	 * that multiple jbd2_journal_get_write_access() calls to the same
375 	 * buffer are perfectly permissable.
376 	 */
377 	while (commit_transaction->t_reserved_list) {
378 		jh = commit_transaction->t_reserved_list;
379 		JBUFFER_TRACE(jh, "reserved, unused: refile");
380 		/*
381 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
382 		 * leave undo-committed data.
383 		 */
384 		if (jh->b_committed_data) {
385 			struct buffer_head *bh = jh2bh(jh);
386 
387 			jbd_lock_bh_state(bh);
388 			jbd2_slab_free(jh->b_committed_data, bh->b_size);
389 			jh->b_committed_data = NULL;
390 			jbd_unlock_bh_state(bh);
391 		}
392 		jbd2_journal_refile_buffer(journal, jh);
393 	}
394 
395 	/*
396 	 * Now try to drop any written-back buffers from the journal's
397 	 * checkpoint lists.  We do this *before* commit because it potentially
398 	 * frees some memory
399 	 */
400 	spin_lock(&journal->j_list_lock);
401 	__jbd2_journal_clean_checkpoint_list(journal);
402 	spin_unlock(&journal->j_list_lock);
403 
404 	jbd_debug (3, "JBD: commit phase 1\n");
405 
406 	/*
407 	 * Switch to a new revoke table.
408 	 */
409 	jbd2_journal_switch_revoke_table(journal);
410 
411 	commit_transaction->t_state = T_FLUSH;
412 	journal->j_committing_transaction = commit_transaction;
413 	journal->j_running_transaction = NULL;
414 	commit_transaction->t_log_start = journal->j_head;
415 	wake_up(&journal->j_wait_transaction_locked);
416 	spin_unlock(&journal->j_state_lock);
417 
418 	jbd_debug (3, "JBD: commit phase 2\n");
419 
420 	/*
421 	 * First, drop modified flag: all accesses to the buffers
422 	 * will be tracked for a new trasaction only -bzzz
423 	 */
424 	spin_lock(&journal->j_list_lock);
425 	if (commit_transaction->t_buffers) {
426 		new_jh = jh = commit_transaction->t_buffers->b_tnext;
427 		do {
428 			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
429 					new_jh->b_modified == 0);
430 			new_jh->b_modified = 0;
431 			new_jh = new_jh->b_tnext;
432 		} while (new_jh != jh);
433 	}
434 	spin_unlock(&journal->j_list_lock);
435 
436 	/*
437 	 * Now start flushing things to disk, in the order they appear
438 	 * on the transaction lists.  Data blocks go first.
439 	 */
440 	err = 0;
441 	journal_submit_data_buffers(journal, commit_transaction);
442 
443 	/*
444 	 * Wait for all previously submitted IO to complete.
445 	 */
446 	spin_lock(&journal->j_list_lock);
447 	while (commit_transaction->t_locked_list) {
448 		struct buffer_head *bh;
449 
450 		jh = commit_transaction->t_locked_list->b_tprev;
451 		bh = jh2bh(jh);
452 		get_bh(bh);
453 		if (buffer_locked(bh)) {
454 			spin_unlock(&journal->j_list_lock);
455 			wait_on_buffer(bh);
456 			if (unlikely(!buffer_uptodate(bh)))
457 				err = -EIO;
458 			spin_lock(&journal->j_list_lock);
459 		}
460 		if (!inverted_lock(journal, bh)) {
461 			put_bh(bh);
462 			spin_lock(&journal->j_list_lock);
463 			continue;
464 		}
465 		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
466 			__jbd2_journal_unfile_buffer(jh);
467 			jbd_unlock_bh_state(bh);
468 			jbd2_journal_remove_journal_head(bh);
469 			put_bh(bh);
470 		} else {
471 			jbd_unlock_bh_state(bh);
472 		}
473 		put_bh(bh);
474 		cond_resched_lock(&journal->j_list_lock);
475 	}
476 	spin_unlock(&journal->j_list_lock);
477 
478 	if (err)
479 		__jbd2_journal_abort_hard(journal);
480 
481 	jbd2_journal_write_revoke_records(journal, commit_transaction);
482 
483 	jbd_debug(3, "JBD: commit phase 2\n");
484 
485 	/*
486 	 * If we found any dirty or locked buffers, then we should have
487 	 * looped back up to the write_out_data label.  If there weren't
488 	 * any then journal_clean_data_list should have wiped the list
489 	 * clean by now, so check that it is in fact empty.
490 	 */
491 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
492 
493 	jbd_debug (3, "JBD: commit phase 3\n");
494 
495 	/*
496 	 * Way to go: we have now written out all of the data for a
497 	 * transaction!  Now comes the tricky part: we need to write out
498 	 * metadata.  Loop over the transaction's entire buffer list:
499 	 */
500 	commit_transaction->t_state = T_COMMIT;
501 
502 	descriptor = NULL;
503 	bufs = 0;
504 	while (commit_transaction->t_buffers) {
505 
506 		/* Find the next buffer to be journaled... */
507 
508 		jh = commit_transaction->t_buffers;
509 
510 		/* If we're in abort mode, we just un-journal the buffer and
511 		   release it for background writing. */
512 
513 		if (is_journal_aborted(journal)) {
514 			JBUFFER_TRACE(jh, "journal is aborting: refile");
515 			jbd2_journal_refile_buffer(journal, jh);
516 			/* If that was the last one, we need to clean up
517 			 * any descriptor buffers which may have been
518 			 * already allocated, even if we are now
519 			 * aborting. */
520 			if (!commit_transaction->t_buffers)
521 				goto start_journal_io;
522 			continue;
523 		}
524 
525 		/* Make sure we have a descriptor block in which to
526 		   record the metadata buffer. */
527 
528 		if (!descriptor) {
529 			struct buffer_head *bh;
530 
531 			J_ASSERT (bufs == 0);
532 
533 			jbd_debug(4, "JBD: get descriptor\n");
534 
535 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
536 			if (!descriptor) {
537 				__jbd2_journal_abort_hard(journal);
538 				continue;
539 			}
540 
541 			bh = jh2bh(descriptor);
542 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
543 				(unsigned long long)bh->b_blocknr, bh->b_data);
544 			header = (journal_header_t *)&bh->b_data[0];
545 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
546 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
547 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
548 
549 			tagp = &bh->b_data[sizeof(journal_header_t)];
550 			space_left = bh->b_size - sizeof(journal_header_t);
551 			first_tag = 1;
552 			set_buffer_jwrite(bh);
553 			set_buffer_dirty(bh);
554 			wbuf[bufs++] = bh;
555 
556 			/* Record it so that we can wait for IO
557                            completion later */
558 			BUFFER_TRACE(bh, "ph3: file as descriptor");
559 			jbd2_journal_file_buffer(descriptor, commit_transaction,
560 					BJ_LogCtl);
561 		}
562 
563 		/* Where is the buffer to be written? */
564 
565 		err = jbd2_journal_next_log_block(journal, &blocknr);
566 		/* If the block mapping failed, just abandon the buffer
567 		   and repeat this loop: we'll fall into the
568 		   refile-on-abort condition above. */
569 		if (err) {
570 			__jbd2_journal_abort_hard(journal);
571 			continue;
572 		}
573 
574 		/*
575 		 * start_this_handle() uses t_outstanding_credits to determine
576 		 * the free space in the log, but this counter is changed
577 		 * by jbd2_journal_next_log_block() also.
578 		 */
579 		commit_transaction->t_outstanding_credits--;
580 
581 		/* Bump b_count to prevent truncate from stumbling over
582                    the shadowed buffer!  @@@ This can go if we ever get
583                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
584 		atomic_inc(&jh2bh(jh)->b_count);
585 
586 		/* Make a temporary IO buffer with which to write it out
587                    (this will requeue both the metadata buffer and the
588                    temporary IO buffer). new_bh goes on BJ_IO*/
589 
590 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
591 		/*
592 		 * akpm: jbd2_journal_write_metadata_buffer() sets
593 		 * new_bh->b_transaction to commit_transaction.
594 		 * We need to clean this up before we release new_bh
595 		 * (which is of type BJ_IO)
596 		 */
597 		JBUFFER_TRACE(jh, "ph3: write metadata");
598 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
599 						      jh, &new_jh, blocknr);
600 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
601 		wbuf[bufs++] = jh2bh(new_jh);
602 
603 		/* Record the new block's tag in the current descriptor
604                    buffer */
605 
606 		tag_flag = 0;
607 		if (flags & 1)
608 			tag_flag |= JBD2_FLAG_ESCAPE;
609 		if (!first_tag)
610 			tag_flag |= JBD2_FLAG_SAME_UUID;
611 
612 		tag = (journal_block_tag_t *) tagp;
613 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
614 		tag->t_flags = cpu_to_be32(tag_flag);
615 		tagp += tag_bytes;
616 		space_left -= tag_bytes;
617 
618 		if (first_tag) {
619 			memcpy (tagp, journal->j_uuid, 16);
620 			tagp += 16;
621 			space_left -= 16;
622 			first_tag = 0;
623 		}
624 
625 		/* If there's no more to do, or if the descriptor is full,
626 		   let the IO rip! */
627 
628 		if (bufs == journal->j_wbufsize ||
629 		    commit_transaction->t_buffers == NULL ||
630 		    space_left < tag_bytes + 16) {
631 
632 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
633 
634 			/* Write an end-of-descriptor marker before
635                            submitting the IOs.  "tag" still points to
636                            the last tag we set up. */
637 
638 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
639 
640 start_journal_io:
641 			for (i = 0; i < bufs; i++) {
642 				struct buffer_head *bh = wbuf[i];
643 				lock_buffer(bh);
644 				clear_buffer_dirty(bh);
645 				set_buffer_uptodate(bh);
646 				bh->b_end_io = journal_end_buffer_io_sync;
647 				submit_bh(WRITE, bh);
648 			}
649 			cond_resched();
650 
651 			/* Force a new descriptor to be generated next
652                            time round the loop. */
653 			descriptor = NULL;
654 			bufs = 0;
655 		}
656 	}
657 
658 	/* Lo and behold: we have just managed to send a transaction to
659            the log.  Before we can commit it, wait for the IO so far to
660            complete.  Control buffers being written are on the
661            transaction's t_log_list queue, and metadata buffers are on
662            the t_iobuf_list queue.
663 
664 	   Wait for the buffers in reverse order.  That way we are
665 	   less likely to be woken up until all IOs have completed, and
666 	   so we incur less scheduling load.
667 	*/
668 
669 	jbd_debug(3, "JBD: commit phase 4\n");
670 
671 	/*
672 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
673 	 * See __journal_try_to_free_buffer.
674 	 */
675 wait_for_iobuf:
676 	while (commit_transaction->t_iobuf_list != NULL) {
677 		struct buffer_head *bh;
678 
679 		jh = commit_transaction->t_iobuf_list->b_tprev;
680 		bh = jh2bh(jh);
681 		if (buffer_locked(bh)) {
682 			wait_on_buffer(bh);
683 			goto wait_for_iobuf;
684 		}
685 		if (cond_resched())
686 			goto wait_for_iobuf;
687 
688 		if (unlikely(!buffer_uptodate(bh)))
689 			err = -EIO;
690 
691 		clear_buffer_jwrite(bh);
692 
693 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
694 		jbd2_journal_unfile_buffer(journal, jh);
695 
696 		/*
697 		 * ->t_iobuf_list should contain only dummy buffer_heads
698 		 * which were created by jbd2_journal_write_metadata_buffer().
699 		 */
700 		BUFFER_TRACE(bh, "dumping temporary bh");
701 		jbd2_journal_put_journal_head(jh);
702 		__brelse(bh);
703 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
704 		free_buffer_head(bh);
705 
706 		/* We also have to unlock and free the corresponding
707                    shadowed buffer */
708 		jh = commit_transaction->t_shadow_list->b_tprev;
709 		bh = jh2bh(jh);
710 		clear_bit(BH_JWrite, &bh->b_state);
711 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
712 
713 		/* The metadata is now released for reuse, but we need
714                    to remember it against this transaction so that when
715                    we finally commit, we can do any checkpointing
716                    required. */
717 		JBUFFER_TRACE(jh, "file as BJ_Forget");
718 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
719 		/* Wake up any transactions which were waiting for this
720 		   IO to complete */
721 		wake_up_bit(&bh->b_state, BH_Unshadow);
722 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
723 		__brelse(bh);
724 	}
725 
726 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
727 
728 	jbd_debug(3, "JBD: commit phase 5\n");
729 
730 	/* Here we wait for the revoke record and descriptor record buffers */
731  wait_for_ctlbuf:
732 	while (commit_transaction->t_log_list != NULL) {
733 		struct buffer_head *bh;
734 
735 		jh = commit_transaction->t_log_list->b_tprev;
736 		bh = jh2bh(jh);
737 		if (buffer_locked(bh)) {
738 			wait_on_buffer(bh);
739 			goto wait_for_ctlbuf;
740 		}
741 		if (cond_resched())
742 			goto wait_for_ctlbuf;
743 
744 		if (unlikely(!buffer_uptodate(bh)))
745 			err = -EIO;
746 
747 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
748 		clear_buffer_jwrite(bh);
749 		jbd2_journal_unfile_buffer(journal, jh);
750 		jbd2_journal_put_journal_head(jh);
751 		__brelse(bh);		/* One for getblk */
752 		/* AKPM: bforget here */
753 	}
754 
755 	jbd_debug(3, "JBD: commit phase 6\n");
756 
757 	if (journal_write_commit_record(journal, commit_transaction))
758 		err = -EIO;
759 
760 	if (err)
761 		__jbd2_journal_abort_hard(journal);
762 
763 	/* End of a transaction!  Finally, we can do checkpoint
764            processing: any buffers committed as a result of this
765            transaction can be removed from any checkpoint list it was on
766            before. */
767 
768 	jbd_debug(3, "JBD: commit phase 7\n");
769 
770 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
771 	J_ASSERT(commit_transaction->t_buffers == NULL);
772 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
773 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
774 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
775 	J_ASSERT(commit_transaction->t_log_list == NULL);
776 
777 restart_loop:
778 	/*
779 	 * As there are other places (journal_unmap_buffer()) adding buffers
780 	 * to this list we have to be careful and hold the j_list_lock.
781 	 */
782 	spin_lock(&journal->j_list_lock);
783 	while (commit_transaction->t_forget) {
784 		transaction_t *cp_transaction;
785 		struct buffer_head *bh;
786 
787 		jh = commit_transaction->t_forget;
788 		spin_unlock(&journal->j_list_lock);
789 		bh = jh2bh(jh);
790 		jbd_lock_bh_state(bh);
791 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
792 			jh->b_transaction == journal->j_running_transaction);
793 
794 		/*
795 		 * If there is undo-protected committed data against
796 		 * this buffer, then we can remove it now.  If it is a
797 		 * buffer needing such protection, the old frozen_data
798 		 * field now points to a committed version of the
799 		 * buffer, so rotate that field to the new committed
800 		 * data.
801 		 *
802 		 * Otherwise, we can just throw away the frozen data now.
803 		 */
804 		if (jh->b_committed_data) {
805 			jbd2_slab_free(jh->b_committed_data, bh->b_size);
806 			jh->b_committed_data = NULL;
807 			if (jh->b_frozen_data) {
808 				jh->b_committed_data = jh->b_frozen_data;
809 				jh->b_frozen_data = NULL;
810 			}
811 		} else if (jh->b_frozen_data) {
812 			jbd2_slab_free(jh->b_frozen_data, bh->b_size);
813 			jh->b_frozen_data = NULL;
814 		}
815 
816 		spin_lock(&journal->j_list_lock);
817 		cp_transaction = jh->b_cp_transaction;
818 		if (cp_transaction) {
819 			JBUFFER_TRACE(jh, "remove from old cp transaction");
820 			__jbd2_journal_remove_checkpoint(jh);
821 		}
822 
823 		/* Only re-checkpoint the buffer_head if it is marked
824 		 * dirty.  If the buffer was added to the BJ_Forget list
825 		 * by jbd2_journal_forget, it may no longer be dirty and
826 		 * there's no point in keeping a checkpoint record for
827 		 * it. */
828 
829 		/* A buffer which has been freed while still being
830 		 * journaled by a previous transaction may end up still
831 		 * being dirty here, but we want to avoid writing back
832 		 * that buffer in the future now that the last use has
833 		 * been committed.  That's not only a performance gain,
834 		 * it also stops aliasing problems if the buffer is left
835 		 * behind for writeback and gets reallocated for another
836 		 * use in a different page. */
837 		if (buffer_freed(bh)) {
838 			clear_buffer_freed(bh);
839 			clear_buffer_jbddirty(bh);
840 		}
841 
842 		if (buffer_jbddirty(bh)) {
843 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
844 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
845 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
846 			__jbd2_journal_refile_buffer(jh);
847 			jbd_unlock_bh_state(bh);
848 		} else {
849 			J_ASSERT_BH(bh, !buffer_dirty(bh));
850 			/* The buffer on BJ_Forget list and not jbddirty means
851 			 * it has been freed by this transaction and hence it
852 			 * could not have been reallocated until this
853 			 * transaction has committed. *BUT* it could be
854 			 * reallocated once we have written all the data to
855 			 * disk and before we process the buffer on BJ_Forget
856 			 * list. */
857 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
858 			__jbd2_journal_refile_buffer(jh);
859 			if (!jh->b_transaction) {
860 				jbd_unlock_bh_state(bh);
861 				 /* needs a brelse */
862 				jbd2_journal_remove_journal_head(bh);
863 				release_buffer_page(bh);
864 			} else
865 				jbd_unlock_bh_state(bh);
866 		}
867 		cond_resched_lock(&journal->j_list_lock);
868 	}
869 	spin_unlock(&journal->j_list_lock);
870 	/*
871 	 * This is a bit sleazy.  We borrow j_list_lock to protect
872 	 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
873 	 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
874 	 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
875 	 */
876 	spin_lock(&journal->j_state_lock);
877 	spin_lock(&journal->j_list_lock);
878 	/*
879 	 * Now recheck if some buffers did not get attached to the transaction
880 	 * while the lock was dropped...
881 	 */
882 	if (commit_transaction->t_forget) {
883 		spin_unlock(&journal->j_list_lock);
884 		spin_unlock(&journal->j_state_lock);
885 		goto restart_loop;
886 	}
887 
888 	/* Done with this transaction! */
889 
890 	jbd_debug(3, "JBD: commit phase 8\n");
891 
892 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
893 
894 	commit_transaction->t_state = T_FINISHED;
895 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
896 	journal->j_commit_sequence = commit_transaction->t_tid;
897 	journal->j_committing_transaction = NULL;
898 	spin_unlock(&journal->j_state_lock);
899 
900 	if (commit_transaction->t_checkpoint_list == NULL) {
901 		__jbd2_journal_drop_transaction(journal, commit_transaction);
902 	} else {
903 		if (journal->j_checkpoint_transactions == NULL) {
904 			journal->j_checkpoint_transactions = commit_transaction;
905 			commit_transaction->t_cpnext = commit_transaction;
906 			commit_transaction->t_cpprev = commit_transaction;
907 		} else {
908 			commit_transaction->t_cpnext =
909 				journal->j_checkpoint_transactions;
910 			commit_transaction->t_cpprev =
911 				commit_transaction->t_cpnext->t_cpprev;
912 			commit_transaction->t_cpnext->t_cpprev =
913 				commit_transaction;
914 			commit_transaction->t_cpprev->t_cpnext =
915 				commit_transaction;
916 		}
917 	}
918 	spin_unlock(&journal->j_list_lock);
919 
920 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
921 		  journal->j_commit_sequence, journal->j_tail_sequence);
922 
923 	wake_up(&journal->j_wait_done_commit);
924 }
925