xref: /linux/fs/jbd2/commit.c (revision 80b8d5d6bc0000c6e499260883cfc95e645f49d1)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/smp_lock.h>
24 
25 /*
26  * Default IO end handler for temporary BJ_IO buffer_heads.
27  */
28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29 {
30 	BUFFER_TRACE(bh, "");
31 	if (uptodate)
32 		set_buffer_uptodate(bh);
33 	else
34 		clear_buffer_uptodate(bh);
35 	unlock_buffer(bh);
36 }
37 
38 /*
39  * When an ext3-ordered file is truncated, it is possible that many pages are
40  * not sucessfully freed, because they are attached to a committing transaction.
41  * After the transaction commits, these pages are left on the LRU, with no
42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
44  * the numbers in /proc/meminfo look odd.
45  *
46  * So here, we have a buffer which has just come off the forget list.  Look to
47  * see if we can strip all buffers from the backing page.
48  *
49  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
50  * caller provided us with a ref against the buffer, and we drop that here.
51  */
52 static void release_buffer_page(struct buffer_head *bh)
53 {
54 	struct page *page;
55 
56 	if (buffer_dirty(bh))
57 		goto nope;
58 	if (atomic_read(&bh->b_count) != 1)
59 		goto nope;
60 	page = bh->b_page;
61 	if (!page)
62 		goto nope;
63 	if (page->mapping)
64 		goto nope;
65 
66 	/* OK, it's a truncated page */
67 	if (TestSetPageLocked(page))
68 		goto nope;
69 
70 	page_cache_get(page);
71 	__brelse(bh);
72 	try_to_free_buffers(page);
73 	unlock_page(page);
74 	page_cache_release(page);
75 	return;
76 
77 nope:
78 	__brelse(bh);
79 }
80 
81 /*
82  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
84  * return 0.  j_list_lock is dropped in this case.
85  */
86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87 {
88 	if (!jbd_trylock_bh_state(bh)) {
89 		spin_unlock(&journal->j_list_lock);
90 		schedule();
91 		return 0;
92 	}
93 	return 1;
94 }
95 
96 /* Done it all: now write the commit record.  We should have
97  * cleaned up our previous buffers by now, so if we are in abort
98  * mode we can now just skip the rest of the journal write
99  * entirely.
100  *
101  * Returns 1 if the journal needs to be aborted or 0 on success
102  */
103 static int journal_write_commit_record(journal_t *journal,
104 					transaction_t *commit_transaction)
105 {
106 	struct journal_head *descriptor;
107 	struct buffer_head *bh;
108 	int i, ret;
109 	int barrier_done = 0;
110 
111 	if (is_journal_aborted(journal))
112 		return 0;
113 
114 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
115 	if (!descriptor)
116 		return 1;
117 
118 	bh = jh2bh(descriptor);
119 
120 	/* AKPM: buglet - add `i' to tmp! */
121 	for (i = 0; i < bh->b_size; i += 512) {
122 		journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 		tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124 		tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
125 		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 	}
127 
128 	JBUFFER_TRACE(descriptor, "write commit block");
129 	set_buffer_dirty(bh);
130 	if (journal->j_flags & JBD2_BARRIER) {
131 		set_buffer_ordered(bh);
132 		barrier_done = 1;
133 	}
134 	ret = sync_dirty_buffer(bh);
135 	/* is it possible for another commit to fail at roughly
136 	 * the same time as this one?  If so, we don't want to
137 	 * trust the barrier flag in the super, but instead want
138 	 * to remember if we sent a barrier request
139 	 */
140 	if (ret == -EOPNOTSUPP && barrier_done) {
141 		char b[BDEVNAME_SIZE];
142 
143 		printk(KERN_WARNING
144 			"JBD: barrier-based sync failed on %s - "
145 			"disabling barriers\n",
146 			bdevname(journal->j_dev, b));
147 		spin_lock(&journal->j_state_lock);
148 		journal->j_flags &= ~JBD2_BARRIER;
149 		spin_unlock(&journal->j_state_lock);
150 
151 		/* And try again, without the barrier */
152 		clear_buffer_ordered(bh);
153 		set_buffer_uptodate(bh);
154 		set_buffer_dirty(bh);
155 		ret = sync_dirty_buffer(bh);
156 	}
157 	put_bh(bh);		/* One for getblk() */
158 	jbd2_journal_put_journal_head(descriptor);
159 
160 	return (ret == -EIO);
161 }
162 
163 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164 {
165 	int i;
166 
167 	for (i = 0; i < bufs; i++) {
168 		wbuf[i]->b_end_io = end_buffer_write_sync;
169 		/* We use-up our safety reference in submit_bh() */
170 		submit_bh(WRITE, wbuf[i]);
171 	}
172 }
173 
174 /*
175  *  Submit all the data buffers to disk
176  */
177 static void journal_submit_data_buffers(journal_t *journal,
178 				transaction_t *commit_transaction)
179 {
180 	struct journal_head *jh;
181 	struct buffer_head *bh;
182 	int locked;
183 	int bufs = 0;
184 	struct buffer_head **wbuf = journal->j_wbuf;
185 
186 	/*
187 	 * Whenever we unlock the journal and sleep, things can get added
188 	 * onto ->t_sync_datalist, so we have to keep looping back to
189 	 * write_out_data until we *know* that the list is empty.
190 	 *
191 	 * Cleanup any flushed data buffers from the data list.  Even in
192 	 * abort mode, we want to flush this out as soon as possible.
193 	 */
194 write_out_data:
195 	cond_resched();
196 	spin_lock(&journal->j_list_lock);
197 
198 	while (commit_transaction->t_sync_datalist) {
199 		jh = commit_transaction->t_sync_datalist;
200 		bh = jh2bh(jh);
201 		locked = 0;
202 
203 		/* Get reference just to make sure buffer does not disappear
204 		 * when we are forced to drop various locks */
205 		get_bh(bh);
206 		/* If the buffer is dirty, we need to submit IO and hence
207 		 * we need the buffer lock. We try to lock the buffer without
208 		 * blocking. If we fail, we need to drop j_list_lock and do
209 		 * blocking lock_buffer().
210 		 */
211 		if (buffer_dirty(bh)) {
212 			if (test_set_buffer_locked(bh)) {
213 				BUFFER_TRACE(bh, "needs blocking lock");
214 				spin_unlock(&journal->j_list_lock);
215 				/* Write out all data to prevent deadlocks */
216 				journal_do_submit_data(wbuf, bufs);
217 				bufs = 0;
218 				lock_buffer(bh);
219 				spin_lock(&journal->j_list_lock);
220 			}
221 			locked = 1;
222 		}
223 		/* We have to get bh_state lock. Again out of order, sigh. */
224 		if (!inverted_lock(journal, bh)) {
225 			jbd_lock_bh_state(bh);
226 			spin_lock(&journal->j_list_lock);
227 		}
228 		/* Someone already cleaned up the buffer? */
229 		if (!buffer_jbd(bh)
230 			|| jh->b_transaction != commit_transaction
231 			|| jh->b_jlist != BJ_SyncData) {
232 			jbd_unlock_bh_state(bh);
233 			if (locked)
234 				unlock_buffer(bh);
235 			BUFFER_TRACE(bh, "already cleaned up");
236 			put_bh(bh);
237 			continue;
238 		}
239 		if (locked && test_clear_buffer_dirty(bh)) {
240 			BUFFER_TRACE(bh, "needs writeout, adding to array");
241 			wbuf[bufs++] = bh;
242 			__jbd2_journal_file_buffer(jh, commit_transaction,
243 						BJ_Locked);
244 			jbd_unlock_bh_state(bh);
245 			if (bufs == journal->j_wbufsize) {
246 				spin_unlock(&journal->j_list_lock);
247 				journal_do_submit_data(wbuf, bufs);
248 				bufs = 0;
249 				goto write_out_data;
250 			}
251 		}
252 		else {
253 			BUFFER_TRACE(bh, "writeout complete: unfile");
254 			__jbd2_journal_unfile_buffer(jh);
255 			jbd_unlock_bh_state(bh);
256 			if (locked)
257 				unlock_buffer(bh);
258 			jbd2_journal_remove_journal_head(bh);
259 			/* Once for our safety reference, once for
260 			 * jbd2_journal_remove_journal_head() */
261 			put_bh(bh);
262 			put_bh(bh);
263 		}
264 
265 		if (lock_need_resched(&journal->j_list_lock)) {
266 			spin_unlock(&journal->j_list_lock);
267 			goto write_out_data;
268 		}
269 	}
270 	spin_unlock(&journal->j_list_lock);
271 	journal_do_submit_data(wbuf, bufs);
272 }
273 
274 static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
275 				   unsigned long long block)
276 {
277 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
278 	if (tag_bytes > JBD_TAG_SIZE32)
279 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
280 }
281 
282 /*
283  * jbd2_journal_commit_transaction
284  *
285  * The primary function for committing a transaction to the log.  This
286  * function is called by the journal thread to begin a complete commit.
287  */
288 void jbd2_journal_commit_transaction(journal_t *journal)
289 {
290 	transaction_t *commit_transaction;
291 	struct journal_head *jh, *new_jh, *descriptor;
292 	struct buffer_head **wbuf = journal->j_wbuf;
293 	int bufs;
294 	int flags;
295 	int err;
296 	unsigned long long blocknr;
297 	char *tagp = NULL;
298 	journal_header_t *header;
299 	journal_block_tag_t *tag = NULL;
300 	int space_left = 0;
301 	int first_tag = 0;
302 	int tag_flag;
303 	int i;
304 	int tag_bytes = journal_tag_bytes(journal);
305 
306 	/*
307 	 * First job: lock down the current transaction and wait for
308 	 * all outstanding updates to complete.
309 	 */
310 
311 #ifdef COMMIT_STATS
312 	spin_lock(&journal->j_list_lock);
313 	summarise_journal_usage(journal);
314 	spin_unlock(&journal->j_list_lock);
315 #endif
316 
317 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
318 	if (journal->j_flags & JBD2_FLUSHED) {
319 		jbd_debug(3, "super block updated\n");
320 		jbd2_journal_update_superblock(journal, 1);
321 	} else {
322 		jbd_debug(3, "superblock not updated\n");
323 	}
324 
325 	J_ASSERT(journal->j_running_transaction != NULL);
326 	J_ASSERT(journal->j_committing_transaction == NULL);
327 
328 	commit_transaction = journal->j_running_transaction;
329 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
330 
331 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
332 			commit_transaction->t_tid);
333 
334 	spin_lock(&journal->j_state_lock);
335 	commit_transaction->t_state = T_LOCKED;
336 
337 	spin_lock(&commit_transaction->t_handle_lock);
338 	while (commit_transaction->t_updates) {
339 		DEFINE_WAIT(wait);
340 
341 		prepare_to_wait(&journal->j_wait_updates, &wait,
342 					TASK_UNINTERRUPTIBLE);
343 		if (commit_transaction->t_updates) {
344 			spin_unlock(&commit_transaction->t_handle_lock);
345 			spin_unlock(&journal->j_state_lock);
346 			schedule();
347 			spin_lock(&journal->j_state_lock);
348 			spin_lock(&commit_transaction->t_handle_lock);
349 		}
350 		finish_wait(&journal->j_wait_updates, &wait);
351 	}
352 	spin_unlock(&commit_transaction->t_handle_lock);
353 
354 	J_ASSERT (commit_transaction->t_outstanding_credits <=
355 			journal->j_max_transaction_buffers);
356 
357 	/*
358 	 * First thing we are allowed to do is to discard any remaining
359 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
360 	 * that there are no such buffers: if a large filesystem
361 	 * operation like a truncate needs to split itself over multiple
362 	 * transactions, then it may try to do a jbd2_journal_restart() while
363 	 * there are still BJ_Reserved buffers outstanding.  These must
364 	 * be released cleanly from the current transaction.
365 	 *
366 	 * In this case, the filesystem must still reserve write access
367 	 * again before modifying the buffer in the new transaction, but
368 	 * we do not require it to remember exactly which old buffers it
369 	 * has reserved.  This is consistent with the existing behaviour
370 	 * that multiple jbd2_journal_get_write_access() calls to the same
371 	 * buffer are perfectly permissable.
372 	 */
373 	while (commit_transaction->t_reserved_list) {
374 		jh = commit_transaction->t_reserved_list;
375 		JBUFFER_TRACE(jh, "reserved, unused: refile");
376 		/*
377 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
378 		 * leave undo-committed data.
379 		 */
380 		if (jh->b_committed_data) {
381 			struct buffer_head *bh = jh2bh(jh);
382 
383 			jbd_lock_bh_state(bh);
384 			jbd2_slab_free(jh->b_committed_data, bh->b_size);
385 			jh->b_committed_data = NULL;
386 			jbd_unlock_bh_state(bh);
387 		}
388 		jbd2_journal_refile_buffer(journal, jh);
389 	}
390 
391 	/*
392 	 * Now try to drop any written-back buffers from the journal's
393 	 * checkpoint lists.  We do this *before* commit because it potentially
394 	 * frees some memory
395 	 */
396 	spin_lock(&journal->j_list_lock);
397 	__jbd2_journal_clean_checkpoint_list(journal);
398 	spin_unlock(&journal->j_list_lock);
399 
400 	jbd_debug (3, "JBD: commit phase 1\n");
401 
402 	/*
403 	 * Switch to a new revoke table.
404 	 */
405 	jbd2_journal_switch_revoke_table(journal);
406 
407 	commit_transaction->t_state = T_FLUSH;
408 	journal->j_committing_transaction = commit_transaction;
409 	journal->j_running_transaction = NULL;
410 	commit_transaction->t_log_start = journal->j_head;
411 	wake_up(&journal->j_wait_transaction_locked);
412 	spin_unlock(&journal->j_state_lock);
413 
414 	jbd_debug (3, "JBD: commit phase 2\n");
415 
416 	/*
417 	 * First, drop modified flag: all accesses to the buffers
418 	 * will be tracked for a new trasaction only -bzzz
419 	 */
420 	spin_lock(&journal->j_list_lock);
421 	if (commit_transaction->t_buffers) {
422 		new_jh = jh = commit_transaction->t_buffers->b_tnext;
423 		do {
424 			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
425 					new_jh->b_modified == 0);
426 			new_jh->b_modified = 0;
427 			new_jh = new_jh->b_tnext;
428 		} while (new_jh != jh);
429 	}
430 	spin_unlock(&journal->j_list_lock);
431 
432 	/*
433 	 * Now start flushing things to disk, in the order they appear
434 	 * on the transaction lists.  Data blocks go first.
435 	 */
436 	err = 0;
437 	journal_submit_data_buffers(journal, commit_transaction);
438 
439 	/*
440 	 * Wait for all previously submitted IO to complete.
441 	 */
442 	spin_lock(&journal->j_list_lock);
443 	while (commit_transaction->t_locked_list) {
444 		struct buffer_head *bh;
445 
446 		jh = commit_transaction->t_locked_list->b_tprev;
447 		bh = jh2bh(jh);
448 		get_bh(bh);
449 		if (buffer_locked(bh)) {
450 			spin_unlock(&journal->j_list_lock);
451 			wait_on_buffer(bh);
452 			if (unlikely(!buffer_uptodate(bh)))
453 				err = -EIO;
454 			spin_lock(&journal->j_list_lock);
455 		}
456 		if (!inverted_lock(journal, bh)) {
457 			put_bh(bh);
458 			spin_lock(&journal->j_list_lock);
459 			continue;
460 		}
461 		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
462 			__jbd2_journal_unfile_buffer(jh);
463 			jbd_unlock_bh_state(bh);
464 			jbd2_journal_remove_journal_head(bh);
465 			put_bh(bh);
466 		} else {
467 			jbd_unlock_bh_state(bh);
468 		}
469 		put_bh(bh);
470 		cond_resched_lock(&journal->j_list_lock);
471 	}
472 	spin_unlock(&journal->j_list_lock);
473 
474 	if (err)
475 		__jbd2_journal_abort_hard(journal);
476 
477 	jbd2_journal_write_revoke_records(journal, commit_transaction);
478 
479 	jbd_debug(3, "JBD: commit phase 2\n");
480 
481 	/*
482 	 * If we found any dirty or locked buffers, then we should have
483 	 * looped back up to the write_out_data label.  If there weren't
484 	 * any then journal_clean_data_list should have wiped the list
485 	 * clean by now, so check that it is in fact empty.
486 	 */
487 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
488 
489 	jbd_debug (3, "JBD: commit phase 3\n");
490 
491 	/*
492 	 * Way to go: we have now written out all of the data for a
493 	 * transaction!  Now comes the tricky part: we need to write out
494 	 * metadata.  Loop over the transaction's entire buffer list:
495 	 */
496 	commit_transaction->t_state = T_COMMIT;
497 
498 	descriptor = NULL;
499 	bufs = 0;
500 	while (commit_transaction->t_buffers) {
501 
502 		/* Find the next buffer to be journaled... */
503 
504 		jh = commit_transaction->t_buffers;
505 
506 		/* If we're in abort mode, we just un-journal the buffer and
507 		   release it for background writing. */
508 
509 		if (is_journal_aborted(journal)) {
510 			JBUFFER_TRACE(jh, "journal is aborting: refile");
511 			jbd2_journal_refile_buffer(journal, jh);
512 			/* If that was the last one, we need to clean up
513 			 * any descriptor buffers which may have been
514 			 * already allocated, even if we are now
515 			 * aborting. */
516 			if (!commit_transaction->t_buffers)
517 				goto start_journal_io;
518 			continue;
519 		}
520 
521 		/* Make sure we have a descriptor block in which to
522 		   record the metadata buffer. */
523 
524 		if (!descriptor) {
525 			struct buffer_head *bh;
526 
527 			J_ASSERT (bufs == 0);
528 
529 			jbd_debug(4, "JBD: get descriptor\n");
530 
531 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
532 			if (!descriptor) {
533 				__jbd2_journal_abort_hard(journal);
534 				continue;
535 			}
536 
537 			bh = jh2bh(descriptor);
538 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
539 				(unsigned long long)bh->b_blocknr, bh->b_data);
540 			header = (journal_header_t *)&bh->b_data[0];
541 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
542 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
543 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
544 
545 			tagp = &bh->b_data[sizeof(journal_header_t)];
546 			space_left = bh->b_size - sizeof(journal_header_t);
547 			first_tag = 1;
548 			set_buffer_jwrite(bh);
549 			set_buffer_dirty(bh);
550 			wbuf[bufs++] = bh;
551 
552 			/* Record it so that we can wait for IO
553                            completion later */
554 			BUFFER_TRACE(bh, "ph3: file as descriptor");
555 			jbd2_journal_file_buffer(descriptor, commit_transaction,
556 					BJ_LogCtl);
557 		}
558 
559 		/* Where is the buffer to be written? */
560 
561 		err = jbd2_journal_next_log_block(journal, &blocknr);
562 		/* If the block mapping failed, just abandon the buffer
563 		   and repeat this loop: we'll fall into the
564 		   refile-on-abort condition above. */
565 		if (err) {
566 			__jbd2_journal_abort_hard(journal);
567 			continue;
568 		}
569 
570 		/*
571 		 * start_this_handle() uses t_outstanding_credits to determine
572 		 * the free space in the log, but this counter is changed
573 		 * by jbd2_journal_next_log_block() also.
574 		 */
575 		commit_transaction->t_outstanding_credits--;
576 
577 		/* Bump b_count to prevent truncate from stumbling over
578                    the shadowed buffer!  @@@ This can go if we ever get
579                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
580 		atomic_inc(&jh2bh(jh)->b_count);
581 
582 		/* Make a temporary IO buffer with which to write it out
583                    (this will requeue both the metadata buffer and the
584                    temporary IO buffer). new_bh goes on BJ_IO*/
585 
586 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
587 		/*
588 		 * akpm: jbd2_journal_write_metadata_buffer() sets
589 		 * new_bh->b_transaction to commit_transaction.
590 		 * We need to clean this up before we release new_bh
591 		 * (which is of type BJ_IO)
592 		 */
593 		JBUFFER_TRACE(jh, "ph3: write metadata");
594 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
595 						      jh, &new_jh, blocknr);
596 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
597 		wbuf[bufs++] = jh2bh(new_jh);
598 
599 		/* Record the new block's tag in the current descriptor
600                    buffer */
601 
602 		tag_flag = 0;
603 		if (flags & 1)
604 			tag_flag |= JBD2_FLAG_ESCAPE;
605 		if (!first_tag)
606 			tag_flag |= JBD2_FLAG_SAME_UUID;
607 
608 		tag = (journal_block_tag_t *) tagp;
609 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
610 		tag->t_flags = cpu_to_be32(tag_flag);
611 		tagp += tag_bytes;
612 		space_left -= tag_bytes;
613 
614 		if (first_tag) {
615 			memcpy (tagp, journal->j_uuid, 16);
616 			tagp += 16;
617 			space_left -= 16;
618 			first_tag = 0;
619 		}
620 
621 		/* If there's no more to do, or if the descriptor is full,
622 		   let the IO rip! */
623 
624 		if (bufs == journal->j_wbufsize ||
625 		    commit_transaction->t_buffers == NULL ||
626 		    space_left < tag_bytes + 16) {
627 
628 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
629 
630 			/* Write an end-of-descriptor marker before
631                            submitting the IOs.  "tag" still points to
632                            the last tag we set up. */
633 
634 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
635 
636 start_journal_io:
637 			for (i = 0; i < bufs; i++) {
638 				struct buffer_head *bh = wbuf[i];
639 				lock_buffer(bh);
640 				clear_buffer_dirty(bh);
641 				set_buffer_uptodate(bh);
642 				bh->b_end_io = journal_end_buffer_io_sync;
643 				submit_bh(WRITE, bh);
644 			}
645 			cond_resched();
646 
647 			/* Force a new descriptor to be generated next
648                            time round the loop. */
649 			descriptor = NULL;
650 			bufs = 0;
651 		}
652 	}
653 
654 	/* Lo and behold: we have just managed to send a transaction to
655            the log.  Before we can commit it, wait for the IO so far to
656            complete.  Control buffers being written are on the
657            transaction's t_log_list queue, and metadata buffers are on
658            the t_iobuf_list queue.
659 
660 	   Wait for the buffers in reverse order.  That way we are
661 	   less likely to be woken up until all IOs have completed, and
662 	   so we incur less scheduling load.
663 	*/
664 
665 	jbd_debug(3, "JBD: commit phase 4\n");
666 
667 	/*
668 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
669 	 * See __journal_try_to_free_buffer.
670 	 */
671 wait_for_iobuf:
672 	while (commit_transaction->t_iobuf_list != NULL) {
673 		struct buffer_head *bh;
674 
675 		jh = commit_transaction->t_iobuf_list->b_tprev;
676 		bh = jh2bh(jh);
677 		if (buffer_locked(bh)) {
678 			wait_on_buffer(bh);
679 			goto wait_for_iobuf;
680 		}
681 		if (cond_resched())
682 			goto wait_for_iobuf;
683 
684 		if (unlikely(!buffer_uptodate(bh)))
685 			err = -EIO;
686 
687 		clear_buffer_jwrite(bh);
688 
689 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
690 		jbd2_journal_unfile_buffer(journal, jh);
691 
692 		/*
693 		 * ->t_iobuf_list should contain only dummy buffer_heads
694 		 * which were created by jbd2_journal_write_metadata_buffer().
695 		 */
696 		BUFFER_TRACE(bh, "dumping temporary bh");
697 		jbd2_journal_put_journal_head(jh);
698 		__brelse(bh);
699 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
700 		free_buffer_head(bh);
701 
702 		/* We also have to unlock and free the corresponding
703                    shadowed buffer */
704 		jh = commit_transaction->t_shadow_list->b_tprev;
705 		bh = jh2bh(jh);
706 		clear_bit(BH_JWrite, &bh->b_state);
707 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
708 
709 		/* The metadata is now released for reuse, but we need
710                    to remember it against this transaction so that when
711                    we finally commit, we can do any checkpointing
712                    required. */
713 		JBUFFER_TRACE(jh, "file as BJ_Forget");
714 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
715 		/* Wake up any transactions which were waiting for this
716 		   IO to complete */
717 		wake_up_bit(&bh->b_state, BH_Unshadow);
718 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
719 		__brelse(bh);
720 	}
721 
722 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
723 
724 	jbd_debug(3, "JBD: commit phase 5\n");
725 
726 	/* Here we wait for the revoke record and descriptor record buffers */
727  wait_for_ctlbuf:
728 	while (commit_transaction->t_log_list != NULL) {
729 		struct buffer_head *bh;
730 
731 		jh = commit_transaction->t_log_list->b_tprev;
732 		bh = jh2bh(jh);
733 		if (buffer_locked(bh)) {
734 			wait_on_buffer(bh);
735 			goto wait_for_ctlbuf;
736 		}
737 		if (cond_resched())
738 			goto wait_for_ctlbuf;
739 
740 		if (unlikely(!buffer_uptodate(bh)))
741 			err = -EIO;
742 
743 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
744 		clear_buffer_jwrite(bh);
745 		jbd2_journal_unfile_buffer(journal, jh);
746 		jbd2_journal_put_journal_head(jh);
747 		__brelse(bh);		/* One for getblk */
748 		/* AKPM: bforget here */
749 	}
750 
751 	jbd_debug(3, "JBD: commit phase 6\n");
752 
753 	if (journal_write_commit_record(journal, commit_transaction))
754 		err = -EIO;
755 
756 	if (err)
757 		__jbd2_journal_abort_hard(journal);
758 
759 	/* End of a transaction!  Finally, we can do checkpoint
760            processing: any buffers committed as a result of this
761            transaction can be removed from any checkpoint list it was on
762            before. */
763 
764 	jbd_debug(3, "JBD: commit phase 7\n");
765 
766 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
767 	J_ASSERT(commit_transaction->t_buffers == NULL);
768 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
769 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
770 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
771 	J_ASSERT(commit_transaction->t_log_list == NULL);
772 
773 restart_loop:
774 	/*
775 	 * As there are other places (journal_unmap_buffer()) adding buffers
776 	 * to this list we have to be careful and hold the j_list_lock.
777 	 */
778 	spin_lock(&journal->j_list_lock);
779 	while (commit_transaction->t_forget) {
780 		transaction_t *cp_transaction;
781 		struct buffer_head *bh;
782 
783 		jh = commit_transaction->t_forget;
784 		spin_unlock(&journal->j_list_lock);
785 		bh = jh2bh(jh);
786 		jbd_lock_bh_state(bh);
787 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
788 			jh->b_transaction == journal->j_running_transaction);
789 
790 		/*
791 		 * If there is undo-protected committed data against
792 		 * this buffer, then we can remove it now.  If it is a
793 		 * buffer needing such protection, the old frozen_data
794 		 * field now points to a committed version of the
795 		 * buffer, so rotate that field to the new committed
796 		 * data.
797 		 *
798 		 * Otherwise, we can just throw away the frozen data now.
799 		 */
800 		if (jh->b_committed_data) {
801 			jbd2_slab_free(jh->b_committed_data, bh->b_size);
802 			jh->b_committed_data = NULL;
803 			if (jh->b_frozen_data) {
804 				jh->b_committed_data = jh->b_frozen_data;
805 				jh->b_frozen_data = NULL;
806 			}
807 		} else if (jh->b_frozen_data) {
808 			jbd2_slab_free(jh->b_frozen_data, bh->b_size);
809 			jh->b_frozen_data = NULL;
810 		}
811 
812 		spin_lock(&journal->j_list_lock);
813 		cp_transaction = jh->b_cp_transaction;
814 		if (cp_transaction) {
815 			JBUFFER_TRACE(jh, "remove from old cp transaction");
816 			__jbd2_journal_remove_checkpoint(jh);
817 		}
818 
819 		/* Only re-checkpoint the buffer_head if it is marked
820 		 * dirty.  If the buffer was added to the BJ_Forget list
821 		 * by jbd2_journal_forget, it may no longer be dirty and
822 		 * there's no point in keeping a checkpoint record for
823 		 * it. */
824 
825 		/* A buffer which has been freed while still being
826 		 * journaled by a previous transaction may end up still
827 		 * being dirty here, but we want to avoid writing back
828 		 * that buffer in the future now that the last use has
829 		 * been committed.  That's not only a performance gain,
830 		 * it also stops aliasing problems if the buffer is left
831 		 * behind for writeback and gets reallocated for another
832 		 * use in a different page. */
833 		if (buffer_freed(bh)) {
834 			clear_buffer_freed(bh);
835 			clear_buffer_jbddirty(bh);
836 		}
837 
838 		if (buffer_jbddirty(bh)) {
839 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
840 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
841 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
842 			__jbd2_journal_refile_buffer(jh);
843 			jbd_unlock_bh_state(bh);
844 		} else {
845 			J_ASSERT_BH(bh, !buffer_dirty(bh));
846 			/* The buffer on BJ_Forget list and not jbddirty means
847 			 * it has been freed by this transaction and hence it
848 			 * could not have been reallocated until this
849 			 * transaction has committed. *BUT* it could be
850 			 * reallocated once we have written all the data to
851 			 * disk and before we process the buffer on BJ_Forget
852 			 * list. */
853 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
854 			__jbd2_journal_refile_buffer(jh);
855 			if (!jh->b_transaction) {
856 				jbd_unlock_bh_state(bh);
857 				 /* needs a brelse */
858 				jbd2_journal_remove_journal_head(bh);
859 				release_buffer_page(bh);
860 			} else
861 				jbd_unlock_bh_state(bh);
862 		}
863 		cond_resched_lock(&journal->j_list_lock);
864 	}
865 	spin_unlock(&journal->j_list_lock);
866 	/*
867 	 * This is a bit sleazy.  We borrow j_list_lock to protect
868 	 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
869 	 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
870 	 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
871 	 */
872 	spin_lock(&journal->j_state_lock);
873 	spin_lock(&journal->j_list_lock);
874 	/*
875 	 * Now recheck if some buffers did not get attached to the transaction
876 	 * while the lock was dropped...
877 	 */
878 	if (commit_transaction->t_forget) {
879 		spin_unlock(&journal->j_list_lock);
880 		spin_unlock(&journal->j_state_lock);
881 		goto restart_loop;
882 	}
883 
884 	/* Done with this transaction! */
885 
886 	jbd_debug(3, "JBD: commit phase 8\n");
887 
888 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
889 
890 	commit_transaction->t_state = T_FINISHED;
891 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
892 	journal->j_commit_sequence = commit_transaction->t_tid;
893 	journal->j_committing_transaction = NULL;
894 	spin_unlock(&journal->j_state_lock);
895 
896 	if (commit_transaction->t_checkpoint_list == NULL) {
897 		__jbd2_journal_drop_transaction(journal, commit_transaction);
898 	} else {
899 		if (journal->j_checkpoint_transactions == NULL) {
900 			journal->j_checkpoint_transactions = commit_transaction;
901 			commit_transaction->t_cpnext = commit_transaction;
902 			commit_transaction->t_cpprev = commit_transaction;
903 		} else {
904 			commit_transaction->t_cpnext =
905 				journal->j_checkpoint_transactions;
906 			commit_transaction->t_cpprev =
907 				commit_transaction->t_cpnext->t_cpprev;
908 			commit_transaction->t_cpnext->t_cpprev =
909 				commit_transaction;
910 			commit_transaction->t_cpprev->t_cpnext =
911 				commit_transaction;
912 		}
913 	}
914 	spin_unlock(&journal->j_list_lock);
915 
916 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
917 		  journal->j_commit_sequence, journal->j_tail_sequence);
918 
919 	wake_up(&journal->j_wait_done_commit);
920 }
921