xref: /linux/fs/jbd2/commit.c (revision 498ade1a133dffd0f3ee90952737045d56e6689a)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31 	BUFFER_TRACE(bh, "");
32 	if (uptodate)
33 		set_buffer_uptodate(bh);
34 	else
35 		clear_buffer_uptodate(bh);
36 	unlock_buffer(bh);
37 }
38 
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not sucessfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
51  * caller provided us with a ref against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55 	struct page *page;
56 
57 	if (buffer_dirty(bh))
58 		goto nope;
59 	if (atomic_read(&bh->b_count) != 1)
60 		goto nope;
61 	page = bh->b_page;
62 	if (!page)
63 		goto nope;
64 	if (page->mapping)
65 		goto nope;
66 
67 	/* OK, it's a truncated page */
68 	if (TestSetPageLocked(page))
69 		goto nope;
70 
71 	page_cache_get(page);
72 	__brelse(bh);
73 	try_to_free_buffers(page);
74 	unlock_page(page);
75 	page_cache_release(page);
76 	return;
77 
78 nope:
79 	__brelse(bh);
80 }
81 
82 /*
83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
85  * return 0.  j_list_lock is dropped in this case.
86  */
87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88 {
89 	if (!jbd_trylock_bh_state(bh)) {
90 		spin_unlock(&journal->j_list_lock);
91 		schedule();
92 		return 0;
93 	}
94 	return 1;
95 }
96 
97 /*
98  * Done it all: now submit the commit record.  We should have
99  * cleaned up our previous buffers by now, so if we are in abort
100  * mode we can now just skip the rest of the journal write
101  * entirely.
102  *
103  * Returns 1 if the journal needs to be aborted or 0 on success
104  */
105 static int journal_submit_commit_record(journal_t *journal,
106 					transaction_t *commit_transaction,
107 					struct buffer_head **cbh,
108 					__u32 crc32_sum)
109 {
110 	struct journal_head *descriptor;
111 	struct commit_header *tmp;
112 	struct buffer_head *bh;
113 	int ret;
114 	int barrier_done = 0;
115 
116 	if (is_journal_aborted(journal))
117 		return 0;
118 
119 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
120 	if (!descriptor)
121 		return 1;
122 
123 	bh = jh2bh(descriptor);
124 
125 	tmp = (struct commit_header *)bh->b_data;
126 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
129 
130 	if (JBD2_HAS_COMPAT_FEATURE(journal,
131 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
132 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
133 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
134 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
135 	}
136 
137 	JBUFFER_TRACE(descriptor, "submit commit block");
138 	lock_buffer(bh);
139 
140 	set_buffer_dirty(bh);
141 	set_buffer_uptodate(bh);
142 	bh->b_end_io = journal_end_buffer_io_sync;
143 
144 	if (journal->j_flags & JBD2_BARRIER &&
145 		!JBD2_HAS_COMPAT_FEATURE(journal,
146 					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
147 		set_buffer_ordered(bh);
148 		barrier_done = 1;
149 	}
150 	ret = submit_bh(WRITE, bh);
151 
152 	/* is it possible for another commit to fail at roughly
153 	 * the same time as this one?  If so, we don't want to
154 	 * trust the barrier flag in the super, but instead want
155 	 * to remember if we sent a barrier request
156 	 */
157 	if (ret == -EOPNOTSUPP && barrier_done) {
158 		char b[BDEVNAME_SIZE];
159 
160 		printk(KERN_WARNING
161 			"JBD: barrier-based sync failed on %s - "
162 			"disabling barriers\n",
163 			bdevname(journal->j_dev, b));
164 		spin_lock(&journal->j_state_lock);
165 		journal->j_flags &= ~JBD2_BARRIER;
166 		spin_unlock(&journal->j_state_lock);
167 
168 		/* And try again, without the barrier */
169 		clear_buffer_ordered(bh);
170 		set_buffer_uptodate(bh);
171 		set_buffer_dirty(bh);
172 		ret = submit_bh(WRITE, bh);
173 	}
174 	*cbh = bh;
175 	return ret;
176 }
177 
178 /*
179  * This function along with journal_submit_commit_record
180  * allows to write the commit record asynchronously.
181  */
182 static int journal_wait_on_commit_record(struct buffer_head *bh)
183 {
184 	int ret = 0;
185 
186 	clear_buffer_dirty(bh);
187 	wait_on_buffer(bh);
188 
189 	if (unlikely(!buffer_uptodate(bh)))
190 		ret = -EIO;
191 	put_bh(bh);            /* One for getblk() */
192 	jbd2_journal_put_journal_head(bh2jh(bh));
193 
194 	return ret;
195 }
196 
197 /*
198  * Wait for all submitted IO to complete.
199  */
200 static int journal_wait_on_locked_list(journal_t *journal,
201 				       transaction_t *commit_transaction)
202 {
203 	int ret = 0;
204 	struct journal_head *jh;
205 
206 	while (commit_transaction->t_locked_list) {
207 		struct buffer_head *bh;
208 
209 		jh = commit_transaction->t_locked_list->b_tprev;
210 		bh = jh2bh(jh);
211 		get_bh(bh);
212 		if (buffer_locked(bh)) {
213 			spin_unlock(&journal->j_list_lock);
214 			wait_on_buffer(bh);
215 			if (unlikely(!buffer_uptodate(bh)))
216 				ret = -EIO;
217 			spin_lock(&journal->j_list_lock);
218 		}
219 		if (!inverted_lock(journal, bh)) {
220 			put_bh(bh);
221 			spin_lock(&journal->j_list_lock);
222 			continue;
223 		}
224 		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
225 			__jbd2_journal_unfile_buffer(jh);
226 			jbd_unlock_bh_state(bh);
227 			jbd2_journal_remove_journal_head(bh);
228 			put_bh(bh);
229 		} else {
230 			jbd_unlock_bh_state(bh);
231 		}
232 		put_bh(bh);
233 		cond_resched_lock(&journal->j_list_lock);
234 	}
235 	return ret;
236   }
237 
238 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
239 {
240 	int i;
241 
242 	for (i = 0; i < bufs; i++) {
243 		wbuf[i]->b_end_io = end_buffer_write_sync;
244 		/* We use-up our safety reference in submit_bh() */
245 		submit_bh(WRITE, wbuf[i]);
246 	}
247 }
248 
249 /*
250  *  Submit all the data buffers to disk
251  */
252 static void journal_submit_data_buffers(journal_t *journal,
253 				transaction_t *commit_transaction)
254 {
255 	struct journal_head *jh;
256 	struct buffer_head *bh;
257 	int locked;
258 	int bufs = 0;
259 	struct buffer_head **wbuf = journal->j_wbuf;
260 
261 	/*
262 	 * Whenever we unlock the journal and sleep, things can get added
263 	 * onto ->t_sync_datalist, so we have to keep looping back to
264 	 * write_out_data until we *know* that the list is empty.
265 	 *
266 	 * Cleanup any flushed data buffers from the data list.  Even in
267 	 * abort mode, we want to flush this out as soon as possible.
268 	 */
269 write_out_data:
270 	cond_resched();
271 	spin_lock(&journal->j_list_lock);
272 
273 	while (commit_transaction->t_sync_datalist) {
274 		jh = commit_transaction->t_sync_datalist;
275 		bh = jh2bh(jh);
276 		locked = 0;
277 
278 		/* Get reference just to make sure buffer does not disappear
279 		 * when we are forced to drop various locks */
280 		get_bh(bh);
281 		/* If the buffer is dirty, we need to submit IO and hence
282 		 * we need the buffer lock. We try to lock the buffer without
283 		 * blocking. If we fail, we need to drop j_list_lock and do
284 		 * blocking lock_buffer().
285 		 */
286 		if (buffer_dirty(bh)) {
287 			if (test_set_buffer_locked(bh)) {
288 				BUFFER_TRACE(bh, "needs blocking lock");
289 				spin_unlock(&journal->j_list_lock);
290 				/* Write out all data to prevent deadlocks */
291 				journal_do_submit_data(wbuf, bufs);
292 				bufs = 0;
293 				lock_buffer(bh);
294 				spin_lock(&journal->j_list_lock);
295 			}
296 			locked = 1;
297 		}
298 		/* We have to get bh_state lock. Again out of order, sigh. */
299 		if (!inverted_lock(journal, bh)) {
300 			jbd_lock_bh_state(bh);
301 			spin_lock(&journal->j_list_lock);
302 		}
303 		/* Someone already cleaned up the buffer? */
304 		if (!buffer_jbd(bh)
305 			|| jh->b_transaction != commit_transaction
306 			|| jh->b_jlist != BJ_SyncData) {
307 			jbd_unlock_bh_state(bh);
308 			if (locked)
309 				unlock_buffer(bh);
310 			BUFFER_TRACE(bh, "already cleaned up");
311 			put_bh(bh);
312 			continue;
313 		}
314 		if (locked && test_clear_buffer_dirty(bh)) {
315 			BUFFER_TRACE(bh, "needs writeout, adding to array");
316 			wbuf[bufs++] = bh;
317 			__jbd2_journal_file_buffer(jh, commit_transaction,
318 						BJ_Locked);
319 			jbd_unlock_bh_state(bh);
320 			if (bufs == journal->j_wbufsize) {
321 				spin_unlock(&journal->j_list_lock);
322 				journal_do_submit_data(wbuf, bufs);
323 				bufs = 0;
324 				goto write_out_data;
325 			}
326 		} else if (!locked && buffer_locked(bh)) {
327 			__jbd2_journal_file_buffer(jh, commit_transaction,
328 						BJ_Locked);
329 			jbd_unlock_bh_state(bh);
330 			put_bh(bh);
331 		} else {
332 			BUFFER_TRACE(bh, "writeout complete: unfile");
333 			__jbd2_journal_unfile_buffer(jh);
334 			jbd_unlock_bh_state(bh);
335 			if (locked)
336 				unlock_buffer(bh);
337 			jbd2_journal_remove_journal_head(bh);
338 			/* Once for our safety reference, once for
339 			 * jbd2_journal_remove_journal_head() */
340 			put_bh(bh);
341 			put_bh(bh);
342 		}
343 
344 		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
345 			spin_unlock(&journal->j_list_lock);
346 			goto write_out_data;
347 		}
348 	}
349 	spin_unlock(&journal->j_list_lock);
350 	journal_do_submit_data(wbuf, bufs);
351 }
352 
353 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
354 {
355 	struct page *page = bh->b_page;
356 	char *addr;
357 	__u32 checksum;
358 
359 	addr = kmap_atomic(page, KM_USER0);
360 	checksum = crc32_be(crc32_sum,
361 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
362 	kunmap_atomic(addr, KM_USER0);
363 
364 	return checksum;
365 }
366 
367 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
368 				   unsigned long long block)
369 {
370 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
371 	if (tag_bytes > JBD2_TAG_SIZE32)
372 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
373 }
374 
375 /*
376  * jbd2_journal_commit_transaction
377  *
378  * The primary function for committing a transaction to the log.  This
379  * function is called by the journal thread to begin a complete commit.
380  */
381 void jbd2_journal_commit_transaction(journal_t *journal)
382 {
383 	struct transaction_stats_s stats;
384 	transaction_t *commit_transaction;
385 	struct journal_head *jh, *new_jh, *descriptor;
386 	struct buffer_head **wbuf = journal->j_wbuf;
387 	int bufs;
388 	int flags;
389 	int err;
390 	unsigned long long blocknr;
391 	char *tagp = NULL;
392 	journal_header_t *header;
393 	journal_block_tag_t *tag = NULL;
394 	int space_left = 0;
395 	int first_tag = 0;
396 	int tag_flag;
397 	int i;
398 	int tag_bytes = journal_tag_bytes(journal);
399 	struct buffer_head *cbh = NULL; /* For transactional checksums */
400 	__u32 crc32_sum = ~0;
401 
402 	/*
403 	 * First job: lock down the current transaction and wait for
404 	 * all outstanding updates to complete.
405 	 */
406 
407 #ifdef COMMIT_STATS
408 	spin_lock(&journal->j_list_lock);
409 	summarise_journal_usage(journal);
410 	spin_unlock(&journal->j_list_lock);
411 #endif
412 
413 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
414 	if (journal->j_flags & JBD2_FLUSHED) {
415 		jbd_debug(3, "super block updated\n");
416 		jbd2_journal_update_superblock(journal, 1);
417 	} else {
418 		jbd_debug(3, "superblock not updated\n");
419 	}
420 
421 	J_ASSERT(journal->j_running_transaction != NULL);
422 	J_ASSERT(journal->j_committing_transaction == NULL);
423 
424 	commit_transaction = journal->j_running_transaction;
425 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
426 
427 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
428 			commit_transaction->t_tid);
429 
430 	spin_lock(&journal->j_state_lock);
431 	commit_transaction->t_state = T_LOCKED;
432 
433 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
434 	stats.u.run.rs_locked = jiffies;
435 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
436 						stats.u.run.rs_locked);
437 
438 	spin_lock(&commit_transaction->t_handle_lock);
439 	while (commit_transaction->t_updates) {
440 		DEFINE_WAIT(wait);
441 
442 		prepare_to_wait(&journal->j_wait_updates, &wait,
443 					TASK_UNINTERRUPTIBLE);
444 		if (commit_transaction->t_updates) {
445 			spin_unlock(&commit_transaction->t_handle_lock);
446 			spin_unlock(&journal->j_state_lock);
447 			schedule();
448 			spin_lock(&journal->j_state_lock);
449 			spin_lock(&commit_transaction->t_handle_lock);
450 		}
451 		finish_wait(&journal->j_wait_updates, &wait);
452 	}
453 	spin_unlock(&commit_transaction->t_handle_lock);
454 
455 	J_ASSERT (commit_transaction->t_outstanding_credits <=
456 			journal->j_max_transaction_buffers);
457 
458 	/*
459 	 * First thing we are allowed to do is to discard any remaining
460 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
461 	 * that there are no such buffers: if a large filesystem
462 	 * operation like a truncate needs to split itself over multiple
463 	 * transactions, then it may try to do a jbd2_journal_restart() while
464 	 * there are still BJ_Reserved buffers outstanding.  These must
465 	 * be released cleanly from the current transaction.
466 	 *
467 	 * In this case, the filesystem must still reserve write access
468 	 * again before modifying the buffer in the new transaction, but
469 	 * we do not require it to remember exactly which old buffers it
470 	 * has reserved.  This is consistent with the existing behaviour
471 	 * that multiple jbd2_journal_get_write_access() calls to the same
472 	 * buffer are perfectly permissable.
473 	 */
474 	while (commit_transaction->t_reserved_list) {
475 		jh = commit_transaction->t_reserved_list;
476 		JBUFFER_TRACE(jh, "reserved, unused: refile");
477 		/*
478 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
479 		 * leave undo-committed data.
480 		 */
481 		if (jh->b_committed_data) {
482 			struct buffer_head *bh = jh2bh(jh);
483 
484 			jbd_lock_bh_state(bh);
485 			jbd2_free(jh->b_committed_data, bh->b_size);
486 			jh->b_committed_data = NULL;
487 			jbd_unlock_bh_state(bh);
488 		}
489 		jbd2_journal_refile_buffer(journal, jh);
490 	}
491 
492 	/*
493 	 * Now try to drop any written-back buffers from the journal's
494 	 * checkpoint lists.  We do this *before* commit because it potentially
495 	 * frees some memory
496 	 */
497 	spin_lock(&journal->j_list_lock);
498 	__jbd2_journal_clean_checkpoint_list(journal);
499 	spin_unlock(&journal->j_list_lock);
500 
501 	jbd_debug (3, "JBD: commit phase 1\n");
502 
503 	/*
504 	 * Switch to a new revoke table.
505 	 */
506 	jbd2_journal_switch_revoke_table(journal);
507 
508 	stats.u.run.rs_flushing = jiffies;
509 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
510 					       stats.u.run.rs_flushing);
511 
512 	commit_transaction->t_state = T_FLUSH;
513 	journal->j_committing_transaction = commit_transaction;
514 	journal->j_running_transaction = NULL;
515 	commit_transaction->t_log_start = journal->j_head;
516 	wake_up(&journal->j_wait_transaction_locked);
517 	spin_unlock(&journal->j_state_lock);
518 
519 	jbd_debug (3, "JBD: commit phase 2\n");
520 
521 	/*
522 	 * First, drop modified flag: all accesses to the buffers
523 	 * will be tracked for a new trasaction only -bzzz
524 	 */
525 	spin_lock(&journal->j_list_lock);
526 	if (commit_transaction->t_buffers) {
527 		new_jh = jh = commit_transaction->t_buffers->b_tnext;
528 		do {
529 			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
530 					new_jh->b_modified == 0);
531 			new_jh->b_modified = 0;
532 			new_jh = new_jh->b_tnext;
533 		} while (new_jh != jh);
534 	}
535 	spin_unlock(&journal->j_list_lock);
536 
537 	/*
538 	 * Now start flushing things to disk, in the order they appear
539 	 * on the transaction lists.  Data blocks go first.
540 	 */
541 	err = 0;
542 	journal_submit_data_buffers(journal, commit_transaction);
543 
544 	/*
545 	 * Wait for all previously submitted IO to complete if commit
546 	 * record is to be written synchronously.
547 	 */
548 	spin_lock(&journal->j_list_lock);
549 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
550 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
551 		err = journal_wait_on_locked_list(journal,
552 						commit_transaction);
553 
554 	spin_unlock(&journal->j_list_lock);
555 
556 	if (err)
557 		jbd2_journal_abort(journal, err);
558 
559 	jbd2_journal_write_revoke_records(journal, commit_transaction);
560 
561 	jbd_debug(3, "JBD: commit phase 2\n");
562 
563 	/*
564 	 * If we found any dirty or locked buffers, then we should have
565 	 * looped back up to the write_out_data label.  If there weren't
566 	 * any then journal_clean_data_list should have wiped the list
567 	 * clean by now, so check that it is in fact empty.
568 	 */
569 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
570 
571 	jbd_debug (3, "JBD: commit phase 3\n");
572 
573 	/*
574 	 * Way to go: we have now written out all of the data for a
575 	 * transaction!  Now comes the tricky part: we need to write out
576 	 * metadata.  Loop over the transaction's entire buffer list:
577 	 */
578 	commit_transaction->t_state = T_COMMIT;
579 
580 	stats.u.run.rs_logging = jiffies;
581 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
582 						 stats.u.run.rs_logging);
583 	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
584 	stats.u.run.rs_blocks_logged = 0;
585 
586 	descriptor = NULL;
587 	bufs = 0;
588 	while (commit_transaction->t_buffers) {
589 
590 		/* Find the next buffer to be journaled... */
591 
592 		jh = commit_transaction->t_buffers;
593 
594 		/* If we're in abort mode, we just un-journal the buffer and
595 		   release it for background writing. */
596 
597 		if (is_journal_aborted(journal)) {
598 			JBUFFER_TRACE(jh, "journal is aborting: refile");
599 			jbd2_journal_refile_buffer(journal, jh);
600 			/* If that was the last one, we need to clean up
601 			 * any descriptor buffers which may have been
602 			 * already allocated, even if we are now
603 			 * aborting. */
604 			if (!commit_transaction->t_buffers)
605 				goto start_journal_io;
606 			continue;
607 		}
608 
609 		/* Make sure we have a descriptor block in which to
610 		   record the metadata buffer. */
611 
612 		if (!descriptor) {
613 			struct buffer_head *bh;
614 
615 			J_ASSERT (bufs == 0);
616 
617 			jbd_debug(4, "JBD: get descriptor\n");
618 
619 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
620 			if (!descriptor) {
621 				jbd2_journal_abort(journal, -EIO);
622 				continue;
623 			}
624 
625 			bh = jh2bh(descriptor);
626 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
627 				(unsigned long long)bh->b_blocknr, bh->b_data);
628 			header = (journal_header_t *)&bh->b_data[0];
629 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
630 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
632 
633 			tagp = &bh->b_data[sizeof(journal_header_t)];
634 			space_left = bh->b_size - sizeof(journal_header_t);
635 			first_tag = 1;
636 			set_buffer_jwrite(bh);
637 			set_buffer_dirty(bh);
638 			wbuf[bufs++] = bh;
639 
640 			/* Record it so that we can wait for IO
641                            completion later */
642 			BUFFER_TRACE(bh, "ph3: file as descriptor");
643 			jbd2_journal_file_buffer(descriptor, commit_transaction,
644 					BJ_LogCtl);
645 		}
646 
647 		/* Where is the buffer to be written? */
648 
649 		err = jbd2_journal_next_log_block(journal, &blocknr);
650 		/* If the block mapping failed, just abandon the buffer
651 		   and repeat this loop: we'll fall into the
652 		   refile-on-abort condition above. */
653 		if (err) {
654 			jbd2_journal_abort(journal, err);
655 			continue;
656 		}
657 
658 		/*
659 		 * start_this_handle() uses t_outstanding_credits to determine
660 		 * the free space in the log, but this counter is changed
661 		 * by jbd2_journal_next_log_block() also.
662 		 */
663 		commit_transaction->t_outstanding_credits--;
664 
665 		/* Bump b_count to prevent truncate from stumbling over
666                    the shadowed buffer!  @@@ This can go if we ever get
667                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
668 		atomic_inc(&jh2bh(jh)->b_count);
669 
670 		/* Make a temporary IO buffer with which to write it out
671                    (this will requeue both the metadata buffer and the
672                    temporary IO buffer). new_bh goes on BJ_IO*/
673 
674 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 		/*
676 		 * akpm: jbd2_journal_write_metadata_buffer() sets
677 		 * new_bh->b_transaction to commit_transaction.
678 		 * We need to clean this up before we release new_bh
679 		 * (which is of type BJ_IO)
680 		 */
681 		JBUFFER_TRACE(jh, "ph3: write metadata");
682 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
683 						      jh, &new_jh, blocknr);
684 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
685 		wbuf[bufs++] = jh2bh(new_jh);
686 
687 		/* Record the new block's tag in the current descriptor
688                    buffer */
689 
690 		tag_flag = 0;
691 		if (flags & 1)
692 			tag_flag |= JBD2_FLAG_ESCAPE;
693 		if (!first_tag)
694 			tag_flag |= JBD2_FLAG_SAME_UUID;
695 
696 		tag = (journal_block_tag_t *) tagp;
697 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
698 		tag->t_flags = cpu_to_be32(tag_flag);
699 		tagp += tag_bytes;
700 		space_left -= tag_bytes;
701 
702 		if (first_tag) {
703 			memcpy (tagp, journal->j_uuid, 16);
704 			tagp += 16;
705 			space_left -= 16;
706 			first_tag = 0;
707 		}
708 
709 		/* If there's no more to do, or if the descriptor is full,
710 		   let the IO rip! */
711 
712 		if (bufs == journal->j_wbufsize ||
713 		    commit_transaction->t_buffers == NULL ||
714 		    space_left < tag_bytes + 16) {
715 
716 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
717 
718 			/* Write an end-of-descriptor marker before
719                            submitting the IOs.  "tag" still points to
720                            the last tag we set up. */
721 
722 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
723 
724 start_journal_io:
725 			for (i = 0; i < bufs; i++) {
726 				struct buffer_head *bh = wbuf[i];
727 				/*
728 				 * Compute checksum.
729 				 */
730 				if (JBD2_HAS_COMPAT_FEATURE(journal,
731 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
732 					crc32_sum =
733 					    jbd2_checksum_data(crc32_sum, bh);
734 				}
735 
736 				lock_buffer(bh);
737 				clear_buffer_dirty(bh);
738 				set_buffer_uptodate(bh);
739 				bh->b_end_io = journal_end_buffer_io_sync;
740 				submit_bh(WRITE, bh);
741 			}
742 			cond_resched();
743 			stats.u.run.rs_blocks_logged += bufs;
744 
745 			/* Force a new descriptor to be generated next
746                            time round the loop. */
747 			descriptor = NULL;
748 			bufs = 0;
749 		}
750 	}
751 
752 	/* Done it all: now write the commit record asynchronously. */
753 
754 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
755 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
756 		err = journal_submit_commit_record(journal, commit_transaction,
757 						 &cbh, crc32_sum);
758 		if (err)
759 			__jbd2_journal_abort_hard(journal);
760 
761 		spin_lock(&journal->j_list_lock);
762 		err = journal_wait_on_locked_list(journal,
763 						commit_transaction);
764 		spin_unlock(&journal->j_list_lock);
765 		if (err)
766 			__jbd2_journal_abort_hard(journal);
767 	}
768 
769 	/* Lo and behold: we have just managed to send a transaction to
770            the log.  Before we can commit it, wait for the IO so far to
771            complete.  Control buffers being written are on the
772            transaction's t_log_list queue, and metadata buffers are on
773            the t_iobuf_list queue.
774 
775 	   Wait for the buffers in reverse order.  That way we are
776 	   less likely to be woken up until all IOs have completed, and
777 	   so we incur less scheduling load.
778 	*/
779 
780 	jbd_debug(3, "JBD: commit phase 4\n");
781 
782 	/*
783 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
784 	 * See __journal_try_to_free_buffer.
785 	 */
786 wait_for_iobuf:
787 	while (commit_transaction->t_iobuf_list != NULL) {
788 		struct buffer_head *bh;
789 
790 		jh = commit_transaction->t_iobuf_list->b_tprev;
791 		bh = jh2bh(jh);
792 		if (buffer_locked(bh)) {
793 			wait_on_buffer(bh);
794 			goto wait_for_iobuf;
795 		}
796 		if (cond_resched())
797 			goto wait_for_iobuf;
798 
799 		if (unlikely(!buffer_uptodate(bh)))
800 			err = -EIO;
801 
802 		clear_buffer_jwrite(bh);
803 
804 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
805 		jbd2_journal_unfile_buffer(journal, jh);
806 
807 		/*
808 		 * ->t_iobuf_list should contain only dummy buffer_heads
809 		 * which were created by jbd2_journal_write_metadata_buffer().
810 		 */
811 		BUFFER_TRACE(bh, "dumping temporary bh");
812 		jbd2_journal_put_journal_head(jh);
813 		__brelse(bh);
814 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
815 		free_buffer_head(bh);
816 
817 		/* We also have to unlock and free the corresponding
818                    shadowed buffer */
819 		jh = commit_transaction->t_shadow_list->b_tprev;
820 		bh = jh2bh(jh);
821 		clear_bit(BH_JWrite, &bh->b_state);
822 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
823 
824 		/* The metadata is now released for reuse, but we need
825                    to remember it against this transaction so that when
826                    we finally commit, we can do any checkpointing
827                    required. */
828 		JBUFFER_TRACE(jh, "file as BJ_Forget");
829 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
830 		/* Wake up any transactions which were waiting for this
831 		   IO to complete */
832 		wake_up_bit(&bh->b_state, BH_Unshadow);
833 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
834 		__brelse(bh);
835 	}
836 
837 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
838 
839 	jbd_debug(3, "JBD: commit phase 5\n");
840 
841 	/* Here we wait for the revoke record and descriptor record buffers */
842  wait_for_ctlbuf:
843 	while (commit_transaction->t_log_list != NULL) {
844 		struct buffer_head *bh;
845 
846 		jh = commit_transaction->t_log_list->b_tprev;
847 		bh = jh2bh(jh);
848 		if (buffer_locked(bh)) {
849 			wait_on_buffer(bh);
850 			goto wait_for_ctlbuf;
851 		}
852 		if (cond_resched())
853 			goto wait_for_ctlbuf;
854 
855 		if (unlikely(!buffer_uptodate(bh)))
856 			err = -EIO;
857 
858 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
859 		clear_buffer_jwrite(bh);
860 		jbd2_journal_unfile_buffer(journal, jh);
861 		jbd2_journal_put_journal_head(jh);
862 		__brelse(bh);		/* One for getblk */
863 		/* AKPM: bforget here */
864 	}
865 
866 	jbd_debug(3, "JBD: commit phase 6\n");
867 
868 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
869 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
870 		err = journal_submit_commit_record(journal, commit_transaction,
871 						&cbh, crc32_sum);
872 		if (err)
873 			__jbd2_journal_abort_hard(journal);
874 	}
875 	err = journal_wait_on_commit_record(cbh);
876 
877 	if (err)
878 		jbd2_journal_abort(journal, err);
879 
880 	/* End of a transaction!  Finally, we can do checkpoint
881            processing: any buffers committed as a result of this
882            transaction can be removed from any checkpoint list it was on
883            before. */
884 
885 	jbd_debug(3, "JBD: commit phase 7\n");
886 
887 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
888 	J_ASSERT(commit_transaction->t_buffers == NULL);
889 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
890 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
891 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
892 	J_ASSERT(commit_transaction->t_log_list == NULL);
893 
894 restart_loop:
895 	/*
896 	 * As there are other places (journal_unmap_buffer()) adding buffers
897 	 * to this list we have to be careful and hold the j_list_lock.
898 	 */
899 	spin_lock(&journal->j_list_lock);
900 	while (commit_transaction->t_forget) {
901 		transaction_t *cp_transaction;
902 		struct buffer_head *bh;
903 
904 		jh = commit_transaction->t_forget;
905 		spin_unlock(&journal->j_list_lock);
906 		bh = jh2bh(jh);
907 		jbd_lock_bh_state(bh);
908 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
909 			jh->b_transaction == journal->j_running_transaction);
910 
911 		/*
912 		 * If there is undo-protected committed data against
913 		 * this buffer, then we can remove it now.  If it is a
914 		 * buffer needing such protection, the old frozen_data
915 		 * field now points to a committed version of the
916 		 * buffer, so rotate that field to the new committed
917 		 * data.
918 		 *
919 		 * Otherwise, we can just throw away the frozen data now.
920 		 */
921 		if (jh->b_committed_data) {
922 			jbd2_free(jh->b_committed_data, bh->b_size);
923 			jh->b_committed_data = NULL;
924 			if (jh->b_frozen_data) {
925 				jh->b_committed_data = jh->b_frozen_data;
926 				jh->b_frozen_data = NULL;
927 			}
928 		} else if (jh->b_frozen_data) {
929 			jbd2_free(jh->b_frozen_data, bh->b_size);
930 			jh->b_frozen_data = NULL;
931 		}
932 
933 		spin_lock(&journal->j_list_lock);
934 		cp_transaction = jh->b_cp_transaction;
935 		if (cp_transaction) {
936 			JBUFFER_TRACE(jh, "remove from old cp transaction");
937 			cp_transaction->t_chp_stats.cs_dropped++;
938 			__jbd2_journal_remove_checkpoint(jh);
939 		}
940 
941 		/* Only re-checkpoint the buffer_head if it is marked
942 		 * dirty.  If the buffer was added to the BJ_Forget list
943 		 * by jbd2_journal_forget, it may no longer be dirty and
944 		 * there's no point in keeping a checkpoint record for
945 		 * it. */
946 
947 		/* A buffer which has been freed while still being
948 		 * journaled by a previous transaction may end up still
949 		 * being dirty here, but we want to avoid writing back
950 		 * that buffer in the future now that the last use has
951 		 * been committed.  That's not only a performance gain,
952 		 * it also stops aliasing problems if the buffer is left
953 		 * behind for writeback and gets reallocated for another
954 		 * use in a different page. */
955 		if (buffer_freed(bh)) {
956 			clear_buffer_freed(bh);
957 			clear_buffer_jbddirty(bh);
958 		}
959 
960 		if (buffer_jbddirty(bh)) {
961 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
962 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
963 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
964 			__jbd2_journal_refile_buffer(jh);
965 			jbd_unlock_bh_state(bh);
966 		} else {
967 			J_ASSERT_BH(bh, !buffer_dirty(bh));
968 			/* The buffer on BJ_Forget list and not jbddirty means
969 			 * it has been freed by this transaction and hence it
970 			 * could not have been reallocated until this
971 			 * transaction has committed. *BUT* it could be
972 			 * reallocated once we have written all the data to
973 			 * disk and before we process the buffer on BJ_Forget
974 			 * list. */
975 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
976 			__jbd2_journal_refile_buffer(jh);
977 			if (!jh->b_transaction) {
978 				jbd_unlock_bh_state(bh);
979 				 /* needs a brelse */
980 				jbd2_journal_remove_journal_head(bh);
981 				release_buffer_page(bh);
982 			} else
983 				jbd_unlock_bh_state(bh);
984 		}
985 		cond_resched_lock(&journal->j_list_lock);
986 	}
987 	spin_unlock(&journal->j_list_lock);
988 	/*
989 	 * This is a bit sleazy.  We use j_list_lock to protect transition
990 	 * of a transaction into T_FINISHED state and calling
991 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
992 	 * other checkpointing code processing the transaction...
993 	 */
994 	spin_lock(&journal->j_state_lock);
995 	spin_lock(&journal->j_list_lock);
996 	/*
997 	 * Now recheck if some buffers did not get attached to the transaction
998 	 * while the lock was dropped...
999 	 */
1000 	if (commit_transaction->t_forget) {
1001 		spin_unlock(&journal->j_list_lock);
1002 		spin_unlock(&journal->j_state_lock);
1003 		goto restart_loop;
1004 	}
1005 
1006 	/* Done with this transaction! */
1007 
1008 	jbd_debug(3, "JBD: commit phase 8\n");
1009 
1010 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
1011 
1012 	commit_transaction->t_start = jiffies;
1013 	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1014 						commit_transaction->t_start);
1015 
1016 	/*
1017 	 * File the transaction for history
1018 	 */
1019 	stats.ts_type = JBD2_STATS_RUN;
1020 	stats.ts_tid = commit_transaction->t_tid;
1021 	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1022 	spin_lock(&journal->j_history_lock);
1023 	memcpy(journal->j_history + journal->j_history_cur, &stats,
1024 			sizeof(stats));
1025 	if (++journal->j_history_cur == journal->j_history_max)
1026 		journal->j_history_cur = 0;
1027 
1028 	/*
1029 	 * Calculate overall stats
1030 	 */
1031 	journal->j_stats.ts_tid++;
1032 	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1033 	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1034 	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1035 	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1036 	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1037 	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1038 	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1039 	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1040 	spin_unlock(&journal->j_history_lock);
1041 
1042 	commit_transaction->t_state = T_FINISHED;
1043 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1044 	journal->j_commit_sequence = commit_transaction->t_tid;
1045 	journal->j_committing_transaction = NULL;
1046 	spin_unlock(&journal->j_state_lock);
1047 
1048 	if (commit_transaction->t_checkpoint_list == NULL &&
1049 	    commit_transaction->t_checkpoint_io_list == NULL) {
1050 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1051 	} else {
1052 		if (journal->j_checkpoint_transactions == NULL) {
1053 			journal->j_checkpoint_transactions = commit_transaction;
1054 			commit_transaction->t_cpnext = commit_transaction;
1055 			commit_transaction->t_cpprev = commit_transaction;
1056 		} else {
1057 			commit_transaction->t_cpnext =
1058 				journal->j_checkpoint_transactions;
1059 			commit_transaction->t_cpprev =
1060 				commit_transaction->t_cpnext->t_cpprev;
1061 			commit_transaction->t_cpnext->t_cpprev =
1062 				commit_transaction;
1063 			commit_transaction->t_cpprev->t_cpnext =
1064 				commit_transaction;
1065 		}
1066 	}
1067 	spin_unlock(&journal->j_list_lock);
1068 
1069 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1070 		  journal->j_commit_sequence, journal->j_tail_sequence);
1071 
1072 	wake_up(&journal->j_wait_done_commit);
1073 }
1074