xref: /linux/fs/jbd2/commit.c (revision cc4589ebfae6f8dbb5cf880a0a67eedab3416492)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <trace/events/jbd2.h>
30 
31 /*
32  * Default IO end handler for temporary BJ_IO buffer_heads.
33  */
34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
35 {
36 	BUFFER_TRACE(bh, "");
37 	if (uptodate)
38 		set_buffer_uptodate(bh);
39 	else
40 		clear_buffer_uptodate(bh);
41 	unlock_buffer(bh);
42 }
43 
44 /*
45  * When an ext4 file is truncated, it is possible that some pages are not
46  * successfully freed, because they are attached to a committing transaction.
47  * After the transaction commits, these pages are left on the LRU, with no
48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
50  * the numbers in /proc/meminfo look odd.
51  *
52  * So here, we have a buffer which has just come off the forget list.  Look to
53  * see if we can strip all buffers from the backing page.
54  *
55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
56  * caller provided us with a ref against the buffer, and we drop that here.
57  */
58 static void release_buffer_page(struct buffer_head *bh)
59 {
60 	struct page *page;
61 
62 	if (buffer_dirty(bh))
63 		goto nope;
64 	if (atomic_read(&bh->b_count) != 1)
65 		goto nope;
66 	page = bh->b_page;
67 	if (!page)
68 		goto nope;
69 	if (page->mapping)
70 		goto nope;
71 
72 	/* OK, it's a truncated page */
73 	if (!trylock_page(page))
74 		goto nope;
75 
76 	page_cache_get(page);
77 	__brelse(bh);
78 	try_to_free_buffers(page);
79 	unlock_page(page);
80 	page_cache_release(page);
81 	return;
82 
83 nope:
84 	__brelse(bh);
85 }
86 
87 /*
88  * Done it all: now submit the commit record.  We should have
89  * cleaned up our previous buffers by now, so if we are in abort
90  * mode we can now just skip the rest of the journal write
91  * entirely.
92  *
93  * Returns 1 if the journal needs to be aborted or 0 on success
94  */
95 static int journal_submit_commit_record(journal_t *journal,
96 					transaction_t *commit_transaction,
97 					struct buffer_head **cbh,
98 					__u32 crc32_sum)
99 {
100 	struct journal_head *descriptor;
101 	struct commit_header *tmp;
102 	struct buffer_head *bh;
103 	int ret;
104 	int barrier_done = 0;
105 	struct timespec now = current_kernel_time();
106 
107 	if (is_journal_aborted(journal))
108 		return 0;
109 
110 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
111 	if (!descriptor)
112 		return 1;
113 
114 	bh = jh2bh(descriptor);
115 
116 	tmp = (struct commit_header *)bh->b_data;
117 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
118 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
119 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
120 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
121 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
122 
123 	if (JBD2_HAS_COMPAT_FEATURE(journal,
124 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
125 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
126 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
127 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
128 	}
129 
130 	JBUFFER_TRACE(descriptor, "submit commit block");
131 	lock_buffer(bh);
132 	clear_buffer_dirty(bh);
133 	set_buffer_uptodate(bh);
134 	bh->b_end_io = journal_end_buffer_io_sync;
135 
136 	if (journal->j_flags & JBD2_BARRIER &&
137 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
138 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
139 		set_buffer_ordered(bh);
140 		barrier_done = 1;
141 	}
142 	ret = submit_bh(WRITE_SYNC_PLUG, bh);
143 	if (barrier_done)
144 		clear_buffer_ordered(bh);
145 
146 	/* is it possible for another commit to fail at roughly
147 	 * the same time as this one?  If so, we don't want to
148 	 * trust the barrier flag in the super, but instead want
149 	 * to remember if we sent a barrier request
150 	 */
151 	if (ret == -EOPNOTSUPP && barrier_done) {
152 		printk(KERN_WARNING
153 		       "JBD2: Disabling barriers on %s, "
154 		       "not supported by device\n", journal->j_devname);
155 		write_lock(&journal->j_state_lock);
156 		journal->j_flags &= ~JBD2_BARRIER;
157 		write_unlock(&journal->j_state_lock);
158 
159 		/* And try again, without the barrier */
160 		lock_buffer(bh);
161 		set_buffer_uptodate(bh);
162 		clear_buffer_dirty(bh);
163 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
164 	}
165 	*cbh = bh;
166 	return ret;
167 }
168 
169 /*
170  * This function along with journal_submit_commit_record
171  * allows to write the commit record asynchronously.
172  */
173 static int journal_wait_on_commit_record(journal_t *journal,
174 					 struct buffer_head *bh)
175 {
176 	int ret = 0;
177 
178 retry:
179 	clear_buffer_dirty(bh);
180 	wait_on_buffer(bh);
181 	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
182 		printk(KERN_WARNING
183 		       "JBD2: %s: disabling barries on %s - not supported "
184 		       "by device\n", __func__, journal->j_devname);
185 		write_lock(&journal->j_state_lock);
186 		journal->j_flags &= ~JBD2_BARRIER;
187 		write_unlock(&journal->j_state_lock);
188 
189 		lock_buffer(bh);
190 		clear_buffer_dirty(bh);
191 		set_buffer_uptodate(bh);
192 		bh->b_end_io = journal_end_buffer_io_sync;
193 
194 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
195 		if (ret) {
196 			unlock_buffer(bh);
197 			return ret;
198 		}
199 		goto retry;
200 	}
201 
202 	if (unlikely(!buffer_uptodate(bh)))
203 		ret = -EIO;
204 	put_bh(bh);            /* One for getblk() */
205 	jbd2_journal_put_journal_head(bh2jh(bh));
206 
207 	return ret;
208 }
209 
210 /*
211  * write the filemap data using writepage() address_space_operations.
212  * We don't do block allocation here even for delalloc. We don't
213  * use writepages() because with dealyed allocation we may be doing
214  * block allocation in writepages().
215  */
216 static int journal_submit_inode_data_buffers(struct address_space *mapping)
217 {
218 	int ret;
219 	struct writeback_control wbc = {
220 		.sync_mode =  WB_SYNC_ALL,
221 		.nr_to_write = mapping->nrpages * 2,
222 		.range_start = 0,
223 		.range_end = i_size_read(mapping->host),
224 	};
225 
226 	ret = generic_writepages(mapping, &wbc);
227 	return ret;
228 }
229 
230 /*
231  * Submit all the data buffers of inode associated with the transaction to
232  * disk.
233  *
234  * We are in a committing transaction. Therefore no new inode can be added to
235  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
236  * operate on from being released while we write out pages.
237  */
238 static int journal_submit_data_buffers(journal_t *journal,
239 		transaction_t *commit_transaction)
240 {
241 	struct jbd2_inode *jinode;
242 	int err, ret = 0;
243 	struct address_space *mapping;
244 
245 	spin_lock(&journal->j_list_lock);
246 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
247 		mapping = jinode->i_vfs_inode->i_mapping;
248 		jinode->i_flags |= JI_COMMIT_RUNNING;
249 		spin_unlock(&journal->j_list_lock);
250 		/*
251 		 * submit the inode data buffers. We use writepage
252 		 * instead of writepages. Because writepages can do
253 		 * block allocation  with delalloc. We need to write
254 		 * only allocated blocks here.
255 		 */
256 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
257 		err = journal_submit_inode_data_buffers(mapping);
258 		if (!ret)
259 			ret = err;
260 		spin_lock(&journal->j_list_lock);
261 		J_ASSERT(jinode->i_transaction == commit_transaction);
262 		commit_transaction->t_flushed_data_blocks = 1;
263 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
264 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
265 	}
266 	spin_unlock(&journal->j_list_lock);
267 	return ret;
268 }
269 
270 /*
271  * Wait for data submitted for writeout, refile inodes to proper
272  * transaction if needed.
273  *
274  */
275 static int journal_finish_inode_data_buffers(journal_t *journal,
276 		transaction_t *commit_transaction)
277 {
278 	struct jbd2_inode *jinode, *next_i;
279 	int err, ret = 0;
280 
281 	/* For locking, see the comment in journal_submit_data_buffers() */
282 	spin_lock(&journal->j_list_lock);
283 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
284 		jinode->i_flags |= JI_COMMIT_RUNNING;
285 		spin_unlock(&journal->j_list_lock);
286 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
287 		if (err) {
288 			/*
289 			 * Because AS_EIO is cleared by
290 			 * filemap_fdatawait_range(), set it again so
291 			 * that user process can get -EIO from fsync().
292 			 */
293 			set_bit(AS_EIO,
294 				&jinode->i_vfs_inode->i_mapping->flags);
295 
296 			if (!ret)
297 				ret = err;
298 		}
299 		spin_lock(&journal->j_list_lock);
300 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
301 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
302 	}
303 
304 	/* Now refile inode to proper lists */
305 	list_for_each_entry_safe(jinode, next_i,
306 				 &commit_transaction->t_inode_list, i_list) {
307 		list_del(&jinode->i_list);
308 		if (jinode->i_next_transaction) {
309 			jinode->i_transaction = jinode->i_next_transaction;
310 			jinode->i_next_transaction = NULL;
311 			list_add(&jinode->i_list,
312 				&jinode->i_transaction->t_inode_list);
313 		} else {
314 			jinode->i_transaction = NULL;
315 		}
316 	}
317 	spin_unlock(&journal->j_list_lock);
318 
319 	return ret;
320 }
321 
322 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
323 {
324 	struct page *page = bh->b_page;
325 	char *addr;
326 	__u32 checksum;
327 
328 	addr = kmap_atomic(page, KM_USER0);
329 	checksum = crc32_be(crc32_sum,
330 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
331 	kunmap_atomic(addr, KM_USER0);
332 
333 	return checksum;
334 }
335 
336 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
337 				   unsigned long long block)
338 {
339 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
340 	if (tag_bytes > JBD2_TAG_SIZE32)
341 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
342 }
343 
344 /*
345  * jbd2_journal_commit_transaction
346  *
347  * The primary function for committing a transaction to the log.  This
348  * function is called by the journal thread to begin a complete commit.
349  */
350 void jbd2_journal_commit_transaction(journal_t *journal)
351 {
352 	struct transaction_stats_s stats;
353 	transaction_t *commit_transaction;
354 	struct journal_head *jh, *new_jh, *descriptor;
355 	struct buffer_head **wbuf = journal->j_wbuf;
356 	int bufs;
357 	int flags;
358 	int err;
359 	unsigned long long blocknr;
360 	ktime_t start_time;
361 	u64 commit_time;
362 	char *tagp = NULL;
363 	journal_header_t *header;
364 	journal_block_tag_t *tag = NULL;
365 	int space_left = 0;
366 	int first_tag = 0;
367 	int tag_flag;
368 	int i, to_free = 0;
369 	int tag_bytes = journal_tag_bytes(journal);
370 	struct buffer_head *cbh = NULL; /* For transactional checksums */
371 	__u32 crc32_sum = ~0;
372 	int write_op = WRITE;
373 
374 	/*
375 	 * First job: lock down the current transaction and wait for
376 	 * all outstanding updates to complete.
377 	 */
378 
379 #ifdef COMMIT_STATS
380 	spin_lock(&journal->j_list_lock);
381 	summarise_journal_usage(journal);
382 	spin_unlock(&journal->j_list_lock);
383 #endif
384 
385 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
386 	if (journal->j_flags & JBD2_FLUSHED) {
387 		jbd_debug(3, "super block updated\n");
388 		jbd2_journal_update_superblock(journal, 1);
389 	} else {
390 		jbd_debug(3, "superblock not updated\n");
391 	}
392 
393 	J_ASSERT(journal->j_running_transaction != NULL);
394 	J_ASSERT(journal->j_committing_transaction == NULL);
395 
396 	commit_transaction = journal->j_running_transaction;
397 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
398 
399 	trace_jbd2_start_commit(journal, commit_transaction);
400 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
401 			commit_transaction->t_tid);
402 
403 	write_lock(&journal->j_state_lock);
404 	commit_transaction->t_state = T_LOCKED;
405 
406 	/*
407 	 * Use plugged writes here, since we want to submit several before
408 	 * we unplug the device. We don't do explicit unplugging in here,
409 	 * instead we rely on sync_buffer() doing the unplug for us.
410 	 */
411 	if (commit_transaction->t_synchronous_commit)
412 		write_op = WRITE_SYNC_PLUG;
413 	trace_jbd2_commit_locking(journal, commit_transaction);
414 	stats.run.rs_wait = commit_transaction->t_max_wait;
415 	stats.run.rs_locked = jiffies;
416 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
417 					      stats.run.rs_locked);
418 
419 	spin_lock(&commit_transaction->t_handle_lock);
420 	while (atomic_read(&commit_transaction->t_updates)) {
421 		DEFINE_WAIT(wait);
422 
423 		prepare_to_wait(&journal->j_wait_updates, &wait,
424 					TASK_UNINTERRUPTIBLE);
425 		if (atomic_read(&commit_transaction->t_updates)) {
426 			spin_unlock(&commit_transaction->t_handle_lock);
427 			write_unlock(&journal->j_state_lock);
428 			schedule();
429 			write_lock(&journal->j_state_lock);
430 			spin_lock(&commit_transaction->t_handle_lock);
431 		}
432 		finish_wait(&journal->j_wait_updates, &wait);
433 	}
434 	spin_unlock(&commit_transaction->t_handle_lock);
435 
436 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
437 			journal->j_max_transaction_buffers);
438 
439 	/*
440 	 * First thing we are allowed to do is to discard any remaining
441 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
442 	 * that there are no such buffers: if a large filesystem
443 	 * operation like a truncate needs to split itself over multiple
444 	 * transactions, then it may try to do a jbd2_journal_restart() while
445 	 * there are still BJ_Reserved buffers outstanding.  These must
446 	 * be released cleanly from the current transaction.
447 	 *
448 	 * In this case, the filesystem must still reserve write access
449 	 * again before modifying the buffer in the new transaction, but
450 	 * we do not require it to remember exactly which old buffers it
451 	 * has reserved.  This is consistent with the existing behaviour
452 	 * that multiple jbd2_journal_get_write_access() calls to the same
453 	 * buffer are perfectly permissable.
454 	 */
455 	while (commit_transaction->t_reserved_list) {
456 		jh = commit_transaction->t_reserved_list;
457 		JBUFFER_TRACE(jh, "reserved, unused: refile");
458 		/*
459 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
460 		 * leave undo-committed data.
461 		 */
462 		if (jh->b_committed_data) {
463 			struct buffer_head *bh = jh2bh(jh);
464 
465 			jbd_lock_bh_state(bh);
466 			jbd2_free(jh->b_committed_data, bh->b_size);
467 			jh->b_committed_data = NULL;
468 			jbd_unlock_bh_state(bh);
469 		}
470 		jbd2_journal_refile_buffer(journal, jh);
471 	}
472 
473 	/*
474 	 * Now try to drop any written-back buffers from the journal's
475 	 * checkpoint lists.  We do this *before* commit because it potentially
476 	 * frees some memory
477 	 */
478 	spin_lock(&journal->j_list_lock);
479 	__jbd2_journal_clean_checkpoint_list(journal);
480 	spin_unlock(&journal->j_list_lock);
481 
482 	jbd_debug (3, "JBD: commit phase 1\n");
483 
484 	/*
485 	 * Switch to a new revoke table.
486 	 */
487 	jbd2_journal_switch_revoke_table(journal);
488 
489 	trace_jbd2_commit_flushing(journal, commit_transaction);
490 	stats.run.rs_flushing = jiffies;
491 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
492 					     stats.run.rs_flushing);
493 
494 	commit_transaction->t_state = T_FLUSH;
495 	journal->j_committing_transaction = commit_transaction;
496 	journal->j_running_transaction = NULL;
497 	start_time = ktime_get();
498 	commit_transaction->t_log_start = journal->j_head;
499 	wake_up(&journal->j_wait_transaction_locked);
500 	write_unlock(&journal->j_state_lock);
501 
502 	jbd_debug (3, "JBD: commit phase 2\n");
503 
504 	/*
505 	 * Now start flushing things to disk, in the order they appear
506 	 * on the transaction lists.  Data blocks go first.
507 	 */
508 	err = journal_submit_data_buffers(journal, commit_transaction);
509 	if (err)
510 		jbd2_journal_abort(journal, err);
511 
512 	jbd2_journal_write_revoke_records(journal, commit_transaction,
513 					  write_op);
514 
515 	jbd_debug(3, "JBD: commit phase 2\n");
516 
517 	/*
518 	 * Way to go: we have now written out all of the data for a
519 	 * transaction!  Now comes the tricky part: we need to write out
520 	 * metadata.  Loop over the transaction's entire buffer list:
521 	 */
522 	write_lock(&journal->j_state_lock);
523 	commit_transaction->t_state = T_COMMIT;
524 	write_unlock(&journal->j_state_lock);
525 
526 	trace_jbd2_commit_logging(journal, commit_transaction);
527 	stats.run.rs_logging = jiffies;
528 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
529 					       stats.run.rs_logging);
530 	stats.run.rs_blocks =
531 		atomic_read(&commit_transaction->t_outstanding_credits);
532 	stats.run.rs_blocks_logged = 0;
533 
534 	J_ASSERT(commit_transaction->t_nr_buffers <=
535 		 atomic_read(&commit_transaction->t_outstanding_credits));
536 
537 	err = 0;
538 	descriptor = NULL;
539 	bufs = 0;
540 	while (commit_transaction->t_buffers) {
541 
542 		/* Find the next buffer to be journaled... */
543 
544 		jh = commit_transaction->t_buffers;
545 
546 		/* If we're in abort mode, we just un-journal the buffer and
547 		   release it. */
548 
549 		if (is_journal_aborted(journal)) {
550 			clear_buffer_jbddirty(jh2bh(jh));
551 			JBUFFER_TRACE(jh, "journal is aborting: refile");
552 			jbd2_buffer_abort_trigger(jh,
553 						  jh->b_frozen_data ?
554 						  jh->b_frozen_triggers :
555 						  jh->b_triggers);
556 			jbd2_journal_refile_buffer(journal, jh);
557 			/* If that was the last one, we need to clean up
558 			 * any descriptor buffers which may have been
559 			 * already allocated, even if we are now
560 			 * aborting. */
561 			if (!commit_transaction->t_buffers)
562 				goto start_journal_io;
563 			continue;
564 		}
565 
566 		/* Make sure we have a descriptor block in which to
567 		   record the metadata buffer. */
568 
569 		if (!descriptor) {
570 			struct buffer_head *bh;
571 
572 			J_ASSERT (bufs == 0);
573 
574 			jbd_debug(4, "JBD: get descriptor\n");
575 
576 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
577 			if (!descriptor) {
578 				jbd2_journal_abort(journal, -EIO);
579 				continue;
580 			}
581 
582 			bh = jh2bh(descriptor);
583 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
584 				(unsigned long long)bh->b_blocknr, bh->b_data);
585 			header = (journal_header_t *)&bh->b_data[0];
586 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
587 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
588 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
589 
590 			tagp = &bh->b_data[sizeof(journal_header_t)];
591 			space_left = bh->b_size - sizeof(journal_header_t);
592 			first_tag = 1;
593 			set_buffer_jwrite(bh);
594 			set_buffer_dirty(bh);
595 			wbuf[bufs++] = bh;
596 
597 			/* Record it so that we can wait for IO
598                            completion later */
599 			BUFFER_TRACE(bh, "ph3: file as descriptor");
600 			jbd2_journal_file_buffer(descriptor, commit_transaction,
601 					BJ_LogCtl);
602 		}
603 
604 		/* Where is the buffer to be written? */
605 
606 		err = jbd2_journal_next_log_block(journal, &blocknr);
607 		/* If the block mapping failed, just abandon the buffer
608 		   and repeat this loop: we'll fall into the
609 		   refile-on-abort condition above. */
610 		if (err) {
611 			jbd2_journal_abort(journal, err);
612 			continue;
613 		}
614 
615 		/*
616 		 * start_this_handle() uses t_outstanding_credits to determine
617 		 * the free space in the log, but this counter is changed
618 		 * by jbd2_journal_next_log_block() also.
619 		 */
620 		atomic_dec(&commit_transaction->t_outstanding_credits);
621 
622 		/* Bump b_count to prevent truncate from stumbling over
623                    the shadowed buffer!  @@@ This can go if we ever get
624                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
625 		atomic_inc(&jh2bh(jh)->b_count);
626 
627 		/* Make a temporary IO buffer with which to write it out
628                    (this will requeue both the metadata buffer and the
629                    temporary IO buffer). new_bh goes on BJ_IO*/
630 
631 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
632 		/*
633 		 * akpm: jbd2_journal_write_metadata_buffer() sets
634 		 * new_bh->b_transaction to commit_transaction.
635 		 * We need to clean this up before we release new_bh
636 		 * (which is of type BJ_IO)
637 		 */
638 		JBUFFER_TRACE(jh, "ph3: write metadata");
639 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
640 						      jh, &new_jh, blocknr);
641 		if (flags < 0) {
642 			jbd2_journal_abort(journal, flags);
643 			continue;
644 		}
645 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
646 		wbuf[bufs++] = jh2bh(new_jh);
647 
648 		/* Record the new block's tag in the current descriptor
649                    buffer */
650 
651 		tag_flag = 0;
652 		if (flags & 1)
653 			tag_flag |= JBD2_FLAG_ESCAPE;
654 		if (!first_tag)
655 			tag_flag |= JBD2_FLAG_SAME_UUID;
656 
657 		tag = (journal_block_tag_t *) tagp;
658 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
659 		tag->t_flags = cpu_to_be32(tag_flag);
660 		tagp += tag_bytes;
661 		space_left -= tag_bytes;
662 
663 		if (first_tag) {
664 			memcpy (tagp, journal->j_uuid, 16);
665 			tagp += 16;
666 			space_left -= 16;
667 			first_tag = 0;
668 		}
669 
670 		/* If there's no more to do, or if the descriptor is full,
671 		   let the IO rip! */
672 
673 		if (bufs == journal->j_wbufsize ||
674 		    commit_transaction->t_buffers == NULL ||
675 		    space_left < tag_bytes + 16) {
676 
677 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
678 
679 			/* Write an end-of-descriptor marker before
680                            submitting the IOs.  "tag" still points to
681                            the last tag we set up. */
682 
683 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
684 
685 start_journal_io:
686 			for (i = 0; i < bufs; i++) {
687 				struct buffer_head *bh = wbuf[i];
688 				/*
689 				 * Compute checksum.
690 				 */
691 				if (JBD2_HAS_COMPAT_FEATURE(journal,
692 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
693 					crc32_sum =
694 					    jbd2_checksum_data(crc32_sum, bh);
695 				}
696 
697 				lock_buffer(bh);
698 				clear_buffer_dirty(bh);
699 				set_buffer_uptodate(bh);
700 				bh->b_end_io = journal_end_buffer_io_sync;
701 				submit_bh(write_op, bh);
702 			}
703 			cond_resched();
704 			stats.run.rs_blocks_logged += bufs;
705 
706 			/* Force a new descriptor to be generated next
707                            time round the loop. */
708 			descriptor = NULL;
709 			bufs = 0;
710 		}
711 	}
712 
713 	/*
714 	 * If the journal is not located on the file system device,
715 	 * then we must flush the file system device before we issue
716 	 * the commit record
717 	 */
718 	if (commit_transaction->t_flushed_data_blocks &&
719 	    (journal->j_fs_dev != journal->j_dev) &&
720 	    (journal->j_flags & JBD2_BARRIER))
721 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
722 			BLKDEV_IFL_WAIT);
723 
724 	/* Done it all: now write the commit record asynchronously. */
725 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
726 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
727 		err = journal_submit_commit_record(journal, commit_transaction,
728 						 &cbh, crc32_sum);
729 		if (err)
730 			__jbd2_journal_abort_hard(journal);
731 		if (journal->j_flags & JBD2_BARRIER)
732 			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
733 				BLKDEV_IFL_WAIT);
734 	}
735 
736 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
737 	if (err) {
738 		printk(KERN_WARNING
739 			"JBD2: Detected IO errors while flushing file data "
740 		       "on %s\n", journal->j_devname);
741 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
742 			jbd2_journal_abort(journal, err);
743 		err = 0;
744 	}
745 
746 	/* Lo and behold: we have just managed to send a transaction to
747            the log.  Before we can commit it, wait for the IO so far to
748            complete.  Control buffers being written are on the
749            transaction's t_log_list queue, and metadata buffers are on
750            the t_iobuf_list queue.
751 
752 	   Wait for the buffers in reverse order.  That way we are
753 	   less likely to be woken up until all IOs have completed, and
754 	   so we incur less scheduling load.
755 	*/
756 
757 	jbd_debug(3, "JBD: commit phase 3\n");
758 
759 	/*
760 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
761 	 * See __journal_try_to_free_buffer.
762 	 */
763 wait_for_iobuf:
764 	while (commit_transaction->t_iobuf_list != NULL) {
765 		struct buffer_head *bh;
766 
767 		jh = commit_transaction->t_iobuf_list->b_tprev;
768 		bh = jh2bh(jh);
769 		if (buffer_locked(bh)) {
770 			wait_on_buffer(bh);
771 			goto wait_for_iobuf;
772 		}
773 		if (cond_resched())
774 			goto wait_for_iobuf;
775 
776 		if (unlikely(!buffer_uptodate(bh)))
777 			err = -EIO;
778 
779 		clear_buffer_jwrite(bh);
780 
781 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
782 		jbd2_journal_unfile_buffer(journal, jh);
783 
784 		/*
785 		 * ->t_iobuf_list should contain only dummy buffer_heads
786 		 * which were created by jbd2_journal_write_metadata_buffer().
787 		 */
788 		BUFFER_TRACE(bh, "dumping temporary bh");
789 		jbd2_journal_put_journal_head(jh);
790 		__brelse(bh);
791 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
792 		free_buffer_head(bh);
793 
794 		/* We also have to unlock and free the corresponding
795                    shadowed buffer */
796 		jh = commit_transaction->t_shadow_list->b_tprev;
797 		bh = jh2bh(jh);
798 		clear_bit(BH_JWrite, &bh->b_state);
799 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
800 
801 		/* The metadata is now released for reuse, but we need
802                    to remember it against this transaction so that when
803                    we finally commit, we can do any checkpointing
804                    required. */
805 		JBUFFER_TRACE(jh, "file as BJ_Forget");
806 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
807 		/* Wake up any transactions which were waiting for this
808 		   IO to complete */
809 		wake_up_bit(&bh->b_state, BH_Unshadow);
810 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
811 		__brelse(bh);
812 	}
813 
814 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
815 
816 	jbd_debug(3, "JBD: commit phase 4\n");
817 
818 	/* Here we wait for the revoke record and descriptor record buffers */
819  wait_for_ctlbuf:
820 	while (commit_transaction->t_log_list != NULL) {
821 		struct buffer_head *bh;
822 
823 		jh = commit_transaction->t_log_list->b_tprev;
824 		bh = jh2bh(jh);
825 		if (buffer_locked(bh)) {
826 			wait_on_buffer(bh);
827 			goto wait_for_ctlbuf;
828 		}
829 		if (cond_resched())
830 			goto wait_for_ctlbuf;
831 
832 		if (unlikely(!buffer_uptodate(bh)))
833 			err = -EIO;
834 
835 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
836 		clear_buffer_jwrite(bh);
837 		jbd2_journal_unfile_buffer(journal, jh);
838 		jbd2_journal_put_journal_head(jh);
839 		__brelse(bh);		/* One for getblk */
840 		/* AKPM: bforget here */
841 	}
842 
843 	if (err)
844 		jbd2_journal_abort(journal, err);
845 
846 	jbd_debug(3, "JBD: commit phase 5\n");
847 
848 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
849 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
850 		err = journal_submit_commit_record(journal, commit_transaction,
851 						&cbh, crc32_sum);
852 		if (err)
853 			__jbd2_journal_abort_hard(journal);
854 	}
855 	if (!err && !is_journal_aborted(journal))
856 		err = journal_wait_on_commit_record(journal, cbh);
857 
858 	if (err)
859 		jbd2_journal_abort(journal, err);
860 
861 	/* End of a transaction!  Finally, we can do checkpoint
862            processing: any buffers committed as a result of this
863            transaction can be removed from any checkpoint list it was on
864            before. */
865 
866 	jbd_debug(3, "JBD: commit phase 6\n");
867 
868 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
869 	J_ASSERT(commit_transaction->t_buffers == NULL);
870 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
871 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
872 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
873 	J_ASSERT(commit_transaction->t_log_list == NULL);
874 
875 restart_loop:
876 	/*
877 	 * As there are other places (journal_unmap_buffer()) adding buffers
878 	 * to this list we have to be careful and hold the j_list_lock.
879 	 */
880 	spin_lock(&journal->j_list_lock);
881 	while (commit_transaction->t_forget) {
882 		transaction_t *cp_transaction;
883 		struct buffer_head *bh;
884 
885 		jh = commit_transaction->t_forget;
886 		spin_unlock(&journal->j_list_lock);
887 		bh = jh2bh(jh);
888 		jbd_lock_bh_state(bh);
889 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
890 
891 		/*
892 		 * If there is undo-protected committed data against
893 		 * this buffer, then we can remove it now.  If it is a
894 		 * buffer needing such protection, the old frozen_data
895 		 * field now points to a committed version of the
896 		 * buffer, so rotate that field to the new committed
897 		 * data.
898 		 *
899 		 * Otherwise, we can just throw away the frozen data now.
900 		 *
901 		 * We also know that the frozen data has already fired
902 		 * its triggers if they exist, so we can clear that too.
903 		 */
904 		if (jh->b_committed_data) {
905 			jbd2_free(jh->b_committed_data, bh->b_size);
906 			jh->b_committed_data = NULL;
907 			if (jh->b_frozen_data) {
908 				jh->b_committed_data = jh->b_frozen_data;
909 				jh->b_frozen_data = NULL;
910 				jh->b_frozen_triggers = NULL;
911 			}
912 		} else if (jh->b_frozen_data) {
913 			jbd2_free(jh->b_frozen_data, bh->b_size);
914 			jh->b_frozen_data = NULL;
915 			jh->b_frozen_triggers = NULL;
916 		}
917 
918 		spin_lock(&journal->j_list_lock);
919 		cp_transaction = jh->b_cp_transaction;
920 		if (cp_transaction) {
921 			JBUFFER_TRACE(jh, "remove from old cp transaction");
922 			cp_transaction->t_chp_stats.cs_dropped++;
923 			__jbd2_journal_remove_checkpoint(jh);
924 		}
925 
926 		/* Only re-checkpoint the buffer_head if it is marked
927 		 * dirty.  If the buffer was added to the BJ_Forget list
928 		 * by jbd2_journal_forget, it may no longer be dirty and
929 		 * there's no point in keeping a checkpoint record for
930 		 * it. */
931 
932 		/* A buffer which has been freed while still being
933 		 * journaled by a previous transaction may end up still
934 		 * being dirty here, but we want to avoid writing back
935 		 * that buffer in the future after the "add to orphan"
936 		 * operation been committed,  That's not only a performance
937 		 * gain, it also stops aliasing problems if the buffer is
938 		 * left behind for writeback and gets reallocated for another
939 		 * use in a different page. */
940 		if (buffer_freed(bh) && !jh->b_next_transaction) {
941 			clear_buffer_freed(bh);
942 			clear_buffer_jbddirty(bh);
943 		}
944 
945 		if (buffer_jbddirty(bh)) {
946 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
947 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
948 			if (is_journal_aborted(journal))
949 				clear_buffer_jbddirty(bh);
950 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
951 			__jbd2_journal_refile_buffer(jh);
952 			jbd_unlock_bh_state(bh);
953 		} else {
954 			J_ASSERT_BH(bh, !buffer_dirty(bh));
955 			/* The buffer on BJ_Forget list and not jbddirty means
956 			 * it has been freed by this transaction and hence it
957 			 * could not have been reallocated until this
958 			 * transaction has committed. *BUT* it could be
959 			 * reallocated once we have written all the data to
960 			 * disk and before we process the buffer on BJ_Forget
961 			 * list. */
962 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
963 			__jbd2_journal_refile_buffer(jh);
964 			if (!jh->b_transaction) {
965 				jbd_unlock_bh_state(bh);
966 				 /* needs a brelse */
967 				jbd2_journal_remove_journal_head(bh);
968 				release_buffer_page(bh);
969 			} else
970 				jbd_unlock_bh_state(bh);
971 		}
972 		cond_resched_lock(&journal->j_list_lock);
973 	}
974 	spin_unlock(&journal->j_list_lock);
975 	/*
976 	 * This is a bit sleazy.  We use j_list_lock to protect transition
977 	 * of a transaction into T_FINISHED state and calling
978 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
979 	 * other checkpointing code processing the transaction...
980 	 */
981 	write_lock(&journal->j_state_lock);
982 	spin_lock(&journal->j_list_lock);
983 	/*
984 	 * Now recheck if some buffers did not get attached to the transaction
985 	 * while the lock was dropped...
986 	 */
987 	if (commit_transaction->t_forget) {
988 		spin_unlock(&journal->j_list_lock);
989 		write_unlock(&journal->j_state_lock);
990 		goto restart_loop;
991 	}
992 
993 	/* Done with this transaction! */
994 
995 	jbd_debug(3, "JBD: commit phase 7\n");
996 
997 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
998 
999 	commit_transaction->t_start = jiffies;
1000 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1001 					      commit_transaction->t_start);
1002 
1003 	/*
1004 	 * File the transaction statistics
1005 	 */
1006 	stats.ts_tid = commit_transaction->t_tid;
1007 	stats.run.rs_handle_count =
1008 		atomic_read(&commit_transaction->t_handle_count);
1009 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1010 			     commit_transaction->t_tid, &stats.run);
1011 
1012 	/*
1013 	 * Calculate overall stats
1014 	 */
1015 	spin_lock(&journal->j_history_lock);
1016 	journal->j_stats.ts_tid++;
1017 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1018 	journal->j_stats.run.rs_running += stats.run.rs_running;
1019 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1020 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1021 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1022 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1023 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1024 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1025 	spin_unlock(&journal->j_history_lock);
1026 
1027 	commit_transaction->t_state = T_FINISHED;
1028 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1029 	journal->j_commit_sequence = commit_transaction->t_tid;
1030 	journal->j_committing_transaction = NULL;
1031 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1032 
1033 	/*
1034 	 * weight the commit time higher than the average time so we don't
1035 	 * react too strongly to vast changes in the commit time
1036 	 */
1037 	if (likely(journal->j_average_commit_time))
1038 		journal->j_average_commit_time = (commit_time +
1039 				journal->j_average_commit_time*3) / 4;
1040 	else
1041 		journal->j_average_commit_time = commit_time;
1042 	write_unlock(&journal->j_state_lock);
1043 
1044 	if (commit_transaction->t_checkpoint_list == NULL &&
1045 	    commit_transaction->t_checkpoint_io_list == NULL) {
1046 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1047 		to_free = 1;
1048 	} else {
1049 		if (journal->j_checkpoint_transactions == NULL) {
1050 			journal->j_checkpoint_transactions = commit_transaction;
1051 			commit_transaction->t_cpnext = commit_transaction;
1052 			commit_transaction->t_cpprev = commit_transaction;
1053 		} else {
1054 			commit_transaction->t_cpnext =
1055 				journal->j_checkpoint_transactions;
1056 			commit_transaction->t_cpprev =
1057 				commit_transaction->t_cpnext->t_cpprev;
1058 			commit_transaction->t_cpnext->t_cpprev =
1059 				commit_transaction;
1060 			commit_transaction->t_cpprev->t_cpnext =
1061 				commit_transaction;
1062 		}
1063 	}
1064 	spin_unlock(&journal->j_list_lock);
1065 
1066 	if (journal->j_commit_callback)
1067 		journal->j_commit_callback(journal, commit_transaction);
1068 
1069 	trace_jbd2_end_commit(journal, commit_transaction);
1070 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1071 		  journal->j_commit_sequence, journal->j_tail_sequence);
1072 	if (to_free)
1073 		kfree(commit_transaction);
1074 
1075 	wake_up(&journal->j_wait_done_commit);
1076 }
1077