xref: /linux/fs/jbd2/commit.c (revision b889fcf63cb62e7fdb7816565e28f44dbe4a76a5)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * Default IO end handler for temporary BJ_IO buffer_heads.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	BUFFER_TRACE(bh, "");
38 	if (uptodate)
39 		set_buffer_uptodate(bh);
40 	else
41 		clear_buffer_uptodate(bh);
42 	unlock_buffer(bh);
43 }
44 
45 /*
46  * When an ext4 file is truncated, it is possible that some pages are not
47  * successfully freed, because they are attached to a committing transaction.
48  * After the transaction commits, these pages are left on the LRU, with no
49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
51  * the numbers in /proc/meminfo look odd.
52  *
53  * So here, we have a buffer which has just come off the forget list.  Look to
54  * see if we can strip all buffers from the backing page.
55  *
56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
57  * caller provided us with a ref against the buffer, and we drop that here.
58  */
59 static void release_buffer_page(struct buffer_head *bh)
60 {
61 	struct page *page;
62 
63 	if (buffer_dirty(bh))
64 		goto nope;
65 	if (atomic_read(&bh->b_count) != 1)
66 		goto nope;
67 	page = bh->b_page;
68 	if (!page)
69 		goto nope;
70 	if (page->mapping)
71 		goto nope;
72 
73 	/* OK, it's a truncated page */
74 	if (!trylock_page(page))
75 		goto nope;
76 
77 	page_cache_get(page);
78 	__brelse(bh);
79 	try_to_free_buffers(page);
80 	unlock_page(page);
81 	page_cache_release(page);
82 	return;
83 
84 nope:
85 	__brelse(bh);
86 }
87 
88 static void jbd2_commit_block_csum_set(journal_t *j,
89 				       struct journal_head *descriptor)
90 {
91 	struct commit_header *h;
92 	__u32 csum;
93 
94 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95 		return;
96 
97 	h = (struct commit_header *)(jh2bh(descriptor)->b_data);
98 	h->h_chksum_type = 0;
99 	h->h_chksum_size = 0;
100 	h->h_chksum[0] = 0;
101 	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
102 			   j->j_blocksize);
103 	h->h_chksum[0] = cpu_to_be32(csum);
104 }
105 
106 /*
107  * Done it all: now submit the commit record.  We should have
108  * cleaned up our previous buffers by now, so if we are in abort
109  * mode we can now just skip the rest of the journal write
110  * entirely.
111  *
112  * Returns 1 if the journal needs to be aborted or 0 on success
113  */
114 static int journal_submit_commit_record(journal_t *journal,
115 					transaction_t *commit_transaction,
116 					struct buffer_head **cbh,
117 					__u32 crc32_sum)
118 {
119 	struct journal_head *descriptor;
120 	struct commit_header *tmp;
121 	struct buffer_head *bh;
122 	int ret;
123 	struct timespec now = current_kernel_time();
124 
125 	*cbh = NULL;
126 
127 	if (is_journal_aborted(journal))
128 		return 0;
129 
130 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
131 	if (!descriptor)
132 		return 1;
133 
134 	bh = jh2bh(descriptor);
135 
136 	tmp = (struct commit_header *)bh->b_data;
137 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
138 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
139 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
140 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143 	if (JBD2_HAS_COMPAT_FEATURE(journal,
144 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
145 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
146 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
147 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
148 	}
149 	jbd2_commit_block_csum_set(journal, descriptor);
150 
151 	JBUFFER_TRACE(descriptor, "submit commit block");
152 	lock_buffer(bh);
153 	clear_buffer_dirty(bh);
154 	set_buffer_uptodate(bh);
155 	bh->b_end_io = journal_end_buffer_io_sync;
156 
157 	if (journal->j_flags & JBD2_BARRIER &&
158 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
159 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
160 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
161 	else
162 		ret = submit_bh(WRITE_SYNC, bh);
163 
164 	*cbh = bh;
165 	return ret;
166 }
167 
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(journal_t *journal,
173 					 struct buffer_head *bh)
174 {
175 	int ret = 0;
176 
177 	clear_buffer_dirty(bh);
178 	wait_on_buffer(bh);
179 
180 	if (unlikely(!buffer_uptodate(bh)))
181 		ret = -EIO;
182 	put_bh(bh);            /* One for getblk() */
183 	jbd2_journal_put_journal_head(bh2jh(bh));
184 
185 	return ret;
186 }
187 
188 /*
189  * write the filemap data using writepage() address_space_operations.
190  * We don't do block allocation here even for delalloc. We don't
191  * use writepages() because with dealyed allocation we may be doing
192  * block allocation in writepages().
193  */
194 static int journal_submit_inode_data_buffers(struct address_space *mapping)
195 {
196 	int ret;
197 	struct writeback_control wbc = {
198 		.sync_mode =  WB_SYNC_ALL,
199 		.nr_to_write = mapping->nrpages * 2,
200 		.range_start = 0,
201 		.range_end = i_size_read(mapping->host),
202 	};
203 
204 	ret = generic_writepages(mapping, &wbc);
205 	return ret;
206 }
207 
208 /*
209  * Submit all the data buffers of inode associated with the transaction to
210  * disk.
211  *
212  * We are in a committing transaction. Therefore no new inode can be added to
213  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
214  * operate on from being released while we write out pages.
215  */
216 static int journal_submit_data_buffers(journal_t *journal,
217 		transaction_t *commit_transaction)
218 {
219 	struct jbd2_inode *jinode;
220 	int err, ret = 0;
221 	struct address_space *mapping;
222 
223 	spin_lock(&journal->j_list_lock);
224 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
225 		mapping = jinode->i_vfs_inode->i_mapping;
226 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
227 		spin_unlock(&journal->j_list_lock);
228 		/*
229 		 * submit the inode data buffers. We use writepage
230 		 * instead of writepages. Because writepages can do
231 		 * block allocation  with delalloc. We need to write
232 		 * only allocated blocks here.
233 		 */
234 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
235 		err = journal_submit_inode_data_buffers(mapping);
236 		if (!ret)
237 			ret = err;
238 		spin_lock(&journal->j_list_lock);
239 		J_ASSERT(jinode->i_transaction == commit_transaction);
240 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
241 		smp_mb__after_clear_bit();
242 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 	}
244 	spin_unlock(&journal->j_list_lock);
245 	return ret;
246 }
247 
248 /*
249  * Wait for data submitted for writeout, refile inodes to proper
250  * transaction if needed.
251  *
252  */
253 static int journal_finish_inode_data_buffers(journal_t *journal,
254 		transaction_t *commit_transaction)
255 {
256 	struct jbd2_inode *jinode, *next_i;
257 	int err, ret = 0;
258 
259 	/* For locking, see the comment in journal_submit_data_buffers() */
260 	spin_lock(&journal->j_list_lock);
261 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
263 		spin_unlock(&journal->j_list_lock);
264 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265 		if (err) {
266 			/*
267 			 * Because AS_EIO is cleared by
268 			 * filemap_fdatawait_range(), set it again so
269 			 * that user process can get -EIO from fsync().
270 			 */
271 			set_bit(AS_EIO,
272 				&jinode->i_vfs_inode->i_mapping->flags);
273 
274 			if (!ret)
275 				ret = err;
276 		}
277 		spin_lock(&journal->j_list_lock);
278 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
279 		smp_mb__after_clear_bit();
280 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281 	}
282 
283 	/* Now refile inode to proper lists */
284 	list_for_each_entry_safe(jinode, next_i,
285 				 &commit_transaction->t_inode_list, i_list) {
286 		list_del(&jinode->i_list);
287 		if (jinode->i_next_transaction) {
288 			jinode->i_transaction = jinode->i_next_transaction;
289 			jinode->i_next_transaction = NULL;
290 			list_add(&jinode->i_list,
291 				&jinode->i_transaction->t_inode_list);
292 		} else {
293 			jinode->i_transaction = NULL;
294 		}
295 	}
296 	spin_unlock(&journal->j_list_lock);
297 
298 	return ret;
299 }
300 
301 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 {
303 	struct page *page = bh->b_page;
304 	char *addr;
305 	__u32 checksum;
306 
307 	addr = kmap_atomic(page);
308 	checksum = crc32_be(crc32_sum,
309 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
310 	kunmap_atomic(addr);
311 
312 	return checksum;
313 }
314 
315 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
316 				   unsigned long long block)
317 {
318 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
319 	if (tag_bytes > JBD2_TAG_SIZE32)
320 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
321 }
322 
323 static void jbd2_descr_block_csum_set(journal_t *j,
324 				      struct journal_head *descriptor)
325 {
326 	struct jbd2_journal_block_tail *tail;
327 	__u32 csum;
328 
329 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330 		return;
331 
332 	tail = (struct jbd2_journal_block_tail *)
333 			(jh2bh(descriptor)->b_data + j->j_blocksize -
334 			sizeof(struct jbd2_journal_block_tail));
335 	tail->t_checksum = 0;
336 	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
337 			   j->j_blocksize);
338 	tail->t_checksum = cpu_to_be32(csum);
339 }
340 
341 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
342 				    struct buffer_head *bh, __u32 sequence)
343 {
344 	struct page *page = bh->b_page;
345 	__u8 *addr;
346 	__u32 csum;
347 
348 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349 		return;
350 
351 	sequence = cpu_to_be32(sequence);
352 	addr = kmap_atomic(page);
353 	csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354 			  sizeof(sequence));
355 	csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
356 			  bh->b_size);
357 	kunmap_atomic(addr);
358 
359 	tag->t_checksum = cpu_to_be32(csum);
360 }
361 /*
362  * jbd2_journal_commit_transaction
363  *
364  * The primary function for committing a transaction to the log.  This
365  * function is called by the journal thread to begin a complete commit.
366  */
367 void jbd2_journal_commit_transaction(journal_t *journal)
368 {
369 	struct transaction_stats_s stats;
370 	transaction_t *commit_transaction;
371 	struct journal_head *jh, *new_jh, *descriptor;
372 	struct buffer_head **wbuf = journal->j_wbuf;
373 	int bufs;
374 	int flags;
375 	int err;
376 	unsigned long long blocknr;
377 	ktime_t start_time;
378 	u64 commit_time;
379 	char *tagp = NULL;
380 	journal_header_t *header;
381 	journal_block_tag_t *tag = NULL;
382 	int space_left = 0;
383 	int first_tag = 0;
384 	int tag_flag;
385 	int i, to_free = 0;
386 	int tag_bytes = journal_tag_bytes(journal);
387 	struct buffer_head *cbh = NULL; /* For transactional checksums */
388 	__u32 crc32_sum = ~0;
389 	struct blk_plug plug;
390 	/* Tail of the journal */
391 	unsigned long first_block;
392 	tid_t first_tid;
393 	int update_tail;
394 	int csum_size = 0;
395 
396 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
397 		csum_size = sizeof(struct jbd2_journal_block_tail);
398 
399 	/*
400 	 * First job: lock down the current transaction and wait for
401 	 * all outstanding updates to complete.
402 	 */
403 
404 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
405 	if (journal->j_flags & JBD2_FLUSHED) {
406 		jbd_debug(3, "super block updated\n");
407 		mutex_lock(&journal->j_checkpoint_mutex);
408 		/*
409 		 * We hold j_checkpoint_mutex so tail cannot change under us.
410 		 * We don't need any special data guarantees for writing sb
411 		 * since journal is empty and it is ok for write to be
412 		 * flushed only with transaction commit.
413 		 */
414 		jbd2_journal_update_sb_log_tail(journal,
415 						journal->j_tail_sequence,
416 						journal->j_tail,
417 						WRITE_SYNC);
418 		mutex_unlock(&journal->j_checkpoint_mutex);
419 	} else {
420 		jbd_debug(3, "superblock not updated\n");
421 	}
422 
423 	J_ASSERT(journal->j_running_transaction != NULL);
424 	J_ASSERT(journal->j_committing_transaction == NULL);
425 
426 	commit_transaction = journal->j_running_transaction;
427 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
428 
429 	trace_jbd2_start_commit(journal, commit_transaction);
430 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
431 			commit_transaction->t_tid);
432 
433 	write_lock(&journal->j_state_lock);
434 	commit_transaction->t_state = T_LOCKED;
435 
436 	trace_jbd2_commit_locking(journal, commit_transaction);
437 	stats.run.rs_wait = commit_transaction->t_max_wait;
438 	stats.run.rs_locked = jiffies;
439 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
440 					      stats.run.rs_locked);
441 
442 	spin_lock(&commit_transaction->t_handle_lock);
443 	while (atomic_read(&commit_transaction->t_updates)) {
444 		DEFINE_WAIT(wait);
445 
446 		prepare_to_wait(&journal->j_wait_updates, &wait,
447 					TASK_UNINTERRUPTIBLE);
448 		if (atomic_read(&commit_transaction->t_updates)) {
449 			spin_unlock(&commit_transaction->t_handle_lock);
450 			write_unlock(&journal->j_state_lock);
451 			schedule();
452 			write_lock(&journal->j_state_lock);
453 			spin_lock(&commit_transaction->t_handle_lock);
454 		}
455 		finish_wait(&journal->j_wait_updates, &wait);
456 	}
457 	spin_unlock(&commit_transaction->t_handle_lock);
458 
459 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
460 			journal->j_max_transaction_buffers);
461 
462 	/*
463 	 * First thing we are allowed to do is to discard any remaining
464 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
465 	 * that there are no such buffers: if a large filesystem
466 	 * operation like a truncate needs to split itself over multiple
467 	 * transactions, then it may try to do a jbd2_journal_restart() while
468 	 * there are still BJ_Reserved buffers outstanding.  These must
469 	 * be released cleanly from the current transaction.
470 	 *
471 	 * In this case, the filesystem must still reserve write access
472 	 * again before modifying the buffer in the new transaction, but
473 	 * we do not require it to remember exactly which old buffers it
474 	 * has reserved.  This is consistent with the existing behaviour
475 	 * that multiple jbd2_journal_get_write_access() calls to the same
476 	 * buffer are perfectly permissible.
477 	 */
478 	while (commit_transaction->t_reserved_list) {
479 		jh = commit_transaction->t_reserved_list;
480 		JBUFFER_TRACE(jh, "reserved, unused: refile");
481 		/*
482 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
483 		 * leave undo-committed data.
484 		 */
485 		if (jh->b_committed_data) {
486 			struct buffer_head *bh = jh2bh(jh);
487 
488 			jbd_lock_bh_state(bh);
489 			jbd2_free(jh->b_committed_data, bh->b_size);
490 			jh->b_committed_data = NULL;
491 			jbd_unlock_bh_state(bh);
492 		}
493 		jbd2_journal_refile_buffer(journal, jh);
494 	}
495 
496 	/*
497 	 * Now try to drop any written-back buffers from the journal's
498 	 * checkpoint lists.  We do this *before* commit because it potentially
499 	 * frees some memory
500 	 */
501 	spin_lock(&journal->j_list_lock);
502 	__jbd2_journal_clean_checkpoint_list(journal);
503 	spin_unlock(&journal->j_list_lock);
504 
505 	jbd_debug(3, "JBD2: commit phase 1\n");
506 
507 	/*
508 	 * Clear revoked flag to reflect there is no revoked buffers
509 	 * in the next transaction which is going to be started.
510 	 */
511 	jbd2_clear_buffer_revoked_flags(journal);
512 
513 	/*
514 	 * Switch to a new revoke table.
515 	 */
516 	jbd2_journal_switch_revoke_table(journal);
517 
518 	trace_jbd2_commit_flushing(journal, commit_transaction);
519 	stats.run.rs_flushing = jiffies;
520 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
521 					     stats.run.rs_flushing);
522 
523 	commit_transaction->t_state = T_FLUSH;
524 	journal->j_committing_transaction = commit_transaction;
525 	journal->j_running_transaction = NULL;
526 	start_time = ktime_get();
527 	commit_transaction->t_log_start = journal->j_head;
528 	wake_up(&journal->j_wait_transaction_locked);
529 	write_unlock(&journal->j_state_lock);
530 
531 	jbd_debug(3, "JBD2: commit phase 2\n");
532 
533 	/*
534 	 * Now start flushing things to disk, in the order they appear
535 	 * on the transaction lists.  Data blocks go first.
536 	 */
537 	err = journal_submit_data_buffers(journal, commit_transaction);
538 	if (err)
539 		jbd2_journal_abort(journal, err);
540 
541 	blk_start_plug(&plug);
542 	jbd2_journal_write_revoke_records(journal, commit_transaction,
543 					  WRITE_SYNC);
544 	blk_finish_plug(&plug);
545 
546 	jbd_debug(3, "JBD2: commit phase 2\n");
547 
548 	/*
549 	 * Way to go: we have now written out all of the data for a
550 	 * transaction!  Now comes the tricky part: we need to write out
551 	 * metadata.  Loop over the transaction's entire buffer list:
552 	 */
553 	write_lock(&journal->j_state_lock);
554 	commit_transaction->t_state = T_COMMIT;
555 	write_unlock(&journal->j_state_lock);
556 
557 	trace_jbd2_commit_logging(journal, commit_transaction);
558 	stats.run.rs_logging = jiffies;
559 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
560 					       stats.run.rs_logging);
561 	stats.run.rs_blocks =
562 		atomic_read(&commit_transaction->t_outstanding_credits);
563 	stats.run.rs_blocks_logged = 0;
564 
565 	J_ASSERT(commit_transaction->t_nr_buffers <=
566 		 atomic_read(&commit_transaction->t_outstanding_credits));
567 
568 	err = 0;
569 	descriptor = NULL;
570 	bufs = 0;
571 	blk_start_plug(&plug);
572 	while (commit_transaction->t_buffers) {
573 
574 		/* Find the next buffer to be journaled... */
575 
576 		jh = commit_transaction->t_buffers;
577 
578 		/* If we're in abort mode, we just un-journal the buffer and
579 		   release it. */
580 
581 		if (is_journal_aborted(journal)) {
582 			clear_buffer_jbddirty(jh2bh(jh));
583 			JBUFFER_TRACE(jh, "journal is aborting: refile");
584 			jbd2_buffer_abort_trigger(jh,
585 						  jh->b_frozen_data ?
586 						  jh->b_frozen_triggers :
587 						  jh->b_triggers);
588 			jbd2_journal_refile_buffer(journal, jh);
589 			/* If that was the last one, we need to clean up
590 			 * any descriptor buffers which may have been
591 			 * already allocated, even if we are now
592 			 * aborting. */
593 			if (!commit_transaction->t_buffers)
594 				goto start_journal_io;
595 			continue;
596 		}
597 
598 		/* Make sure we have a descriptor block in which to
599 		   record the metadata buffer. */
600 
601 		if (!descriptor) {
602 			struct buffer_head *bh;
603 
604 			J_ASSERT (bufs == 0);
605 
606 			jbd_debug(4, "JBD2: get descriptor\n");
607 
608 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
609 			if (!descriptor) {
610 				jbd2_journal_abort(journal, -EIO);
611 				continue;
612 			}
613 
614 			bh = jh2bh(descriptor);
615 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
616 				(unsigned long long)bh->b_blocknr, bh->b_data);
617 			header = (journal_header_t *)&bh->b_data[0];
618 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
619 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
620 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
621 
622 			tagp = &bh->b_data[sizeof(journal_header_t)];
623 			space_left = bh->b_size - sizeof(journal_header_t);
624 			first_tag = 1;
625 			set_buffer_jwrite(bh);
626 			set_buffer_dirty(bh);
627 			wbuf[bufs++] = bh;
628 
629 			/* Record it so that we can wait for IO
630                            completion later */
631 			BUFFER_TRACE(bh, "ph3: file as descriptor");
632 			jbd2_journal_file_buffer(descriptor, commit_transaction,
633 					BJ_LogCtl);
634 		}
635 
636 		/* Where is the buffer to be written? */
637 
638 		err = jbd2_journal_next_log_block(journal, &blocknr);
639 		/* If the block mapping failed, just abandon the buffer
640 		   and repeat this loop: we'll fall into the
641 		   refile-on-abort condition above. */
642 		if (err) {
643 			jbd2_journal_abort(journal, err);
644 			continue;
645 		}
646 
647 		/*
648 		 * start_this_handle() uses t_outstanding_credits to determine
649 		 * the free space in the log, but this counter is changed
650 		 * by jbd2_journal_next_log_block() also.
651 		 */
652 		atomic_dec(&commit_transaction->t_outstanding_credits);
653 
654 		/* Bump b_count to prevent truncate from stumbling over
655                    the shadowed buffer!  @@@ This can go if we ever get
656                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
657 		atomic_inc(&jh2bh(jh)->b_count);
658 
659 		/* Make a temporary IO buffer with which to write it out
660                    (this will requeue both the metadata buffer and the
661                    temporary IO buffer). new_bh goes on BJ_IO*/
662 
663 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
664 		/*
665 		 * akpm: jbd2_journal_write_metadata_buffer() sets
666 		 * new_bh->b_transaction to commit_transaction.
667 		 * We need to clean this up before we release new_bh
668 		 * (which is of type BJ_IO)
669 		 */
670 		JBUFFER_TRACE(jh, "ph3: write metadata");
671 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
672 						      jh, &new_jh, blocknr);
673 		if (flags < 0) {
674 			jbd2_journal_abort(journal, flags);
675 			continue;
676 		}
677 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
678 		wbuf[bufs++] = jh2bh(new_jh);
679 
680 		/* Record the new block's tag in the current descriptor
681                    buffer */
682 
683 		tag_flag = 0;
684 		if (flags & 1)
685 			tag_flag |= JBD2_FLAG_ESCAPE;
686 		if (!first_tag)
687 			tag_flag |= JBD2_FLAG_SAME_UUID;
688 
689 		tag = (journal_block_tag_t *) tagp;
690 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
691 		tag->t_flags = cpu_to_be16(tag_flag);
692 		jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
693 					commit_transaction->t_tid);
694 		tagp += tag_bytes;
695 		space_left -= tag_bytes;
696 
697 		if (first_tag) {
698 			memcpy (tagp, journal->j_uuid, 16);
699 			tagp += 16;
700 			space_left -= 16;
701 			first_tag = 0;
702 		}
703 
704 		/* If there's no more to do, or if the descriptor is full,
705 		   let the IO rip! */
706 
707 		if (bufs == journal->j_wbufsize ||
708 		    commit_transaction->t_buffers == NULL ||
709 		    space_left < tag_bytes + 16 + csum_size) {
710 
711 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
712 
713 			/* Write an end-of-descriptor marker before
714                            submitting the IOs.  "tag" still points to
715                            the last tag we set up. */
716 
717 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
718 
719 			jbd2_descr_block_csum_set(journal, descriptor);
720 start_journal_io:
721 			for (i = 0; i < bufs; i++) {
722 				struct buffer_head *bh = wbuf[i];
723 				/*
724 				 * Compute checksum.
725 				 */
726 				if (JBD2_HAS_COMPAT_FEATURE(journal,
727 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
728 					crc32_sum =
729 					    jbd2_checksum_data(crc32_sum, bh);
730 				}
731 
732 				lock_buffer(bh);
733 				clear_buffer_dirty(bh);
734 				set_buffer_uptodate(bh);
735 				bh->b_end_io = journal_end_buffer_io_sync;
736 				submit_bh(WRITE_SYNC, bh);
737 			}
738 			cond_resched();
739 			stats.run.rs_blocks_logged += bufs;
740 
741 			/* Force a new descriptor to be generated next
742                            time round the loop. */
743 			descriptor = NULL;
744 			bufs = 0;
745 		}
746 	}
747 
748 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
749 	if (err) {
750 		printk(KERN_WARNING
751 			"JBD2: Detected IO errors while flushing file data "
752 		       "on %s\n", journal->j_devname);
753 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
754 			jbd2_journal_abort(journal, err);
755 		err = 0;
756 	}
757 
758 	/*
759 	 * Get current oldest transaction in the log before we issue flush
760 	 * to the filesystem device. After the flush we can be sure that
761 	 * blocks of all older transactions are checkpointed to persistent
762 	 * storage and we will be safe to update journal start in the
763 	 * superblock with the numbers we get here.
764 	 */
765 	update_tail =
766 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
767 
768 	write_lock(&journal->j_state_lock);
769 	if (update_tail) {
770 		long freed = first_block - journal->j_tail;
771 
772 		if (first_block < journal->j_tail)
773 			freed += journal->j_last - journal->j_first;
774 		/* Update tail only if we free significant amount of space */
775 		if (freed < journal->j_maxlen / 4)
776 			update_tail = 0;
777 	}
778 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
779 	commit_transaction->t_state = T_COMMIT_DFLUSH;
780 	write_unlock(&journal->j_state_lock);
781 
782 	/*
783 	 * If the journal is not located on the file system device,
784 	 * then we must flush the file system device before we issue
785 	 * the commit record
786 	 */
787 	if (commit_transaction->t_need_data_flush &&
788 	    (journal->j_fs_dev != journal->j_dev) &&
789 	    (journal->j_flags & JBD2_BARRIER))
790 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
791 
792 	/* Done it all: now write the commit record asynchronously. */
793 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
794 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
795 		err = journal_submit_commit_record(journal, commit_transaction,
796 						 &cbh, crc32_sum);
797 		if (err)
798 			__jbd2_journal_abort_hard(journal);
799 	}
800 
801 	blk_finish_plug(&plug);
802 
803 	/* Lo and behold: we have just managed to send a transaction to
804            the log.  Before we can commit it, wait for the IO so far to
805            complete.  Control buffers being written are on the
806            transaction's t_log_list queue, and metadata buffers are on
807            the t_iobuf_list queue.
808 
809 	   Wait for the buffers in reverse order.  That way we are
810 	   less likely to be woken up until all IOs have completed, and
811 	   so we incur less scheduling load.
812 	*/
813 
814 	jbd_debug(3, "JBD2: commit phase 3\n");
815 
816 	/*
817 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
818 	 * See __journal_try_to_free_buffer.
819 	 */
820 wait_for_iobuf:
821 	while (commit_transaction->t_iobuf_list != NULL) {
822 		struct buffer_head *bh;
823 
824 		jh = commit_transaction->t_iobuf_list->b_tprev;
825 		bh = jh2bh(jh);
826 		if (buffer_locked(bh)) {
827 			wait_on_buffer(bh);
828 			goto wait_for_iobuf;
829 		}
830 		if (cond_resched())
831 			goto wait_for_iobuf;
832 
833 		if (unlikely(!buffer_uptodate(bh)))
834 			err = -EIO;
835 
836 		clear_buffer_jwrite(bh);
837 
838 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
839 		jbd2_journal_unfile_buffer(journal, jh);
840 
841 		/*
842 		 * ->t_iobuf_list should contain only dummy buffer_heads
843 		 * which were created by jbd2_journal_write_metadata_buffer().
844 		 */
845 		BUFFER_TRACE(bh, "dumping temporary bh");
846 		jbd2_journal_put_journal_head(jh);
847 		__brelse(bh);
848 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
849 		free_buffer_head(bh);
850 
851 		/* We also have to unlock and free the corresponding
852                    shadowed buffer */
853 		jh = commit_transaction->t_shadow_list->b_tprev;
854 		bh = jh2bh(jh);
855 		clear_bit(BH_JWrite, &bh->b_state);
856 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
857 
858 		/* The metadata is now released for reuse, but we need
859                    to remember it against this transaction so that when
860                    we finally commit, we can do any checkpointing
861                    required. */
862 		JBUFFER_TRACE(jh, "file as BJ_Forget");
863 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
864 		/*
865 		 * Wake up any transactions which were waiting for this IO to
866 		 * complete. The barrier must be here so that changes by
867 		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
868 		 * does the waitqueue check.
869 		 */
870 		smp_mb();
871 		wake_up_bit(&bh->b_state, BH_Unshadow);
872 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
873 		__brelse(bh);
874 	}
875 
876 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
877 
878 	jbd_debug(3, "JBD2: commit phase 4\n");
879 
880 	/* Here we wait for the revoke record and descriptor record buffers */
881  wait_for_ctlbuf:
882 	while (commit_transaction->t_log_list != NULL) {
883 		struct buffer_head *bh;
884 
885 		jh = commit_transaction->t_log_list->b_tprev;
886 		bh = jh2bh(jh);
887 		if (buffer_locked(bh)) {
888 			wait_on_buffer(bh);
889 			goto wait_for_ctlbuf;
890 		}
891 		if (cond_resched())
892 			goto wait_for_ctlbuf;
893 
894 		if (unlikely(!buffer_uptodate(bh)))
895 			err = -EIO;
896 
897 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
898 		clear_buffer_jwrite(bh);
899 		jbd2_journal_unfile_buffer(journal, jh);
900 		jbd2_journal_put_journal_head(jh);
901 		__brelse(bh);		/* One for getblk */
902 		/* AKPM: bforget here */
903 	}
904 
905 	if (err)
906 		jbd2_journal_abort(journal, err);
907 
908 	jbd_debug(3, "JBD2: commit phase 5\n");
909 	write_lock(&journal->j_state_lock);
910 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
911 	commit_transaction->t_state = T_COMMIT_JFLUSH;
912 	write_unlock(&journal->j_state_lock);
913 
914 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
915 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
916 		err = journal_submit_commit_record(journal, commit_transaction,
917 						&cbh, crc32_sum);
918 		if (err)
919 			__jbd2_journal_abort_hard(journal);
920 	}
921 	if (cbh)
922 		err = journal_wait_on_commit_record(journal, cbh);
923 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
924 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
925 	    journal->j_flags & JBD2_BARRIER) {
926 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
927 	}
928 
929 	if (err)
930 		jbd2_journal_abort(journal, err);
931 
932 	/*
933 	 * Now disk caches for filesystem device are flushed so we are safe to
934 	 * erase checkpointed transactions from the log by updating journal
935 	 * superblock.
936 	 */
937 	if (update_tail)
938 		jbd2_update_log_tail(journal, first_tid, first_block);
939 
940 	/* End of a transaction!  Finally, we can do checkpoint
941            processing: any buffers committed as a result of this
942            transaction can be removed from any checkpoint list it was on
943            before. */
944 
945 	jbd_debug(3, "JBD2: commit phase 6\n");
946 
947 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
948 	J_ASSERT(commit_transaction->t_buffers == NULL);
949 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
950 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
951 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
952 	J_ASSERT(commit_transaction->t_log_list == NULL);
953 
954 restart_loop:
955 	/*
956 	 * As there are other places (journal_unmap_buffer()) adding buffers
957 	 * to this list we have to be careful and hold the j_list_lock.
958 	 */
959 	spin_lock(&journal->j_list_lock);
960 	while (commit_transaction->t_forget) {
961 		transaction_t *cp_transaction;
962 		struct buffer_head *bh;
963 		int try_to_free = 0;
964 
965 		jh = commit_transaction->t_forget;
966 		spin_unlock(&journal->j_list_lock);
967 		bh = jh2bh(jh);
968 		/*
969 		 * Get a reference so that bh cannot be freed before we are
970 		 * done with it.
971 		 */
972 		get_bh(bh);
973 		jbd_lock_bh_state(bh);
974 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
975 
976 		/*
977 		 * If there is undo-protected committed data against
978 		 * this buffer, then we can remove it now.  If it is a
979 		 * buffer needing such protection, the old frozen_data
980 		 * field now points to a committed version of the
981 		 * buffer, so rotate that field to the new committed
982 		 * data.
983 		 *
984 		 * Otherwise, we can just throw away the frozen data now.
985 		 *
986 		 * We also know that the frozen data has already fired
987 		 * its triggers if they exist, so we can clear that too.
988 		 */
989 		if (jh->b_committed_data) {
990 			jbd2_free(jh->b_committed_data, bh->b_size);
991 			jh->b_committed_data = NULL;
992 			if (jh->b_frozen_data) {
993 				jh->b_committed_data = jh->b_frozen_data;
994 				jh->b_frozen_data = NULL;
995 				jh->b_frozen_triggers = NULL;
996 			}
997 		} else if (jh->b_frozen_data) {
998 			jbd2_free(jh->b_frozen_data, bh->b_size);
999 			jh->b_frozen_data = NULL;
1000 			jh->b_frozen_triggers = NULL;
1001 		}
1002 
1003 		spin_lock(&journal->j_list_lock);
1004 		cp_transaction = jh->b_cp_transaction;
1005 		if (cp_transaction) {
1006 			JBUFFER_TRACE(jh, "remove from old cp transaction");
1007 			cp_transaction->t_chp_stats.cs_dropped++;
1008 			__jbd2_journal_remove_checkpoint(jh);
1009 		}
1010 
1011 		/* Only re-checkpoint the buffer_head if it is marked
1012 		 * dirty.  If the buffer was added to the BJ_Forget list
1013 		 * by jbd2_journal_forget, it may no longer be dirty and
1014 		 * there's no point in keeping a checkpoint record for
1015 		 * it. */
1016 
1017 		/*
1018 		* A buffer which has been freed while still being journaled by
1019 		* a previous transaction.
1020 		*/
1021 		if (buffer_freed(bh)) {
1022 			/*
1023 			 * If the running transaction is the one containing
1024 			 * "add to orphan" operation (b_next_transaction !=
1025 			 * NULL), we have to wait for that transaction to
1026 			 * commit before we can really get rid of the buffer.
1027 			 * So just clear b_modified to not confuse transaction
1028 			 * credit accounting and refile the buffer to
1029 			 * BJ_Forget of the running transaction. If the just
1030 			 * committed transaction contains "add to orphan"
1031 			 * operation, we can completely invalidate the buffer
1032 			 * now. We are rather through in that since the
1033 			 * buffer may be still accessible when blocksize <
1034 			 * pagesize and it is attached to the last partial
1035 			 * page.
1036 			 */
1037 			jh->b_modified = 0;
1038 			if (!jh->b_next_transaction) {
1039 				clear_buffer_freed(bh);
1040 				clear_buffer_jbddirty(bh);
1041 				clear_buffer_mapped(bh);
1042 				clear_buffer_new(bh);
1043 				clear_buffer_req(bh);
1044 				bh->b_bdev = NULL;
1045 			}
1046 		}
1047 
1048 		if (buffer_jbddirty(bh)) {
1049 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1050 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1051 			if (is_journal_aborted(journal))
1052 				clear_buffer_jbddirty(bh);
1053 		} else {
1054 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1055 			/*
1056 			 * The buffer on BJ_Forget list and not jbddirty means
1057 			 * it has been freed by this transaction and hence it
1058 			 * could not have been reallocated until this
1059 			 * transaction has committed. *BUT* it could be
1060 			 * reallocated once we have written all the data to
1061 			 * disk and before we process the buffer on BJ_Forget
1062 			 * list.
1063 			 */
1064 			if (!jh->b_next_transaction)
1065 				try_to_free = 1;
1066 		}
1067 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1068 		__jbd2_journal_refile_buffer(jh);
1069 		jbd_unlock_bh_state(bh);
1070 		if (try_to_free)
1071 			release_buffer_page(bh);	/* Drops bh reference */
1072 		else
1073 			__brelse(bh);
1074 		cond_resched_lock(&journal->j_list_lock);
1075 	}
1076 	spin_unlock(&journal->j_list_lock);
1077 	/*
1078 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1079 	 * of a transaction into T_FINISHED state and calling
1080 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1081 	 * other checkpointing code processing the transaction...
1082 	 */
1083 	write_lock(&journal->j_state_lock);
1084 	spin_lock(&journal->j_list_lock);
1085 	/*
1086 	 * Now recheck if some buffers did not get attached to the transaction
1087 	 * while the lock was dropped...
1088 	 */
1089 	if (commit_transaction->t_forget) {
1090 		spin_unlock(&journal->j_list_lock);
1091 		write_unlock(&journal->j_state_lock);
1092 		goto restart_loop;
1093 	}
1094 
1095 	/* Done with this transaction! */
1096 
1097 	jbd_debug(3, "JBD2: commit phase 7\n");
1098 
1099 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1100 
1101 	commit_transaction->t_start = jiffies;
1102 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1103 					      commit_transaction->t_start);
1104 
1105 	/*
1106 	 * File the transaction statistics
1107 	 */
1108 	stats.ts_tid = commit_transaction->t_tid;
1109 	stats.run.rs_handle_count =
1110 		atomic_read(&commit_transaction->t_handle_count);
1111 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1112 			     commit_transaction->t_tid, &stats.run);
1113 
1114 	/*
1115 	 * Calculate overall stats
1116 	 */
1117 	spin_lock(&journal->j_history_lock);
1118 	journal->j_stats.ts_tid++;
1119 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1120 	journal->j_stats.run.rs_running += stats.run.rs_running;
1121 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1122 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1123 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1124 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1125 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1126 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1127 	spin_unlock(&journal->j_history_lock);
1128 
1129 	commit_transaction->t_state = T_FINISHED;
1130 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1131 	journal->j_commit_sequence = commit_transaction->t_tid;
1132 	journal->j_committing_transaction = NULL;
1133 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1134 
1135 	/*
1136 	 * weight the commit time higher than the average time so we don't
1137 	 * react too strongly to vast changes in the commit time
1138 	 */
1139 	if (likely(journal->j_average_commit_time))
1140 		journal->j_average_commit_time = (commit_time +
1141 				journal->j_average_commit_time*3) / 4;
1142 	else
1143 		journal->j_average_commit_time = commit_time;
1144 	write_unlock(&journal->j_state_lock);
1145 
1146 	if (commit_transaction->t_checkpoint_list == NULL &&
1147 	    commit_transaction->t_checkpoint_io_list == NULL) {
1148 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1149 		to_free = 1;
1150 	} else {
1151 		if (journal->j_checkpoint_transactions == NULL) {
1152 			journal->j_checkpoint_transactions = commit_transaction;
1153 			commit_transaction->t_cpnext = commit_transaction;
1154 			commit_transaction->t_cpprev = commit_transaction;
1155 		} else {
1156 			commit_transaction->t_cpnext =
1157 				journal->j_checkpoint_transactions;
1158 			commit_transaction->t_cpprev =
1159 				commit_transaction->t_cpnext->t_cpprev;
1160 			commit_transaction->t_cpnext->t_cpprev =
1161 				commit_transaction;
1162 			commit_transaction->t_cpprev->t_cpnext =
1163 				commit_transaction;
1164 		}
1165 	}
1166 	spin_unlock(&journal->j_list_lock);
1167 
1168 	if (journal->j_commit_callback)
1169 		journal->j_commit_callback(journal, commit_transaction);
1170 
1171 	trace_jbd2_end_commit(journal, commit_transaction);
1172 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1173 		  journal->j_commit_sequence, journal->j_tail_sequence);
1174 	if (to_free)
1175 		jbd2_journal_free_transaction(commit_transaction);
1176 
1177 	wake_up(&journal->j_wait_done_commit);
1178 }
1179