xref: /linux/fs/jbd2/commit.c (revision 07100877ea8fd9b2feabb4dd78f3322892f6bd77)
1  /*
2   * linux/fs/jbd2/commit.c
3   *
4   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5   *
6   * Copyright 1998 Red Hat corp --- All Rights Reserved
7   *
8   * This file is part of the Linux kernel and is made available under
9   * the terms of the GNU General Public License, version 2, or at your
10   * option, any later version, incorporated herein by reference.
11   *
12   * Journal commit routines for the generic filesystem journaling code;
13   * part of the ext2fs journaling system.
14   */
15  
16  #include <linux/time.h>
17  #include <linux/fs.h>
18  #include <linux/jbd2.h>
19  #include <linux/errno.h>
20  #include <linux/slab.h>
21  #include <linux/mm.h>
22  #include <linux/pagemap.h>
23  #include <linux/jiffies.h>
24  #include <linux/crc32.h>
25  #include <linux/writeback.h>
26  #include <linux/backing-dev.h>
27  #include <linux/bio.h>
28  #include <linux/blkdev.h>
29  #include <linux/bitops.h>
30  #include <trace/events/jbd2.h>
31  
32  /*
33   * IO end handler for temporary buffer_heads handling writes to the journal.
34   */
35  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36  {
37  	struct buffer_head *orig_bh = bh->b_private;
38  
39  	BUFFER_TRACE(bh, "");
40  	if (uptodate)
41  		set_buffer_uptodate(bh);
42  	else
43  		clear_buffer_uptodate(bh);
44  	if (orig_bh) {
45  		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46  		smp_mb__after_atomic();
47  		wake_up_bit(&orig_bh->b_state, BH_Shadow);
48  	}
49  	unlock_buffer(bh);
50  }
51  
52  /*
53   * When an ext4 file is truncated, it is possible that some pages are not
54   * successfully freed, because they are attached to a committing transaction.
55   * After the transaction commits, these pages are left on the LRU, with no
56   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
57   * by the VM, but their apparent absence upsets the VM accounting, and it makes
58   * the numbers in /proc/meminfo look odd.
59   *
60   * So here, we have a buffer which has just come off the forget list.  Look to
61   * see if we can strip all buffers from the backing page.
62   *
63   * Called under lock_journal(), and possibly under journal_datalist_lock.  The
64   * caller provided us with a ref against the buffer, and we drop that here.
65   */
66  static void release_buffer_page(struct buffer_head *bh)
67  {
68  	struct page *page;
69  
70  	if (buffer_dirty(bh))
71  		goto nope;
72  	if (atomic_read(&bh->b_count) != 1)
73  		goto nope;
74  	page = bh->b_page;
75  	if (!page)
76  		goto nope;
77  	if (page->mapping)
78  		goto nope;
79  
80  	/* OK, it's a truncated page */
81  	if (!trylock_page(page))
82  		goto nope;
83  
84  	page_cache_get(page);
85  	__brelse(bh);
86  	try_to_free_buffers(page);
87  	unlock_page(page);
88  	page_cache_release(page);
89  	return;
90  
91  nope:
92  	__brelse(bh);
93  }
94  
95  static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96  {
97  	struct commit_header *h;
98  	__u32 csum;
99  
100  	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
101  		return;
102  
103  	h = (struct commit_header *)(bh->b_data);
104  	h->h_chksum_type = 0;
105  	h->h_chksum_size = 0;
106  	h->h_chksum[0] = 0;
107  	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108  	h->h_chksum[0] = cpu_to_be32(csum);
109  }
110  
111  /*
112   * Done it all: now submit the commit record.  We should have
113   * cleaned up our previous buffers by now, so if we are in abort
114   * mode we can now just skip the rest of the journal write
115   * entirely.
116   *
117   * Returns 1 if the journal needs to be aborted or 0 on success
118   */
119  static int journal_submit_commit_record(journal_t *journal,
120  					transaction_t *commit_transaction,
121  					struct buffer_head **cbh,
122  					__u32 crc32_sum)
123  {
124  	struct commit_header *tmp;
125  	struct buffer_head *bh;
126  	int ret;
127  	struct timespec now = current_kernel_time();
128  
129  	*cbh = NULL;
130  
131  	if (is_journal_aborted(journal))
132  		return 0;
133  
134  	bh = jbd2_journal_get_descriptor_buffer(journal);
135  	if (!bh)
136  		return 1;
137  
138  	tmp = (struct commit_header *)bh->b_data;
139  	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140  	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141  	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142  	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143  	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
144  
145  	if (JBD2_HAS_COMPAT_FEATURE(journal,
146  				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
147  		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
148  		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
149  		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
150  	}
151  	jbd2_commit_block_csum_set(journal, bh);
152  
153  	BUFFER_TRACE(bh, "submit commit block");
154  	lock_buffer(bh);
155  	clear_buffer_dirty(bh);
156  	set_buffer_uptodate(bh);
157  	bh->b_end_io = journal_end_buffer_io_sync;
158  
159  	if (journal->j_flags & JBD2_BARRIER &&
160  	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
161  				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
162  		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
163  	else
164  		ret = submit_bh(WRITE_SYNC, bh);
165  
166  	*cbh = bh;
167  	return ret;
168  }
169  
170  /*
171   * This function along with journal_submit_commit_record
172   * allows to write the commit record asynchronously.
173   */
174  static int journal_wait_on_commit_record(journal_t *journal,
175  					 struct buffer_head *bh)
176  {
177  	int ret = 0;
178  
179  	clear_buffer_dirty(bh);
180  	wait_on_buffer(bh);
181  
182  	if (unlikely(!buffer_uptodate(bh)))
183  		ret = -EIO;
184  	put_bh(bh);            /* One for getblk() */
185  
186  	return ret;
187  }
188  
189  /*
190   * write the filemap data using writepage() address_space_operations.
191   * We don't do block allocation here even for delalloc. We don't
192   * use writepages() because with dealyed allocation we may be doing
193   * block allocation in writepages().
194   */
195  static int journal_submit_inode_data_buffers(struct address_space *mapping)
196  {
197  	int ret;
198  	struct writeback_control wbc = {
199  		.sync_mode =  WB_SYNC_ALL,
200  		.nr_to_write = mapping->nrpages * 2,
201  		.range_start = 0,
202  		.range_end = i_size_read(mapping->host),
203  	};
204  
205  	ret = generic_writepages(mapping, &wbc);
206  	return ret;
207  }
208  
209  /*
210   * Submit all the data buffers of inode associated with the transaction to
211   * disk.
212   *
213   * We are in a committing transaction. Therefore no new inode can be added to
214   * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
215   * operate on from being released while we write out pages.
216   */
217  static int journal_submit_data_buffers(journal_t *journal,
218  		transaction_t *commit_transaction)
219  {
220  	struct jbd2_inode *jinode;
221  	int err, ret = 0;
222  	struct address_space *mapping;
223  
224  	spin_lock(&journal->j_list_lock);
225  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
226  		mapping = jinode->i_vfs_inode->i_mapping;
227  		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
228  		spin_unlock(&journal->j_list_lock);
229  		/*
230  		 * submit the inode data buffers. We use writepage
231  		 * instead of writepages. Because writepages can do
232  		 * block allocation  with delalloc. We need to write
233  		 * only allocated blocks here.
234  		 */
235  		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
236  		err = journal_submit_inode_data_buffers(mapping);
237  		if (!ret)
238  			ret = err;
239  		spin_lock(&journal->j_list_lock);
240  		J_ASSERT(jinode->i_transaction == commit_transaction);
241  		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
242  		smp_mb__after_atomic();
243  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
244  	}
245  	spin_unlock(&journal->j_list_lock);
246  	return ret;
247  }
248  
249  /*
250   * Wait for data submitted for writeout, refile inodes to proper
251   * transaction if needed.
252   *
253   */
254  static int journal_finish_inode_data_buffers(journal_t *journal,
255  		transaction_t *commit_transaction)
256  {
257  	struct jbd2_inode *jinode, *next_i;
258  	int err, ret = 0;
259  
260  	/* For locking, see the comment in journal_submit_data_buffers() */
261  	spin_lock(&journal->j_list_lock);
262  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
263  		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
264  		spin_unlock(&journal->j_list_lock);
265  		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266  		if (err) {
267  			/*
268  			 * Because AS_EIO is cleared by
269  			 * filemap_fdatawait_range(), set it again so
270  			 * that user process can get -EIO from fsync().
271  			 */
272  			set_bit(AS_EIO,
273  				&jinode->i_vfs_inode->i_mapping->flags);
274  
275  			if (!ret)
276  				ret = err;
277  		}
278  		spin_lock(&journal->j_list_lock);
279  		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
280  		smp_mb__after_atomic();
281  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
282  	}
283  
284  	/* Now refile inode to proper lists */
285  	list_for_each_entry_safe(jinode, next_i,
286  				 &commit_transaction->t_inode_list, i_list) {
287  		list_del(&jinode->i_list);
288  		if (jinode->i_next_transaction) {
289  			jinode->i_transaction = jinode->i_next_transaction;
290  			jinode->i_next_transaction = NULL;
291  			list_add(&jinode->i_list,
292  				&jinode->i_transaction->t_inode_list);
293  		} else {
294  			jinode->i_transaction = NULL;
295  		}
296  	}
297  	spin_unlock(&journal->j_list_lock);
298  
299  	return ret;
300  }
301  
302  static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
303  {
304  	struct page *page = bh->b_page;
305  	char *addr;
306  	__u32 checksum;
307  
308  	addr = kmap_atomic(page);
309  	checksum = crc32_be(crc32_sum,
310  		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
311  	kunmap_atomic(addr);
312  
313  	return checksum;
314  }
315  
316  static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
317  				   unsigned long long block)
318  {
319  	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
320  	if (tag_bytes > JBD2_TAG_SIZE32)
321  		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322  }
323  
324  static void jbd2_descr_block_csum_set(journal_t *j,
325  				      struct buffer_head *bh)
326  {
327  	struct jbd2_journal_block_tail *tail;
328  	__u32 csum;
329  
330  	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
331  		return;
332  
333  	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
334  			sizeof(struct jbd2_journal_block_tail));
335  	tail->t_checksum = 0;
336  	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
337  	tail->t_checksum = cpu_to_be32(csum);
338  }
339  
340  static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
341  				    struct buffer_head *bh, __u32 sequence)
342  {
343  	struct page *page = bh->b_page;
344  	__u8 *addr;
345  	__u32 csum32;
346  	__be32 seq;
347  
348  	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349  		return;
350  
351  	seq = cpu_to_be32(sequence);
352  	addr = kmap_atomic(page);
353  	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
354  	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
355  			     bh->b_size);
356  	kunmap_atomic(addr);
357  
358  	/* We only have space to store the lower 16 bits of the crc32c. */
359  	tag->t_checksum = cpu_to_be16(csum32);
360  }
361  /*
362   * jbd2_journal_commit_transaction
363   *
364   * The primary function for committing a transaction to the log.  This
365   * function is called by the journal thread to begin a complete commit.
366   */
367  void jbd2_journal_commit_transaction(journal_t *journal)
368  {
369  	struct transaction_stats_s stats;
370  	transaction_t *commit_transaction;
371  	struct journal_head *jh;
372  	struct buffer_head *descriptor;
373  	struct buffer_head **wbuf = journal->j_wbuf;
374  	int bufs;
375  	int flags;
376  	int err;
377  	unsigned long long blocknr;
378  	ktime_t start_time;
379  	u64 commit_time;
380  	char *tagp = NULL;
381  	journal_header_t *header;
382  	journal_block_tag_t *tag = NULL;
383  	int space_left = 0;
384  	int first_tag = 0;
385  	int tag_flag;
386  	int i;
387  	int tag_bytes = journal_tag_bytes(journal);
388  	struct buffer_head *cbh = NULL; /* For transactional checksums */
389  	__u32 crc32_sum = ~0;
390  	struct blk_plug plug;
391  	/* Tail of the journal */
392  	unsigned long first_block;
393  	tid_t first_tid;
394  	int update_tail;
395  	int csum_size = 0;
396  	LIST_HEAD(io_bufs);
397  	LIST_HEAD(log_bufs);
398  
399  	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
400  		csum_size = sizeof(struct jbd2_journal_block_tail);
401  
402  	/*
403  	 * First job: lock down the current transaction and wait for
404  	 * all outstanding updates to complete.
405  	 */
406  
407  	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
408  	if (journal->j_flags & JBD2_FLUSHED) {
409  		jbd_debug(3, "super block updated\n");
410  		mutex_lock(&journal->j_checkpoint_mutex);
411  		/*
412  		 * We hold j_checkpoint_mutex so tail cannot change under us.
413  		 * We don't need any special data guarantees for writing sb
414  		 * since journal is empty and it is ok for write to be
415  		 * flushed only with transaction commit.
416  		 */
417  		jbd2_journal_update_sb_log_tail(journal,
418  						journal->j_tail_sequence,
419  						journal->j_tail,
420  						WRITE_SYNC);
421  		mutex_unlock(&journal->j_checkpoint_mutex);
422  	} else {
423  		jbd_debug(3, "superblock not updated\n");
424  	}
425  
426  	J_ASSERT(journal->j_running_transaction != NULL);
427  	J_ASSERT(journal->j_committing_transaction == NULL);
428  
429  	commit_transaction = journal->j_running_transaction;
430  
431  	trace_jbd2_start_commit(journal, commit_transaction);
432  	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
433  			commit_transaction->t_tid);
434  
435  	write_lock(&journal->j_state_lock);
436  	J_ASSERT(commit_transaction->t_state == T_RUNNING);
437  	commit_transaction->t_state = T_LOCKED;
438  
439  	trace_jbd2_commit_locking(journal, commit_transaction);
440  	stats.run.rs_wait = commit_transaction->t_max_wait;
441  	stats.run.rs_request_delay = 0;
442  	stats.run.rs_locked = jiffies;
443  	if (commit_transaction->t_requested)
444  		stats.run.rs_request_delay =
445  			jbd2_time_diff(commit_transaction->t_requested,
446  				       stats.run.rs_locked);
447  	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
448  					      stats.run.rs_locked);
449  
450  	spin_lock(&commit_transaction->t_handle_lock);
451  	while (atomic_read(&commit_transaction->t_updates)) {
452  		DEFINE_WAIT(wait);
453  
454  		prepare_to_wait(&journal->j_wait_updates, &wait,
455  					TASK_UNINTERRUPTIBLE);
456  		if (atomic_read(&commit_transaction->t_updates)) {
457  			spin_unlock(&commit_transaction->t_handle_lock);
458  			write_unlock(&journal->j_state_lock);
459  			schedule();
460  			write_lock(&journal->j_state_lock);
461  			spin_lock(&commit_transaction->t_handle_lock);
462  		}
463  		finish_wait(&journal->j_wait_updates, &wait);
464  	}
465  	spin_unlock(&commit_transaction->t_handle_lock);
466  
467  	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
468  			journal->j_max_transaction_buffers);
469  
470  	/*
471  	 * First thing we are allowed to do is to discard any remaining
472  	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
473  	 * that there are no such buffers: if a large filesystem
474  	 * operation like a truncate needs to split itself over multiple
475  	 * transactions, then it may try to do a jbd2_journal_restart() while
476  	 * there are still BJ_Reserved buffers outstanding.  These must
477  	 * be released cleanly from the current transaction.
478  	 *
479  	 * In this case, the filesystem must still reserve write access
480  	 * again before modifying the buffer in the new transaction, but
481  	 * we do not require it to remember exactly which old buffers it
482  	 * has reserved.  This is consistent with the existing behaviour
483  	 * that multiple jbd2_journal_get_write_access() calls to the same
484  	 * buffer are perfectly permissible.
485  	 */
486  	while (commit_transaction->t_reserved_list) {
487  		jh = commit_transaction->t_reserved_list;
488  		JBUFFER_TRACE(jh, "reserved, unused: refile");
489  		/*
490  		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
491  		 * leave undo-committed data.
492  		 */
493  		if (jh->b_committed_data) {
494  			struct buffer_head *bh = jh2bh(jh);
495  
496  			jbd_lock_bh_state(bh);
497  			jbd2_free(jh->b_committed_data, bh->b_size);
498  			jh->b_committed_data = NULL;
499  			jbd_unlock_bh_state(bh);
500  		}
501  		jbd2_journal_refile_buffer(journal, jh);
502  	}
503  
504  	/*
505  	 * Now try to drop any written-back buffers from the journal's
506  	 * checkpoint lists.  We do this *before* commit because it potentially
507  	 * frees some memory
508  	 */
509  	spin_lock(&journal->j_list_lock);
510  	__jbd2_journal_clean_checkpoint_list(journal);
511  	spin_unlock(&journal->j_list_lock);
512  
513  	jbd_debug(3, "JBD2: commit phase 1\n");
514  
515  	/*
516  	 * Clear revoked flag to reflect there is no revoked buffers
517  	 * in the next transaction which is going to be started.
518  	 */
519  	jbd2_clear_buffer_revoked_flags(journal);
520  
521  	/*
522  	 * Switch to a new revoke table.
523  	 */
524  	jbd2_journal_switch_revoke_table(journal);
525  
526  	/*
527  	 * Reserved credits cannot be claimed anymore, free them
528  	 */
529  	atomic_sub(atomic_read(&journal->j_reserved_credits),
530  		   &commit_transaction->t_outstanding_credits);
531  
532  	trace_jbd2_commit_flushing(journal, commit_transaction);
533  	stats.run.rs_flushing = jiffies;
534  	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
535  					     stats.run.rs_flushing);
536  
537  	commit_transaction->t_state = T_FLUSH;
538  	journal->j_committing_transaction = commit_transaction;
539  	journal->j_running_transaction = NULL;
540  	start_time = ktime_get();
541  	commit_transaction->t_log_start = journal->j_head;
542  	wake_up(&journal->j_wait_transaction_locked);
543  	write_unlock(&journal->j_state_lock);
544  
545  	jbd_debug(3, "JBD2: commit phase 2a\n");
546  
547  	/*
548  	 * Now start flushing things to disk, in the order they appear
549  	 * on the transaction lists.  Data blocks go first.
550  	 */
551  	err = journal_submit_data_buffers(journal, commit_transaction);
552  	if (err)
553  		jbd2_journal_abort(journal, err);
554  
555  	blk_start_plug(&plug);
556  	jbd2_journal_write_revoke_records(journal, commit_transaction,
557  					  &log_bufs, WRITE_SYNC);
558  
559  	jbd_debug(3, "JBD2: commit phase 2b\n");
560  
561  	/*
562  	 * Way to go: we have now written out all of the data for a
563  	 * transaction!  Now comes the tricky part: we need to write out
564  	 * metadata.  Loop over the transaction's entire buffer list:
565  	 */
566  	write_lock(&journal->j_state_lock);
567  	commit_transaction->t_state = T_COMMIT;
568  	write_unlock(&journal->j_state_lock);
569  
570  	trace_jbd2_commit_logging(journal, commit_transaction);
571  	stats.run.rs_logging = jiffies;
572  	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
573  					       stats.run.rs_logging);
574  	stats.run.rs_blocks =
575  		atomic_read(&commit_transaction->t_outstanding_credits);
576  	stats.run.rs_blocks_logged = 0;
577  
578  	J_ASSERT(commit_transaction->t_nr_buffers <=
579  		 atomic_read(&commit_transaction->t_outstanding_credits));
580  
581  	err = 0;
582  	bufs = 0;
583  	descriptor = NULL;
584  	while (commit_transaction->t_buffers) {
585  
586  		/* Find the next buffer to be journaled... */
587  
588  		jh = commit_transaction->t_buffers;
589  
590  		/* If we're in abort mode, we just un-journal the buffer and
591  		   release it. */
592  
593  		if (is_journal_aborted(journal)) {
594  			clear_buffer_jbddirty(jh2bh(jh));
595  			JBUFFER_TRACE(jh, "journal is aborting: refile");
596  			jbd2_buffer_abort_trigger(jh,
597  						  jh->b_frozen_data ?
598  						  jh->b_frozen_triggers :
599  						  jh->b_triggers);
600  			jbd2_journal_refile_buffer(journal, jh);
601  			/* If that was the last one, we need to clean up
602  			 * any descriptor buffers which may have been
603  			 * already allocated, even if we are now
604  			 * aborting. */
605  			if (!commit_transaction->t_buffers)
606  				goto start_journal_io;
607  			continue;
608  		}
609  
610  		/* Make sure we have a descriptor block in which to
611  		   record the metadata buffer. */
612  
613  		if (!descriptor) {
614  			J_ASSERT (bufs == 0);
615  
616  			jbd_debug(4, "JBD2: get descriptor\n");
617  
618  			descriptor = jbd2_journal_get_descriptor_buffer(journal);
619  			if (!descriptor) {
620  				jbd2_journal_abort(journal, -EIO);
621  				continue;
622  			}
623  
624  			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
625  				(unsigned long long)descriptor->b_blocknr,
626  				descriptor->b_data);
627  			header = (journal_header_t *)descriptor->b_data;
628  			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
629  			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
630  			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
631  
632  			tagp = &descriptor->b_data[sizeof(journal_header_t)];
633  			space_left = descriptor->b_size -
634  						sizeof(journal_header_t);
635  			first_tag = 1;
636  			set_buffer_jwrite(descriptor);
637  			set_buffer_dirty(descriptor);
638  			wbuf[bufs++] = descriptor;
639  
640  			/* Record it so that we can wait for IO
641                             completion later */
642  			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
643  			jbd2_file_log_bh(&log_bufs, descriptor);
644  		}
645  
646  		/* Where is the buffer to be written? */
647  
648  		err = jbd2_journal_next_log_block(journal, &blocknr);
649  		/* If the block mapping failed, just abandon the buffer
650  		   and repeat this loop: we'll fall into the
651  		   refile-on-abort condition above. */
652  		if (err) {
653  			jbd2_journal_abort(journal, err);
654  			continue;
655  		}
656  
657  		/*
658  		 * start_this_handle() uses t_outstanding_credits to determine
659  		 * the free space in the log, but this counter is changed
660  		 * by jbd2_journal_next_log_block() also.
661  		 */
662  		atomic_dec(&commit_transaction->t_outstanding_credits);
663  
664  		/* Bump b_count to prevent truncate from stumbling over
665                     the shadowed buffer!  @@@ This can go if we ever get
666                     rid of the shadow pairing of buffers. */
667  		atomic_inc(&jh2bh(jh)->b_count);
668  
669  		/*
670  		 * Make a temporary IO buffer with which to write it out
671  		 * (this will requeue the metadata buffer to BJ_Shadow).
672  		 */
673  		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
674  		JBUFFER_TRACE(jh, "ph3: write metadata");
675  		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
676  						jh, &wbuf[bufs], blocknr);
677  		if (flags < 0) {
678  			jbd2_journal_abort(journal, flags);
679  			continue;
680  		}
681  		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
682  
683  		/* Record the new block's tag in the current descriptor
684                     buffer */
685  
686  		tag_flag = 0;
687  		if (flags & 1)
688  			tag_flag |= JBD2_FLAG_ESCAPE;
689  		if (!first_tag)
690  			tag_flag |= JBD2_FLAG_SAME_UUID;
691  
692  		tag = (journal_block_tag_t *) tagp;
693  		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
694  		tag->t_flags = cpu_to_be16(tag_flag);
695  		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
696  					commit_transaction->t_tid);
697  		tagp += tag_bytes;
698  		space_left -= tag_bytes;
699  		bufs++;
700  
701  		if (first_tag) {
702  			memcpy (tagp, journal->j_uuid, 16);
703  			tagp += 16;
704  			space_left -= 16;
705  			first_tag = 0;
706  		}
707  
708  		/* If there's no more to do, or if the descriptor is full,
709  		   let the IO rip! */
710  
711  		if (bufs == journal->j_wbufsize ||
712  		    commit_transaction->t_buffers == NULL ||
713  		    space_left < tag_bytes + 16 + csum_size) {
714  
715  			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
716  
717  			/* Write an end-of-descriptor marker before
718                             submitting the IOs.  "tag" still points to
719                             the last tag we set up. */
720  
721  			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
722  
723  			jbd2_descr_block_csum_set(journal, descriptor);
724  start_journal_io:
725  			for (i = 0; i < bufs; i++) {
726  				struct buffer_head *bh = wbuf[i];
727  				/*
728  				 * Compute checksum.
729  				 */
730  				if (JBD2_HAS_COMPAT_FEATURE(journal,
731  					JBD2_FEATURE_COMPAT_CHECKSUM)) {
732  					crc32_sum =
733  					    jbd2_checksum_data(crc32_sum, bh);
734  				}
735  
736  				lock_buffer(bh);
737  				clear_buffer_dirty(bh);
738  				set_buffer_uptodate(bh);
739  				bh->b_end_io = journal_end_buffer_io_sync;
740  				submit_bh(WRITE_SYNC, bh);
741  			}
742  			cond_resched();
743  			stats.run.rs_blocks_logged += bufs;
744  
745  			/* Force a new descriptor to be generated next
746                             time round the loop. */
747  			descriptor = NULL;
748  			bufs = 0;
749  		}
750  	}
751  
752  	err = journal_finish_inode_data_buffers(journal, commit_transaction);
753  	if (err) {
754  		printk(KERN_WARNING
755  			"JBD2: Detected IO errors while flushing file data "
756  		       "on %s\n", journal->j_devname);
757  		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
758  			jbd2_journal_abort(journal, err);
759  		err = 0;
760  	}
761  
762  	/*
763  	 * Get current oldest transaction in the log before we issue flush
764  	 * to the filesystem device. After the flush we can be sure that
765  	 * blocks of all older transactions are checkpointed to persistent
766  	 * storage and we will be safe to update journal start in the
767  	 * superblock with the numbers we get here.
768  	 */
769  	update_tail =
770  		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
771  
772  	write_lock(&journal->j_state_lock);
773  	if (update_tail) {
774  		long freed = first_block - journal->j_tail;
775  
776  		if (first_block < journal->j_tail)
777  			freed += journal->j_last - journal->j_first;
778  		/* Update tail only if we free significant amount of space */
779  		if (freed < journal->j_maxlen / 4)
780  			update_tail = 0;
781  	}
782  	J_ASSERT(commit_transaction->t_state == T_COMMIT);
783  	commit_transaction->t_state = T_COMMIT_DFLUSH;
784  	write_unlock(&journal->j_state_lock);
785  
786  	/*
787  	 * If the journal is not located on the file system device,
788  	 * then we must flush the file system device before we issue
789  	 * the commit record
790  	 */
791  	if (commit_transaction->t_need_data_flush &&
792  	    (journal->j_fs_dev != journal->j_dev) &&
793  	    (journal->j_flags & JBD2_BARRIER))
794  		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
795  
796  	/* Done it all: now write the commit record asynchronously. */
797  	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
798  				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
799  		err = journal_submit_commit_record(journal, commit_transaction,
800  						 &cbh, crc32_sum);
801  		if (err)
802  			__jbd2_journal_abort_hard(journal);
803  	}
804  
805  	blk_finish_plug(&plug);
806  
807  	/* Lo and behold: we have just managed to send a transaction to
808             the log.  Before we can commit it, wait for the IO so far to
809             complete.  Control buffers being written are on the
810             transaction's t_log_list queue, and metadata buffers are on
811             the io_bufs list.
812  
813  	   Wait for the buffers in reverse order.  That way we are
814  	   less likely to be woken up until all IOs have completed, and
815  	   so we incur less scheduling load.
816  	*/
817  
818  	jbd_debug(3, "JBD2: commit phase 3\n");
819  
820  	while (!list_empty(&io_bufs)) {
821  		struct buffer_head *bh = list_entry(io_bufs.prev,
822  						    struct buffer_head,
823  						    b_assoc_buffers);
824  
825  		wait_on_buffer(bh);
826  		cond_resched();
827  
828  		if (unlikely(!buffer_uptodate(bh)))
829  			err = -EIO;
830  		jbd2_unfile_log_bh(bh);
831  
832  		/*
833  		 * The list contains temporary buffer heads created by
834  		 * jbd2_journal_write_metadata_buffer().
835  		 */
836  		BUFFER_TRACE(bh, "dumping temporary bh");
837  		__brelse(bh);
838  		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
839  		free_buffer_head(bh);
840  
841  		/* We also have to refile the corresponding shadowed buffer */
842  		jh = commit_transaction->t_shadow_list->b_tprev;
843  		bh = jh2bh(jh);
844  		clear_buffer_jwrite(bh);
845  		J_ASSERT_BH(bh, buffer_jbddirty(bh));
846  		J_ASSERT_BH(bh, !buffer_shadow(bh));
847  
848  		/* The metadata is now released for reuse, but we need
849                     to remember it against this transaction so that when
850                     we finally commit, we can do any checkpointing
851                     required. */
852  		JBUFFER_TRACE(jh, "file as BJ_Forget");
853  		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
854  		JBUFFER_TRACE(jh, "brelse shadowed buffer");
855  		__brelse(bh);
856  	}
857  
858  	J_ASSERT (commit_transaction->t_shadow_list == NULL);
859  
860  	jbd_debug(3, "JBD2: commit phase 4\n");
861  
862  	/* Here we wait for the revoke record and descriptor record buffers */
863  	while (!list_empty(&log_bufs)) {
864  		struct buffer_head *bh;
865  
866  		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
867  		wait_on_buffer(bh);
868  		cond_resched();
869  
870  		if (unlikely(!buffer_uptodate(bh)))
871  			err = -EIO;
872  
873  		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
874  		clear_buffer_jwrite(bh);
875  		jbd2_unfile_log_bh(bh);
876  		__brelse(bh);		/* One for getblk */
877  		/* AKPM: bforget here */
878  	}
879  
880  	if (err)
881  		jbd2_journal_abort(journal, err);
882  
883  	jbd_debug(3, "JBD2: commit phase 5\n");
884  	write_lock(&journal->j_state_lock);
885  	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
886  	commit_transaction->t_state = T_COMMIT_JFLUSH;
887  	write_unlock(&journal->j_state_lock);
888  
889  	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
890  				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
891  		err = journal_submit_commit_record(journal, commit_transaction,
892  						&cbh, crc32_sum);
893  		if (err)
894  			__jbd2_journal_abort_hard(journal);
895  	}
896  	if (cbh)
897  		err = journal_wait_on_commit_record(journal, cbh);
898  	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
899  				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
900  	    journal->j_flags & JBD2_BARRIER) {
901  		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
902  	}
903  
904  	if (err)
905  		jbd2_journal_abort(journal, err);
906  
907  	/*
908  	 * Now disk caches for filesystem device are flushed so we are safe to
909  	 * erase checkpointed transactions from the log by updating journal
910  	 * superblock.
911  	 */
912  	if (update_tail)
913  		jbd2_update_log_tail(journal, first_tid, first_block);
914  
915  	/* End of a transaction!  Finally, we can do checkpoint
916             processing: any buffers committed as a result of this
917             transaction can be removed from any checkpoint list it was on
918             before. */
919  
920  	jbd_debug(3, "JBD2: commit phase 6\n");
921  
922  	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
923  	J_ASSERT(commit_transaction->t_buffers == NULL);
924  	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
925  	J_ASSERT(commit_transaction->t_shadow_list == NULL);
926  
927  restart_loop:
928  	/*
929  	 * As there are other places (journal_unmap_buffer()) adding buffers
930  	 * to this list we have to be careful and hold the j_list_lock.
931  	 */
932  	spin_lock(&journal->j_list_lock);
933  	while (commit_transaction->t_forget) {
934  		transaction_t *cp_transaction;
935  		struct buffer_head *bh;
936  		int try_to_free = 0;
937  
938  		jh = commit_transaction->t_forget;
939  		spin_unlock(&journal->j_list_lock);
940  		bh = jh2bh(jh);
941  		/*
942  		 * Get a reference so that bh cannot be freed before we are
943  		 * done with it.
944  		 */
945  		get_bh(bh);
946  		jbd_lock_bh_state(bh);
947  		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
948  
949  		/*
950  		 * If there is undo-protected committed data against
951  		 * this buffer, then we can remove it now.  If it is a
952  		 * buffer needing such protection, the old frozen_data
953  		 * field now points to a committed version of the
954  		 * buffer, so rotate that field to the new committed
955  		 * data.
956  		 *
957  		 * Otherwise, we can just throw away the frozen data now.
958  		 *
959  		 * We also know that the frozen data has already fired
960  		 * its triggers if they exist, so we can clear that too.
961  		 */
962  		if (jh->b_committed_data) {
963  			jbd2_free(jh->b_committed_data, bh->b_size);
964  			jh->b_committed_data = NULL;
965  			if (jh->b_frozen_data) {
966  				jh->b_committed_data = jh->b_frozen_data;
967  				jh->b_frozen_data = NULL;
968  				jh->b_frozen_triggers = NULL;
969  			}
970  		} else if (jh->b_frozen_data) {
971  			jbd2_free(jh->b_frozen_data, bh->b_size);
972  			jh->b_frozen_data = NULL;
973  			jh->b_frozen_triggers = NULL;
974  		}
975  
976  		spin_lock(&journal->j_list_lock);
977  		cp_transaction = jh->b_cp_transaction;
978  		if (cp_transaction) {
979  			JBUFFER_TRACE(jh, "remove from old cp transaction");
980  			cp_transaction->t_chp_stats.cs_dropped++;
981  			__jbd2_journal_remove_checkpoint(jh);
982  		}
983  
984  		/* Only re-checkpoint the buffer_head if it is marked
985  		 * dirty.  If the buffer was added to the BJ_Forget list
986  		 * by jbd2_journal_forget, it may no longer be dirty and
987  		 * there's no point in keeping a checkpoint record for
988  		 * it. */
989  
990  		/*
991  		* A buffer which has been freed while still being journaled by
992  		* a previous transaction.
993  		*/
994  		if (buffer_freed(bh)) {
995  			/*
996  			 * If the running transaction is the one containing
997  			 * "add to orphan" operation (b_next_transaction !=
998  			 * NULL), we have to wait for that transaction to
999  			 * commit before we can really get rid of the buffer.
1000  			 * So just clear b_modified to not confuse transaction
1001  			 * credit accounting and refile the buffer to
1002  			 * BJ_Forget of the running transaction. If the just
1003  			 * committed transaction contains "add to orphan"
1004  			 * operation, we can completely invalidate the buffer
1005  			 * now. We are rather through in that since the
1006  			 * buffer may be still accessible when blocksize <
1007  			 * pagesize and it is attached to the last partial
1008  			 * page.
1009  			 */
1010  			jh->b_modified = 0;
1011  			if (!jh->b_next_transaction) {
1012  				clear_buffer_freed(bh);
1013  				clear_buffer_jbddirty(bh);
1014  				clear_buffer_mapped(bh);
1015  				clear_buffer_new(bh);
1016  				clear_buffer_req(bh);
1017  				bh->b_bdev = NULL;
1018  			}
1019  		}
1020  
1021  		if (buffer_jbddirty(bh)) {
1022  			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1023  			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1024  			if (is_journal_aborted(journal))
1025  				clear_buffer_jbddirty(bh);
1026  		} else {
1027  			J_ASSERT_BH(bh, !buffer_dirty(bh));
1028  			/*
1029  			 * The buffer on BJ_Forget list and not jbddirty means
1030  			 * it has been freed by this transaction and hence it
1031  			 * could not have been reallocated until this
1032  			 * transaction has committed. *BUT* it could be
1033  			 * reallocated once we have written all the data to
1034  			 * disk and before we process the buffer on BJ_Forget
1035  			 * list.
1036  			 */
1037  			if (!jh->b_next_transaction)
1038  				try_to_free = 1;
1039  		}
1040  		JBUFFER_TRACE(jh, "refile or unfile buffer");
1041  		__jbd2_journal_refile_buffer(jh);
1042  		jbd_unlock_bh_state(bh);
1043  		if (try_to_free)
1044  			release_buffer_page(bh);	/* Drops bh reference */
1045  		else
1046  			__brelse(bh);
1047  		cond_resched_lock(&journal->j_list_lock);
1048  	}
1049  	spin_unlock(&journal->j_list_lock);
1050  	/*
1051  	 * This is a bit sleazy.  We use j_list_lock to protect transition
1052  	 * of a transaction into T_FINISHED state and calling
1053  	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1054  	 * other checkpointing code processing the transaction...
1055  	 */
1056  	write_lock(&journal->j_state_lock);
1057  	spin_lock(&journal->j_list_lock);
1058  	/*
1059  	 * Now recheck if some buffers did not get attached to the transaction
1060  	 * while the lock was dropped...
1061  	 */
1062  	if (commit_transaction->t_forget) {
1063  		spin_unlock(&journal->j_list_lock);
1064  		write_unlock(&journal->j_state_lock);
1065  		goto restart_loop;
1066  	}
1067  
1068  	/* Add the transaction to the checkpoint list
1069  	 * __journal_remove_checkpoint() can not destroy transaction
1070  	 * under us because it is not marked as T_FINISHED yet */
1071  	if (journal->j_checkpoint_transactions == NULL) {
1072  		journal->j_checkpoint_transactions = commit_transaction;
1073  		commit_transaction->t_cpnext = commit_transaction;
1074  		commit_transaction->t_cpprev = commit_transaction;
1075  	} else {
1076  		commit_transaction->t_cpnext =
1077  			journal->j_checkpoint_transactions;
1078  		commit_transaction->t_cpprev =
1079  			commit_transaction->t_cpnext->t_cpprev;
1080  		commit_transaction->t_cpnext->t_cpprev =
1081  			commit_transaction;
1082  		commit_transaction->t_cpprev->t_cpnext =
1083  				commit_transaction;
1084  	}
1085  	spin_unlock(&journal->j_list_lock);
1086  
1087  	/* Done with this transaction! */
1088  
1089  	jbd_debug(3, "JBD2: commit phase 7\n");
1090  
1091  	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1092  
1093  	commit_transaction->t_start = jiffies;
1094  	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1095  					      commit_transaction->t_start);
1096  
1097  	/*
1098  	 * File the transaction statistics
1099  	 */
1100  	stats.ts_tid = commit_transaction->t_tid;
1101  	stats.run.rs_handle_count =
1102  		atomic_read(&commit_transaction->t_handle_count);
1103  	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1104  			     commit_transaction->t_tid, &stats.run);
1105  	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1106  
1107  	commit_transaction->t_state = T_COMMIT_CALLBACK;
1108  	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1109  	journal->j_commit_sequence = commit_transaction->t_tid;
1110  	journal->j_committing_transaction = NULL;
1111  	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1112  
1113  	/*
1114  	 * weight the commit time higher than the average time so we don't
1115  	 * react too strongly to vast changes in the commit time
1116  	 */
1117  	if (likely(journal->j_average_commit_time))
1118  		journal->j_average_commit_time = (commit_time +
1119  				journal->j_average_commit_time*3) / 4;
1120  	else
1121  		journal->j_average_commit_time = commit_time;
1122  
1123  	write_unlock(&journal->j_state_lock);
1124  
1125  	if (journal->j_commit_callback)
1126  		journal->j_commit_callback(journal, commit_transaction);
1127  
1128  	trace_jbd2_end_commit(journal, commit_transaction);
1129  	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1130  		  journal->j_commit_sequence, journal->j_tail_sequence);
1131  
1132  	write_lock(&journal->j_state_lock);
1133  	spin_lock(&journal->j_list_lock);
1134  	commit_transaction->t_state = T_FINISHED;
1135  	/* Check if the transaction can be dropped now that we are finished */
1136  	if (commit_transaction->t_checkpoint_list == NULL &&
1137  	    commit_transaction->t_checkpoint_io_list == NULL) {
1138  		__jbd2_journal_drop_transaction(journal, commit_transaction);
1139  		jbd2_journal_free_transaction(commit_transaction);
1140  	}
1141  	spin_unlock(&journal->j_list_lock);
1142  	write_unlock(&journal->j_state_lock);
1143  	wake_up(&journal->j_wait_done_commit);
1144  
1145  	/*
1146  	 * Calculate overall stats
1147  	 */
1148  	spin_lock(&journal->j_history_lock);
1149  	journal->j_stats.ts_tid++;
1150  	journal->j_stats.ts_requested += stats.ts_requested;
1151  	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1152  	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1153  	journal->j_stats.run.rs_running += stats.run.rs_running;
1154  	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1155  	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1156  	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1157  	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1158  	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1159  	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1160  	spin_unlock(&journal->j_history_lock);
1161  }
1162