xref: /linux/fs/jbd2/commit.c (revision 9fc3a18a942f74d245429211577a733930d365fa)
1  // SPDX-License-Identifier: GPL-2.0+
2  /*
3   * linux/fs/jbd2/commit.c
4   *
5   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6   *
7   * Copyright 1998 Red Hat corp --- All Rights Reserved
8   *
9   * Journal commit routines for the generic filesystem journaling code;
10   * part of the ext2fs journaling system.
11   */
12  
13  #include <linux/time.h>
14  #include <linux/fs.h>
15  #include <linux/jbd2.h>
16  #include <linux/errno.h>
17  #include <linux/slab.h>
18  #include <linux/mm.h>
19  #include <linux/pagemap.h>
20  #include <linux/jiffies.h>
21  #include <linux/crc32.h>
22  #include <linux/writeback.h>
23  #include <linux/backing-dev.h>
24  #include <linux/bio.h>
25  #include <linux/blkdev.h>
26  #include <linux/bitops.h>
27  #include <trace/events/jbd2.h>
28  
29  /*
30   * IO end handler for temporary buffer_heads handling writes to the journal.
31   */
32  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33  {
34  	struct buffer_head *orig_bh = bh->b_private;
35  
36  	BUFFER_TRACE(bh, "");
37  	if (uptodate)
38  		set_buffer_uptodate(bh);
39  	else
40  		clear_buffer_uptodate(bh);
41  	if (orig_bh) {
42  		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43  		smp_mb__after_atomic();
44  		wake_up_bit(&orig_bh->b_state, BH_Shadow);
45  	}
46  	unlock_buffer(bh);
47  }
48  
49  /*
50   * When an ext4 file is truncated, it is possible that some pages are not
51   * successfully freed, because they are attached to a committing transaction.
52   * After the transaction commits, these pages are left on the LRU, with no
53   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54   * by the VM, but their apparent absence upsets the VM accounting, and it makes
55   * the numbers in /proc/meminfo look odd.
56   *
57   * So here, we have a buffer which has just come off the forget list.  Look to
58   * see if we can strip all buffers from the backing page.
59   *
60   * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61   * caller provided us with a ref against the buffer, and we drop that here.
62   */
63  static void release_buffer_page(struct buffer_head *bh)
64  {
65  	struct page *page;
66  
67  	if (buffer_dirty(bh))
68  		goto nope;
69  	if (atomic_read(&bh->b_count) != 1)
70  		goto nope;
71  	page = bh->b_page;
72  	if (!page)
73  		goto nope;
74  	if (page->mapping)
75  		goto nope;
76  
77  	/* OK, it's a truncated page */
78  	if (!trylock_page(page))
79  		goto nope;
80  
81  	get_page(page);
82  	__brelse(bh);
83  	try_to_free_buffers(page);
84  	unlock_page(page);
85  	put_page(page);
86  	return;
87  
88  nope:
89  	__brelse(bh);
90  }
91  
92  static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
93  {
94  	struct commit_header *h;
95  	__u32 csum;
96  
97  	if (!jbd2_journal_has_csum_v2or3(j))
98  		return;
99  
100  	h = (struct commit_header *)(bh->b_data);
101  	h->h_chksum_type = 0;
102  	h->h_chksum_size = 0;
103  	h->h_chksum[0] = 0;
104  	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
105  	h->h_chksum[0] = cpu_to_be32(csum);
106  }
107  
108  /*
109   * Done it all: now submit the commit record.  We should have
110   * cleaned up our previous buffers by now, so if we are in abort
111   * mode we can now just skip the rest of the journal write
112   * entirely.
113   *
114   * Returns 1 if the journal needs to be aborted or 0 on success
115   */
116  static int journal_submit_commit_record(journal_t *journal,
117  					transaction_t *commit_transaction,
118  					struct buffer_head **cbh,
119  					__u32 crc32_sum)
120  {
121  	struct commit_header *tmp;
122  	struct buffer_head *bh;
123  	int ret;
124  	struct timespec64 now;
125  
126  	*cbh = NULL;
127  
128  	if (is_journal_aborted(journal))
129  		return 0;
130  
131  	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
132  						JBD2_COMMIT_BLOCK);
133  	if (!bh)
134  		return 1;
135  
136  	tmp = (struct commit_header *)bh->b_data;
137  	ktime_get_coarse_real_ts64(&now);
138  	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
139  	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
140  
141  	if (jbd2_has_feature_checksum(journal)) {
142  		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
143  		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
144  		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
145  	}
146  	jbd2_commit_block_csum_set(journal, bh);
147  
148  	BUFFER_TRACE(bh, "submit commit block");
149  	lock_buffer(bh);
150  	clear_buffer_dirty(bh);
151  	set_buffer_uptodate(bh);
152  	bh->b_end_io = journal_end_buffer_io_sync;
153  
154  	if (journal->j_flags & JBD2_BARRIER &&
155  	    !jbd2_has_feature_async_commit(journal))
156  		ret = submit_bh(REQ_OP_WRITE,
157  			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
158  	else
159  		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
160  
161  	*cbh = bh;
162  	return ret;
163  }
164  
165  /*
166   * This function along with journal_submit_commit_record
167   * allows to write the commit record asynchronously.
168   */
169  static int journal_wait_on_commit_record(journal_t *journal,
170  					 struct buffer_head *bh)
171  {
172  	int ret = 0;
173  
174  	clear_buffer_dirty(bh);
175  	wait_on_buffer(bh);
176  
177  	if (unlikely(!buffer_uptodate(bh)))
178  		ret = -EIO;
179  	put_bh(bh);            /* One for getblk() */
180  
181  	return ret;
182  }
183  
184  /*
185   * write the filemap data using writepage() address_space_operations.
186   * We don't do block allocation here even for delalloc. We don't
187   * use writepages() because with delayed allocation we may be doing
188   * block allocation in writepages().
189   */
190  static int journal_submit_inode_data_buffers(struct address_space *mapping,
191  		loff_t dirty_start, loff_t dirty_end)
192  {
193  	int ret;
194  	struct writeback_control wbc = {
195  		.sync_mode =  WB_SYNC_ALL,
196  		.nr_to_write = mapping->nrpages * 2,
197  		.range_start = dirty_start,
198  		.range_end = dirty_end,
199  	};
200  
201  	ret = generic_writepages(mapping, &wbc);
202  	return ret;
203  }
204  
205  /*
206   * Submit all the data buffers of inode associated with the transaction to
207   * disk.
208   *
209   * We are in a committing transaction. Therefore no new inode can be added to
210   * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
211   * operate on from being released while we write out pages.
212   */
213  static int journal_submit_data_buffers(journal_t *journal,
214  		transaction_t *commit_transaction)
215  {
216  	struct jbd2_inode *jinode;
217  	int err, ret = 0;
218  	struct address_space *mapping;
219  
220  	spin_lock(&journal->j_list_lock);
221  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
222  		loff_t dirty_start = jinode->i_dirty_start;
223  		loff_t dirty_end = jinode->i_dirty_end;
224  
225  		if (!(jinode->i_flags & JI_WRITE_DATA))
226  			continue;
227  		mapping = jinode->i_vfs_inode->i_mapping;
228  		jinode->i_flags |= JI_COMMIT_RUNNING;
229  		spin_unlock(&journal->j_list_lock);
230  		/*
231  		 * submit the inode data buffers. We use writepage
232  		 * instead of writepages. Because writepages can do
233  		 * block allocation  with delalloc. We need to write
234  		 * only allocated blocks here.
235  		 */
236  		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
237  		err = journal_submit_inode_data_buffers(mapping, dirty_start,
238  				dirty_end);
239  		if (!ret)
240  			ret = err;
241  		spin_lock(&journal->j_list_lock);
242  		J_ASSERT(jinode->i_transaction == commit_transaction);
243  		jinode->i_flags &= ~JI_COMMIT_RUNNING;
244  		smp_mb();
245  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
246  	}
247  	spin_unlock(&journal->j_list_lock);
248  	return ret;
249  }
250  
251  /*
252   * Wait for data submitted for writeout, refile inodes to proper
253   * transaction if needed.
254   *
255   */
256  static int journal_finish_inode_data_buffers(journal_t *journal,
257  		transaction_t *commit_transaction)
258  {
259  	struct jbd2_inode *jinode, *next_i;
260  	int err, ret = 0;
261  
262  	/* For locking, see the comment in journal_submit_data_buffers() */
263  	spin_lock(&journal->j_list_lock);
264  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
265  		loff_t dirty_start = jinode->i_dirty_start;
266  		loff_t dirty_end = jinode->i_dirty_end;
267  
268  		if (!(jinode->i_flags & JI_WAIT_DATA))
269  			continue;
270  		jinode->i_flags |= JI_COMMIT_RUNNING;
271  		spin_unlock(&journal->j_list_lock);
272  		err = filemap_fdatawait_range_keep_errors(
273  				jinode->i_vfs_inode->i_mapping, dirty_start,
274  				dirty_end);
275  		if (!ret)
276  			ret = err;
277  		spin_lock(&journal->j_list_lock);
278  		jinode->i_flags &= ~JI_COMMIT_RUNNING;
279  		smp_mb();
280  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281  	}
282  
283  	/* Now refile inode to proper lists */
284  	list_for_each_entry_safe(jinode, next_i,
285  				 &commit_transaction->t_inode_list, i_list) {
286  		list_del(&jinode->i_list);
287  		if (jinode->i_next_transaction) {
288  			jinode->i_transaction = jinode->i_next_transaction;
289  			jinode->i_next_transaction = NULL;
290  			list_add(&jinode->i_list,
291  				&jinode->i_transaction->t_inode_list);
292  		} else {
293  			jinode->i_transaction = NULL;
294  			jinode->i_dirty_start = 0;
295  			jinode->i_dirty_end = 0;
296  		}
297  	}
298  	spin_unlock(&journal->j_list_lock);
299  
300  	return ret;
301  }
302  
303  static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
304  {
305  	struct page *page = bh->b_page;
306  	char *addr;
307  	__u32 checksum;
308  
309  	addr = kmap_atomic(page);
310  	checksum = crc32_be(crc32_sum,
311  		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
312  	kunmap_atomic(addr);
313  
314  	return checksum;
315  }
316  
317  static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
318  				   unsigned long long block)
319  {
320  	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
321  	if (jbd2_has_feature_64bit(j))
322  		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
323  }
324  
325  static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
326  				    struct buffer_head *bh, __u32 sequence)
327  {
328  	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
329  	struct page *page = bh->b_page;
330  	__u8 *addr;
331  	__u32 csum32;
332  	__be32 seq;
333  
334  	if (!jbd2_journal_has_csum_v2or3(j))
335  		return;
336  
337  	seq = cpu_to_be32(sequence);
338  	addr = kmap_atomic(page);
339  	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
340  	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
341  			     bh->b_size);
342  	kunmap_atomic(addr);
343  
344  	if (jbd2_has_feature_csum3(j))
345  		tag3->t_checksum = cpu_to_be32(csum32);
346  	else
347  		tag->t_checksum = cpu_to_be16(csum32);
348  }
349  /*
350   * jbd2_journal_commit_transaction
351   *
352   * The primary function for committing a transaction to the log.  This
353   * function is called by the journal thread to begin a complete commit.
354   */
355  void jbd2_journal_commit_transaction(journal_t *journal)
356  {
357  	struct transaction_stats_s stats;
358  	transaction_t *commit_transaction;
359  	struct journal_head *jh;
360  	struct buffer_head *descriptor;
361  	struct buffer_head **wbuf = journal->j_wbuf;
362  	int bufs;
363  	int flags;
364  	int err;
365  	unsigned long long blocknr;
366  	ktime_t start_time;
367  	u64 commit_time;
368  	char *tagp = NULL;
369  	journal_block_tag_t *tag = NULL;
370  	int space_left = 0;
371  	int first_tag = 0;
372  	int tag_flag;
373  	int i;
374  	int tag_bytes = journal_tag_bytes(journal);
375  	struct buffer_head *cbh = NULL; /* For transactional checksums */
376  	__u32 crc32_sum = ~0;
377  	struct blk_plug plug;
378  	/* Tail of the journal */
379  	unsigned long first_block;
380  	tid_t first_tid;
381  	int update_tail;
382  	int csum_size = 0;
383  	LIST_HEAD(io_bufs);
384  	LIST_HEAD(log_bufs);
385  
386  	if (jbd2_journal_has_csum_v2or3(journal))
387  		csum_size = sizeof(struct jbd2_journal_block_tail);
388  
389  	/*
390  	 * First job: lock down the current transaction and wait for
391  	 * all outstanding updates to complete.
392  	 */
393  
394  	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
395  	if (journal->j_flags & JBD2_FLUSHED) {
396  		jbd_debug(3, "super block updated\n");
397  		mutex_lock_io(&journal->j_checkpoint_mutex);
398  		/*
399  		 * We hold j_checkpoint_mutex so tail cannot change under us.
400  		 * We don't need any special data guarantees for writing sb
401  		 * since journal is empty and it is ok for write to be
402  		 * flushed only with transaction commit.
403  		 */
404  		jbd2_journal_update_sb_log_tail(journal,
405  						journal->j_tail_sequence,
406  						journal->j_tail,
407  						REQ_SYNC);
408  		mutex_unlock(&journal->j_checkpoint_mutex);
409  	} else {
410  		jbd_debug(3, "superblock not updated\n");
411  	}
412  
413  	J_ASSERT(journal->j_running_transaction != NULL);
414  	J_ASSERT(journal->j_committing_transaction == NULL);
415  
416  	commit_transaction = journal->j_running_transaction;
417  
418  	trace_jbd2_start_commit(journal, commit_transaction);
419  	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
420  			commit_transaction->t_tid);
421  
422  	write_lock(&journal->j_state_lock);
423  	J_ASSERT(commit_transaction->t_state == T_RUNNING);
424  	commit_transaction->t_state = T_LOCKED;
425  
426  	trace_jbd2_commit_locking(journal, commit_transaction);
427  	stats.run.rs_wait = commit_transaction->t_max_wait;
428  	stats.run.rs_request_delay = 0;
429  	stats.run.rs_locked = jiffies;
430  	if (commit_transaction->t_requested)
431  		stats.run.rs_request_delay =
432  			jbd2_time_diff(commit_transaction->t_requested,
433  				       stats.run.rs_locked);
434  	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
435  					      stats.run.rs_locked);
436  
437  	spin_lock(&commit_transaction->t_handle_lock);
438  	while (atomic_read(&commit_transaction->t_updates)) {
439  		DEFINE_WAIT(wait);
440  
441  		prepare_to_wait(&journal->j_wait_updates, &wait,
442  					TASK_UNINTERRUPTIBLE);
443  		if (atomic_read(&commit_transaction->t_updates)) {
444  			spin_unlock(&commit_transaction->t_handle_lock);
445  			write_unlock(&journal->j_state_lock);
446  			schedule();
447  			write_lock(&journal->j_state_lock);
448  			spin_lock(&commit_transaction->t_handle_lock);
449  		}
450  		finish_wait(&journal->j_wait_updates, &wait);
451  	}
452  	spin_unlock(&commit_transaction->t_handle_lock);
453  	commit_transaction->t_state = T_SWITCH;
454  	write_unlock(&journal->j_state_lock);
455  
456  	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
457  			journal->j_max_transaction_buffers);
458  
459  	/*
460  	 * First thing we are allowed to do is to discard any remaining
461  	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
462  	 * that there are no such buffers: if a large filesystem
463  	 * operation like a truncate needs to split itself over multiple
464  	 * transactions, then it may try to do a jbd2_journal_restart() while
465  	 * there are still BJ_Reserved buffers outstanding.  These must
466  	 * be released cleanly from the current transaction.
467  	 *
468  	 * In this case, the filesystem must still reserve write access
469  	 * again before modifying the buffer in the new transaction, but
470  	 * we do not require it to remember exactly which old buffers it
471  	 * has reserved.  This is consistent with the existing behaviour
472  	 * that multiple jbd2_journal_get_write_access() calls to the same
473  	 * buffer are perfectly permissible.
474  	 */
475  	while (commit_transaction->t_reserved_list) {
476  		jh = commit_transaction->t_reserved_list;
477  		JBUFFER_TRACE(jh, "reserved, unused: refile");
478  		/*
479  		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
480  		 * leave undo-committed data.
481  		 */
482  		if (jh->b_committed_data) {
483  			struct buffer_head *bh = jh2bh(jh);
484  
485  			jbd_lock_bh_state(bh);
486  			jbd2_free(jh->b_committed_data, bh->b_size);
487  			jh->b_committed_data = NULL;
488  			jbd_unlock_bh_state(bh);
489  		}
490  		jbd2_journal_refile_buffer(journal, jh);
491  	}
492  
493  	/*
494  	 * Now try to drop any written-back buffers from the journal's
495  	 * checkpoint lists.  We do this *before* commit because it potentially
496  	 * frees some memory
497  	 */
498  	spin_lock(&journal->j_list_lock);
499  	__jbd2_journal_clean_checkpoint_list(journal, false);
500  	spin_unlock(&journal->j_list_lock);
501  
502  	jbd_debug(3, "JBD2: commit phase 1\n");
503  
504  	/*
505  	 * Clear revoked flag to reflect there is no revoked buffers
506  	 * in the next transaction which is going to be started.
507  	 */
508  	jbd2_clear_buffer_revoked_flags(journal);
509  
510  	/*
511  	 * Switch to a new revoke table.
512  	 */
513  	jbd2_journal_switch_revoke_table(journal);
514  
515  	/*
516  	 * Reserved credits cannot be claimed anymore, free them
517  	 */
518  	atomic_sub(atomic_read(&journal->j_reserved_credits),
519  		   &commit_transaction->t_outstanding_credits);
520  
521  	write_lock(&journal->j_state_lock);
522  	trace_jbd2_commit_flushing(journal, commit_transaction);
523  	stats.run.rs_flushing = jiffies;
524  	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
525  					     stats.run.rs_flushing);
526  
527  	commit_transaction->t_state = T_FLUSH;
528  	journal->j_committing_transaction = commit_transaction;
529  	journal->j_running_transaction = NULL;
530  	start_time = ktime_get();
531  	commit_transaction->t_log_start = journal->j_head;
532  	wake_up(&journal->j_wait_transaction_locked);
533  	write_unlock(&journal->j_state_lock);
534  
535  	jbd_debug(3, "JBD2: commit phase 2a\n");
536  
537  	/*
538  	 * Now start flushing things to disk, in the order they appear
539  	 * on the transaction lists.  Data blocks go first.
540  	 */
541  	err = journal_submit_data_buffers(journal, commit_transaction);
542  	if (err)
543  		jbd2_journal_abort(journal, err);
544  
545  	blk_start_plug(&plug);
546  	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
547  
548  	jbd_debug(3, "JBD2: commit phase 2b\n");
549  
550  	/*
551  	 * Way to go: we have now written out all of the data for a
552  	 * transaction!  Now comes the tricky part: we need to write out
553  	 * metadata.  Loop over the transaction's entire buffer list:
554  	 */
555  	write_lock(&journal->j_state_lock);
556  	commit_transaction->t_state = T_COMMIT;
557  	write_unlock(&journal->j_state_lock);
558  
559  	trace_jbd2_commit_logging(journal, commit_transaction);
560  	stats.run.rs_logging = jiffies;
561  	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
562  					       stats.run.rs_logging);
563  	stats.run.rs_blocks =
564  		atomic_read(&commit_transaction->t_outstanding_credits);
565  	stats.run.rs_blocks_logged = 0;
566  
567  	J_ASSERT(commit_transaction->t_nr_buffers <=
568  		 atomic_read(&commit_transaction->t_outstanding_credits));
569  
570  	err = 0;
571  	bufs = 0;
572  	descriptor = NULL;
573  	while (commit_transaction->t_buffers) {
574  
575  		/* Find the next buffer to be journaled... */
576  
577  		jh = commit_transaction->t_buffers;
578  
579  		/* If we're in abort mode, we just un-journal the buffer and
580  		   release it. */
581  
582  		if (is_journal_aborted(journal)) {
583  			clear_buffer_jbddirty(jh2bh(jh));
584  			JBUFFER_TRACE(jh, "journal is aborting: refile");
585  			jbd2_buffer_abort_trigger(jh,
586  						  jh->b_frozen_data ?
587  						  jh->b_frozen_triggers :
588  						  jh->b_triggers);
589  			jbd2_journal_refile_buffer(journal, jh);
590  			/* If that was the last one, we need to clean up
591  			 * any descriptor buffers which may have been
592  			 * already allocated, even if we are now
593  			 * aborting. */
594  			if (!commit_transaction->t_buffers)
595  				goto start_journal_io;
596  			continue;
597  		}
598  
599  		/* Make sure we have a descriptor block in which to
600  		   record the metadata buffer. */
601  
602  		if (!descriptor) {
603  			J_ASSERT (bufs == 0);
604  
605  			jbd_debug(4, "JBD2: get descriptor\n");
606  
607  			descriptor = jbd2_journal_get_descriptor_buffer(
608  							commit_transaction,
609  							JBD2_DESCRIPTOR_BLOCK);
610  			if (!descriptor) {
611  				jbd2_journal_abort(journal, -EIO);
612  				continue;
613  			}
614  
615  			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
616  				(unsigned long long)descriptor->b_blocknr,
617  				descriptor->b_data);
618  			tagp = &descriptor->b_data[sizeof(journal_header_t)];
619  			space_left = descriptor->b_size -
620  						sizeof(journal_header_t);
621  			first_tag = 1;
622  			set_buffer_jwrite(descriptor);
623  			set_buffer_dirty(descriptor);
624  			wbuf[bufs++] = descriptor;
625  
626  			/* Record it so that we can wait for IO
627                             completion later */
628  			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
629  			jbd2_file_log_bh(&log_bufs, descriptor);
630  		}
631  
632  		/* Where is the buffer to be written? */
633  
634  		err = jbd2_journal_next_log_block(journal, &blocknr);
635  		/* If the block mapping failed, just abandon the buffer
636  		   and repeat this loop: we'll fall into the
637  		   refile-on-abort condition above. */
638  		if (err) {
639  			jbd2_journal_abort(journal, err);
640  			continue;
641  		}
642  
643  		/*
644  		 * start_this_handle() uses t_outstanding_credits to determine
645  		 * the free space in the log, but this counter is changed
646  		 * by jbd2_journal_next_log_block() also.
647  		 */
648  		atomic_dec(&commit_transaction->t_outstanding_credits);
649  
650  		/* Bump b_count to prevent truncate from stumbling over
651                     the shadowed buffer!  @@@ This can go if we ever get
652                     rid of the shadow pairing of buffers. */
653  		atomic_inc(&jh2bh(jh)->b_count);
654  
655  		/*
656  		 * Make a temporary IO buffer with which to write it out
657  		 * (this will requeue the metadata buffer to BJ_Shadow).
658  		 */
659  		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
660  		JBUFFER_TRACE(jh, "ph3: write metadata");
661  		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
662  						jh, &wbuf[bufs], blocknr);
663  		if (flags < 0) {
664  			jbd2_journal_abort(journal, flags);
665  			continue;
666  		}
667  		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
668  
669  		/* Record the new block's tag in the current descriptor
670                     buffer */
671  
672  		tag_flag = 0;
673  		if (flags & 1)
674  			tag_flag |= JBD2_FLAG_ESCAPE;
675  		if (!first_tag)
676  			tag_flag |= JBD2_FLAG_SAME_UUID;
677  
678  		tag = (journal_block_tag_t *) tagp;
679  		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
680  		tag->t_flags = cpu_to_be16(tag_flag);
681  		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
682  					commit_transaction->t_tid);
683  		tagp += tag_bytes;
684  		space_left -= tag_bytes;
685  		bufs++;
686  
687  		if (first_tag) {
688  			memcpy (tagp, journal->j_uuid, 16);
689  			tagp += 16;
690  			space_left -= 16;
691  			first_tag = 0;
692  		}
693  
694  		/* If there's no more to do, or if the descriptor is full,
695  		   let the IO rip! */
696  
697  		if (bufs == journal->j_wbufsize ||
698  		    commit_transaction->t_buffers == NULL ||
699  		    space_left < tag_bytes + 16 + csum_size) {
700  
701  			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
702  
703  			/* Write an end-of-descriptor marker before
704                             submitting the IOs.  "tag" still points to
705                             the last tag we set up. */
706  
707  			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
708  start_journal_io:
709  			if (descriptor)
710  				jbd2_descriptor_block_csum_set(journal,
711  							descriptor);
712  
713  			for (i = 0; i < bufs; i++) {
714  				struct buffer_head *bh = wbuf[i];
715  				/*
716  				 * Compute checksum.
717  				 */
718  				if (jbd2_has_feature_checksum(journal)) {
719  					crc32_sum =
720  					    jbd2_checksum_data(crc32_sum, bh);
721  				}
722  
723  				lock_buffer(bh);
724  				clear_buffer_dirty(bh);
725  				set_buffer_uptodate(bh);
726  				bh->b_end_io = journal_end_buffer_io_sync;
727  				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
728  			}
729  			cond_resched();
730  			stats.run.rs_blocks_logged += bufs;
731  
732  			/* Force a new descriptor to be generated next
733                             time round the loop. */
734  			descriptor = NULL;
735  			bufs = 0;
736  		}
737  	}
738  
739  	err = journal_finish_inode_data_buffers(journal, commit_transaction);
740  	if (err) {
741  		printk(KERN_WARNING
742  			"JBD2: Detected IO errors while flushing file data "
743  		       "on %s\n", journal->j_devname);
744  		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
745  			jbd2_journal_abort(journal, err);
746  		err = 0;
747  	}
748  
749  	/*
750  	 * Get current oldest transaction in the log before we issue flush
751  	 * to the filesystem device. After the flush we can be sure that
752  	 * blocks of all older transactions are checkpointed to persistent
753  	 * storage and we will be safe to update journal start in the
754  	 * superblock with the numbers we get here.
755  	 */
756  	update_tail =
757  		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
758  
759  	write_lock(&journal->j_state_lock);
760  	if (update_tail) {
761  		long freed = first_block - journal->j_tail;
762  
763  		if (first_block < journal->j_tail)
764  			freed += journal->j_last - journal->j_first;
765  		/* Update tail only if we free significant amount of space */
766  		if (freed < journal->j_maxlen / 4)
767  			update_tail = 0;
768  	}
769  	J_ASSERT(commit_transaction->t_state == T_COMMIT);
770  	commit_transaction->t_state = T_COMMIT_DFLUSH;
771  	write_unlock(&journal->j_state_lock);
772  
773  	/*
774  	 * If the journal is not located on the file system device,
775  	 * then we must flush the file system device before we issue
776  	 * the commit record
777  	 */
778  	if (commit_transaction->t_need_data_flush &&
779  	    (journal->j_fs_dev != journal->j_dev) &&
780  	    (journal->j_flags & JBD2_BARRIER))
781  		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
782  
783  	/* Done it all: now write the commit record asynchronously. */
784  	if (jbd2_has_feature_async_commit(journal)) {
785  		err = journal_submit_commit_record(journal, commit_transaction,
786  						 &cbh, crc32_sum);
787  		if (err)
788  			__jbd2_journal_abort_hard(journal);
789  	}
790  
791  	blk_finish_plug(&plug);
792  
793  	/* Lo and behold: we have just managed to send a transaction to
794             the log.  Before we can commit it, wait for the IO so far to
795             complete.  Control buffers being written are on the
796             transaction's t_log_list queue, and metadata buffers are on
797             the io_bufs list.
798  
799  	   Wait for the buffers in reverse order.  That way we are
800  	   less likely to be woken up until all IOs have completed, and
801  	   so we incur less scheduling load.
802  	*/
803  
804  	jbd_debug(3, "JBD2: commit phase 3\n");
805  
806  	while (!list_empty(&io_bufs)) {
807  		struct buffer_head *bh = list_entry(io_bufs.prev,
808  						    struct buffer_head,
809  						    b_assoc_buffers);
810  
811  		wait_on_buffer(bh);
812  		cond_resched();
813  
814  		if (unlikely(!buffer_uptodate(bh)))
815  			err = -EIO;
816  		jbd2_unfile_log_bh(bh);
817  
818  		/*
819  		 * The list contains temporary buffer heads created by
820  		 * jbd2_journal_write_metadata_buffer().
821  		 */
822  		BUFFER_TRACE(bh, "dumping temporary bh");
823  		__brelse(bh);
824  		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
825  		free_buffer_head(bh);
826  
827  		/* We also have to refile the corresponding shadowed buffer */
828  		jh = commit_transaction->t_shadow_list->b_tprev;
829  		bh = jh2bh(jh);
830  		clear_buffer_jwrite(bh);
831  		J_ASSERT_BH(bh, buffer_jbddirty(bh));
832  		J_ASSERT_BH(bh, !buffer_shadow(bh));
833  
834  		/* The metadata is now released for reuse, but we need
835                     to remember it against this transaction so that when
836                     we finally commit, we can do any checkpointing
837                     required. */
838  		JBUFFER_TRACE(jh, "file as BJ_Forget");
839  		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
840  		JBUFFER_TRACE(jh, "brelse shadowed buffer");
841  		__brelse(bh);
842  	}
843  
844  	J_ASSERT (commit_transaction->t_shadow_list == NULL);
845  
846  	jbd_debug(3, "JBD2: commit phase 4\n");
847  
848  	/* Here we wait for the revoke record and descriptor record buffers */
849  	while (!list_empty(&log_bufs)) {
850  		struct buffer_head *bh;
851  
852  		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
853  		wait_on_buffer(bh);
854  		cond_resched();
855  
856  		if (unlikely(!buffer_uptodate(bh)))
857  			err = -EIO;
858  
859  		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
860  		clear_buffer_jwrite(bh);
861  		jbd2_unfile_log_bh(bh);
862  		__brelse(bh);		/* One for getblk */
863  		/* AKPM: bforget here */
864  	}
865  
866  	if (err)
867  		jbd2_journal_abort(journal, err);
868  
869  	jbd_debug(3, "JBD2: commit phase 5\n");
870  	write_lock(&journal->j_state_lock);
871  	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
872  	commit_transaction->t_state = T_COMMIT_JFLUSH;
873  	write_unlock(&journal->j_state_lock);
874  
875  	if (!jbd2_has_feature_async_commit(journal)) {
876  		err = journal_submit_commit_record(journal, commit_transaction,
877  						&cbh, crc32_sum);
878  		if (err)
879  			__jbd2_journal_abort_hard(journal);
880  	}
881  	if (cbh)
882  		err = journal_wait_on_commit_record(journal, cbh);
883  	if (jbd2_has_feature_async_commit(journal) &&
884  	    journal->j_flags & JBD2_BARRIER) {
885  		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
886  	}
887  
888  	if (err)
889  		jbd2_journal_abort(journal, err);
890  
891  	/*
892  	 * Now disk caches for filesystem device are flushed so we are safe to
893  	 * erase checkpointed transactions from the log by updating journal
894  	 * superblock.
895  	 */
896  	if (update_tail)
897  		jbd2_update_log_tail(journal, first_tid, first_block);
898  
899  	/* End of a transaction!  Finally, we can do checkpoint
900             processing: any buffers committed as a result of this
901             transaction can be removed from any checkpoint list it was on
902             before. */
903  
904  	jbd_debug(3, "JBD2: commit phase 6\n");
905  
906  	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
907  	J_ASSERT(commit_transaction->t_buffers == NULL);
908  	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
909  	J_ASSERT(commit_transaction->t_shadow_list == NULL);
910  
911  restart_loop:
912  	/*
913  	 * As there are other places (journal_unmap_buffer()) adding buffers
914  	 * to this list we have to be careful and hold the j_list_lock.
915  	 */
916  	spin_lock(&journal->j_list_lock);
917  	while (commit_transaction->t_forget) {
918  		transaction_t *cp_transaction;
919  		struct buffer_head *bh;
920  		int try_to_free = 0;
921  
922  		jh = commit_transaction->t_forget;
923  		spin_unlock(&journal->j_list_lock);
924  		bh = jh2bh(jh);
925  		/*
926  		 * Get a reference so that bh cannot be freed before we are
927  		 * done with it.
928  		 */
929  		get_bh(bh);
930  		jbd_lock_bh_state(bh);
931  		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
932  
933  		/*
934  		 * If there is undo-protected committed data against
935  		 * this buffer, then we can remove it now.  If it is a
936  		 * buffer needing such protection, the old frozen_data
937  		 * field now points to a committed version of the
938  		 * buffer, so rotate that field to the new committed
939  		 * data.
940  		 *
941  		 * Otherwise, we can just throw away the frozen data now.
942  		 *
943  		 * We also know that the frozen data has already fired
944  		 * its triggers if they exist, so we can clear that too.
945  		 */
946  		if (jh->b_committed_data) {
947  			jbd2_free(jh->b_committed_data, bh->b_size);
948  			jh->b_committed_data = NULL;
949  			if (jh->b_frozen_data) {
950  				jh->b_committed_data = jh->b_frozen_data;
951  				jh->b_frozen_data = NULL;
952  				jh->b_frozen_triggers = NULL;
953  			}
954  		} else if (jh->b_frozen_data) {
955  			jbd2_free(jh->b_frozen_data, bh->b_size);
956  			jh->b_frozen_data = NULL;
957  			jh->b_frozen_triggers = NULL;
958  		}
959  
960  		spin_lock(&journal->j_list_lock);
961  		cp_transaction = jh->b_cp_transaction;
962  		if (cp_transaction) {
963  			JBUFFER_TRACE(jh, "remove from old cp transaction");
964  			cp_transaction->t_chp_stats.cs_dropped++;
965  			__jbd2_journal_remove_checkpoint(jh);
966  		}
967  
968  		/* Only re-checkpoint the buffer_head if it is marked
969  		 * dirty.  If the buffer was added to the BJ_Forget list
970  		 * by jbd2_journal_forget, it may no longer be dirty and
971  		 * there's no point in keeping a checkpoint record for
972  		 * it. */
973  
974  		/*
975  		* A buffer which has been freed while still being journaled by
976  		* a previous transaction.
977  		*/
978  		if (buffer_freed(bh)) {
979  			/*
980  			 * If the running transaction is the one containing
981  			 * "add to orphan" operation (b_next_transaction !=
982  			 * NULL), we have to wait for that transaction to
983  			 * commit before we can really get rid of the buffer.
984  			 * So just clear b_modified to not confuse transaction
985  			 * credit accounting and refile the buffer to
986  			 * BJ_Forget of the running transaction. If the just
987  			 * committed transaction contains "add to orphan"
988  			 * operation, we can completely invalidate the buffer
989  			 * now. We are rather through in that since the
990  			 * buffer may be still accessible when blocksize <
991  			 * pagesize and it is attached to the last partial
992  			 * page.
993  			 */
994  			jh->b_modified = 0;
995  			if (!jh->b_next_transaction) {
996  				clear_buffer_freed(bh);
997  				clear_buffer_jbddirty(bh);
998  				clear_buffer_mapped(bh);
999  				clear_buffer_new(bh);
1000  				clear_buffer_req(bh);
1001  				bh->b_bdev = NULL;
1002  			}
1003  		}
1004  
1005  		if (buffer_jbddirty(bh)) {
1006  			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1007  			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1008  			if (is_journal_aborted(journal))
1009  				clear_buffer_jbddirty(bh);
1010  		} else {
1011  			J_ASSERT_BH(bh, !buffer_dirty(bh));
1012  			/*
1013  			 * The buffer on BJ_Forget list and not jbddirty means
1014  			 * it has been freed by this transaction and hence it
1015  			 * could not have been reallocated until this
1016  			 * transaction has committed. *BUT* it could be
1017  			 * reallocated once we have written all the data to
1018  			 * disk and before we process the buffer on BJ_Forget
1019  			 * list.
1020  			 */
1021  			if (!jh->b_next_transaction)
1022  				try_to_free = 1;
1023  		}
1024  		JBUFFER_TRACE(jh, "refile or unfile buffer");
1025  		__jbd2_journal_refile_buffer(jh);
1026  		jbd_unlock_bh_state(bh);
1027  		if (try_to_free)
1028  			release_buffer_page(bh);	/* Drops bh reference */
1029  		else
1030  			__brelse(bh);
1031  		cond_resched_lock(&journal->j_list_lock);
1032  	}
1033  	spin_unlock(&journal->j_list_lock);
1034  	/*
1035  	 * This is a bit sleazy.  We use j_list_lock to protect transition
1036  	 * of a transaction into T_FINISHED state and calling
1037  	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1038  	 * other checkpointing code processing the transaction...
1039  	 */
1040  	write_lock(&journal->j_state_lock);
1041  	spin_lock(&journal->j_list_lock);
1042  	/*
1043  	 * Now recheck if some buffers did not get attached to the transaction
1044  	 * while the lock was dropped...
1045  	 */
1046  	if (commit_transaction->t_forget) {
1047  		spin_unlock(&journal->j_list_lock);
1048  		write_unlock(&journal->j_state_lock);
1049  		goto restart_loop;
1050  	}
1051  
1052  	/* Add the transaction to the checkpoint list
1053  	 * __journal_remove_checkpoint() can not destroy transaction
1054  	 * under us because it is not marked as T_FINISHED yet */
1055  	if (journal->j_checkpoint_transactions == NULL) {
1056  		journal->j_checkpoint_transactions = commit_transaction;
1057  		commit_transaction->t_cpnext = commit_transaction;
1058  		commit_transaction->t_cpprev = commit_transaction;
1059  	} else {
1060  		commit_transaction->t_cpnext =
1061  			journal->j_checkpoint_transactions;
1062  		commit_transaction->t_cpprev =
1063  			commit_transaction->t_cpnext->t_cpprev;
1064  		commit_transaction->t_cpnext->t_cpprev =
1065  			commit_transaction;
1066  		commit_transaction->t_cpprev->t_cpnext =
1067  				commit_transaction;
1068  	}
1069  	spin_unlock(&journal->j_list_lock);
1070  
1071  	/* Done with this transaction! */
1072  
1073  	jbd_debug(3, "JBD2: commit phase 7\n");
1074  
1075  	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1076  
1077  	commit_transaction->t_start = jiffies;
1078  	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1079  					      commit_transaction->t_start);
1080  
1081  	/*
1082  	 * File the transaction statistics
1083  	 */
1084  	stats.ts_tid = commit_transaction->t_tid;
1085  	stats.run.rs_handle_count =
1086  		atomic_read(&commit_transaction->t_handle_count);
1087  	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1088  			     commit_transaction->t_tid, &stats.run);
1089  	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1090  
1091  	commit_transaction->t_state = T_COMMIT_CALLBACK;
1092  	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1093  	journal->j_commit_sequence = commit_transaction->t_tid;
1094  	journal->j_committing_transaction = NULL;
1095  	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1096  
1097  	/*
1098  	 * weight the commit time higher than the average time so we don't
1099  	 * react too strongly to vast changes in the commit time
1100  	 */
1101  	if (likely(journal->j_average_commit_time))
1102  		journal->j_average_commit_time = (commit_time +
1103  				journal->j_average_commit_time*3) / 4;
1104  	else
1105  		journal->j_average_commit_time = commit_time;
1106  
1107  	write_unlock(&journal->j_state_lock);
1108  
1109  	if (journal->j_commit_callback)
1110  		journal->j_commit_callback(journal, commit_transaction);
1111  
1112  	trace_jbd2_end_commit(journal, commit_transaction);
1113  	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1114  		  journal->j_commit_sequence, journal->j_tail_sequence);
1115  
1116  	write_lock(&journal->j_state_lock);
1117  	spin_lock(&journal->j_list_lock);
1118  	commit_transaction->t_state = T_FINISHED;
1119  	/* Check if the transaction can be dropped now that we are finished */
1120  	if (commit_transaction->t_checkpoint_list == NULL &&
1121  	    commit_transaction->t_checkpoint_io_list == NULL) {
1122  		__jbd2_journal_drop_transaction(journal, commit_transaction);
1123  		jbd2_journal_free_transaction(commit_transaction);
1124  	}
1125  	spin_unlock(&journal->j_list_lock);
1126  	write_unlock(&journal->j_state_lock);
1127  	wake_up(&journal->j_wait_done_commit);
1128  
1129  	/*
1130  	 * Calculate overall stats
1131  	 */
1132  	spin_lock(&journal->j_history_lock);
1133  	journal->j_stats.ts_tid++;
1134  	journal->j_stats.ts_requested += stats.ts_requested;
1135  	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1136  	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1137  	journal->j_stats.run.rs_running += stats.run.rs_running;
1138  	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1139  	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1140  	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1141  	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1142  	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1143  	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1144  	spin_unlock(&journal->j_history_lock);
1145  }
1146