xref: /linux/fs/jbd2/commit.c (revision 72421f35540c3e8830be8897ef1b99d2b7aa0981)
1  // SPDX-License-Identifier: GPL-2.0+
2  /*
3   * linux/fs/jbd2/commit.c
4   *
5   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6   *
7   * Copyright 1998 Red Hat corp --- All Rights Reserved
8   *
9   * Journal commit routines for the generic filesystem journaling code;
10   * part of the ext2fs journaling system.
11   */
12  
13  #include <linux/time.h>
14  #include <linux/fs.h>
15  #include <linux/jbd2.h>
16  #include <linux/errno.h>
17  #include <linux/slab.h>
18  #include <linux/mm.h>
19  #include <linux/pagemap.h>
20  #include <linux/jiffies.h>
21  #include <linux/crc32.h>
22  #include <linux/writeback.h>
23  #include <linux/backing-dev.h>
24  #include <linux/bio.h>
25  #include <linux/blkdev.h>
26  #include <linux/bitops.h>
27  #include <trace/events/jbd2.h>
28  
29  /*
30   * IO end handler for temporary buffer_heads handling writes to the journal.
31   */
32  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33  {
34  	struct buffer_head *orig_bh = bh->b_private;
35  
36  	BUFFER_TRACE(bh, "");
37  	if (uptodate)
38  		set_buffer_uptodate(bh);
39  	else
40  		clear_buffer_uptodate(bh);
41  	if (orig_bh) {
42  		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43  		smp_mb__after_atomic();
44  		wake_up_bit(&orig_bh->b_state, BH_Shadow);
45  	}
46  	unlock_buffer(bh);
47  }
48  
49  /*
50   * When an ext4 file is truncated, it is possible that some pages are not
51   * successfully freed, because they are attached to a committing transaction.
52   * After the transaction commits, these pages are left on the LRU, with no
53   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54   * by the VM, but their apparent absence upsets the VM accounting, and it makes
55   * the numbers in /proc/meminfo look odd.
56   *
57   * So here, we have a buffer which has just come off the forget list.  Look to
58   * see if we can strip all buffers from the backing page.
59   *
60   * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61   * caller provided us with a ref against the buffer, and we drop that here.
62   */
63  static void release_buffer_page(struct buffer_head *bh)
64  {
65  	struct folio *folio;
66  
67  	if (buffer_dirty(bh))
68  		goto nope;
69  	if (atomic_read(&bh->b_count) != 1)
70  		goto nope;
71  	folio = bh->b_folio;
72  	if (folio->mapping)
73  		goto nope;
74  
75  	/* OK, it's a truncated page */
76  	if (!folio_trylock(folio))
77  		goto nope;
78  
79  	folio_get(folio);
80  	__brelse(bh);
81  	try_to_free_buffers(folio);
82  	folio_unlock(folio);
83  	folio_put(folio);
84  	return;
85  
86  nope:
87  	__brelse(bh);
88  }
89  
90  static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
91  {
92  	struct commit_header *h;
93  	__u32 csum;
94  
95  	if (!jbd2_journal_has_csum_v2or3(j))
96  		return;
97  
98  	h = (struct commit_header *)(bh->b_data);
99  	h->h_chksum_type = 0;
100  	h->h_chksum_size = 0;
101  	h->h_chksum[0] = 0;
102  	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
103  	h->h_chksum[0] = cpu_to_be32(csum);
104  }
105  
106  /*
107   * Done it all: now submit the commit record.  We should have
108   * cleaned up our previous buffers by now, so if we are in abort
109   * mode we can now just skip the rest of the journal write
110   * entirely.
111   *
112   * Returns 1 if the journal needs to be aborted or 0 on success
113   */
114  static int journal_submit_commit_record(journal_t *journal,
115  					transaction_t *commit_transaction,
116  					struct buffer_head **cbh,
117  					__u32 crc32_sum)
118  {
119  	struct commit_header *tmp;
120  	struct buffer_head *bh;
121  	struct timespec64 now;
122  	blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
123  
124  	*cbh = NULL;
125  
126  	if (is_journal_aborted(journal))
127  		return 0;
128  
129  	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
130  						JBD2_COMMIT_BLOCK);
131  	if (!bh)
132  		return 1;
133  
134  	tmp = (struct commit_header *)bh->b_data;
135  	ktime_get_coarse_real_ts64(&now);
136  	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
137  	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
138  
139  	if (jbd2_has_feature_checksum(journal)) {
140  		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
141  		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
142  		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
143  	}
144  	jbd2_commit_block_csum_set(journal, bh);
145  
146  	BUFFER_TRACE(bh, "submit commit block");
147  	lock_buffer(bh);
148  	clear_buffer_dirty(bh);
149  	set_buffer_uptodate(bh);
150  	bh->b_end_io = journal_end_buffer_io_sync;
151  
152  	if (journal->j_flags & JBD2_BARRIER &&
153  	    !jbd2_has_feature_async_commit(journal))
154  		write_flags |= REQ_PREFLUSH | REQ_FUA;
155  
156  	submit_bh(write_flags, bh);
157  	*cbh = bh;
158  	return 0;
159  }
160  
161  /*
162   * This function along with journal_submit_commit_record
163   * allows to write the commit record asynchronously.
164   */
165  static int journal_wait_on_commit_record(journal_t *journal,
166  					 struct buffer_head *bh)
167  {
168  	int ret = 0;
169  
170  	clear_buffer_dirty(bh);
171  	wait_on_buffer(bh);
172  
173  	if (unlikely(!buffer_uptodate(bh)))
174  		ret = -EIO;
175  	put_bh(bh);            /* One for getblk() */
176  
177  	return ret;
178  }
179  
180  /* Send all the data buffers related to an inode */
181  int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
182  {
183  	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
184  		return 0;
185  
186  	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
187  	return journal->j_submit_inode_data_buffers(jinode);
188  
189  }
190  EXPORT_SYMBOL(jbd2_submit_inode_data);
191  
192  int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
193  {
194  	if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
195  		!jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
196  		return 0;
197  	return filemap_fdatawait_range_keep_errors(
198  		jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
199  		jinode->i_dirty_end);
200  }
201  EXPORT_SYMBOL(jbd2_wait_inode_data);
202  
203  /*
204   * Submit all the data buffers of inode associated with the transaction to
205   * disk.
206   *
207   * We are in a committing transaction. Therefore no new inode can be added to
208   * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
209   * operate on from being released while we write out pages.
210   */
211  static int journal_submit_data_buffers(journal_t *journal,
212  		transaction_t *commit_transaction)
213  {
214  	struct jbd2_inode *jinode;
215  	int err, ret = 0;
216  
217  	spin_lock(&journal->j_list_lock);
218  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
219  		if (!(jinode->i_flags & JI_WRITE_DATA))
220  			continue;
221  		jinode->i_flags |= JI_COMMIT_RUNNING;
222  		spin_unlock(&journal->j_list_lock);
223  		/* submit the inode data buffers. */
224  		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
225  		if (journal->j_submit_inode_data_buffers) {
226  			err = journal->j_submit_inode_data_buffers(jinode);
227  			if (!ret)
228  				ret = err;
229  		}
230  		spin_lock(&journal->j_list_lock);
231  		J_ASSERT(jinode->i_transaction == commit_transaction);
232  		jinode->i_flags &= ~JI_COMMIT_RUNNING;
233  		smp_mb();
234  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
235  	}
236  	spin_unlock(&journal->j_list_lock);
237  	return ret;
238  }
239  
240  int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
241  {
242  	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
243  
244  	return filemap_fdatawait_range_keep_errors(mapping,
245  						   jinode->i_dirty_start,
246  						   jinode->i_dirty_end);
247  }
248  
249  /*
250   * Wait for data submitted for writeout, refile inodes to proper
251   * transaction if needed.
252   *
253   */
254  static int journal_finish_inode_data_buffers(journal_t *journal,
255  		transaction_t *commit_transaction)
256  {
257  	struct jbd2_inode *jinode, *next_i;
258  	int err, ret = 0;
259  
260  	/* For locking, see the comment in journal_submit_data_buffers() */
261  	spin_lock(&journal->j_list_lock);
262  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
263  		if (!(jinode->i_flags & JI_WAIT_DATA))
264  			continue;
265  		jinode->i_flags |= JI_COMMIT_RUNNING;
266  		spin_unlock(&journal->j_list_lock);
267  		/* wait for the inode data buffers writeout. */
268  		if (journal->j_finish_inode_data_buffers) {
269  			err = journal->j_finish_inode_data_buffers(jinode);
270  			if (!ret)
271  				ret = err;
272  		}
273  		cond_resched();
274  		spin_lock(&journal->j_list_lock);
275  		jinode->i_flags &= ~JI_COMMIT_RUNNING;
276  		smp_mb();
277  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
278  	}
279  
280  	/* Now refile inode to proper lists */
281  	list_for_each_entry_safe(jinode, next_i,
282  				 &commit_transaction->t_inode_list, i_list) {
283  		list_del(&jinode->i_list);
284  		if (jinode->i_next_transaction) {
285  			jinode->i_transaction = jinode->i_next_transaction;
286  			jinode->i_next_transaction = NULL;
287  			list_add(&jinode->i_list,
288  				&jinode->i_transaction->t_inode_list);
289  		} else {
290  			jinode->i_transaction = NULL;
291  			jinode->i_dirty_start = 0;
292  			jinode->i_dirty_end = 0;
293  		}
294  	}
295  	spin_unlock(&journal->j_list_lock);
296  
297  	return ret;
298  }
299  
300  static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
301  {
302  	char *addr;
303  	__u32 checksum;
304  
305  	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
306  	checksum = crc32_be(crc32_sum, addr, bh->b_size);
307  	kunmap_local(addr);
308  
309  	return checksum;
310  }
311  
312  static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
313  				   unsigned long long block)
314  {
315  	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
316  	if (jbd2_has_feature_64bit(j))
317  		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
318  }
319  
320  static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
321  				    struct buffer_head *bh, __u32 sequence)
322  {
323  	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
324  	__u8 *addr;
325  	__u32 csum32;
326  	__be32 seq;
327  
328  	if (!jbd2_journal_has_csum_v2or3(j))
329  		return;
330  
331  	seq = cpu_to_be32(sequence);
332  	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
333  	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
334  	csum32 = jbd2_chksum(j, csum32, addr, bh->b_size);
335  	kunmap_local(addr);
336  
337  	if (jbd2_has_feature_csum3(j))
338  		tag3->t_checksum = cpu_to_be32(csum32);
339  	else
340  		tag->t_checksum = cpu_to_be16(csum32);
341  }
342  /*
343   * jbd2_journal_commit_transaction
344   *
345   * The primary function for committing a transaction to the log.  This
346   * function is called by the journal thread to begin a complete commit.
347   */
348  void jbd2_journal_commit_transaction(journal_t *journal)
349  {
350  	struct transaction_stats_s stats;
351  	transaction_t *commit_transaction;
352  	struct journal_head *jh;
353  	struct buffer_head *descriptor;
354  	struct buffer_head **wbuf = journal->j_wbuf;
355  	int bufs;
356  	int flags;
357  	int err;
358  	unsigned long long blocknr;
359  	ktime_t start_time;
360  	u64 commit_time;
361  	char *tagp = NULL;
362  	journal_block_tag_t *tag = NULL;
363  	int space_left = 0;
364  	int first_tag = 0;
365  	int tag_flag;
366  	int i;
367  	int tag_bytes = journal_tag_bytes(journal);
368  	struct buffer_head *cbh = NULL; /* For transactional checksums */
369  	__u32 crc32_sum = ~0;
370  	struct blk_plug plug;
371  	/* Tail of the journal */
372  	unsigned long first_block;
373  	tid_t first_tid;
374  	int update_tail;
375  	int csum_size = 0;
376  	LIST_HEAD(io_bufs);
377  	LIST_HEAD(log_bufs);
378  
379  	if (jbd2_journal_has_csum_v2or3(journal))
380  		csum_size = sizeof(struct jbd2_journal_block_tail);
381  
382  	/*
383  	 * First job: lock down the current transaction and wait for
384  	 * all outstanding updates to complete.
385  	 */
386  
387  	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
388  	if (journal->j_flags & JBD2_FLUSHED) {
389  		jbd2_debug(3, "super block updated\n");
390  		mutex_lock_io(&journal->j_checkpoint_mutex);
391  		/*
392  		 * We hold j_checkpoint_mutex so tail cannot change under us.
393  		 * We don't need any special data guarantees for writing sb
394  		 * since journal is empty and it is ok for write to be
395  		 * flushed only with transaction commit.
396  		 */
397  		jbd2_journal_update_sb_log_tail(journal,
398  						journal->j_tail_sequence,
399  						journal->j_tail, 0);
400  		mutex_unlock(&journal->j_checkpoint_mutex);
401  	} else {
402  		jbd2_debug(3, "superblock not updated\n");
403  	}
404  
405  	J_ASSERT(journal->j_running_transaction != NULL);
406  	J_ASSERT(journal->j_committing_transaction == NULL);
407  
408  	write_lock(&journal->j_state_lock);
409  	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
410  	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
411  		DEFINE_WAIT(wait);
412  
413  		prepare_to_wait(&journal->j_fc_wait, &wait,
414  				TASK_UNINTERRUPTIBLE);
415  		write_unlock(&journal->j_state_lock);
416  		schedule();
417  		write_lock(&journal->j_state_lock);
418  		finish_wait(&journal->j_fc_wait, &wait);
419  		/*
420  		 * TODO: by blocking fast commits here, we are increasing
421  		 * fsync() latency slightly. Strictly speaking, we don't need
422  		 * to block fast commits until the transaction enters T_FLUSH
423  		 * state. So an optimization is possible where we block new fast
424  		 * commits here and wait for existing ones to complete
425  		 * just before we enter T_FLUSH. That way, the existing fast
426  		 * commits and this full commit can proceed parallely.
427  		 */
428  	}
429  	write_unlock(&journal->j_state_lock);
430  
431  	commit_transaction = journal->j_running_transaction;
432  
433  	trace_jbd2_start_commit(journal, commit_transaction);
434  	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
435  			commit_transaction->t_tid);
436  
437  	write_lock(&journal->j_state_lock);
438  	journal->j_fc_off = 0;
439  	J_ASSERT(commit_transaction->t_state == T_RUNNING);
440  	commit_transaction->t_state = T_LOCKED;
441  
442  	trace_jbd2_commit_locking(journal, commit_transaction);
443  	stats.run.rs_wait = commit_transaction->t_max_wait;
444  	stats.run.rs_request_delay = 0;
445  	stats.run.rs_locked = jiffies;
446  	if (commit_transaction->t_requested)
447  		stats.run.rs_request_delay =
448  			jbd2_time_diff(commit_transaction->t_requested,
449  				       stats.run.rs_locked);
450  	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
451  					      stats.run.rs_locked);
452  
453  	// waits for any t_updates to finish
454  	jbd2_journal_wait_updates(journal);
455  
456  	commit_transaction->t_state = T_SWITCH;
457  
458  	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
459  			journal->j_max_transaction_buffers);
460  
461  	/*
462  	 * First thing we are allowed to do is to discard any remaining
463  	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
464  	 * that there are no such buffers: if a large filesystem
465  	 * operation like a truncate needs to split itself over multiple
466  	 * transactions, then it may try to do a jbd2_journal_restart() while
467  	 * there are still BJ_Reserved buffers outstanding.  These must
468  	 * be released cleanly from the current transaction.
469  	 *
470  	 * In this case, the filesystem must still reserve write access
471  	 * again before modifying the buffer in the new transaction, but
472  	 * we do not require it to remember exactly which old buffers it
473  	 * has reserved.  This is consistent with the existing behaviour
474  	 * that multiple jbd2_journal_get_write_access() calls to the same
475  	 * buffer are perfectly permissible.
476  	 * We use journal->j_state_lock here to serialize processing of
477  	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
478  	 */
479  	while (commit_transaction->t_reserved_list) {
480  		jh = commit_transaction->t_reserved_list;
481  		JBUFFER_TRACE(jh, "reserved, unused: refile");
482  		/*
483  		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
484  		 * leave undo-committed data.
485  		 */
486  		if (jh->b_committed_data) {
487  			struct buffer_head *bh = jh2bh(jh);
488  
489  			spin_lock(&jh->b_state_lock);
490  			jbd2_free(jh->b_committed_data, bh->b_size);
491  			jh->b_committed_data = NULL;
492  			spin_unlock(&jh->b_state_lock);
493  		}
494  		jbd2_journal_refile_buffer(journal, jh);
495  	}
496  
497  	write_unlock(&journal->j_state_lock);
498  	/*
499  	 * Now try to drop any written-back buffers from the journal's
500  	 * checkpoint lists.  We do this *before* commit because it potentially
501  	 * frees some memory
502  	 */
503  	spin_lock(&journal->j_list_lock);
504  	__jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
505  	spin_unlock(&journal->j_list_lock);
506  
507  	jbd2_debug(3, "JBD2: commit phase 1\n");
508  
509  	/*
510  	 * Clear revoked flag to reflect there is no revoked buffers
511  	 * in the next transaction which is going to be started.
512  	 */
513  	jbd2_clear_buffer_revoked_flags(journal);
514  
515  	/*
516  	 * Switch to a new revoke table.
517  	 */
518  	jbd2_journal_switch_revoke_table(journal);
519  
520  	write_lock(&journal->j_state_lock);
521  	/*
522  	 * Reserved credits cannot be claimed anymore, free them
523  	 */
524  	atomic_sub(atomic_read(&journal->j_reserved_credits),
525  		   &commit_transaction->t_outstanding_credits);
526  
527  	trace_jbd2_commit_flushing(journal, commit_transaction);
528  	stats.run.rs_flushing = jiffies;
529  	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
530  					     stats.run.rs_flushing);
531  
532  	commit_transaction->t_state = T_FLUSH;
533  	journal->j_committing_transaction = commit_transaction;
534  	journal->j_running_transaction = NULL;
535  	start_time = ktime_get();
536  	commit_transaction->t_log_start = journal->j_head;
537  	wake_up_all(&journal->j_wait_transaction_locked);
538  	write_unlock(&journal->j_state_lock);
539  
540  	jbd2_debug(3, "JBD2: commit phase 2a\n");
541  
542  	/*
543  	 * Now start flushing things to disk, in the order they appear
544  	 * on the transaction lists.  Data blocks go first.
545  	 */
546  	err = journal_submit_data_buffers(journal, commit_transaction);
547  	if (err)
548  		jbd2_journal_abort(journal, err);
549  
550  	blk_start_plug(&plug);
551  	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
552  
553  	jbd2_debug(3, "JBD2: commit phase 2b\n");
554  
555  	/*
556  	 * Way to go: we have now written out all of the data for a
557  	 * transaction!  Now comes the tricky part: we need to write out
558  	 * metadata.  Loop over the transaction's entire buffer list:
559  	 */
560  	write_lock(&journal->j_state_lock);
561  	commit_transaction->t_state = T_COMMIT;
562  	write_unlock(&journal->j_state_lock);
563  
564  	trace_jbd2_commit_logging(journal, commit_transaction);
565  	stats.run.rs_logging = jiffies;
566  	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
567  					       stats.run.rs_logging);
568  	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
569  	stats.run.rs_blocks_logged = 0;
570  
571  	J_ASSERT(commit_transaction->t_nr_buffers <=
572  		 atomic_read(&commit_transaction->t_outstanding_credits));
573  
574  	bufs = 0;
575  	descriptor = NULL;
576  	while (commit_transaction->t_buffers) {
577  
578  		/* Find the next buffer to be journaled... */
579  
580  		jh = commit_transaction->t_buffers;
581  
582  		/* If we're in abort mode, we just un-journal the buffer and
583  		   release it. */
584  
585  		if (is_journal_aborted(journal)) {
586  			clear_buffer_jbddirty(jh2bh(jh));
587  			JBUFFER_TRACE(jh, "journal is aborting: refile");
588  			jbd2_buffer_abort_trigger(jh,
589  						  jh->b_frozen_data ?
590  						  jh->b_frozen_triggers :
591  						  jh->b_triggers);
592  			jbd2_journal_refile_buffer(journal, jh);
593  			/* If that was the last one, we need to clean up
594  			 * any descriptor buffers which may have been
595  			 * already allocated, even if we are now
596  			 * aborting. */
597  			if (!commit_transaction->t_buffers)
598  				goto start_journal_io;
599  			continue;
600  		}
601  
602  		/* Make sure we have a descriptor block in which to
603  		   record the metadata buffer. */
604  
605  		if (!descriptor) {
606  			J_ASSERT (bufs == 0);
607  
608  			jbd2_debug(4, "JBD2: get descriptor\n");
609  
610  			descriptor = jbd2_journal_get_descriptor_buffer(
611  							commit_transaction,
612  							JBD2_DESCRIPTOR_BLOCK);
613  			if (!descriptor) {
614  				jbd2_journal_abort(journal, -EIO);
615  				continue;
616  			}
617  
618  			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
619  				(unsigned long long)descriptor->b_blocknr,
620  				descriptor->b_data);
621  			tagp = &descriptor->b_data[sizeof(journal_header_t)];
622  			space_left = descriptor->b_size -
623  						sizeof(journal_header_t);
624  			first_tag = 1;
625  			set_buffer_jwrite(descriptor);
626  			set_buffer_dirty(descriptor);
627  			wbuf[bufs++] = descriptor;
628  
629  			/* Record it so that we can wait for IO
630                             completion later */
631  			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
632  			jbd2_file_log_bh(&log_bufs, descriptor);
633  		}
634  
635  		/* Where is the buffer to be written? */
636  
637  		err = jbd2_journal_next_log_block(journal, &blocknr);
638  		/* If the block mapping failed, just abandon the buffer
639  		   and repeat this loop: we'll fall into the
640  		   refile-on-abort condition above. */
641  		if (err) {
642  			jbd2_journal_abort(journal, err);
643  			continue;
644  		}
645  
646  		/*
647  		 * start_this_handle() uses t_outstanding_credits to determine
648  		 * the free space in the log.
649  		 */
650  		atomic_dec(&commit_transaction->t_outstanding_credits);
651  
652  		/* Bump b_count to prevent truncate from stumbling over
653                     the shadowed buffer!  @@@ This can go if we ever get
654                     rid of the shadow pairing of buffers. */
655  		atomic_inc(&jh2bh(jh)->b_count);
656  
657  		/*
658  		 * Make a temporary IO buffer with which to write it out
659  		 * (this will requeue the metadata buffer to BJ_Shadow).
660  		 */
661  		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
662  		JBUFFER_TRACE(jh, "ph3: write metadata");
663  		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
664  						jh, &wbuf[bufs], blocknr);
665  		if (flags < 0) {
666  			jbd2_journal_abort(journal, flags);
667  			continue;
668  		}
669  		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
670  
671  		/* Record the new block's tag in the current descriptor
672                     buffer */
673  
674  		tag_flag = 0;
675  		if (flags & 1)
676  			tag_flag |= JBD2_FLAG_ESCAPE;
677  		if (!first_tag)
678  			tag_flag |= JBD2_FLAG_SAME_UUID;
679  
680  		tag = (journal_block_tag_t *) tagp;
681  		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
682  		tag->t_flags = cpu_to_be16(tag_flag);
683  		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
684  					commit_transaction->t_tid);
685  		tagp += tag_bytes;
686  		space_left -= tag_bytes;
687  		bufs++;
688  
689  		if (first_tag) {
690  			memcpy (tagp, journal->j_uuid, 16);
691  			tagp += 16;
692  			space_left -= 16;
693  			first_tag = 0;
694  		}
695  
696  		/* If there's no more to do, or if the descriptor is full,
697  		   let the IO rip! */
698  
699  		if (bufs == journal->j_wbufsize ||
700  		    commit_transaction->t_buffers == NULL ||
701  		    space_left < tag_bytes + 16 + csum_size) {
702  
703  			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
704  
705  			/* Write an end-of-descriptor marker before
706                             submitting the IOs.  "tag" still points to
707                             the last tag we set up. */
708  
709  			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
710  start_journal_io:
711  			if (descriptor)
712  				jbd2_descriptor_block_csum_set(journal,
713  							descriptor);
714  
715  			for (i = 0; i < bufs; i++) {
716  				struct buffer_head *bh = wbuf[i];
717  
718  				/*
719  				 * Compute checksum.
720  				 */
721  				if (jbd2_has_feature_checksum(journal)) {
722  					crc32_sum =
723  					    jbd2_checksum_data(crc32_sum, bh);
724  				}
725  
726  				lock_buffer(bh);
727  				clear_buffer_dirty(bh);
728  				set_buffer_uptodate(bh);
729  				bh->b_end_io = journal_end_buffer_io_sync;
730  				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
731  					  bh);
732  			}
733  			cond_resched();
734  
735  			/* Force a new descriptor to be generated next
736                             time round the loop. */
737  			descriptor = NULL;
738  			bufs = 0;
739  		}
740  	}
741  
742  	err = journal_finish_inode_data_buffers(journal, commit_transaction);
743  	if (err) {
744  		printk(KERN_WARNING
745  			"JBD2: Detected IO errors while flushing file data "
746  		       "on %s\n", journal->j_devname);
747  		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
748  			jbd2_journal_abort(journal, err);
749  		err = 0;
750  	}
751  
752  	/*
753  	 * Get current oldest transaction in the log before we issue flush
754  	 * to the filesystem device. After the flush we can be sure that
755  	 * blocks of all older transactions are checkpointed to persistent
756  	 * storage and we will be safe to update journal start in the
757  	 * superblock with the numbers we get here.
758  	 */
759  	update_tail =
760  		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
761  
762  	write_lock(&journal->j_state_lock);
763  	if (update_tail) {
764  		long freed = first_block - journal->j_tail;
765  
766  		if (first_block < journal->j_tail)
767  			freed += journal->j_last - journal->j_first;
768  		/* Update tail only if we free significant amount of space */
769  		if (freed < jbd2_journal_get_max_txn_bufs(journal))
770  			update_tail = 0;
771  	}
772  	J_ASSERT(commit_transaction->t_state == T_COMMIT);
773  	commit_transaction->t_state = T_COMMIT_DFLUSH;
774  	write_unlock(&journal->j_state_lock);
775  
776  	/*
777  	 * If the journal is not located on the file system device,
778  	 * then we must flush the file system device before we issue
779  	 * the commit record
780  	 */
781  	if (commit_transaction->t_need_data_flush &&
782  	    (journal->j_fs_dev != journal->j_dev) &&
783  	    (journal->j_flags & JBD2_BARRIER))
784  		blkdev_issue_flush(journal->j_fs_dev);
785  
786  	/* Done it all: now write the commit record asynchronously. */
787  	if (jbd2_has_feature_async_commit(journal)) {
788  		err = journal_submit_commit_record(journal, commit_transaction,
789  						 &cbh, crc32_sum);
790  		if (err)
791  			jbd2_journal_abort(journal, err);
792  	}
793  
794  	blk_finish_plug(&plug);
795  
796  	/* Lo and behold: we have just managed to send a transaction to
797             the log.  Before we can commit it, wait for the IO so far to
798             complete.  Control buffers being written are on the
799             transaction's t_log_list queue, and metadata buffers are on
800             the io_bufs list.
801  
802  	   Wait for the buffers in reverse order.  That way we are
803  	   less likely to be woken up until all IOs have completed, and
804  	   so we incur less scheduling load.
805  	*/
806  
807  	jbd2_debug(3, "JBD2: commit phase 3\n");
808  
809  	while (!list_empty(&io_bufs)) {
810  		struct buffer_head *bh = list_entry(io_bufs.prev,
811  						    struct buffer_head,
812  						    b_assoc_buffers);
813  
814  		wait_on_buffer(bh);
815  		cond_resched();
816  
817  		if (unlikely(!buffer_uptodate(bh)))
818  			err = -EIO;
819  		jbd2_unfile_log_bh(bh);
820  		stats.run.rs_blocks_logged++;
821  
822  		/*
823  		 * The list contains temporary buffer heads created by
824  		 * jbd2_journal_write_metadata_buffer().
825  		 */
826  		BUFFER_TRACE(bh, "dumping temporary bh");
827  		__brelse(bh);
828  		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
829  		free_buffer_head(bh);
830  
831  		/* We also have to refile the corresponding shadowed buffer */
832  		jh = commit_transaction->t_shadow_list->b_tprev;
833  		bh = jh2bh(jh);
834  		clear_buffer_jwrite(bh);
835  		J_ASSERT_BH(bh, buffer_jbddirty(bh));
836  		J_ASSERT_BH(bh, !buffer_shadow(bh));
837  
838  		/* The metadata is now released for reuse, but we need
839                     to remember it against this transaction so that when
840                     we finally commit, we can do any checkpointing
841                     required. */
842  		JBUFFER_TRACE(jh, "file as BJ_Forget");
843  		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
844  		JBUFFER_TRACE(jh, "brelse shadowed buffer");
845  		__brelse(bh);
846  	}
847  
848  	J_ASSERT (commit_transaction->t_shadow_list == NULL);
849  
850  	jbd2_debug(3, "JBD2: commit phase 4\n");
851  
852  	/* Here we wait for the revoke record and descriptor record buffers */
853  	while (!list_empty(&log_bufs)) {
854  		struct buffer_head *bh;
855  
856  		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
857  		wait_on_buffer(bh);
858  		cond_resched();
859  
860  		if (unlikely(!buffer_uptodate(bh)))
861  			err = -EIO;
862  
863  		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
864  		clear_buffer_jwrite(bh);
865  		jbd2_unfile_log_bh(bh);
866  		stats.run.rs_blocks_logged++;
867  		__brelse(bh);		/* One for getblk */
868  		/* AKPM: bforget here */
869  	}
870  
871  	if (err)
872  		jbd2_journal_abort(journal, err);
873  
874  	jbd2_debug(3, "JBD2: commit phase 5\n");
875  	write_lock(&journal->j_state_lock);
876  	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
877  	commit_transaction->t_state = T_COMMIT_JFLUSH;
878  	write_unlock(&journal->j_state_lock);
879  
880  	if (!jbd2_has_feature_async_commit(journal)) {
881  		err = journal_submit_commit_record(journal, commit_transaction,
882  						&cbh, crc32_sum);
883  		if (err)
884  			jbd2_journal_abort(journal, err);
885  	}
886  	if (cbh)
887  		err = journal_wait_on_commit_record(journal, cbh);
888  	stats.run.rs_blocks_logged++;
889  	if (jbd2_has_feature_async_commit(journal) &&
890  	    journal->j_flags & JBD2_BARRIER) {
891  		blkdev_issue_flush(journal->j_dev);
892  	}
893  
894  	if (err)
895  		jbd2_journal_abort(journal, err);
896  
897  	WARN_ON_ONCE(
898  		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
899  
900  	/*
901  	 * Now disk caches for filesystem device are flushed so we are safe to
902  	 * erase checkpointed transactions from the log by updating journal
903  	 * superblock.
904  	 */
905  	if (update_tail)
906  		jbd2_update_log_tail(journal, first_tid, first_block);
907  
908  	/* End of a transaction!  Finally, we can do checkpoint
909             processing: any buffers committed as a result of this
910             transaction can be removed from any checkpoint list it was on
911             before. */
912  
913  	jbd2_debug(3, "JBD2: commit phase 6\n");
914  
915  	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
916  	J_ASSERT(commit_transaction->t_buffers == NULL);
917  	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
918  	J_ASSERT(commit_transaction->t_shadow_list == NULL);
919  
920  restart_loop:
921  	/*
922  	 * As there are other places (journal_unmap_buffer()) adding buffers
923  	 * to this list we have to be careful and hold the j_list_lock.
924  	 */
925  	spin_lock(&journal->j_list_lock);
926  	while (commit_transaction->t_forget) {
927  		transaction_t *cp_transaction;
928  		struct buffer_head *bh;
929  		int try_to_free = 0;
930  		bool drop_ref;
931  
932  		jh = commit_transaction->t_forget;
933  		spin_unlock(&journal->j_list_lock);
934  		bh = jh2bh(jh);
935  		/*
936  		 * Get a reference so that bh cannot be freed before we are
937  		 * done with it.
938  		 */
939  		get_bh(bh);
940  		spin_lock(&jh->b_state_lock);
941  		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
942  
943  		/*
944  		 * If there is undo-protected committed data against
945  		 * this buffer, then we can remove it now.  If it is a
946  		 * buffer needing such protection, the old frozen_data
947  		 * field now points to a committed version of the
948  		 * buffer, so rotate that field to the new committed
949  		 * data.
950  		 *
951  		 * Otherwise, we can just throw away the frozen data now.
952  		 *
953  		 * We also know that the frozen data has already fired
954  		 * its triggers if they exist, so we can clear that too.
955  		 */
956  		if (jh->b_committed_data) {
957  			jbd2_free(jh->b_committed_data, bh->b_size);
958  			jh->b_committed_data = NULL;
959  			if (jh->b_frozen_data) {
960  				jh->b_committed_data = jh->b_frozen_data;
961  				jh->b_frozen_data = NULL;
962  				jh->b_frozen_triggers = NULL;
963  			}
964  		} else if (jh->b_frozen_data) {
965  			jbd2_free(jh->b_frozen_data, bh->b_size);
966  			jh->b_frozen_data = NULL;
967  			jh->b_frozen_triggers = NULL;
968  		}
969  
970  		spin_lock(&journal->j_list_lock);
971  		cp_transaction = jh->b_cp_transaction;
972  		if (cp_transaction) {
973  			JBUFFER_TRACE(jh, "remove from old cp transaction");
974  			cp_transaction->t_chp_stats.cs_dropped++;
975  			__jbd2_journal_remove_checkpoint(jh);
976  		}
977  
978  		/* Only re-checkpoint the buffer_head if it is marked
979  		 * dirty.  If the buffer was added to the BJ_Forget list
980  		 * by jbd2_journal_forget, it may no longer be dirty and
981  		 * there's no point in keeping a checkpoint record for
982  		 * it. */
983  
984  		/*
985  		 * A buffer which has been freed while still being journaled
986  		 * by a previous transaction, refile the buffer to BJ_Forget of
987  		 * the running transaction. If the just committed transaction
988  		 * contains "add to orphan" operation, we can completely
989  		 * invalidate the buffer now. We are rather through in that
990  		 * since the buffer may be still accessible when blocksize <
991  		 * pagesize and it is attached to the last partial page.
992  		 */
993  		if (buffer_freed(bh) && !jh->b_next_transaction) {
994  			struct address_space *mapping;
995  
996  			clear_buffer_freed(bh);
997  			clear_buffer_jbddirty(bh);
998  
999  			/*
1000  			 * Block device buffers need to stay mapped all the
1001  			 * time, so it is enough to clear buffer_jbddirty and
1002  			 * buffer_freed bits. For the file mapping buffers (i.e.
1003  			 * journalled data) we need to unmap buffer and clear
1004  			 * more bits. We also need to be careful about the check
1005  			 * because the data page mapping can get cleared under
1006  			 * our hands. Note that if mapping == NULL, we don't
1007  			 * need to make buffer unmapped because the page is
1008  			 * already detached from the mapping and buffers cannot
1009  			 * get reused.
1010  			 */
1011  			mapping = READ_ONCE(bh->b_folio->mapping);
1012  			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1013  				clear_buffer_mapped(bh);
1014  				clear_buffer_new(bh);
1015  				clear_buffer_req(bh);
1016  				bh->b_bdev = NULL;
1017  			}
1018  		}
1019  
1020  		if (buffer_jbddirty(bh)) {
1021  			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1022  			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1023  			if (is_journal_aborted(journal))
1024  				clear_buffer_jbddirty(bh);
1025  		} else {
1026  			J_ASSERT_BH(bh, !buffer_dirty(bh));
1027  			/*
1028  			 * The buffer on BJ_Forget list and not jbddirty means
1029  			 * it has been freed by this transaction and hence it
1030  			 * could not have been reallocated until this
1031  			 * transaction has committed. *BUT* it could be
1032  			 * reallocated once we have written all the data to
1033  			 * disk and before we process the buffer on BJ_Forget
1034  			 * list.
1035  			 */
1036  			if (!jh->b_next_transaction)
1037  				try_to_free = 1;
1038  		}
1039  		JBUFFER_TRACE(jh, "refile or unfile buffer");
1040  		drop_ref = __jbd2_journal_refile_buffer(jh);
1041  		spin_unlock(&jh->b_state_lock);
1042  		if (drop_ref)
1043  			jbd2_journal_put_journal_head(jh);
1044  		if (try_to_free)
1045  			release_buffer_page(bh);	/* Drops bh reference */
1046  		else
1047  			__brelse(bh);
1048  		cond_resched_lock(&journal->j_list_lock);
1049  	}
1050  	spin_unlock(&journal->j_list_lock);
1051  	/*
1052  	 * This is a bit sleazy.  We use j_list_lock to protect transition
1053  	 * of a transaction into T_FINISHED state and calling
1054  	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1055  	 * other checkpointing code processing the transaction...
1056  	 */
1057  	write_lock(&journal->j_state_lock);
1058  	spin_lock(&journal->j_list_lock);
1059  	/*
1060  	 * Now recheck if some buffers did not get attached to the transaction
1061  	 * while the lock was dropped...
1062  	 */
1063  	if (commit_transaction->t_forget) {
1064  		spin_unlock(&journal->j_list_lock);
1065  		write_unlock(&journal->j_state_lock);
1066  		goto restart_loop;
1067  	}
1068  
1069  	/* Add the transaction to the checkpoint list
1070  	 * __journal_remove_checkpoint() can not destroy transaction
1071  	 * under us because it is not marked as T_FINISHED yet */
1072  	if (journal->j_checkpoint_transactions == NULL) {
1073  		journal->j_checkpoint_transactions = commit_transaction;
1074  		commit_transaction->t_cpnext = commit_transaction;
1075  		commit_transaction->t_cpprev = commit_transaction;
1076  	} else {
1077  		commit_transaction->t_cpnext =
1078  			journal->j_checkpoint_transactions;
1079  		commit_transaction->t_cpprev =
1080  			commit_transaction->t_cpnext->t_cpprev;
1081  		commit_transaction->t_cpnext->t_cpprev =
1082  			commit_transaction;
1083  		commit_transaction->t_cpprev->t_cpnext =
1084  				commit_transaction;
1085  	}
1086  	spin_unlock(&journal->j_list_lock);
1087  
1088  	/* Done with this transaction! */
1089  
1090  	jbd2_debug(3, "JBD2: commit phase 7\n");
1091  
1092  	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1093  
1094  	commit_transaction->t_start = jiffies;
1095  	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1096  					      commit_transaction->t_start);
1097  
1098  	/*
1099  	 * File the transaction statistics
1100  	 */
1101  	stats.ts_tid = commit_transaction->t_tid;
1102  	stats.run.rs_handle_count =
1103  		atomic_read(&commit_transaction->t_handle_count);
1104  	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1105  			     commit_transaction->t_tid, &stats.run);
1106  	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1107  
1108  	commit_transaction->t_state = T_COMMIT_CALLBACK;
1109  	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1110  	journal->j_commit_sequence = commit_transaction->t_tid;
1111  	journal->j_committing_transaction = NULL;
1112  	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1113  
1114  	/*
1115  	 * weight the commit time higher than the average time so we don't
1116  	 * react too strongly to vast changes in the commit time
1117  	 */
1118  	if (likely(journal->j_average_commit_time))
1119  		journal->j_average_commit_time = (commit_time +
1120  				journal->j_average_commit_time*3) / 4;
1121  	else
1122  		journal->j_average_commit_time = commit_time;
1123  
1124  	write_unlock(&journal->j_state_lock);
1125  
1126  	if (journal->j_commit_callback)
1127  		journal->j_commit_callback(journal, commit_transaction);
1128  	if (journal->j_fc_cleanup_callback)
1129  		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1130  
1131  	trace_jbd2_end_commit(journal, commit_transaction);
1132  	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1133  		  journal->j_commit_sequence, journal->j_tail_sequence);
1134  
1135  	write_lock(&journal->j_state_lock);
1136  	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1137  	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1138  	spin_lock(&journal->j_list_lock);
1139  	commit_transaction->t_state = T_FINISHED;
1140  	/* Check if the transaction can be dropped now that we are finished */
1141  	if (commit_transaction->t_checkpoint_list == NULL) {
1142  		__jbd2_journal_drop_transaction(journal, commit_transaction);
1143  		jbd2_journal_free_transaction(commit_transaction);
1144  	}
1145  	spin_unlock(&journal->j_list_lock);
1146  	write_unlock(&journal->j_state_lock);
1147  	wake_up(&journal->j_wait_done_commit);
1148  	wake_up(&journal->j_fc_wait);
1149  
1150  	/*
1151  	 * Calculate overall stats
1152  	 */
1153  	spin_lock(&journal->j_history_lock);
1154  	journal->j_stats.ts_tid++;
1155  	journal->j_stats.ts_requested += stats.ts_requested;
1156  	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1157  	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1158  	journal->j_stats.run.rs_running += stats.run.rs_running;
1159  	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1160  	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1161  	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1162  	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1163  	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1164  	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1165  	spin_unlock(&journal->j_history_lock);
1166  }
1167