xref: /linux/fs/jbd2/commit.c (revision 6b3f7af57881f6d6250c6dcc4d910fe8e855a607)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * linux/fs/jbd2/commit.c
4  *
5  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6  *
7  * Copyright 1998 Red Hat corp --- All Rights Reserved
8  *
9  * Journal commit routines for the generic filesystem journaling code;
10  * part of the ext2fs journaling system.
11  */
12 
13 #include <linux/time.h>
14 #include <linux/fs.h>
15 #include <linux/jbd2.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/mm.h>
19 #include <linux/pagemap.h>
20 #include <linux/jiffies.h>
21 #include <linux/crc32.h>
22 #include <linux/writeback.h>
23 #include <linux/backing-dev.h>
24 #include <linux/bio.h>
25 #include <linux/blkdev.h>
26 #include <linux/bitops.h>
27 #include <trace/events/jbd2.h>
28 
29 /*
30  * IO end handler for temporary buffer_heads handling writes to the journal.
31  */
32 static void journal_end_buffer_io_sync(struct bio *bio)
33 {
34 	struct buffer_head *bh;
35 	bool uptodate = bio_endio_bh(bio, &bh);
36 	struct buffer_head *orig_bh = bh->b_private;
37 
38 	BUFFER_TRACE(bh, "");
39 	if (uptodate)
40 		set_buffer_uptodate(bh);
41 	else
42 		clear_buffer_uptodate(bh);
43 	if (orig_bh) {
44 		clear_and_wake_up_bit(BH_Shadow, &orig_bh->b_state);
45 	}
46 	unlock_buffer(bh);
47 }
48 
49 /*
50  * When an ext4 file is truncated, it is possible that some pages are not
51  * successfully freed, because they are attached to a committing transaction.
52  * After the transaction commits, these pages are left on the LRU, with no
53  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54  * by the VM, but their apparent absence upsets the VM accounting, and it makes
55  * the numbers in /proc/meminfo look odd.
56  *
57  * So here, we have a buffer which has just come off the forget list.  Look to
58  * see if we can strip all buffers from the backing page.
59  *
60  * Called under j_list_lock. The caller provided us with a ref against the
61  * buffer, and we drop that here.
62  */
63 static void release_buffer_page(struct buffer_head *bh)
64 {
65 	struct folio *folio;
66 
67 	if (buffer_dirty(bh))
68 		goto nope;
69 	if (atomic_read(&bh->b_count) != 1)
70 		goto nope;
71 	folio = bh->b_folio;
72 	if (folio->mapping)
73 		goto nope;
74 
75 	/* OK, it's a truncated page */
76 	if (!folio_trylock(folio))
77 		goto nope;
78 
79 	folio_get(folio);
80 	__brelse(bh);
81 	try_to_free_buffers(folio);
82 	folio_unlock(folio);
83 	folio_put(folio);
84 	return;
85 
86 nope:
87 	__brelse(bh);
88 }
89 
90 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
91 {
92 	struct commit_header *h;
93 	__u32 csum;
94 
95 	if (!jbd2_journal_has_csum_v2or3(j))
96 		return;
97 
98 	h = (struct commit_header *)(bh->b_data);
99 	h->h_chksum_type = 0;
100 	h->h_chksum_size = 0;
101 	h->h_chksum[0] = 0;
102 	csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
103 	h->h_chksum[0] = cpu_to_be32(csum);
104 }
105 
106 /*
107  * Done it all: now submit the commit record.  We should have
108  * cleaned up our previous buffers by now, so if we are in abort
109  * mode we can now just skip the rest of the journal write
110  * entirely.
111  *
112  * Returns 1 if the journal needs to be aborted or 0 on success
113  */
114 static int journal_submit_commit_record(journal_t *journal,
115 					transaction_t *commit_transaction,
116 					struct buffer_head **cbh,
117 					__u32 crc32_sum)
118 {
119 	struct commit_header *tmp;
120 	struct buffer_head *bh;
121 	struct timespec64 now;
122 	blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
123 
124 	*cbh = NULL;
125 
126 	if (is_journal_aborted(journal))
127 		return 0;
128 
129 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
130 						JBD2_COMMIT_BLOCK);
131 	if (!bh)
132 		return 1;
133 
134 	tmp = (struct commit_header *)bh->b_data;
135 	ktime_get_coarse_real_ts64(&now);
136 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
137 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
138 
139 	if (jbd2_has_feature_checksum(journal)) {
140 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
141 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
142 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
143 	}
144 	jbd2_commit_block_csum_set(journal, bh);
145 
146 	BUFFER_TRACE(bh, "submit commit block");
147 	lock_buffer(bh);
148 	clear_buffer_dirty(bh);
149 	set_buffer_uptodate(bh);
150 
151 	if (journal->j_flags & JBD2_BARRIER &&
152 	    !jbd2_has_feature_async_commit(journal))
153 		write_flags |= REQ_PREFLUSH | REQ_FUA;
154 
155 	bh_submit(bh, write_flags, journal_end_buffer_io_sync);
156 	*cbh = bh;
157 	return 0;
158 }
159 
160 /*
161  * This function along with journal_submit_commit_record
162  * allows to write the commit record asynchronously.
163  */
164 static int journal_wait_on_commit_record(journal_t *journal,
165 					 struct buffer_head *bh)
166 {
167 	int ret = 0;
168 
169 	clear_buffer_dirty(bh);
170 	wait_on_buffer(bh);
171 
172 	if (unlikely(!buffer_uptodate(bh)))
173 		ret = -EIO;
174 	put_bh(bh);            /* One for getblk() */
175 
176 	return ret;
177 }
178 
179 /* Send all the data buffers related to an inode */
180 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
181 {
182 	unsigned long flags;
183 
184 	if (!jinode)
185 		return 0;
186 
187 	flags = READ_ONCE(jinode->i_flags);
188 	if (!(flags & JI_WRITE_DATA))
189 		return 0;
190 
191 	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
192 	return journal->j_submit_inode_data_buffers(jinode);
193 
194 }
195 EXPORT_SYMBOL(jbd2_submit_inode_data);
196 
197 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
198 {
199 	struct address_space *mapping;
200 	struct inode *inode;
201 	unsigned long flags;
202 	loff_t start_byte, end_byte;
203 
204 	if (!jinode)
205 		return 0;
206 
207 	flags = READ_ONCE(jinode->i_flags);
208 	if (!(flags & JI_WAIT_DATA))
209 		return 0;
210 
211 	inode = jinode->i_vfs_inode;
212 	if (!inode)
213 		return 0;
214 
215 	mapping = inode->i_mapping;
216 	if (!mapping)
217 		return 0;
218 
219 	if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
220 		return 0;
221 	return filemap_fdatawait_range_keep_errors(
222 		mapping, start_byte, end_byte);
223 }
224 EXPORT_SYMBOL(jbd2_wait_inode_data);
225 
226 /*
227  * Submit all the data buffers of inode associated with the transaction to
228  * disk.
229  *
230  * We are in a committing transaction. Therefore no new inode can be added to
231  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
232  * operate on from being released while we write out pages.
233  */
234 static int journal_submit_data_buffers(journal_t *journal,
235 		transaction_t *commit_transaction)
236 {
237 	struct jbd2_inode *jinode;
238 	int err, ret = 0;
239 
240 	spin_lock(&journal->j_list_lock);
241 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
242 		if (!(jinode->i_flags & JI_WRITE_DATA))
243 			continue;
244 		WRITE_ONCE(jinode->i_flags,
245 			   jinode->i_flags | JI_COMMIT_RUNNING);
246 		spin_unlock(&journal->j_list_lock);
247 		/* submit the inode data buffers. */
248 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
249 		if (journal->j_submit_inode_data_buffers) {
250 			err = journal->j_submit_inode_data_buffers(jinode);
251 			if (!ret)
252 				ret = err;
253 		}
254 		spin_lock(&journal->j_list_lock);
255 		J_ASSERT(jinode->i_transaction == commit_transaction);
256 		WRITE_ONCE(jinode->i_flags,
257 			   jinode->i_flags & ~JI_COMMIT_RUNNING);
258 		smp_mb();
259 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
260 	}
261 	spin_unlock(&journal->j_list_lock);
262 	return ret;
263 }
264 
265 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
266 {
267 	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
268 	loff_t start_byte, end_byte;
269 
270 	if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
271 		return 0;
272 
273 	return filemap_fdatawait_range_keep_errors(mapping,
274 						   start_byte, end_byte);
275 }
276 
277 /*
278  * Wait for data submitted for writeout, refile inodes to proper
279  * transaction if needed.
280  *
281  */
282 static int journal_finish_inode_data_buffers(journal_t *journal,
283 		transaction_t *commit_transaction)
284 {
285 	struct jbd2_inode *jinode, *next_i;
286 	int err, ret = 0;
287 
288 	/* For locking, see the comment in journal_submit_data_buffers() */
289 	spin_lock(&journal->j_list_lock);
290 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
291 		if (!(jinode->i_flags & JI_WAIT_DATA))
292 			continue;
293 		WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING);
294 		spin_unlock(&journal->j_list_lock);
295 		/* wait for the inode data buffers writeout. */
296 		if (journal->j_finish_inode_data_buffers) {
297 			err = journal->j_finish_inode_data_buffers(jinode);
298 			if (!ret)
299 				ret = err;
300 		}
301 		cond_resched();
302 		spin_lock(&journal->j_list_lock);
303 		WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING);
304 		smp_mb();
305 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
306 	}
307 
308 	/* Now refile inode to proper lists */
309 	list_for_each_entry_safe(jinode, next_i,
310 				 &commit_transaction->t_inode_list, i_list) {
311 		list_del(&jinode->i_list);
312 		if (jinode->i_next_transaction) {
313 			jinode->i_transaction = jinode->i_next_transaction;
314 			jinode->i_next_transaction = NULL;
315 			list_add(&jinode->i_list,
316 				&jinode->i_transaction->t_inode_list);
317 		} else {
318 			jinode->i_transaction = NULL;
319 			WRITE_ONCE(jinode->i_dirty_start_page, 0);
320 			WRITE_ONCE(jinode->i_dirty_end_page, 0);
321 		}
322 	}
323 	spin_unlock(&journal->j_list_lock);
324 
325 	return ret;
326 }
327 
328 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
329 {
330 	char *addr;
331 	__u32 checksum;
332 
333 	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
334 	checksum = crc32_be(crc32_sum, addr, bh->b_size);
335 	kunmap_local(addr);
336 
337 	return checksum;
338 }
339 
340 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
341 				   unsigned long long block)
342 {
343 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
344 	if (jbd2_has_feature_64bit(j))
345 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
346 }
347 
348 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
349 				    struct buffer_head *bh, __u32 sequence)
350 {
351 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
352 	__u8 *addr;
353 	__u32 csum32;
354 	__be32 seq;
355 
356 	if (!jbd2_journal_has_csum_v2or3(j))
357 		return;
358 
359 	seq = cpu_to_be32(sequence);
360 	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
361 	csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
362 	csum32 = jbd2_chksum(csum32, addr, bh->b_size);
363 	kunmap_local(addr);
364 
365 	if (jbd2_has_feature_csum3(j))
366 		tag3->t_checksum = cpu_to_be32(csum32);
367 	else
368 		tag->t_checksum = cpu_to_be16(csum32);
369 }
370 /*
371  * jbd2_journal_commit_transaction
372  *
373  * The primary function for committing a transaction to the log.  This
374  * function is called by the journal thread to begin a complete commit.
375  */
376 void jbd2_journal_commit_transaction(journal_t *journal)
377 {
378 	struct transaction_stats_s stats;
379 	transaction_t *commit_transaction;
380 	struct journal_head *jh;
381 	struct buffer_head *descriptor;
382 	struct buffer_head **wbuf = journal->j_wbuf;
383 	int bufs;
384 	int escape;
385 	int err;
386 	unsigned long long blocknr;
387 	ktime_t start_time;
388 	u64 commit_time;
389 	char *tagp = NULL;
390 	journal_block_tag_t *tag = NULL;
391 	int space_left = 0;
392 	int first_tag = 0;
393 	int tag_flag;
394 	int i;
395 	int tag_bytes = journal_tag_bytes(journal);
396 	struct buffer_head *cbh = NULL; /* For transactional checksums */
397 	__u32 crc32_sum = ~0;
398 	struct blk_plug plug;
399 	/* Tail of the journal */
400 	unsigned long first_block;
401 	tid_t first_tid;
402 	int update_tail;
403 	int csum_size = 0;
404 	LIST_HEAD(io_bufs);
405 	LIST_HEAD(log_bufs);
406 
407 	if (jbd2_journal_has_csum_v2or3(journal))
408 		csum_size = sizeof(struct jbd2_journal_block_tail);
409 
410 	/*
411 	 * First job: lock down the current transaction and wait for
412 	 * all outstanding updates to complete.
413 	 */
414 
415 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
416 	if (journal->j_flags & JBD2_FLUSHED) {
417 		jbd2_debug(3, "super block updated\n");
418 		mutex_lock_io(&journal->j_checkpoint_mutex);
419 		/*
420 		 * We hold j_checkpoint_mutex so tail cannot change under us.
421 		 * We don't need any special data guarantees for writing sb
422 		 * since journal is empty and it is ok for write to be
423 		 * flushed only with transaction commit.
424 		 */
425 		jbd2_journal_update_sb_log_tail(journal,
426 						journal->j_tail_sequence,
427 						journal->j_tail, 0);
428 		mutex_unlock(&journal->j_checkpoint_mutex);
429 	} else {
430 		jbd2_debug(3, "superblock not updated\n");
431 	}
432 
433 	J_ASSERT(journal->j_running_transaction != NULL);
434 	J_ASSERT(journal->j_committing_transaction == NULL);
435 
436 	write_lock(&journal->j_state_lock);
437 	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
438 	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
439 		DEFINE_WAIT(wait);
440 
441 		prepare_to_wait(&journal->j_fc_wait, &wait,
442 				TASK_UNINTERRUPTIBLE);
443 		write_unlock(&journal->j_state_lock);
444 		schedule();
445 		write_lock(&journal->j_state_lock);
446 		finish_wait(&journal->j_fc_wait, &wait);
447 		/*
448 		 * TODO: by blocking fast commits here, we are increasing
449 		 * fsync() latency slightly. Strictly speaking, we don't need
450 		 * to block fast commits until the transaction enters T_FLUSH
451 		 * state. So an optimization is possible where we block new fast
452 		 * commits here and wait for existing ones to complete
453 		 * just before we enter T_FLUSH. That way, the existing fast
454 		 * commits and this full commit can proceed parallely.
455 		 */
456 	}
457 	write_unlock(&journal->j_state_lock);
458 
459 	commit_transaction = journal->j_running_transaction;
460 
461 	trace_jbd2_start_commit(journal, commit_transaction);
462 	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
463 			commit_transaction->t_tid);
464 
465 	write_lock(&journal->j_state_lock);
466 	journal->j_fc_off = 0;
467 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
468 	commit_transaction->t_state = T_LOCKED;
469 
470 	trace_jbd2_commit_locking(journal, commit_transaction);
471 	stats.run.rs_wait = commit_transaction->t_max_wait;
472 	stats.run.rs_request_delay = 0;
473 	stats.run.rs_locked = jiffies;
474 	if (commit_transaction->t_requested)
475 		stats.run.rs_request_delay =
476 			jbd2_time_diff(commit_transaction->t_requested,
477 				       stats.run.rs_locked);
478 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
479 					      stats.run.rs_locked);
480 
481 	// waits for any t_updates to finish
482 	jbd2_journal_wait_updates(journal);
483 
484 	commit_transaction->t_state = T_SWITCH;
485 
486 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
487 			journal->j_max_transaction_buffers);
488 
489 	/*
490 	 * First thing we are allowed to do is to discard any remaining
491 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
492 	 * that there are no such buffers: if a large filesystem
493 	 * operation like a truncate needs to split itself over multiple
494 	 * transactions, then it may try to do a jbd2_journal_restart() while
495 	 * there are still BJ_Reserved buffers outstanding.  These must
496 	 * be released cleanly from the current transaction.
497 	 *
498 	 * In this case, the filesystem must still reserve write access
499 	 * again before modifying the buffer in the new transaction, but
500 	 * we do not require it to remember exactly which old buffers it
501 	 * has reserved.  This is consistent with the existing behaviour
502 	 * that multiple jbd2_journal_get_write_access() calls to the same
503 	 * buffer are perfectly permissible.
504 	 * We use journal->j_state_lock here to serialize processing of
505 	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
506 	 */
507 	while (commit_transaction->t_reserved_list) {
508 		jh = commit_transaction->t_reserved_list;
509 		JBUFFER_TRACE(jh, "reserved, unused: refile");
510 		/*
511 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
512 		 * leave undo-committed data.
513 		 */
514 		if (jh->b_committed_data) {
515 			struct buffer_head *bh = jh2bh(jh);
516 
517 			spin_lock(&jh->b_state_lock);
518 			jbd2_free(jh->b_committed_data, bh->b_size);
519 			jh->b_committed_data = NULL;
520 			spin_unlock(&jh->b_state_lock);
521 		}
522 		jbd2_journal_refile_buffer(journal, jh);
523 	}
524 
525 	write_unlock(&journal->j_state_lock);
526 	/*
527 	 * Now try to drop any written-back buffers from the journal's
528 	 * checkpoint lists.  We do this *before* commit because it potentially
529 	 * frees some memory
530 	 */
531 	spin_lock(&journal->j_list_lock);
532 	__jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
533 	spin_unlock(&journal->j_list_lock);
534 
535 	jbd2_debug(3, "JBD2: commit phase 1\n");
536 
537 	/*
538 	 * Clear revoked flag to reflect there is no revoked buffers
539 	 * in the next transaction which is going to be started.
540 	 */
541 	jbd2_clear_buffer_revoked_flags(journal);
542 
543 	/*
544 	 * Switch to a new revoke table.
545 	 */
546 	jbd2_journal_switch_revoke_table(journal);
547 
548 	write_lock(&journal->j_state_lock);
549 	/*
550 	 * Reserved credits cannot be claimed anymore, free them
551 	 */
552 	atomic_sub(atomic_read(&journal->j_reserved_credits),
553 		   &commit_transaction->t_outstanding_credits);
554 
555 	trace_jbd2_commit_flushing(journal, commit_transaction);
556 	stats.run.rs_flushing = jiffies;
557 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
558 					     stats.run.rs_flushing);
559 
560 	commit_transaction->t_state = T_FLUSH;
561 	journal->j_committing_transaction = commit_transaction;
562 	journal->j_running_transaction = NULL;
563 	start_time = ktime_get();
564 	commit_transaction->t_log_start = journal->j_head;
565 	wake_up_all(&journal->j_wait_transaction_locked);
566 	write_unlock(&journal->j_state_lock);
567 
568 	jbd2_debug(3, "JBD2: commit phase 2a\n");
569 
570 	/*
571 	 * Now start flushing things to disk, in the order they appear
572 	 * on the transaction lists.  Data blocks go first.
573 	 */
574 	err = journal_submit_data_buffers(journal, commit_transaction);
575 	if (err)
576 		jbd2_journal_abort(journal, err);
577 
578 	blk_start_plug(&plug);
579 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
580 
581 	jbd2_debug(3, "JBD2: commit phase 2b\n");
582 
583 	/*
584 	 * Way to go: we have now written out all of the data for a
585 	 * transaction!  Now comes the tricky part: we need to write out
586 	 * metadata.  Loop over the transaction's entire buffer list:
587 	 */
588 	write_lock(&journal->j_state_lock);
589 	commit_transaction->t_state = T_COMMIT;
590 	write_unlock(&journal->j_state_lock);
591 
592 	trace_jbd2_commit_logging(journal, commit_transaction);
593 	stats.run.rs_logging = jiffies;
594 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
595 					       stats.run.rs_logging);
596 	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
597 	stats.run.rs_blocks_logged = 0;
598 
599 	J_ASSERT(commit_transaction->t_nr_buffers <=
600 		 atomic_read(&commit_transaction->t_outstanding_credits));
601 
602 	bufs = 0;
603 	descriptor = NULL;
604 	while (commit_transaction->t_buffers) {
605 
606 		/* Find the next buffer to be journaled... */
607 
608 		jh = commit_transaction->t_buffers;
609 
610 		/* If we're in abort mode, we just un-journal the buffer and
611 		   release it. */
612 
613 		if (is_journal_aborted(journal)) {
614 			clear_buffer_jbddirty(jh2bh(jh));
615 			JBUFFER_TRACE(jh, "journal is aborting: refile");
616 			jbd2_buffer_abort_trigger(jh,
617 						  jh->b_frozen_data ?
618 						  jh->b_frozen_triggers :
619 						  jh->b_triggers);
620 			jbd2_journal_refile_buffer(journal, jh);
621 			/* If that was the last one, we need to clean up
622 			 * any descriptor buffers which may have been
623 			 * already allocated, even if we are now
624 			 * aborting. */
625 			if (!commit_transaction->t_buffers)
626 				goto start_journal_io;
627 			continue;
628 		}
629 
630 		/* Make sure we have a descriptor block in which to
631 		   record the metadata buffer. */
632 
633 		if (!descriptor) {
634 			J_ASSERT (bufs == 0);
635 
636 			jbd2_debug(4, "JBD2: get descriptor\n");
637 
638 			descriptor = jbd2_journal_get_descriptor_buffer(
639 							commit_transaction,
640 							JBD2_DESCRIPTOR_BLOCK);
641 			if (!descriptor) {
642 				jbd2_journal_abort(journal, -EIO);
643 				continue;
644 			}
645 
646 			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
647 				(unsigned long long)descriptor->b_blocknr,
648 				descriptor->b_data);
649 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
650 			space_left = descriptor->b_size -
651 						sizeof(journal_header_t);
652 			first_tag = 1;
653 			set_buffer_jwrite(descriptor);
654 			set_buffer_dirty(descriptor);
655 			wbuf[bufs++] = descriptor;
656 
657 			/* Record it so that we can wait for IO
658                            completion later */
659 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
660 			jbd2_file_log_bh(&log_bufs, descriptor);
661 		}
662 
663 		/* Where is the buffer to be written? */
664 
665 		err = jbd2_journal_next_log_block(journal, &blocknr);
666 		/* If the block mapping failed, just abandon the buffer
667 		   and repeat this loop: we'll fall into the
668 		   refile-on-abort condition above. */
669 		if (err) {
670 			jbd2_journal_abort(journal, err);
671 			continue;
672 		}
673 
674 		/*
675 		 * start_this_handle() uses t_outstanding_credits to determine
676 		 * the free space in the log.
677 		 */
678 		atomic_dec(&commit_transaction->t_outstanding_credits);
679 
680 		/* Bump b_count to prevent truncate from stumbling over
681                    the shadowed buffer!  @@@ This can go if we ever get
682                    rid of the shadow pairing of buffers. */
683 		atomic_inc(&jh2bh(jh)->b_count);
684 
685 		/*
686 		 * Make a temporary IO buffer with which to write it out
687 		 * (this will requeue the metadata buffer to BJ_Shadow).
688 		 */
689 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
690 		JBUFFER_TRACE(jh, "ph3: write metadata");
691 		escape = jbd2_journal_write_metadata_buffer(commit_transaction,
692 						jh, &wbuf[bufs], blocknr);
693 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
694 
695 		/* Record the new block's tag in the current descriptor
696                    buffer */
697 
698 		tag_flag = 0;
699 		if (escape)
700 			tag_flag |= JBD2_FLAG_ESCAPE;
701 		if (!first_tag)
702 			tag_flag |= JBD2_FLAG_SAME_UUID;
703 
704 		tag = (journal_block_tag_t *) tagp;
705 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
706 		tag->t_flags = cpu_to_be16(tag_flag);
707 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
708 					commit_transaction->t_tid);
709 		tagp += tag_bytes;
710 		space_left -= tag_bytes;
711 		bufs++;
712 
713 		if (first_tag) {
714 			memcpy (tagp, journal->j_uuid, 16);
715 			tagp += 16;
716 			space_left -= 16;
717 			first_tag = 0;
718 		}
719 
720 		/* If there's no more to do, or if the descriptor is full,
721 		   let the IO rip! */
722 
723 		if (bufs == journal->j_wbufsize ||
724 		    commit_transaction->t_buffers == NULL ||
725 		    space_left < tag_bytes + 16 + csum_size) {
726 
727 			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
728 
729 			/* Write an end-of-descriptor marker before
730                            submitting the IOs.  "tag" still points to
731                            the last tag we set up. */
732 
733 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
734 start_journal_io:
735 			if (descriptor)
736 				jbd2_descriptor_block_csum_set(journal,
737 							descriptor);
738 
739 			for (i = 0; i < bufs; i++) {
740 				struct buffer_head *bh = wbuf[i];
741 
742 				/*
743 				 * Compute checksum.
744 				 */
745 				if (jbd2_has_feature_checksum(journal)) {
746 					crc32_sum =
747 					    jbd2_checksum_data(crc32_sum, bh);
748 				}
749 
750 				lock_buffer(bh);
751 				clear_buffer_dirty(bh);
752 				set_buffer_uptodate(bh);
753 				bh_submit(bh,
754 					REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
755 					journal_end_buffer_io_sync);
756 			}
757 			cond_resched();
758 
759 			/* Force a new descriptor to be generated next
760                            time round the loop. */
761 			descriptor = NULL;
762 			bufs = 0;
763 		}
764 	}
765 
766 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
767 	if (err) {
768 		printk(KERN_WARNING
769 			"JBD2: Detected IO errors %d while flushing file data on %s\n",
770 			err, journal->j_devname);
771 		err = 0;
772 	}
773 
774 	/*
775 	 * Get current oldest transaction in the log before we issue flush
776 	 * to the filesystem device. After the flush we can be sure that
777 	 * blocks of all older transactions are checkpointed to persistent
778 	 * storage and we will be safe to update journal start in the
779 	 * superblock with the numbers we get here.
780 	 */
781 	update_tail =
782 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
783 
784 	write_lock(&journal->j_state_lock);
785 	if (update_tail) {
786 		long freed = first_block - journal->j_tail;
787 
788 		if (first_block < journal->j_tail)
789 			freed += journal->j_last - journal->j_first;
790 		/* Update tail only if we free significant amount of space */
791 		if (freed < journal->j_max_transaction_buffers)
792 			update_tail = 0;
793 	}
794 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
795 	commit_transaction->t_state = T_COMMIT_DFLUSH;
796 	write_unlock(&journal->j_state_lock);
797 
798 	/*
799 	 * If the journal is not located on the file system device,
800 	 * then we must flush the file system device before we issue
801 	 * the commit record and update the journal tail sequence.
802 	 */
803 	if ((commit_transaction->t_need_data_flush || update_tail) &&
804 	    (journal->j_fs_dev != journal->j_dev) &&
805 	    (journal->j_flags & JBD2_BARRIER))
806 		blkdev_issue_flush(journal->j_fs_dev);
807 
808 	/* Done it all: now write the commit record asynchronously. */
809 	if (jbd2_has_feature_async_commit(journal)) {
810 		err = journal_submit_commit_record(journal, commit_transaction,
811 						 &cbh, crc32_sum);
812 		if (err)
813 			jbd2_journal_abort(journal, err);
814 	}
815 
816 	blk_finish_plug(&plug);
817 
818 	/* Lo and behold: we have just managed to send a transaction to
819            the log.  Before we can commit it, wait for the IO so far to
820            complete.  Control buffers being written are on the
821            transaction's t_log_list queue, and metadata buffers are on
822            the io_bufs list.
823 
824 	   Wait for the buffers in reverse order.  That way we are
825 	   less likely to be woken up until all IOs have completed, and
826 	   so we incur less scheduling load.
827 	*/
828 
829 	jbd2_debug(3, "JBD2: commit phase 3\n");
830 
831 	while (!list_empty(&io_bufs)) {
832 		struct buffer_head *bh = list_entry(io_bufs.prev,
833 						    struct buffer_head,
834 						    b_assoc_buffers);
835 
836 		wait_on_buffer(bh);
837 		cond_resched();
838 
839 		if (unlikely(!buffer_uptodate(bh)))
840 			err = -EIO;
841 		jbd2_unfile_log_bh(bh);
842 		stats.run.rs_blocks_logged++;
843 
844 		/*
845 		 * The list contains temporary buffer heads created by
846 		 * jbd2_journal_write_metadata_buffer().
847 		 */
848 		BUFFER_TRACE(bh, "dumping temporary bh");
849 		__brelse(bh);
850 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
851 		free_buffer_head(bh);
852 
853 		/* We also have to refile the corresponding shadowed buffer */
854 		jh = commit_transaction->t_shadow_list->b_tprev;
855 		bh = jh2bh(jh);
856 		clear_buffer_jwrite(bh);
857 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
858 		J_ASSERT_BH(bh, !buffer_shadow(bh));
859 
860 		/* The metadata is now released for reuse, but we need
861                    to remember it against this transaction so that when
862                    we finally commit, we can do any checkpointing
863                    required. */
864 		JBUFFER_TRACE(jh, "file as BJ_Forget");
865 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
866 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
867 		__brelse(bh);
868 	}
869 
870 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
871 
872 	jbd2_debug(3, "JBD2: commit phase 4\n");
873 
874 	/* Here we wait for the revoke record and descriptor record buffers */
875 	while (!list_empty(&log_bufs)) {
876 		struct buffer_head *bh;
877 
878 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
879 		wait_on_buffer(bh);
880 		cond_resched();
881 
882 		if (unlikely(!buffer_uptodate(bh)))
883 			err = -EIO;
884 
885 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
886 		clear_buffer_jwrite(bh);
887 		jbd2_unfile_log_bh(bh);
888 		stats.run.rs_blocks_logged++;
889 		__brelse(bh);		/* One for getblk */
890 		/* AKPM: bforget here */
891 	}
892 
893 	if (err)
894 		jbd2_journal_abort(journal, err);
895 
896 	jbd2_debug(3, "JBD2: commit phase 5\n");
897 	write_lock(&journal->j_state_lock);
898 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
899 	commit_transaction->t_state = T_COMMIT_JFLUSH;
900 	write_unlock(&journal->j_state_lock);
901 
902 	if (!jbd2_has_feature_async_commit(journal)) {
903 		err = journal_submit_commit_record(journal, commit_transaction,
904 						&cbh, crc32_sum);
905 		if (err)
906 			jbd2_journal_abort(journal, err);
907 	}
908 	if (cbh)
909 		err = journal_wait_on_commit_record(journal, cbh);
910 	stats.run.rs_blocks_logged++;
911 	if (jbd2_has_feature_async_commit(journal) &&
912 	    journal->j_flags & JBD2_BARRIER) {
913 		blkdev_issue_flush(journal->j_dev);
914 	}
915 
916 	if (err)
917 		jbd2_journal_abort(journal, err);
918 
919 	WARN_ON_ONCE(
920 		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
921 
922 	/*
923 	 * Now disk caches for filesystem device are flushed so we are safe to
924 	 * erase checkpointed transactions from the log by updating journal
925 	 * superblock.
926 	 */
927 	if (update_tail)
928 		jbd2_update_log_tail(journal, first_tid, first_block);
929 
930 	/* End of a transaction!  Finally, we can do checkpoint
931            processing: any buffers committed as a result of this
932            transaction can be removed from any checkpoint list it was on
933            before. */
934 
935 	jbd2_debug(3, "JBD2: commit phase 6\n");
936 
937 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
938 	J_ASSERT(commit_transaction->t_buffers == NULL);
939 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
940 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
941 
942 restart_loop:
943 	/*
944 	 * As there are other places (journal_unmap_buffer()) adding buffers
945 	 * to this list we have to be careful and hold the j_list_lock.
946 	 */
947 	spin_lock(&journal->j_list_lock);
948 	while (commit_transaction->t_forget) {
949 		transaction_t *cp_transaction;
950 		struct buffer_head *bh;
951 		int try_to_free = 0;
952 		bool drop_ref;
953 
954 		jh = commit_transaction->t_forget;
955 		spin_unlock(&journal->j_list_lock);
956 		bh = jh2bh(jh);
957 		/*
958 		 * Get a reference so that bh cannot be freed before we are
959 		 * done with it.
960 		 */
961 		get_bh(bh);
962 		spin_lock(&jh->b_state_lock);
963 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
964 
965 		/*
966 		 * If there is undo-protected committed data against
967 		 * this buffer, then we can remove it now.  If it is a
968 		 * buffer needing such protection, the old frozen_data
969 		 * field now points to a committed version of the
970 		 * buffer, so rotate that field to the new committed
971 		 * data.
972 		 *
973 		 * Otherwise, we can just throw away the frozen data now.
974 		 *
975 		 * We also know that the frozen data has already fired
976 		 * its triggers if they exist, so we can clear that too.
977 		 */
978 		if (jh->b_committed_data) {
979 			jbd2_free(jh->b_committed_data, bh->b_size);
980 			jh->b_committed_data = NULL;
981 			if (jh->b_frozen_data) {
982 				jh->b_committed_data = jh->b_frozen_data;
983 				jh->b_frozen_data = NULL;
984 				jh->b_frozen_triggers = NULL;
985 			}
986 		} else if (jh->b_frozen_data) {
987 			jbd2_free(jh->b_frozen_data, bh->b_size);
988 			jh->b_frozen_data = NULL;
989 			jh->b_frozen_triggers = NULL;
990 		}
991 
992 		spin_lock(&journal->j_list_lock);
993 		cp_transaction = jh->b_cp_transaction;
994 		if (cp_transaction) {
995 			JBUFFER_TRACE(jh, "remove from old cp transaction");
996 			cp_transaction->t_chp_stats.cs_dropped++;
997 			__jbd2_journal_remove_checkpoint(jh);
998 		}
999 
1000 		/* Only re-checkpoint the buffer_head if it is marked
1001 		 * dirty.  If the buffer was added to the BJ_Forget list
1002 		 * by jbd2_journal_forget, it may no longer be dirty and
1003 		 * there's no point in keeping a checkpoint record for
1004 		 * it. */
1005 
1006 		/*
1007 		 * A buffer which has been freed while still being journaled
1008 		 * by a previous transaction, refile the buffer to BJ_Forget of
1009 		 * the running transaction. If the just committed transaction
1010 		 * contains "add to orphan" operation, we can completely
1011 		 * invalidate the buffer now. We are rather through in that
1012 		 * since the buffer may be still accessible when blocksize <
1013 		 * pagesize and it is attached to the last partial page.
1014 		 */
1015 		if (buffer_freed(bh) && !jh->b_next_transaction) {
1016 			struct address_space *mapping;
1017 
1018 			clear_buffer_freed(bh);
1019 			clear_buffer_jbddirty(bh);
1020 
1021 			/*
1022 			 * Block device buffers need to stay mapped all the
1023 			 * time, so it is enough to clear buffer_jbddirty and
1024 			 * buffer_freed bits. For the file mapping buffers (i.e.
1025 			 * journalled data) we need to unmap buffer and clear
1026 			 * more bits. We also need to be careful about the check
1027 			 * because the data page mapping can get cleared under
1028 			 * our hands. Note that if mapping == NULL, we don't
1029 			 * need to make buffer unmapped because the page is
1030 			 * already detached from the mapping and buffers cannot
1031 			 * get reused.
1032 			 */
1033 			mapping = READ_ONCE(bh->b_folio->mapping);
1034 			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1035 				clear_buffer_mapped(bh);
1036 				clear_buffer_new(bh);
1037 				clear_buffer_req(bh);
1038 				bh->b_bdev = NULL;
1039 			}
1040 		}
1041 
1042 		if (buffer_jbddirty(bh)) {
1043 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1044 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1045 			if (is_journal_aborted(journal))
1046 				clear_buffer_jbddirty(bh);
1047 		} else {
1048 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1049 			/*
1050 			 * The buffer on BJ_Forget list and not jbddirty means
1051 			 * it has been freed by this transaction and hence it
1052 			 * could not have been reallocated until this
1053 			 * transaction has committed. *BUT* it could be
1054 			 * reallocated once we have written all the data to
1055 			 * disk and before we process the buffer on BJ_Forget
1056 			 * list.
1057 			 */
1058 			if (!jh->b_next_transaction)
1059 				try_to_free = 1;
1060 		}
1061 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1062 		drop_ref = __jbd2_journal_refile_buffer(jh);
1063 		spin_unlock(&jh->b_state_lock);
1064 		if (drop_ref)
1065 			jbd2_journal_put_journal_head(jh);
1066 		if (try_to_free)
1067 			release_buffer_page(bh);	/* Drops bh reference */
1068 		else
1069 			__brelse(bh);
1070 		cond_resched_lock(&journal->j_list_lock);
1071 	}
1072 	spin_unlock(&journal->j_list_lock);
1073 	/*
1074 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1075 	 * of a transaction into T_FINISHED state and calling
1076 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1077 	 * other checkpointing code processing the transaction...
1078 	 */
1079 	write_lock(&journal->j_state_lock);
1080 	spin_lock(&journal->j_list_lock);
1081 	/*
1082 	 * Now recheck if some buffers did not get attached to the transaction
1083 	 * while the lock was dropped...
1084 	 */
1085 	if (commit_transaction->t_forget) {
1086 		spin_unlock(&journal->j_list_lock);
1087 		write_unlock(&journal->j_state_lock);
1088 		goto restart_loop;
1089 	}
1090 
1091 	/* Add the transaction to the checkpoint list
1092 	 * __journal_remove_checkpoint() can not destroy transaction
1093 	 * under us because it is not marked as T_FINISHED yet */
1094 	if (journal->j_checkpoint_transactions == NULL) {
1095 		journal->j_checkpoint_transactions = commit_transaction;
1096 		commit_transaction->t_cpnext = commit_transaction;
1097 		commit_transaction->t_cpprev = commit_transaction;
1098 	} else {
1099 		commit_transaction->t_cpnext =
1100 			journal->j_checkpoint_transactions;
1101 		commit_transaction->t_cpprev =
1102 			commit_transaction->t_cpnext->t_cpprev;
1103 		commit_transaction->t_cpnext->t_cpprev =
1104 			commit_transaction;
1105 		commit_transaction->t_cpprev->t_cpnext =
1106 				commit_transaction;
1107 	}
1108 	spin_unlock(&journal->j_list_lock);
1109 
1110 	/* Done with this transaction! */
1111 
1112 	jbd2_debug(3, "JBD2: commit phase 7\n");
1113 
1114 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1115 
1116 	commit_transaction->t_start = jiffies;
1117 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1118 					      commit_transaction->t_start);
1119 
1120 	/*
1121 	 * File the transaction statistics
1122 	 */
1123 	stats.ts_tid = commit_transaction->t_tid;
1124 	stats.run.rs_handle_count =
1125 		atomic_read(&commit_transaction->t_handle_count);
1126 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1127 			     commit_transaction->t_tid, &stats.run);
1128 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1129 
1130 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1131 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1132 	WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
1133 	journal->j_committing_transaction = NULL;
1134 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1135 
1136 	/*
1137 	 * weight the commit time higher than the average time so we don't
1138 	 * react too strongly to vast changes in the commit time
1139 	 */
1140 	if (likely(journal->j_average_commit_time))
1141 		journal->j_average_commit_time = (commit_time +
1142 				journal->j_average_commit_time*3) / 4;
1143 	else
1144 		journal->j_average_commit_time = commit_time;
1145 
1146 	write_unlock(&journal->j_state_lock);
1147 
1148 	if (journal->j_commit_callback)
1149 		journal->j_commit_callback(journal, commit_transaction);
1150 	if (journal->j_fc_cleanup_callback)
1151 		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1152 
1153 	trace_jbd2_end_commit(journal, commit_transaction);
1154 	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1155 		  journal->j_commit_sequence, journal->j_tail_sequence);
1156 
1157 	write_lock(&journal->j_state_lock);
1158 	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1159 	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1160 	spin_lock(&journal->j_list_lock);
1161 	commit_transaction->t_state = T_FINISHED;
1162 	/* Check if the transaction can be dropped now that we are finished */
1163 	if (commit_transaction->t_checkpoint_list == NULL) {
1164 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1165 		jbd2_journal_free_transaction(commit_transaction);
1166 	}
1167 	spin_unlock(&journal->j_list_lock);
1168 	write_unlock(&journal->j_state_lock);
1169 	wake_up(&journal->j_wait_done_commit);
1170 	wake_up(&journal->j_fc_wait);
1171 
1172 	/*
1173 	 * Calculate overall stats
1174 	 */
1175 	spin_lock(&journal->j_history_lock);
1176 	journal->j_stats.ts_tid++;
1177 	journal->j_stats.ts_requested += stats.ts_requested;
1178 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1179 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1180 	journal->j_stats.run.rs_running += stats.run.rs_running;
1181 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1182 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1183 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1184 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1185 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1186 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1187 	spin_unlock(&journal->j_history_lock);
1188 }
1189