xref: /linux/fs/jbd2/commit.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * linux/fs/jbd2/commit.c
4  *
5  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6  *
7  * Copyright 1998 Red Hat corp --- All Rights Reserved
8  *
9  * Journal commit routines for the generic filesystem journaling code;
10  * part of the ext2fs journaling system.
11  */
12 
13 #include <linux/time.h>
14 #include <linux/fs.h>
15 #include <linux/jbd2.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/mm.h>
19 #include <linux/pagemap.h>
20 #include <linux/jiffies.h>
21 #include <linux/crc32.h>
22 #include <linux/writeback.h>
23 #include <linux/backing-dev.h>
24 #include <linux/bio.h>
25 #include <linux/blkdev.h>
26 #include <linux/bitops.h>
27 #include <trace/events/jbd2.h>
28 
29 /*
30  * IO end handler for temporary buffer_heads handling writes to the journal.
31  */
32 static void journal_end_buffer_io_sync(struct bio *bio)
33 {
34 	struct buffer_head *bh;
35 	bool uptodate = bio_endio_bh(bio, &bh);
36 	struct buffer_head *orig_bh = bh->b_private;
37 
38 	BUFFER_TRACE(bh, "");
39 	if (uptodate)
40 		set_buffer_uptodate(bh);
41 	else
42 		clear_buffer_uptodate(bh);
43 	if (orig_bh) {
44 		clear_and_wake_up_bit(BH_Shadow, &orig_bh->b_state);
45 	}
46 	unlock_buffer(bh);
47 }
48 
49 /*
50  * When an ext4 file is truncated, it is possible that some pages are not
51  * successfully freed, because they are attached to a committing transaction.
52  * After the transaction commits, these pages are left on the LRU, with no
53  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54  * by the VM, but their apparent absence upsets the VM accounting, and it makes
55  * the numbers in /proc/meminfo look odd.
56  *
57  * So here, we have a buffer which has just come off the forget list.  Look to
58  * see if we can strip all buffers from the backing page.
59  *
60  * Called under j_list_lock. The caller provided us with a ref against the
61  * buffer, and we drop that here.
62  */
63 static void release_buffer_page(struct buffer_head *bh)
64 {
65 	struct folio *folio;
66 
67 	if (buffer_dirty(bh))
68 		goto nope;
69 	if (atomic_read(&bh->b_count) != 1)
70 		goto nope;
71 	folio = bh->b_folio;
72 	if (folio->mapping)
73 		goto nope;
74 
75 	/* OK, it's a truncated page */
76 	if (!folio_trylock(folio))
77 		goto nope;
78 
79 	folio_get(folio);
80 	__brelse(bh);
81 	try_to_free_buffers(folio);
82 	folio_unlock(folio);
83 	folio_put(folio);
84 	return;
85 
86 nope:
87 	__brelse(bh);
88 }
89 
90 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
91 {
92 	struct commit_header *h;
93 	__u32 csum;
94 
95 	if (!jbd2_journal_has_csum_v2or3(j))
96 		return;
97 
98 	h = (struct commit_header *)(bh->b_data);
99 	h->h_chksum_type = 0;
100 	h->h_chksum_size = 0;
101 	h->h_chksum[0] = 0;
102 	csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
103 	h->h_chksum[0] = cpu_to_be32(csum);
104 }
105 
106 /*
107  * Done it all: now submit the commit record.  We should have
108  * cleaned up our previous buffers by now, so if we are in abort
109  * mode we can now just skip the rest of the journal write
110  * entirely.
111  *
112  * Returns 1 if the journal needs to be aborted or 0 on success
113  */
114 static int journal_submit_commit_record(journal_t *journal,
115 					transaction_t *commit_transaction,
116 					struct buffer_head **cbh,
117 					__u32 crc32_sum)
118 {
119 	struct commit_header *tmp;
120 	struct buffer_head *bh;
121 	struct timespec64 now;
122 	blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
123 
124 	*cbh = NULL;
125 
126 	if (is_journal_aborted(journal))
127 		return 0;
128 
129 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
130 						JBD2_COMMIT_BLOCK);
131 	if (!bh)
132 		return 1;
133 
134 	tmp = (struct commit_header *)bh->b_data;
135 	ktime_get_coarse_real_ts64(&now);
136 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
137 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
138 
139 	if (jbd2_has_feature_checksum(journal)) {
140 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
141 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
142 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
143 	}
144 	jbd2_commit_block_csum_set(journal, bh);
145 
146 	BUFFER_TRACE(bh, "submit commit block");
147 	lock_buffer(bh);
148 	clear_buffer_dirty(bh);
149 	set_buffer_uptodate(bh);
150 
151 	if (journal->j_flags & JBD2_BARRIER &&
152 	    !jbd2_has_feature_async_commit(journal))
153 		write_flags |= REQ_PREFLUSH | REQ_FUA;
154 
155 	bh_submit(bh, write_flags, journal_end_buffer_io_sync);
156 	*cbh = bh;
157 	return 0;
158 }
159 
160 /*
161  * This function along with journal_submit_commit_record
162  * allows to write the commit record asynchronously.
163  */
164 static int journal_wait_on_commit_record(journal_t *journal,
165 					 struct buffer_head *bh)
166 {
167 	int ret = 0;
168 
169 	clear_buffer_dirty(bh);
170 	wait_on_buffer(bh);
171 
172 	if (unlikely(!buffer_uptodate(bh)))
173 		ret = -EIO;
174 	put_bh(bh);            /* One for getblk() */
175 
176 	return ret;
177 }
178 
179 /* Send all the data buffers related to an inode */
180 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
181 {
182 	unsigned long flags;
183 
184 	if (!jinode)
185 		return 0;
186 
187 	flags = READ_ONCE(jinode->i_flags);
188 	if (!(flags & JI_WRITE_DATA))
189 		return 0;
190 
191 	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
192 	return journal->j_submit_inode_data_buffers(jinode);
193 
194 }
195 EXPORT_SYMBOL(jbd2_submit_inode_data);
196 
197 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
198 {
199 	struct address_space *mapping;
200 	struct inode *inode;
201 	unsigned long flags;
202 	loff_t start_byte, end_byte;
203 
204 	if (!jinode)
205 		return 0;
206 
207 	flags = READ_ONCE(jinode->i_flags);
208 	if (!(flags & JI_WAIT_DATA))
209 		return 0;
210 
211 	inode = jinode->i_vfs_inode;
212 	if (!inode)
213 		return 0;
214 
215 	mapping = inode->i_mapping;
216 	if (!mapping)
217 		return 0;
218 
219 	if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
220 		return 0;
221 	return filemap_fdatawait_range_keep_errors(
222 		mapping, start_byte, end_byte);
223 }
224 EXPORT_SYMBOL(jbd2_wait_inode_data);
225 
226 /*
227  * Submit all the data buffers of inode associated with the transaction to
228  * disk.
229  *
230  * We are in a committing transaction. Therefore no new inode can be added to
231  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
232  * operate on from being released while we write out pages.
233  */
234 static int journal_submit_data_buffers(journal_t *journal,
235 		transaction_t *commit_transaction)
236 {
237 	struct jbd2_inode *jinode;
238 	int err, ret = 0;
239 
240 	spin_lock(&journal->j_list_lock);
241 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
242 		if (!(jinode->i_flags & JI_WRITE_DATA))
243 			continue;
244 		WRITE_ONCE(jinode->i_flags,
245 			   jinode->i_flags | JI_COMMIT_RUNNING);
246 		spin_unlock(&journal->j_list_lock);
247 		/* submit the inode data buffers. */
248 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
249 		if (journal->j_submit_inode_data_buffers) {
250 			err = journal->j_submit_inode_data_buffers(jinode);
251 			if (!ret)
252 				ret = err;
253 		}
254 		spin_lock(&journal->j_list_lock);
255 		J_ASSERT(jinode->i_transaction == commit_transaction);
256 		WRITE_ONCE(jinode->i_flags,
257 			   jinode->i_flags & ~JI_COMMIT_RUNNING);
258 		smp_mb();
259 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
260 	}
261 	spin_unlock(&journal->j_list_lock);
262 	return ret;
263 }
264 
265 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
266 {
267 	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
268 	loff_t start_byte, end_byte;
269 
270 	if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
271 		return 0;
272 
273 	return filemap_fdatawait_range_keep_errors(mapping,
274 						   start_byte, end_byte);
275 }
276 
277 /*
278  * Wait for data submitted for writeout, refile inodes to proper
279  * transaction if needed.
280  *
281  */
282 static int journal_finish_inode_data_buffers(journal_t *journal,
283 		transaction_t *commit_transaction)
284 {
285 	struct jbd2_inode *jinode, *next_i;
286 	int err, ret = 0;
287 
288 	/* For locking, see the comment in journal_submit_data_buffers() */
289 	spin_lock(&journal->j_list_lock);
290 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
291 		if (!(jinode->i_flags & JI_WAIT_DATA))
292 			continue;
293 		WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING);
294 		spin_unlock(&journal->j_list_lock);
295 		/* wait for the inode data buffers writeout. */
296 		if (journal->j_finish_inode_data_buffers) {
297 			err = journal->j_finish_inode_data_buffers(jinode);
298 			if (!ret)
299 				ret = err;
300 		}
301 		cond_resched();
302 		spin_lock(&journal->j_list_lock);
303 		WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING);
304 		smp_mb();
305 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
306 	}
307 
308 	/* Now refile inode to proper lists */
309 	list_for_each_entry_safe(jinode, next_i,
310 				 &commit_transaction->t_inode_list, i_list) {
311 		list_del(&jinode->i_list);
312 		if (jinode->i_next_transaction) {
313 			jinode->i_transaction = jinode->i_next_transaction;
314 			jinode->i_next_transaction = NULL;
315 			list_add(&jinode->i_list,
316 				&jinode->i_transaction->t_inode_list);
317 		} else {
318 			jinode->i_transaction = NULL;
319 			WRITE_ONCE(jinode->i_dirty_start_page, 0);
320 			WRITE_ONCE(jinode->i_dirty_end_page, 0);
321 		}
322 	}
323 	spin_unlock(&journal->j_list_lock);
324 
325 	return ret;
326 }
327 
328 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
329 {
330 	char *addr;
331 	__u32 checksum;
332 
333 	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
334 	checksum = crc32_be(crc32_sum, addr, bh->b_size);
335 	kunmap_local(addr);
336 
337 	return checksum;
338 }
339 
340 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
341 				   unsigned long long block)
342 {
343 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
344 	if (jbd2_has_feature_64bit(j))
345 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
346 }
347 
348 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
349 				    struct buffer_head *bh, __u32 sequence)
350 {
351 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
352 	__u8 *addr;
353 	__u32 csum32;
354 	__be32 seq;
355 
356 	if (!jbd2_journal_has_csum_v2or3(j))
357 		return;
358 
359 	seq = cpu_to_be32(sequence);
360 	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
361 	csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
362 	csum32 = jbd2_chksum(csum32, addr, bh->b_size);
363 	kunmap_local(addr);
364 
365 	if (jbd2_has_feature_csum3(j))
366 		tag3->t_checksum = cpu_to_be32(csum32);
367 	else
368 		tag->t_checksum = cpu_to_be16(csum32);
369 }
370 /*
371  * jbd2_journal_commit_transaction
372  *
373  * The primary function for committing a transaction to the log.  This
374  * function is called by the journal thread to begin a complete commit.
375  */
376 void jbd2_journal_commit_transaction(journal_t *journal)
377 {
378 	struct transaction_stats_s stats;
379 	transaction_t *commit_transaction;
380 	struct journal_head *jh;
381 	struct buffer_head *descriptor;
382 	struct buffer_head **wbuf = journal->j_wbuf;
383 	int bufs;
384 	int escape;
385 	int err;
386 	unsigned long long blocknr;
387 	ktime_t start_time;
388 	u64 commit_time;
389 	char *tagp = NULL;
390 	journal_block_tag_t *tag = NULL;
391 	int space_left = 0;
392 	int first_tag = 0;
393 	int tag_flag;
394 	int i;
395 	int tag_bytes = journal_tag_bytes(journal);
396 	struct buffer_head *cbh = NULL; /* For transactional checksums */
397 	__u32 crc32_sum = ~0;
398 	struct blk_plug plug;
399 	/* Tail of the journal */
400 	unsigned long first_block;
401 	tid_t first_tid;
402 	int update_tail;
403 	int csum_size = 0;
404 	LIST_HEAD(io_bufs);
405 	LIST_HEAD(log_bufs);
406 
407 	if (jbd2_journal_has_csum_v2or3(journal))
408 		csum_size = sizeof(struct jbd2_journal_block_tail);
409 
410 	/*
411 	 * First job: lock down the current transaction and wait for
412 	 * all outstanding updates to complete.
413 	 */
414 
415 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
416 	if (journal->j_flags & JBD2_FLUSHED) {
417 		jbd2_debug(3, "super block updated\n");
418 		mutex_lock_io(&journal->j_checkpoint_mutex);
419 		/*
420 		 * We hold j_checkpoint_mutex so tail cannot change under us.
421 		 * We don't need any special data guarantees for writing sb
422 		 * since journal is empty and it is ok for write to be
423 		 * flushed only with transaction commit.
424 		 */
425 		jbd2_journal_update_sb_log_tail(journal,
426 						journal->j_tail_sequence,
427 						journal->j_tail, 0);
428 		mutex_unlock(&journal->j_checkpoint_mutex);
429 	} else {
430 		jbd2_debug(3, "superblock not updated\n");
431 	}
432 
433 	J_ASSERT(journal->j_running_transaction != NULL);
434 	J_ASSERT(journal->j_committing_transaction == NULL);
435 
436 	write_lock(&journal->j_state_lock);
437 	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
438 	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
439 		DEFINE_WAIT(wait);
440 
441 		prepare_to_wait(&journal->j_fc_wait, &wait,
442 				TASK_UNINTERRUPTIBLE);
443 		write_unlock(&journal->j_state_lock);
444 		schedule();
445 		write_lock(&journal->j_state_lock);
446 		finish_wait(&journal->j_fc_wait, &wait);
447 		/*
448 		 * TODO: by blocking fast commits here, we are increasing
449 		 * fsync() latency slightly. Strictly speaking, we don't need
450 		 * to block fast commits until the transaction enters T_FLUSH
451 		 * state. So an optimization is possible where we block new fast
452 		 * commits here and wait for existing ones to complete
453 		 * just before we enter T_FLUSH. That way, the existing fast
454 		 * commits and this full commit can proceed parallely.
455 		 */
456 	}
457 	write_unlock(&journal->j_state_lock);
458 
459 	commit_transaction = journal->j_running_transaction;
460 
461 	trace_jbd2_start_commit(journal, commit_transaction);
462 	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
463 			commit_transaction->t_tid);
464 
465 	write_lock(&journal->j_state_lock);
466 	journal->j_fc_off = 0;
467 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
468 	commit_transaction->t_state = T_LOCKED;
469 
470 	trace_jbd2_commit_locking(journal, commit_transaction);
471 	stats.run.rs_wait = commit_transaction->t_max_wait;
472 	stats.run.rs_request_delay = 0;
473 	stats.run.rs_locked = jiffies;
474 	if (commit_transaction->t_requested)
475 		stats.run.rs_request_delay =
476 			jbd2_time_diff(commit_transaction->t_requested,
477 				       stats.run.rs_locked);
478 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
479 					      stats.run.rs_locked);
480 
481 	// waits for any t_updates to finish
482 	jbd2_journal_wait_updates(journal);
483 
484 	commit_transaction->t_state = T_SWITCH;
485 
486 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
487 			journal->j_max_transaction_buffers);
488 
489 	/*
490 	 * First thing we are allowed to do is to discard any remaining
491 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
492 	 * that there are no such buffers: if a large filesystem
493 	 * operation like a truncate needs to split itself over multiple
494 	 * transactions, then it may try to do a jbd2_journal_restart() while
495 	 * there are still BJ_Reserved buffers outstanding.  These must
496 	 * be released cleanly from the current transaction.
497 	 *
498 	 * In this case, the filesystem must still reserve write access
499 	 * again before modifying the buffer in the new transaction, but
500 	 * we do not require it to remember exactly which old buffers it
501 	 * has reserved.  This is consistent with the existing behaviour
502 	 * that multiple jbd2_journal_get_write_access() calls to the same
503 	 * buffer are perfectly permissible.
504 	 * We use journal->j_state_lock here to serialize processing of
505 	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
506 	 */
507 	while (commit_transaction->t_reserved_list) {
508 		jh = commit_transaction->t_reserved_list;
509 		JBUFFER_TRACE(jh, "reserved, unused: refile");
510 		/*
511 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
512 		 * leave undo-committed data.
513 		 */
514 		if (jh->b_committed_data) {
515 			spin_lock(&jh->b_state_lock);
516 			kfree(jh->b_committed_data);
517 			jh->b_committed_data = NULL;
518 			spin_unlock(&jh->b_state_lock);
519 		}
520 		jbd2_journal_refile_buffer(journal, jh);
521 	}
522 
523 	write_unlock(&journal->j_state_lock);
524 	/*
525 	 * Now try to drop any written-back buffers from the journal's
526 	 * checkpoint lists.  We do this *before* commit because it potentially
527 	 * frees some memory
528 	 */
529 	spin_lock(&journal->j_list_lock);
530 	__jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
531 	spin_unlock(&journal->j_list_lock);
532 
533 	jbd2_debug(3, "JBD2: commit phase 1\n");
534 
535 	/*
536 	 * Clear revoked flag to reflect there is no revoked buffers
537 	 * in the next transaction which is going to be started.
538 	 */
539 	jbd2_clear_buffer_revoked_flags(journal);
540 
541 	/*
542 	 * Switch to a new revoke table.
543 	 */
544 	jbd2_journal_switch_revoke_table(journal);
545 
546 	write_lock(&journal->j_state_lock);
547 	/*
548 	 * Reserved credits cannot be claimed anymore, free them
549 	 */
550 	atomic_sub(atomic_read(&journal->j_reserved_credits),
551 		   &commit_transaction->t_outstanding_credits);
552 
553 	trace_jbd2_commit_flushing(journal, commit_transaction);
554 	stats.run.rs_flushing = jiffies;
555 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
556 					     stats.run.rs_flushing);
557 
558 	commit_transaction->t_state = T_FLUSH;
559 	journal->j_committing_transaction = commit_transaction;
560 	journal->j_running_transaction = NULL;
561 	start_time = ktime_get();
562 	commit_transaction->t_log_start = journal->j_head;
563 	wake_up_all(&journal->j_wait_transaction_locked);
564 	write_unlock(&journal->j_state_lock);
565 
566 	jbd2_debug(3, "JBD2: commit phase 2a\n");
567 
568 	/*
569 	 * Now start flushing things to disk, in the order they appear
570 	 * on the transaction lists.  Data blocks go first.
571 	 */
572 	err = journal_submit_data_buffers(journal, commit_transaction);
573 	if (err)
574 		jbd2_journal_abort(journal, err);
575 
576 	blk_start_plug(&plug);
577 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
578 
579 	jbd2_debug(3, "JBD2: commit phase 2b\n");
580 
581 	/*
582 	 * Way to go: we have now written out all of the data for a
583 	 * transaction!  Now comes the tricky part: we need to write out
584 	 * metadata.  Loop over the transaction's entire buffer list:
585 	 */
586 	write_lock(&journal->j_state_lock);
587 	commit_transaction->t_state = T_COMMIT;
588 	write_unlock(&journal->j_state_lock);
589 
590 	trace_jbd2_commit_logging(journal, commit_transaction);
591 	stats.run.rs_logging = jiffies;
592 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
593 					       stats.run.rs_logging);
594 	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
595 	stats.run.rs_blocks_logged = 0;
596 
597 	J_ASSERT(commit_transaction->t_nr_buffers <=
598 		 atomic_read(&commit_transaction->t_outstanding_credits));
599 
600 	bufs = 0;
601 	descriptor = NULL;
602 	while (commit_transaction->t_buffers) {
603 
604 		/* Find the next buffer to be journaled... */
605 
606 		jh = commit_transaction->t_buffers;
607 
608 		/* If we're in abort mode, we just un-journal the buffer and
609 		   release it. */
610 
611 		if (is_journal_aborted(journal)) {
612 			clear_buffer_jbddirty(jh2bh(jh));
613 			JBUFFER_TRACE(jh, "journal is aborting: refile");
614 			jbd2_buffer_abort_trigger(jh,
615 						  jh->b_frozen_data ?
616 						  jh->b_frozen_triggers :
617 						  jh->b_triggers);
618 			jbd2_journal_refile_buffer(journal, jh);
619 			/* If that was the last one, we need to clean up
620 			 * any descriptor buffers which may have been
621 			 * already allocated, even if we are now
622 			 * aborting. */
623 			if (!commit_transaction->t_buffers)
624 				goto start_journal_io;
625 			continue;
626 		}
627 
628 		/* Make sure we have a descriptor block in which to
629 		   record the metadata buffer. */
630 
631 		if (!descriptor) {
632 			J_ASSERT (bufs == 0);
633 
634 			jbd2_debug(4, "JBD2: get descriptor\n");
635 
636 			descriptor = jbd2_journal_get_descriptor_buffer(
637 							commit_transaction,
638 							JBD2_DESCRIPTOR_BLOCK);
639 			if (!descriptor) {
640 				jbd2_journal_abort(journal, -EIO);
641 				continue;
642 			}
643 
644 			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
645 				(unsigned long long)descriptor->b_blocknr,
646 				descriptor->b_data);
647 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
648 			space_left = descriptor->b_size -
649 						sizeof(journal_header_t);
650 			first_tag = 1;
651 			set_buffer_jwrite(descriptor);
652 			set_buffer_dirty(descriptor);
653 			wbuf[bufs++] = descriptor;
654 
655 			/* Record it so that we can wait for IO
656                            completion later */
657 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
658 			jbd2_file_log_bh(&log_bufs, descriptor);
659 		}
660 
661 		/* Where is the buffer to be written? */
662 
663 		err = jbd2_journal_next_log_block(journal, &blocknr);
664 		/* If the block mapping failed, just abandon the buffer
665 		   and repeat this loop: we'll fall into the
666 		   refile-on-abort condition above. */
667 		if (err) {
668 			jbd2_journal_abort(journal, err);
669 			continue;
670 		}
671 
672 		/*
673 		 * start_this_handle() uses t_outstanding_credits to determine
674 		 * the free space in the log.
675 		 */
676 		atomic_dec(&commit_transaction->t_outstanding_credits);
677 
678 		/* Bump b_count to prevent truncate from stumbling over
679                    the shadowed buffer!  @@@ This can go if we ever get
680                    rid of the shadow pairing of buffers. */
681 		atomic_inc(&jh2bh(jh)->b_count);
682 
683 		/*
684 		 * Make a temporary IO buffer with which to write it out
685 		 * (this will requeue the metadata buffer to BJ_Shadow).
686 		 */
687 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
688 		JBUFFER_TRACE(jh, "ph3: write metadata");
689 		escape = jbd2_journal_write_metadata_buffer(commit_transaction,
690 						jh, &wbuf[bufs], blocknr);
691 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
692 
693 		/* Record the new block's tag in the current descriptor
694                    buffer */
695 
696 		tag_flag = 0;
697 		if (escape)
698 			tag_flag |= JBD2_FLAG_ESCAPE;
699 		if (!first_tag)
700 			tag_flag |= JBD2_FLAG_SAME_UUID;
701 
702 		tag = (journal_block_tag_t *) tagp;
703 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
704 		tag->t_flags = cpu_to_be16(tag_flag);
705 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
706 					commit_transaction->t_tid);
707 		tagp += tag_bytes;
708 		space_left -= tag_bytes;
709 		bufs++;
710 
711 		if (first_tag) {
712 			memcpy (tagp, journal->j_uuid, 16);
713 			tagp += 16;
714 			space_left -= 16;
715 			first_tag = 0;
716 		}
717 
718 		/* If there's no more to do, or if the descriptor is full,
719 		   let the IO rip! */
720 
721 		if (bufs == journal->j_wbufsize ||
722 		    commit_transaction->t_buffers == NULL ||
723 		    space_left < tag_bytes + 16 + csum_size) {
724 
725 			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
726 
727 			/* Write an end-of-descriptor marker before
728                            submitting the IOs.  "tag" still points to
729                            the last tag we set up. */
730 
731 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
732 start_journal_io:
733 			if (descriptor)
734 				jbd2_descriptor_block_csum_set(journal,
735 							descriptor);
736 
737 			for (i = 0; i < bufs; i++) {
738 				struct buffer_head *bh = wbuf[i];
739 
740 				/*
741 				 * Compute checksum.
742 				 */
743 				if (jbd2_has_feature_checksum(journal)) {
744 					crc32_sum =
745 					    jbd2_checksum_data(crc32_sum, bh);
746 				}
747 
748 				lock_buffer(bh);
749 				clear_buffer_dirty(bh);
750 				set_buffer_uptodate(bh);
751 				bh_submit(bh,
752 					REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
753 					journal_end_buffer_io_sync);
754 			}
755 			cond_resched();
756 
757 			/* Force a new descriptor to be generated next
758                            time round the loop. */
759 			descriptor = NULL;
760 			bufs = 0;
761 		}
762 	}
763 
764 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
765 	if (err) {
766 		printk(KERN_WARNING
767 			"JBD2: Detected IO errors %d while flushing file data on %s\n",
768 			err, journal->j_devname);
769 		err = 0;
770 	}
771 
772 	/*
773 	 * Get current oldest transaction in the log before we issue flush
774 	 * to the filesystem device. After the flush we can be sure that
775 	 * blocks of all older transactions are checkpointed to persistent
776 	 * storage and we will be safe to update journal start in the
777 	 * superblock with the numbers we get here.
778 	 */
779 	update_tail =
780 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
781 
782 	write_lock(&journal->j_state_lock);
783 	if (update_tail) {
784 		long freed = first_block - journal->j_tail;
785 
786 		if (first_block < journal->j_tail)
787 			freed += journal->j_last - journal->j_first;
788 		/* Update tail only if we free significant amount of space */
789 		if (freed < journal->j_max_transaction_buffers)
790 			update_tail = 0;
791 	}
792 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
793 	commit_transaction->t_state = T_COMMIT_DFLUSH;
794 	write_unlock(&journal->j_state_lock);
795 
796 	/*
797 	 * If the journal is not located on the file system device,
798 	 * then we must flush the file system device before we issue
799 	 * the commit record and update the journal tail sequence.
800 	 */
801 	if ((commit_transaction->t_need_data_flush || update_tail) &&
802 	    (journal->j_fs_dev != journal->j_dev) &&
803 	    (journal->j_flags & JBD2_BARRIER))
804 		blkdev_issue_flush(journal->j_fs_dev);
805 
806 	/* Done it all: now write the commit record asynchronously. */
807 	if (jbd2_has_feature_async_commit(journal)) {
808 		err = journal_submit_commit_record(journal, commit_transaction,
809 						 &cbh, crc32_sum);
810 		if (err)
811 			jbd2_journal_abort(journal, err);
812 	}
813 
814 	blk_finish_plug(&plug);
815 
816 	/* Lo and behold: we have just managed to send a transaction to
817            the log.  Before we can commit it, wait for the IO so far to
818            complete.  Control buffers being written are on the
819            transaction's t_log_list queue, and metadata buffers are on
820            the io_bufs list.
821 
822 	   Wait for the buffers in reverse order.  That way we are
823 	   less likely to be woken up until all IOs have completed, and
824 	   so we incur less scheduling load.
825 	*/
826 
827 	jbd2_debug(3, "JBD2: commit phase 3\n");
828 
829 	while (!list_empty(&io_bufs)) {
830 		struct buffer_head *bh = list_entry(io_bufs.prev,
831 						    struct buffer_head,
832 						    b_assoc_buffers);
833 
834 		wait_on_buffer(bh);
835 		cond_resched();
836 
837 		if (unlikely(!buffer_uptodate(bh)))
838 			err = -EIO;
839 		jbd2_unfile_log_bh(bh);
840 		stats.run.rs_blocks_logged++;
841 
842 		/*
843 		 * The list contains temporary buffer heads created by
844 		 * jbd2_journal_write_metadata_buffer().
845 		 */
846 		BUFFER_TRACE(bh, "dumping temporary bh");
847 		__brelse(bh);
848 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
849 		free_buffer_head(bh);
850 
851 		/* We also have to refile the corresponding shadowed buffer */
852 		jh = commit_transaction->t_shadow_list->b_tprev;
853 		bh = jh2bh(jh);
854 		clear_buffer_jwrite(bh);
855 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
856 		J_ASSERT_BH(bh, !buffer_shadow(bh));
857 
858 		/* The metadata is now released for reuse, but we need
859                    to remember it against this transaction so that when
860                    we finally commit, we can do any checkpointing
861                    required. */
862 		JBUFFER_TRACE(jh, "file as BJ_Forget");
863 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
864 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
865 		__brelse(bh);
866 	}
867 
868 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
869 
870 	jbd2_debug(3, "JBD2: commit phase 4\n");
871 
872 	/* Here we wait for the revoke record and descriptor record buffers */
873 	while (!list_empty(&log_bufs)) {
874 		struct buffer_head *bh;
875 
876 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
877 		wait_on_buffer(bh);
878 		cond_resched();
879 
880 		if (unlikely(!buffer_uptodate(bh)))
881 			err = -EIO;
882 
883 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
884 		clear_buffer_jwrite(bh);
885 		jbd2_unfile_log_bh(bh);
886 		stats.run.rs_blocks_logged++;
887 		__brelse(bh);		/* One for getblk */
888 		/* AKPM: bforget here */
889 	}
890 
891 	if (err)
892 		jbd2_journal_abort(journal, err);
893 
894 	jbd2_debug(3, "JBD2: commit phase 5\n");
895 	write_lock(&journal->j_state_lock);
896 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
897 	commit_transaction->t_state = T_COMMIT_JFLUSH;
898 	write_unlock(&journal->j_state_lock);
899 
900 	if (!jbd2_has_feature_async_commit(journal)) {
901 		err = journal_submit_commit_record(journal, commit_transaction,
902 						&cbh, crc32_sum);
903 		if (err)
904 			jbd2_journal_abort(journal, err);
905 	}
906 	if (cbh)
907 		err = journal_wait_on_commit_record(journal, cbh);
908 	stats.run.rs_blocks_logged++;
909 	if (jbd2_has_feature_async_commit(journal) &&
910 	    journal->j_flags & JBD2_BARRIER) {
911 		blkdev_issue_flush(journal->j_dev);
912 	}
913 
914 	if (err)
915 		jbd2_journal_abort(journal, err);
916 
917 	WARN_ON_ONCE(
918 		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
919 
920 	/*
921 	 * Now disk caches for filesystem device are flushed so we are safe to
922 	 * erase checkpointed transactions from the log by updating journal
923 	 * superblock.
924 	 */
925 	if (update_tail)
926 		jbd2_update_log_tail(journal, first_tid, first_block);
927 
928 	/* End of a transaction!  Finally, we can do checkpoint
929            processing: any buffers committed as a result of this
930            transaction can be removed from any checkpoint list it was on
931            before. */
932 
933 	jbd2_debug(3, "JBD2: commit phase 6\n");
934 
935 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
936 	J_ASSERT(commit_transaction->t_buffers == NULL);
937 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
938 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
939 
940 restart_loop:
941 	/*
942 	 * As there are other places (journal_unmap_buffer()) adding buffers
943 	 * to this list we have to be careful and hold the j_list_lock.
944 	 */
945 	spin_lock(&journal->j_list_lock);
946 	while (commit_transaction->t_forget) {
947 		transaction_t *cp_transaction;
948 		struct buffer_head *bh;
949 		int try_to_free = 0;
950 		bool drop_ref;
951 
952 		jh = commit_transaction->t_forget;
953 		spin_unlock(&journal->j_list_lock);
954 		bh = jh2bh(jh);
955 		/*
956 		 * Get a reference so that bh cannot be freed before we are
957 		 * done with it.
958 		 */
959 		get_bh(bh);
960 		spin_lock(&jh->b_state_lock);
961 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
962 
963 		/*
964 		 * If there is undo-protected committed data against
965 		 * this buffer, then we can remove it now.  If it is a
966 		 * buffer needing such protection, the old frozen_data
967 		 * field now points to a committed version of the
968 		 * buffer, so rotate that field to the new committed
969 		 * data.
970 		 *
971 		 * Otherwise, we can just throw away the frozen data now.
972 		 *
973 		 * We also know that the frozen data has already fired
974 		 * its triggers if they exist, so we can clear that too.
975 		 */
976 		if (jh->b_committed_data) {
977 			kfree(jh->b_committed_data);
978 			jh->b_committed_data = NULL;
979 			if (jh->b_frozen_data) {
980 				jh->b_committed_data = jh->b_frozen_data;
981 				jh->b_frozen_data = NULL;
982 				jh->b_frozen_triggers = NULL;
983 			}
984 		} else if (jh->b_frozen_data) {
985 			kfree(jh->b_frozen_data);
986 			jh->b_frozen_data = NULL;
987 			jh->b_frozen_triggers = NULL;
988 		}
989 
990 		spin_lock(&journal->j_list_lock);
991 		cp_transaction = jh->b_cp_transaction;
992 		if (cp_transaction) {
993 			JBUFFER_TRACE(jh, "remove from old cp transaction");
994 			cp_transaction->t_chp_stats.cs_dropped++;
995 			__jbd2_journal_remove_checkpoint(jh);
996 		}
997 
998 		/* Only re-checkpoint the buffer_head if it is marked
999 		 * dirty.  If the buffer was added to the BJ_Forget list
1000 		 * by jbd2_journal_forget, it may no longer be dirty and
1001 		 * there's no point in keeping a checkpoint record for
1002 		 * it. */
1003 
1004 		/*
1005 		 * A buffer which has been freed while still being journaled
1006 		 * by a previous transaction, refile the buffer to BJ_Forget of
1007 		 * the running transaction. If the just committed transaction
1008 		 * contains "add to orphan" operation, we can completely
1009 		 * invalidate the buffer now. We are rather through in that
1010 		 * since the buffer may be still accessible when blocksize <
1011 		 * pagesize and it is attached to the last partial page.
1012 		 */
1013 		if (buffer_freed(bh) && !jh->b_next_transaction) {
1014 			struct address_space *mapping;
1015 
1016 			clear_buffer_freed(bh);
1017 			clear_buffer_jbddirty(bh);
1018 
1019 			/*
1020 			 * Block device buffers need to stay mapped all the
1021 			 * time, so it is enough to clear buffer_jbddirty and
1022 			 * buffer_freed bits. For the file mapping buffers (i.e.
1023 			 * journalled data) we need to unmap buffer and clear
1024 			 * more bits. We also need to be careful about the check
1025 			 * because the data page mapping can get cleared under
1026 			 * our hands. Note that if mapping == NULL, we don't
1027 			 * need to make buffer unmapped because the page is
1028 			 * already detached from the mapping and buffers cannot
1029 			 * get reused.
1030 			 */
1031 			mapping = READ_ONCE(bh->b_folio->mapping);
1032 			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1033 				clear_buffer_mapped(bh);
1034 				clear_buffer_new(bh);
1035 				clear_buffer_req(bh);
1036 				bh->b_bdev = NULL;
1037 			}
1038 		}
1039 
1040 		if (buffer_jbddirty(bh)) {
1041 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1042 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1043 			if (is_journal_aborted(journal))
1044 				clear_buffer_jbddirty(bh);
1045 		} else {
1046 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1047 			/*
1048 			 * The buffer on BJ_Forget list and not jbddirty means
1049 			 * it has been freed by this transaction and hence it
1050 			 * could not have been reallocated until this
1051 			 * transaction has committed. *BUT* it could be
1052 			 * reallocated once we have written all the data to
1053 			 * disk and before we process the buffer on BJ_Forget
1054 			 * list.
1055 			 */
1056 			if (!jh->b_next_transaction)
1057 				try_to_free = 1;
1058 		}
1059 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1060 		drop_ref = __jbd2_journal_refile_buffer(jh);
1061 		spin_unlock(&jh->b_state_lock);
1062 		if (drop_ref)
1063 			jbd2_journal_put_journal_head(jh);
1064 		if (try_to_free)
1065 			release_buffer_page(bh);	/* Drops bh reference */
1066 		else
1067 			__brelse(bh);
1068 		cond_resched_lock(&journal->j_list_lock);
1069 	}
1070 	spin_unlock(&journal->j_list_lock);
1071 	/*
1072 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1073 	 * of a transaction into T_FINISHED state and calling
1074 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1075 	 * other checkpointing code processing the transaction...
1076 	 */
1077 	write_lock(&journal->j_state_lock);
1078 	spin_lock(&journal->j_list_lock);
1079 	/*
1080 	 * Now recheck if some buffers did not get attached to the transaction
1081 	 * while the lock was dropped...
1082 	 */
1083 	if (commit_transaction->t_forget) {
1084 		spin_unlock(&journal->j_list_lock);
1085 		write_unlock(&journal->j_state_lock);
1086 		goto restart_loop;
1087 	}
1088 
1089 	/* Add the transaction to the checkpoint list
1090 	 * __journal_remove_checkpoint() can not destroy transaction
1091 	 * under us because it is not marked as T_FINISHED yet */
1092 	if (journal->j_checkpoint_transactions == NULL) {
1093 		journal->j_checkpoint_transactions = commit_transaction;
1094 		commit_transaction->t_cpnext = commit_transaction;
1095 		commit_transaction->t_cpprev = commit_transaction;
1096 	} else {
1097 		commit_transaction->t_cpnext =
1098 			journal->j_checkpoint_transactions;
1099 		commit_transaction->t_cpprev =
1100 			commit_transaction->t_cpnext->t_cpprev;
1101 		commit_transaction->t_cpnext->t_cpprev =
1102 			commit_transaction;
1103 		commit_transaction->t_cpprev->t_cpnext =
1104 				commit_transaction;
1105 	}
1106 	spin_unlock(&journal->j_list_lock);
1107 
1108 	/* Done with this transaction! */
1109 
1110 	jbd2_debug(3, "JBD2: commit phase 7\n");
1111 
1112 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1113 
1114 	commit_transaction->t_start = jiffies;
1115 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1116 					      commit_transaction->t_start);
1117 
1118 	/*
1119 	 * File the transaction statistics
1120 	 */
1121 	stats.ts_tid = commit_transaction->t_tid;
1122 	stats.run.rs_handle_count =
1123 		atomic_read(&commit_transaction->t_handle_count);
1124 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1125 			     commit_transaction->t_tid, &stats.run);
1126 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1127 
1128 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1129 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1130 	WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
1131 	journal->j_committing_transaction = NULL;
1132 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1133 
1134 	/*
1135 	 * weight the commit time higher than the average time so we don't
1136 	 * react too strongly to vast changes in the commit time
1137 	 */
1138 	if (likely(journal->j_average_commit_time))
1139 		journal->j_average_commit_time = (commit_time +
1140 				journal->j_average_commit_time*3) / 4;
1141 	else
1142 		journal->j_average_commit_time = commit_time;
1143 
1144 	write_unlock(&journal->j_state_lock);
1145 
1146 	if (journal->j_commit_callback)
1147 		journal->j_commit_callback(journal, commit_transaction);
1148 	if (journal->j_fc_cleanup_callback)
1149 		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1150 
1151 	trace_jbd2_end_commit(journal, commit_transaction);
1152 	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1153 		  journal->j_commit_sequence, journal->j_tail_sequence);
1154 
1155 	write_lock(&journal->j_state_lock);
1156 	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1157 	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1158 	spin_lock(&journal->j_list_lock);
1159 	commit_transaction->t_state = T_FINISHED;
1160 	/* Check if the transaction can be dropped now that we are finished */
1161 	if (commit_transaction->t_checkpoint_list == NULL) {
1162 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1163 		jbd2_journal_free_transaction(commit_transaction);
1164 	}
1165 	spin_unlock(&journal->j_list_lock);
1166 	write_unlock(&journal->j_state_lock);
1167 	wake_up(&journal->j_wait_done_commit);
1168 	wake_up(&journal->j_fc_wait);
1169 
1170 	/*
1171 	 * Calculate overall stats
1172 	 */
1173 	spin_lock(&journal->j_history_lock);
1174 	journal->j_stats.ts_tid++;
1175 	journal->j_stats.ts_requested += stats.ts_requested;
1176 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1177 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1178 	journal->j_stats.run.rs_running += stats.run.rs_running;
1179 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1180 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1181 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1182 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1183 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1184 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1185 	spin_unlock(&journal->j_history_lock);
1186 }
1187