xref: /linux/fs/jbd2/commit.c (revision 7a13a2eef645f2d2e3018d6ea518f121b35a87c8)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * linux/fs/jbd2/commit.c
4  *
5  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6  *
7  * Copyright 1998 Red Hat corp --- All Rights Reserved
8  *
9  * Journal commit routines for the generic filesystem journaling code;
10  * part of the ext2fs journaling system.
11  */
12 
13 #include <linux/time.h>
14 #include <linux/fs.h>
15 #include <linux/jbd2.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/mm.h>
19 #include <linux/pagemap.h>
20 #include <linux/jiffies.h>
21 #include <linux/crc32.h>
22 #include <linux/writeback.h>
23 #include <linux/backing-dev.h>
24 #include <linux/bio.h>
25 #include <linux/blkdev.h>
26 #include <linux/bitops.h>
27 #include <trace/events/jbd2.h>
28 
29 /*
30  * IO end handler for temporary buffer_heads handling writes to the journal.
31  */
32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33 {
34 	struct buffer_head *orig_bh = bh->b_private;
35 
36 	BUFFER_TRACE(bh, "");
37 	if (uptodate)
38 		set_buffer_uptodate(bh);
39 	else
40 		clear_buffer_uptodate(bh);
41 	if (orig_bh) {
42 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43 		smp_mb__after_atomic();
44 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
45 	}
46 	unlock_buffer(bh);
47 }
48 
49 /*
50  * When an ext4 file is truncated, it is possible that some pages are not
51  * successfully freed, because they are attached to a committing transaction.
52  * After the transaction commits, these pages are left on the LRU, with no
53  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54  * by the VM, but their apparent absence upsets the VM accounting, and it makes
55  * the numbers in /proc/meminfo look odd.
56  *
57  * So here, we have a buffer which has just come off the forget list.  Look to
58  * see if we can strip all buffers from the backing page.
59  *
60  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61  * caller provided us with a ref against the buffer, and we drop that here.
62  */
63 static void release_buffer_page(struct buffer_head *bh)
64 {
65 	struct folio *folio;
66 	struct page *page;
67 
68 	if (buffer_dirty(bh))
69 		goto nope;
70 	if (atomic_read(&bh->b_count) != 1)
71 		goto nope;
72 	page = bh->b_page;
73 	if (!page)
74 		goto nope;
75 	folio = page_folio(page);
76 	if (folio->mapping)
77 		goto nope;
78 
79 	/* OK, it's a truncated page */
80 	if (!folio_trylock(folio))
81 		goto nope;
82 
83 	folio_get(folio);
84 	__brelse(bh);
85 	try_to_free_buffers(folio);
86 	folio_unlock(folio);
87 	folio_put(folio);
88 	return;
89 
90 nope:
91 	__brelse(bh);
92 }
93 
94 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
95 {
96 	struct commit_header *h;
97 	__u32 csum;
98 
99 	if (!jbd2_journal_has_csum_v2or3(j))
100 		return;
101 
102 	h = (struct commit_header *)(bh->b_data);
103 	h->h_chksum_type = 0;
104 	h->h_chksum_size = 0;
105 	h->h_chksum[0] = 0;
106 	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
107 	h->h_chksum[0] = cpu_to_be32(csum);
108 }
109 
110 /*
111  * Done it all: now submit the commit record.  We should have
112  * cleaned up our previous buffers by now, so if we are in abort
113  * mode we can now just skip the rest of the journal write
114  * entirely.
115  *
116  * Returns 1 if the journal needs to be aborted or 0 on success
117  */
118 static int journal_submit_commit_record(journal_t *journal,
119 					transaction_t *commit_transaction,
120 					struct buffer_head **cbh,
121 					__u32 crc32_sum)
122 {
123 	struct commit_header *tmp;
124 	struct buffer_head *bh;
125 	struct timespec64 now;
126 	blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
127 
128 	*cbh = NULL;
129 
130 	if (is_journal_aborted(journal))
131 		return 0;
132 
133 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
134 						JBD2_COMMIT_BLOCK);
135 	if (!bh)
136 		return 1;
137 
138 	tmp = (struct commit_header *)bh->b_data;
139 	ktime_get_coarse_real_ts64(&now);
140 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143 	if (jbd2_has_feature_checksum(journal)) {
144 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
145 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
146 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
147 	}
148 	jbd2_commit_block_csum_set(journal, bh);
149 
150 	BUFFER_TRACE(bh, "submit commit block");
151 	lock_buffer(bh);
152 	clear_buffer_dirty(bh);
153 	set_buffer_uptodate(bh);
154 	bh->b_end_io = journal_end_buffer_io_sync;
155 
156 	if (journal->j_flags & JBD2_BARRIER &&
157 	    !jbd2_has_feature_async_commit(journal))
158 		write_flags |= REQ_PREFLUSH | REQ_FUA;
159 
160 	submit_bh(write_flags, bh);
161 	*cbh = bh;
162 	return 0;
163 }
164 
165 /*
166  * This function along with journal_submit_commit_record
167  * allows to write the commit record asynchronously.
168  */
169 static int journal_wait_on_commit_record(journal_t *journal,
170 					 struct buffer_head *bh)
171 {
172 	int ret = 0;
173 
174 	clear_buffer_dirty(bh);
175 	wait_on_buffer(bh);
176 
177 	if (unlikely(!buffer_uptodate(bh)))
178 		ret = -EIO;
179 	put_bh(bh);            /* One for getblk() */
180 
181 	return ret;
182 }
183 
184 /*
185  * write the filemap data using writepage() address_space_operations.
186  * We don't do block allocation here even for delalloc. We don't
187  * use writepages() because with delayed allocation we may be doing
188  * block allocation in writepages().
189  */
190 int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
191 {
192 	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
193 	struct writeback_control wbc = {
194 		.sync_mode =  WB_SYNC_ALL,
195 		.nr_to_write = mapping->nrpages * 2,
196 		.range_start = jinode->i_dirty_start,
197 		.range_end = jinode->i_dirty_end,
198 	};
199 
200 	/*
201 	 * submit the inode data buffers. We use writepage
202 	 * instead of writepages. Because writepages can do
203 	 * block allocation with delalloc. We need to write
204 	 * only allocated blocks here.
205 	 */
206 	return generic_writepages(mapping, &wbc);
207 }
208 
209 /* Send all the data buffers related to an inode */
210 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
211 {
212 	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
213 		return 0;
214 
215 	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
216 	return journal->j_submit_inode_data_buffers(jinode);
217 
218 }
219 EXPORT_SYMBOL(jbd2_submit_inode_data);
220 
221 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
222 {
223 	if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
224 		!jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
225 		return 0;
226 	return filemap_fdatawait_range_keep_errors(
227 		jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
228 		jinode->i_dirty_end);
229 }
230 EXPORT_SYMBOL(jbd2_wait_inode_data);
231 
232 /*
233  * Submit all the data buffers of inode associated with the transaction to
234  * disk.
235  *
236  * We are in a committing transaction. Therefore no new inode can be added to
237  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
238  * operate on from being released while we write out pages.
239  */
240 static int journal_submit_data_buffers(journal_t *journal,
241 		transaction_t *commit_transaction)
242 {
243 	struct jbd2_inode *jinode;
244 	int err, ret = 0;
245 
246 	spin_lock(&journal->j_list_lock);
247 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
248 		if (!(jinode->i_flags & JI_WRITE_DATA))
249 			continue;
250 		jinode->i_flags |= JI_COMMIT_RUNNING;
251 		spin_unlock(&journal->j_list_lock);
252 		/* submit the inode data buffers. */
253 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
254 		if (journal->j_submit_inode_data_buffers) {
255 			err = journal->j_submit_inode_data_buffers(jinode);
256 			if (!ret)
257 				ret = err;
258 		}
259 		spin_lock(&journal->j_list_lock);
260 		J_ASSERT(jinode->i_transaction == commit_transaction);
261 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
262 		smp_mb();
263 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
264 	}
265 	spin_unlock(&journal->j_list_lock);
266 	return ret;
267 }
268 
269 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
270 {
271 	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
272 
273 	return filemap_fdatawait_range_keep_errors(mapping,
274 						   jinode->i_dirty_start,
275 						   jinode->i_dirty_end);
276 }
277 
278 /*
279  * Wait for data submitted for writeout, refile inodes to proper
280  * transaction if needed.
281  *
282  */
283 static int journal_finish_inode_data_buffers(journal_t *journal,
284 		transaction_t *commit_transaction)
285 {
286 	struct jbd2_inode *jinode, *next_i;
287 	int err, ret = 0;
288 
289 	/* For locking, see the comment in journal_submit_data_buffers() */
290 	spin_lock(&journal->j_list_lock);
291 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
292 		if (!(jinode->i_flags & JI_WAIT_DATA))
293 			continue;
294 		jinode->i_flags |= JI_COMMIT_RUNNING;
295 		spin_unlock(&journal->j_list_lock);
296 		/* wait for the inode data buffers writeout. */
297 		if (journal->j_finish_inode_data_buffers) {
298 			err = journal->j_finish_inode_data_buffers(jinode);
299 			if (!ret)
300 				ret = err;
301 		}
302 		spin_lock(&journal->j_list_lock);
303 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
304 		smp_mb();
305 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
306 	}
307 
308 	/* Now refile inode to proper lists */
309 	list_for_each_entry_safe(jinode, next_i,
310 				 &commit_transaction->t_inode_list, i_list) {
311 		list_del(&jinode->i_list);
312 		if (jinode->i_next_transaction) {
313 			jinode->i_transaction = jinode->i_next_transaction;
314 			jinode->i_next_transaction = NULL;
315 			list_add(&jinode->i_list,
316 				&jinode->i_transaction->t_inode_list);
317 		} else {
318 			jinode->i_transaction = NULL;
319 			jinode->i_dirty_start = 0;
320 			jinode->i_dirty_end = 0;
321 		}
322 	}
323 	spin_unlock(&journal->j_list_lock);
324 
325 	return ret;
326 }
327 
328 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
329 {
330 	struct page *page = bh->b_page;
331 	char *addr;
332 	__u32 checksum;
333 
334 	addr = kmap_atomic(page);
335 	checksum = crc32_be(crc32_sum,
336 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
337 	kunmap_atomic(addr);
338 
339 	return checksum;
340 }
341 
342 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
343 				   unsigned long long block)
344 {
345 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
346 	if (jbd2_has_feature_64bit(j))
347 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
348 }
349 
350 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
351 				    struct buffer_head *bh, __u32 sequence)
352 {
353 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
354 	struct page *page = bh->b_page;
355 	__u8 *addr;
356 	__u32 csum32;
357 	__be32 seq;
358 
359 	if (!jbd2_journal_has_csum_v2or3(j))
360 		return;
361 
362 	seq = cpu_to_be32(sequence);
363 	addr = kmap_atomic(page);
364 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
365 	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
366 			     bh->b_size);
367 	kunmap_atomic(addr);
368 
369 	if (jbd2_has_feature_csum3(j))
370 		tag3->t_checksum = cpu_to_be32(csum32);
371 	else
372 		tag->t_checksum = cpu_to_be16(csum32);
373 }
374 /*
375  * jbd2_journal_commit_transaction
376  *
377  * The primary function for committing a transaction to the log.  This
378  * function is called by the journal thread to begin a complete commit.
379  */
380 void jbd2_journal_commit_transaction(journal_t *journal)
381 {
382 	struct transaction_stats_s stats;
383 	transaction_t *commit_transaction;
384 	struct journal_head *jh;
385 	struct buffer_head *descriptor;
386 	struct buffer_head **wbuf = journal->j_wbuf;
387 	int bufs;
388 	int flags;
389 	int err;
390 	unsigned long long blocknr;
391 	ktime_t start_time;
392 	u64 commit_time;
393 	char *tagp = NULL;
394 	journal_block_tag_t *tag = NULL;
395 	int space_left = 0;
396 	int first_tag = 0;
397 	int tag_flag;
398 	int i;
399 	int tag_bytes = journal_tag_bytes(journal);
400 	struct buffer_head *cbh = NULL; /* For transactional checksums */
401 	__u32 crc32_sum = ~0;
402 	struct blk_plug plug;
403 	/* Tail of the journal */
404 	unsigned long first_block;
405 	tid_t first_tid;
406 	int update_tail;
407 	int csum_size = 0;
408 	LIST_HEAD(io_bufs);
409 	LIST_HEAD(log_bufs);
410 
411 	if (jbd2_journal_has_csum_v2or3(journal))
412 		csum_size = sizeof(struct jbd2_journal_block_tail);
413 
414 	/*
415 	 * First job: lock down the current transaction and wait for
416 	 * all outstanding updates to complete.
417 	 */
418 
419 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
420 	if (journal->j_flags & JBD2_FLUSHED) {
421 		jbd2_debug(3, "super block updated\n");
422 		mutex_lock_io(&journal->j_checkpoint_mutex);
423 		/*
424 		 * We hold j_checkpoint_mutex so tail cannot change under us.
425 		 * We don't need any special data guarantees for writing sb
426 		 * since journal is empty and it is ok for write to be
427 		 * flushed only with transaction commit.
428 		 */
429 		jbd2_journal_update_sb_log_tail(journal,
430 						journal->j_tail_sequence,
431 						journal->j_tail,
432 						REQ_SYNC);
433 		mutex_unlock(&journal->j_checkpoint_mutex);
434 	} else {
435 		jbd2_debug(3, "superblock not updated\n");
436 	}
437 
438 	J_ASSERT(journal->j_running_transaction != NULL);
439 	J_ASSERT(journal->j_committing_transaction == NULL);
440 
441 	write_lock(&journal->j_state_lock);
442 	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
443 	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
444 		DEFINE_WAIT(wait);
445 
446 		prepare_to_wait(&journal->j_fc_wait, &wait,
447 				TASK_UNINTERRUPTIBLE);
448 		write_unlock(&journal->j_state_lock);
449 		schedule();
450 		write_lock(&journal->j_state_lock);
451 		finish_wait(&journal->j_fc_wait, &wait);
452 		/*
453 		 * TODO: by blocking fast commits here, we are increasing
454 		 * fsync() latency slightly. Strictly speaking, we don't need
455 		 * to block fast commits until the transaction enters T_FLUSH
456 		 * state. So an optimization is possible where we block new fast
457 		 * commits here and wait for existing ones to complete
458 		 * just before we enter T_FLUSH. That way, the existing fast
459 		 * commits and this full commit can proceed parallely.
460 		 */
461 	}
462 	write_unlock(&journal->j_state_lock);
463 
464 	commit_transaction = journal->j_running_transaction;
465 
466 	trace_jbd2_start_commit(journal, commit_transaction);
467 	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
468 			commit_transaction->t_tid);
469 
470 	write_lock(&journal->j_state_lock);
471 	journal->j_fc_off = 0;
472 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
473 	commit_transaction->t_state = T_LOCKED;
474 
475 	trace_jbd2_commit_locking(journal, commit_transaction);
476 	stats.run.rs_wait = commit_transaction->t_max_wait;
477 	stats.run.rs_request_delay = 0;
478 	stats.run.rs_locked = jiffies;
479 	if (commit_transaction->t_requested)
480 		stats.run.rs_request_delay =
481 			jbd2_time_diff(commit_transaction->t_requested,
482 				       stats.run.rs_locked);
483 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
484 					      stats.run.rs_locked);
485 
486 	// waits for any t_updates to finish
487 	jbd2_journal_wait_updates(journal);
488 
489 	commit_transaction->t_state = T_SWITCH;
490 
491 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
492 			journal->j_max_transaction_buffers);
493 
494 	/*
495 	 * First thing we are allowed to do is to discard any remaining
496 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
497 	 * that there are no such buffers: if a large filesystem
498 	 * operation like a truncate needs to split itself over multiple
499 	 * transactions, then it may try to do a jbd2_journal_restart() while
500 	 * there are still BJ_Reserved buffers outstanding.  These must
501 	 * be released cleanly from the current transaction.
502 	 *
503 	 * In this case, the filesystem must still reserve write access
504 	 * again before modifying the buffer in the new transaction, but
505 	 * we do not require it to remember exactly which old buffers it
506 	 * has reserved.  This is consistent with the existing behaviour
507 	 * that multiple jbd2_journal_get_write_access() calls to the same
508 	 * buffer are perfectly permissible.
509 	 * We use journal->j_state_lock here to serialize processing of
510 	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
511 	 */
512 	while (commit_transaction->t_reserved_list) {
513 		jh = commit_transaction->t_reserved_list;
514 		JBUFFER_TRACE(jh, "reserved, unused: refile");
515 		/*
516 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
517 		 * leave undo-committed data.
518 		 */
519 		if (jh->b_committed_data) {
520 			struct buffer_head *bh = jh2bh(jh);
521 
522 			spin_lock(&jh->b_state_lock);
523 			jbd2_free(jh->b_committed_data, bh->b_size);
524 			jh->b_committed_data = NULL;
525 			spin_unlock(&jh->b_state_lock);
526 		}
527 		jbd2_journal_refile_buffer(journal, jh);
528 	}
529 
530 	write_unlock(&journal->j_state_lock);
531 	/*
532 	 * Now try to drop any written-back buffers from the journal's
533 	 * checkpoint lists.  We do this *before* commit because it potentially
534 	 * frees some memory
535 	 */
536 	spin_lock(&journal->j_list_lock);
537 	__jbd2_journal_clean_checkpoint_list(journal, false);
538 	spin_unlock(&journal->j_list_lock);
539 
540 	jbd2_debug(3, "JBD2: commit phase 1\n");
541 
542 	/*
543 	 * Clear revoked flag to reflect there is no revoked buffers
544 	 * in the next transaction which is going to be started.
545 	 */
546 	jbd2_clear_buffer_revoked_flags(journal);
547 
548 	/*
549 	 * Switch to a new revoke table.
550 	 */
551 	jbd2_journal_switch_revoke_table(journal);
552 
553 	write_lock(&journal->j_state_lock);
554 	/*
555 	 * Reserved credits cannot be claimed anymore, free them
556 	 */
557 	atomic_sub(atomic_read(&journal->j_reserved_credits),
558 		   &commit_transaction->t_outstanding_credits);
559 
560 	trace_jbd2_commit_flushing(journal, commit_transaction);
561 	stats.run.rs_flushing = jiffies;
562 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
563 					     stats.run.rs_flushing);
564 
565 	commit_transaction->t_state = T_FLUSH;
566 	journal->j_committing_transaction = commit_transaction;
567 	journal->j_running_transaction = NULL;
568 	start_time = ktime_get();
569 	commit_transaction->t_log_start = journal->j_head;
570 	wake_up_all(&journal->j_wait_transaction_locked);
571 	write_unlock(&journal->j_state_lock);
572 
573 	jbd2_debug(3, "JBD2: commit phase 2a\n");
574 
575 	/*
576 	 * Now start flushing things to disk, in the order they appear
577 	 * on the transaction lists.  Data blocks go first.
578 	 */
579 	err = journal_submit_data_buffers(journal, commit_transaction);
580 	if (err)
581 		jbd2_journal_abort(journal, err);
582 
583 	blk_start_plug(&plug);
584 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
585 
586 	jbd2_debug(3, "JBD2: commit phase 2b\n");
587 
588 	/*
589 	 * Way to go: we have now written out all of the data for a
590 	 * transaction!  Now comes the tricky part: we need to write out
591 	 * metadata.  Loop over the transaction's entire buffer list:
592 	 */
593 	write_lock(&journal->j_state_lock);
594 	commit_transaction->t_state = T_COMMIT;
595 	write_unlock(&journal->j_state_lock);
596 
597 	trace_jbd2_commit_logging(journal, commit_transaction);
598 	stats.run.rs_logging = jiffies;
599 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
600 					       stats.run.rs_logging);
601 	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
602 	stats.run.rs_blocks_logged = 0;
603 
604 	J_ASSERT(commit_transaction->t_nr_buffers <=
605 		 atomic_read(&commit_transaction->t_outstanding_credits));
606 
607 	err = 0;
608 	bufs = 0;
609 	descriptor = NULL;
610 	while (commit_transaction->t_buffers) {
611 
612 		/* Find the next buffer to be journaled... */
613 
614 		jh = commit_transaction->t_buffers;
615 
616 		/* If we're in abort mode, we just un-journal the buffer and
617 		   release it. */
618 
619 		if (is_journal_aborted(journal)) {
620 			clear_buffer_jbddirty(jh2bh(jh));
621 			JBUFFER_TRACE(jh, "journal is aborting: refile");
622 			jbd2_buffer_abort_trigger(jh,
623 						  jh->b_frozen_data ?
624 						  jh->b_frozen_triggers :
625 						  jh->b_triggers);
626 			jbd2_journal_refile_buffer(journal, jh);
627 			/* If that was the last one, we need to clean up
628 			 * any descriptor buffers which may have been
629 			 * already allocated, even if we are now
630 			 * aborting. */
631 			if (!commit_transaction->t_buffers)
632 				goto start_journal_io;
633 			continue;
634 		}
635 
636 		/* Make sure we have a descriptor block in which to
637 		   record the metadata buffer. */
638 
639 		if (!descriptor) {
640 			J_ASSERT (bufs == 0);
641 
642 			jbd2_debug(4, "JBD2: get descriptor\n");
643 
644 			descriptor = jbd2_journal_get_descriptor_buffer(
645 							commit_transaction,
646 							JBD2_DESCRIPTOR_BLOCK);
647 			if (!descriptor) {
648 				jbd2_journal_abort(journal, -EIO);
649 				continue;
650 			}
651 
652 			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
653 				(unsigned long long)descriptor->b_blocknr,
654 				descriptor->b_data);
655 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
656 			space_left = descriptor->b_size -
657 						sizeof(journal_header_t);
658 			first_tag = 1;
659 			set_buffer_jwrite(descriptor);
660 			set_buffer_dirty(descriptor);
661 			wbuf[bufs++] = descriptor;
662 
663 			/* Record it so that we can wait for IO
664                            completion later */
665 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
666 			jbd2_file_log_bh(&log_bufs, descriptor);
667 		}
668 
669 		/* Where is the buffer to be written? */
670 
671 		err = jbd2_journal_next_log_block(journal, &blocknr);
672 		/* If the block mapping failed, just abandon the buffer
673 		   and repeat this loop: we'll fall into the
674 		   refile-on-abort condition above. */
675 		if (err) {
676 			jbd2_journal_abort(journal, err);
677 			continue;
678 		}
679 
680 		/*
681 		 * start_this_handle() uses t_outstanding_credits to determine
682 		 * the free space in the log.
683 		 */
684 		atomic_dec(&commit_transaction->t_outstanding_credits);
685 
686 		/* Bump b_count to prevent truncate from stumbling over
687                    the shadowed buffer!  @@@ This can go if we ever get
688                    rid of the shadow pairing of buffers. */
689 		atomic_inc(&jh2bh(jh)->b_count);
690 
691 		/*
692 		 * Make a temporary IO buffer with which to write it out
693 		 * (this will requeue the metadata buffer to BJ_Shadow).
694 		 */
695 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
696 		JBUFFER_TRACE(jh, "ph3: write metadata");
697 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
698 						jh, &wbuf[bufs], blocknr);
699 		if (flags < 0) {
700 			jbd2_journal_abort(journal, flags);
701 			continue;
702 		}
703 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
704 
705 		/* Record the new block's tag in the current descriptor
706                    buffer */
707 
708 		tag_flag = 0;
709 		if (flags & 1)
710 			tag_flag |= JBD2_FLAG_ESCAPE;
711 		if (!first_tag)
712 			tag_flag |= JBD2_FLAG_SAME_UUID;
713 
714 		tag = (journal_block_tag_t *) tagp;
715 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
716 		tag->t_flags = cpu_to_be16(tag_flag);
717 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
718 					commit_transaction->t_tid);
719 		tagp += tag_bytes;
720 		space_left -= tag_bytes;
721 		bufs++;
722 
723 		if (first_tag) {
724 			memcpy (tagp, journal->j_uuid, 16);
725 			tagp += 16;
726 			space_left -= 16;
727 			first_tag = 0;
728 		}
729 
730 		/* If there's no more to do, or if the descriptor is full,
731 		   let the IO rip! */
732 
733 		if (bufs == journal->j_wbufsize ||
734 		    commit_transaction->t_buffers == NULL ||
735 		    space_left < tag_bytes + 16 + csum_size) {
736 
737 			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
738 
739 			/* Write an end-of-descriptor marker before
740                            submitting the IOs.  "tag" still points to
741                            the last tag we set up. */
742 
743 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
744 start_journal_io:
745 			if (descriptor)
746 				jbd2_descriptor_block_csum_set(journal,
747 							descriptor);
748 
749 			for (i = 0; i < bufs; i++) {
750 				struct buffer_head *bh = wbuf[i];
751 				/*
752 				 * Compute checksum.
753 				 */
754 				if (jbd2_has_feature_checksum(journal)) {
755 					crc32_sum =
756 					    jbd2_checksum_data(crc32_sum, bh);
757 				}
758 
759 				lock_buffer(bh);
760 				clear_buffer_dirty(bh);
761 				set_buffer_uptodate(bh);
762 				bh->b_end_io = journal_end_buffer_io_sync;
763 				submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
764 			}
765 			cond_resched();
766 
767 			/* Force a new descriptor to be generated next
768                            time round the loop. */
769 			descriptor = NULL;
770 			bufs = 0;
771 		}
772 	}
773 
774 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
775 	if (err) {
776 		printk(KERN_WARNING
777 			"JBD2: Detected IO errors while flushing file data "
778 		       "on %s\n", journal->j_devname);
779 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
780 			jbd2_journal_abort(journal, err);
781 		err = 0;
782 	}
783 
784 	/*
785 	 * Get current oldest transaction in the log before we issue flush
786 	 * to the filesystem device. After the flush we can be sure that
787 	 * blocks of all older transactions are checkpointed to persistent
788 	 * storage and we will be safe to update journal start in the
789 	 * superblock with the numbers we get here.
790 	 */
791 	update_tail =
792 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
793 
794 	write_lock(&journal->j_state_lock);
795 	if (update_tail) {
796 		long freed = first_block - journal->j_tail;
797 
798 		if (first_block < journal->j_tail)
799 			freed += journal->j_last - journal->j_first;
800 		/* Update tail only if we free significant amount of space */
801 		if (freed < jbd2_journal_get_max_txn_bufs(journal))
802 			update_tail = 0;
803 	}
804 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
805 	commit_transaction->t_state = T_COMMIT_DFLUSH;
806 	write_unlock(&journal->j_state_lock);
807 
808 	/*
809 	 * If the journal is not located on the file system device,
810 	 * then we must flush the file system device before we issue
811 	 * the commit record
812 	 */
813 	if (commit_transaction->t_need_data_flush &&
814 	    (journal->j_fs_dev != journal->j_dev) &&
815 	    (journal->j_flags & JBD2_BARRIER))
816 		blkdev_issue_flush(journal->j_fs_dev);
817 
818 	/* Done it all: now write the commit record asynchronously. */
819 	if (jbd2_has_feature_async_commit(journal)) {
820 		err = journal_submit_commit_record(journal, commit_transaction,
821 						 &cbh, crc32_sum);
822 		if (err)
823 			jbd2_journal_abort(journal, err);
824 	}
825 
826 	blk_finish_plug(&plug);
827 
828 	/* Lo and behold: we have just managed to send a transaction to
829            the log.  Before we can commit it, wait for the IO so far to
830            complete.  Control buffers being written are on the
831            transaction's t_log_list queue, and metadata buffers are on
832            the io_bufs list.
833 
834 	   Wait for the buffers in reverse order.  That way we are
835 	   less likely to be woken up until all IOs have completed, and
836 	   so we incur less scheduling load.
837 	*/
838 
839 	jbd2_debug(3, "JBD2: commit phase 3\n");
840 
841 	while (!list_empty(&io_bufs)) {
842 		struct buffer_head *bh = list_entry(io_bufs.prev,
843 						    struct buffer_head,
844 						    b_assoc_buffers);
845 
846 		wait_on_buffer(bh);
847 		cond_resched();
848 
849 		if (unlikely(!buffer_uptodate(bh)))
850 			err = -EIO;
851 		jbd2_unfile_log_bh(bh);
852 		stats.run.rs_blocks_logged++;
853 
854 		/*
855 		 * The list contains temporary buffer heads created by
856 		 * jbd2_journal_write_metadata_buffer().
857 		 */
858 		BUFFER_TRACE(bh, "dumping temporary bh");
859 		__brelse(bh);
860 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
861 		free_buffer_head(bh);
862 
863 		/* We also have to refile the corresponding shadowed buffer */
864 		jh = commit_transaction->t_shadow_list->b_tprev;
865 		bh = jh2bh(jh);
866 		clear_buffer_jwrite(bh);
867 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
868 		J_ASSERT_BH(bh, !buffer_shadow(bh));
869 
870 		/* The metadata is now released for reuse, but we need
871                    to remember it against this transaction so that when
872                    we finally commit, we can do any checkpointing
873                    required. */
874 		JBUFFER_TRACE(jh, "file as BJ_Forget");
875 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
876 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
877 		__brelse(bh);
878 	}
879 
880 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
881 
882 	jbd2_debug(3, "JBD2: commit phase 4\n");
883 
884 	/* Here we wait for the revoke record and descriptor record buffers */
885 	while (!list_empty(&log_bufs)) {
886 		struct buffer_head *bh;
887 
888 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
889 		wait_on_buffer(bh);
890 		cond_resched();
891 
892 		if (unlikely(!buffer_uptodate(bh)))
893 			err = -EIO;
894 
895 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
896 		clear_buffer_jwrite(bh);
897 		jbd2_unfile_log_bh(bh);
898 		stats.run.rs_blocks_logged++;
899 		__brelse(bh);		/* One for getblk */
900 		/* AKPM: bforget here */
901 	}
902 
903 	if (err)
904 		jbd2_journal_abort(journal, err);
905 
906 	jbd2_debug(3, "JBD2: commit phase 5\n");
907 	write_lock(&journal->j_state_lock);
908 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
909 	commit_transaction->t_state = T_COMMIT_JFLUSH;
910 	write_unlock(&journal->j_state_lock);
911 
912 	if (!jbd2_has_feature_async_commit(journal)) {
913 		err = journal_submit_commit_record(journal, commit_transaction,
914 						&cbh, crc32_sum);
915 		if (err)
916 			jbd2_journal_abort(journal, err);
917 	}
918 	if (cbh)
919 		err = journal_wait_on_commit_record(journal, cbh);
920 	stats.run.rs_blocks_logged++;
921 	if (jbd2_has_feature_async_commit(journal) &&
922 	    journal->j_flags & JBD2_BARRIER) {
923 		blkdev_issue_flush(journal->j_dev);
924 	}
925 
926 	if (err)
927 		jbd2_journal_abort(journal, err);
928 
929 	WARN_ON_ONCE(
930 		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
931 
932 	/*
933 	 * Now disk caches for filesystem device are flushed so we are safe to
934 	 * erase checkpointed transactions from the log by updating journal
935 	 * superblock.
936 	 */
937 	if (update_tail)
938 		jbd2_update_log_tail(journal, first_tid, first_block);
939 
940 	/* End of a transaction!  Finally, we can do checkpoint
941            processing: any buffers committed as a result of this
942            transaction can be removed from any checkpoint list it was on
943            before. */
944 
945 	jbd2_debug(3, "JBD2: commit phase 6\n");
946 
947 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
948 	J_ASSERT(commit_transaction->t_buffers == NULL);
949 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
950 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
951 
952 restart_loop:
953 	/*
954 	 * As there are other places (journal_unmap_buffer()) adding buffers
955 	 * to this list we have to be careful and hold the j_list_lock.
956 	 */
957 	spin_lock(&journal->j_list_lock);
958 	while (commit_transaction->t_forget) {
959 		transaction_t *cp_transaction;
960 		struct buffer_head *bh;
961 		int try_to_free = 0;
962 		bool drop_ref;
963 
964 		jh = commit_transaction->t_forget;
965 		spin_unlock(&journal->j_list_lock);
966 		bh = jh2bh(jh);
967 		/*
968 		 * Get a reference so that bh cannot be freed before we are
969 		 * done with it.
970 		 */
971 		get_bh(bh);
972 		spin_lock(&jh->b_state_lock);
973 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
974 
975 		/*
976 		 * If there is undo-protected committed data against
977 		 * this buffer, then we can remove it now.  If it is a
978 		 * buffer needing such protection, the old frozen_data
979 		 * field now points to a committed version of the
980 		 * buffer, so rotate that field to the new committed
981 		 * data.
982 		 *
983 		 * Otherwise, we can just throw away the frozen data now.
984 		 *
985 		 * We also know that the frozen data has already fired
986 		 * its triggers if they exist, so we can clear that too.
987 		 */
988 		if (jh->b_committed_data) {
989 			jbd2_free(jh->b_committed_data, bh->b_size);
990 			jh->b_committed_data = NULL;
991 			if (jh->b_frozen_data) {
992 				jh->b_committed_data = jh->b_frozen_data;
993 				jh->b_frozen_data = NULL;
994 				jh->b_frozen_triggers = NULL;
995 			}
996 		} else if (jh->b_frozen_data) {
997 			jbd2_free(jh->b_frozen_data, bh->b_size);
998 			jh->b_frozen_data = NULL;
999 			jh->b_frozen_triggers = NULL;
1000 		}
1001 
1002 		spin_lock(&journal->j_list_lock);
1003 		cp_transaction = jh->b_cp_transaction;
1004 		if (cp_transaction) {
1005 			JBUFFER_TRACE(jh, "remove from old cp transaction");
1006 			cp_transaction->t_chp_stats.cs_dropped++;
1007 			__jbd2_journal_remove_checkpoint(jh);
1008 		}
1009 
1010 		/* Only re-checkpoint the buffer_head if it is marked
1011 		 * dirty.  If the buffer was added to the BJ_Forget list
1012 		 * by jbd2_journal_forget, it may no longer be dirty and
1013 		 * there's no point in keeping a checkpoint record for
1014 		 * it. */
1015 
1016 		/*
1017 		 * A buffer which has been freed while still being journaled
1018 		 * by a previous transaction, refile the buffer to BJ_Forget of
1019 		 * the running transaction. If the just committed transaction
1020 		 * contains "add to orphan" operation, we can completely
1021 		 * invalidate the buffer now. We are rather through in that
1022 		 * since the buffer may be still accessible when blocksize <
1023 		 * pagesize and it is attached to the last partial page.
1024 		 */
1025 		if (buffer_freed(bh) && !jh->b_next_transaction) {
1026 			struct address_space *mapping;
1027 
1028 			clear_buffer_freed(bh);
1029 			clear_buffer_jbddirty(bh);
1030 
1031 			/*
1032 			 * Block device buffers need to stay mapped all the
1033 			 * time, so it is enough to clear buffer_jbddirty and
1034 			 * buffer_freed bits. For the file mapping buffers (i.e.
1035 			 * journalled data) we need to unmap buffer and clear
1036 			 * more bits. We also need to be careful about the check
1037 			 * because the data page mapping can get cleared under
1038 			 * our hands. Note that if mapping == NULL, we don't
1039 			 * need to make buffer unmapped because the page is
1040 			 * already detached from the mapping and buffers cannot
1041 			 * get reused.
1042 			 */
1043 			mapping = READ_ONCE(bh->b_page->mapping);
1044 			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1045 				clear_buffer_mapped(bh);
1046 				clear_buffer_new(bh);
1047 				clear_buffer_req(bh);
1048 				bh->b_bdev = NULL;
1049 			}
1050 		}
1051 
1052 		if (buffer_jbddirty(bh)) {
1053 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1054 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1055 			if (is_journal_aborted(journal))
1056 				clear_buffer_jbddirty(bh);
1057 		} else {
1058 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1059 			/*
1060 			 * The buffer on BJ_Forget list and not jbddirty means
1061 			 * it has been freed by this transaction and hence it
1062 			 * could not have been reallocated until this
1063 			 * transaction has committed. *BUT* it could be
1064 			 * reallocated once we have written all the data to
1065 			 * disk and before we process the buffer on BJ_Forget
1066 			 * list.
1067 			 */
1068 			if (!jh->b_next_transaction)
1069 				try_to_free = 1;
1070 		}
1071 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1072 		drop_ref = __jbd2_journal_refile_buffer(jh);
1073 		spin_unlock(&jh->b_state_lock);
1074 		if (drop_ref)
1075 			jbd2_journal_put_journal_head(jh);
1076 		if (try_to_free)
1077 			release_buffer_page(bh);	/* Drops bh reference */
1078 		else
1079 			__brelse(bh);
1080 		cond_resched_lock(&journal->j_list_lock);
1081 	}
1082 	spin_unlock(&journal->j_list_lock);
1083 	/*
1084 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1085 	 * of a transaction into T_FINISHED state and calling
1086 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1087 	 * other checkpointing code processing the transaction...
1088 	 */
1089 	write_lock(&journal->j_state_lock);
1090 	spin_lock(&journal->j_list_lock);
1091 	/*
1092 	 * Now recheck if some buffers did not get attached to the transaction
1093 	 * while the lock was dropped...
1094 	 */
1095 	if (commit_transaction->t_forget) {
1096 		spin_unlock(&journal->j_list_lock);
1097 		write_unlock(&journal->j_state_lock);
1098 		goto restart_loop;
1099 	}
1100 
1101 	/* Add the transaction to the checkpoint list
1102 	 * __journal_remove_checkpoint() can not destroy transaction
1103 	 * under us because it is not marked as T_FINISHED yet */
1104 	if (journal->j_checkpoint_transactions == NULL) {
1105 		journal->j_checkpoint_transactions = commit_transaction;
1106 		commit_transaction->t_cpnext = commit_transaction;
1107 		commit_transaction->t_cpprev = commit_transaction;
1108 	} else {
1109 		commit_transaction->t_cpnext =
1110 			journal->j_checkpoint_transactions;
1111 		commit_transaction->t_cpprev =
1112 			commit_transaction->t_cpnext->t_cpprev;
1113 		commit_transaction->t_cpnext->t_cpprev =
1114 			commit_transaction;
1115 		commit_transaction->t_cpprev->t_cpnext =
1116 				commit_transaction;
1117 	}
1118 	spin_unlock(&journal->j_list_lock);
1119 
1120 	/* Done with this transaction! */
1121 
1122 	jbd2_debug(3, "JBD2: commit phase 7\n");
1123 
1124 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1125 
1126 	commit_transaction->t_start = jiffies;
1127 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1128 					      commit_transaction->t_start);
1129 
1130 	/*
1131 	 * File the transaction statistics
1132 	 */
1133 	stats.ts_tid = commit_transaction->t_tid;
1134 	stats.run.rs_handle_count =
1135 		atomic_read(&commit_transaction->t_handle_count);
1136 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1137 			     commit_transaction->t_tid, &stats.run);
1138 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1139 
1140 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1141 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1142 	journal->j_commit_sequence = commit_transaction->t_tid;
1143 	journal->j_committing_transaction = NULL;
1144 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1145 
1146 	/*
1147 	 * weight the commit time higher than the average time so we don't
1148 	 * react too strongly to vast changes in the commit time
1149 	 */
1150 	if (likely(journal->j_average_commit_time))
1151 		journal->j_average_commit_time = (commit_time +
1152 				journal->j_average_commit_time*3) / 4;
1153 	else
1154 		journal->j_average_commit_time = commit_time;
1155 
1156 	write_unlock(&journal->j_state_lock);
1157 
1158 	if (journal->j_commit_callback)
1159 		journal->j_commit_callback(journal, commit_transaction);
1160 	if (journal->j_fc_cleanup_callback)
1161 		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1162 
1163 	trace_jbd2_end_commit(journal, commit_transaction);
1164 	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1165 		  journal->j_commit_sequence, journal->j_tail_sequence);
1166 
1167 	write_lock(&journal->j_state_lock);
1168 	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1169 	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1170 	spin_lock(&journal->j_list_lock);
1171 	commit_transaction->t_state = T_FINISHED;
1172 	/* Check if the transaction can be dropped now that we are finished */
1173 	if (commit_transaction->t_checkpoint_list == NULL &&
1174 	    commit_transaction->t_checkpoint_io_list == NULL) {
1175 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1176 		jbd2_journal_free_transaction(commit_transaction);
1177 	}
1178 	spin_unlock(&journal->j_list_lock);
1179 	write_unlock(&journal->j_state_lock);
1180 	wake_up(&journal->j_wait_done_commit);
1181 	wake_up(&journal->j_fc_wait);
1182 
1183 	/*
1184 	 * Calculate overall stats
1185 	 */
1186 	spin_lock(&journal->j_history_lock);
1187 	journal->j_stats.ts_tid++;
1188 	journal->j_stats.ts_requested += stats.ts_requested;
1189 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1190 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1191 	journal->j_stats.run.rs_running += stats.run.rs_running;
1192 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1193 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1194 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1195 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1196 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1197 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1198 	spin_unlock(&journal->j_history_lock);
1199 }
1200