xref: /linux/fs/jbd2/commit.c (revision 3339578f05787259917788f461f4196b7349c2a4)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 #include <asm/system.h>
32 
33 /*
34  * Default IO end handler for temporary BJ_IO buffer_heads.
35  */
36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 {
38 	BUFFER_TRACE(bh, "");
39 	if (uptodate)
40 		set_buffer_uptodate(bh);
41 	else
42 		clear_buffer_uptodate(bh);
43 	unlock_buffer(bh);
44 }
45 
46 /*
47  * When an ext4 file is truncated, it is possible that some pages are not
48  * successfully freed, because they are attached to a committing transaction.
49  * After the transaction commits, these pages are left on the LRU, with no
50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
52  * the numbers in /proc/meminfo look odd.
53  *
54  * So here, we have a buffer which has just come off the forget list.  Look to
55  * see if we can strip all buffers from the backing page.
56  *
57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
58  * caller provided us with a ref against the buffer, and we drop that here.
59  */
60 static void release_buffer_page(struct buffer_head *bh)
61 {
62 	struct page *page;
63 
64 	if (buffer_dirty(bh))
65 		goto nope;
66 	if (atomic_read(&bh->b_count) != 1)
67 		goto nope;
68 	page = bh->b_page;
69 	if (!page)
70 		goto nope;
71 	if (page->mapping)
72 		goto nope;
73 
74 	/* OK, it's a truncated page */
75 	if (!trylock_page(page))
76 		goto nope;
77 
78 	page_cache_get(page);
79 	__brelse(bh);
80 	try_to_free_buffers(page);
81 	unlock_page(page);
82 	page_cache_release(page);
83 	return;
84 
85 nope:
86 	__brelse(bh);
87 }
88 
89 /*
90  * Done it all: now submit the commit record.  We should have
91  * cleaned up our previous buffers by now, so if we are in abort
92  * mode we can now just skip the rest of the journal write
93  * entirely.
94  *
95  * Returns 1 if the journal needs to be aborted or 0 on success
96  */
97 static int journal_submit_commit_record(journal_t *journal,
98 					transaction_t *commit_transaction,
99 					struct buffer_head **cbh,
100 					__u32 crc32_sum)
101 {
102 	struct journal_head *descriptor;
103 	struct commit_header *tmp;
104 	struct buffer_head *bh;
105 	int ret;
106 	struct timespec now = current_kernel_time();
107 
108 	*cbh = NULL;
109 
110 	if (is_journal_aborted(journal))
111 		return 0;
112 
113 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
114 	if (!descriptor)
115 		return 1;
116 
117 	bh = jh2bh(descriptor);
118 
119 	tmp = (struct commit_header *)bh->b_data;
120 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
121 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
122 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
123 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
124 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
125 
126 	if (JBD2_HAS_COMPAT_FEATURE(journal,
127 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
128 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
129 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
130 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
131 	}
132 
133 	JBUFFER_TRACE(descriptor, "submit commit block");
134 	lock_buffer(bh);
135 	clear_buffer_dirty(bh);
136 	set_buffer_uptodate(bh);
137 	bh->b_end_io = journal_end_buffer_io_sync;
138 
139 	if (journal->j_flags & JBD2_BARRIER &&
140 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
141 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
142 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
143 	else
144 		ret = submit_bh(WRITE_SYNC, bh);
145 
146 	*cbh = bh;
147 	return ret;
148 }
149 
150 /*
151  * This function along with journal_submit_commit_record
152  * allows to write the commit record asynchronously.
153  */
154 static int journal_wait_on_commit_record(journal_t *journal,
155 					 struct buffer_head *bh)
156 {
157 	int ret = 0;
158 
159 	clear_buffer_dirty(bh);
160 	wait_on_buffer(bh);
161 
162 	if (unlikely(!buffer_uptodate(bh)))
163 		ret = -EIO;
164 	put_bh(bh);            /* One for getblk() */
165 	jbd2_journal_put_journal_head(bh2jh(bh));
166 
167 	return ret;
168 }
169 
170 /*
171  * write the filemap data using writepage() address_space_operations.
172  * We don't do block allocation here even for delalloc. We don't
173  * use writepages() because with dealyed allocation we may be doing
174  * block allocation in writepages().
175  */
176 static int journal_submit_inode_data_buffers(struct address_space *mapping)
177 {
178 	int ret;
179 	struct writeback_control wbc = {
180 		.sync_mode =  WB_SYNC_ALL,
181 		.nr_to_write = mapping->nrpages * 2,
182 		.range_start = 0,
183 		.range_end = i_size_read(mapping->host),
184 	};
185 
186 	ret = generic_writepages(mapping, &wbc);
187 	return ret;
188 }
189 
190 /*
191  * Submit all the data buffers of inode associated with the transaction to
192  * disk.
193  *
194  * We are in a committing transaction. Therefore no new inode can be added to
195  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
196  * operate on from being released while we write out pages.
197  */
198 static int journal_submit_data_buffers(journal_t *journal,
199 		transaction_t *commit_transaction)
200 {
201 	struct jbd2_inode *jinode;
202 	int err, ret = 0;
203 	struct address_space *mapping;
204 
205 	spin_lock(&journal->j_list_lock);
206 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
207 		mapping = jinode->i_vfs_inode->i_mapping;
208 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
209 		spin_unlock(&journal->j_list_lock);
210 		/*
211 		 * submit the inode data buffers. We use writepage
212 		 * instead of writepages. Because writepages can do
213 		 * block allocation  with delalloc. We need to write
214 		 * only allocated blocks here.
215 		 */
216 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
217 		err = journal_submit_inode_data_buffers(mapping);
218 		if (!ret)
219 			ret = err;
220 		spin_lock(&journal->j_list_lock);
221 		J_ASSERT(jinode->i_transaction == commit_transaction);
222 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
223 		smp_mb__after_clear_bit();
224 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
225 	}
226 	spin_unlock(&journal->j_list_lock);
227 	return ret;
228 }
229 
230 /*
231  * Wait for data submitted for writeout, refile inodes to proper
232  * transaction if needed.
233  *
234  */
235 static int journal_finish_inode_data_buffers(journal_t *journal,
236 		transaction_t *commit_transaction)
237 {
238 	struct jbd2_inode *jinode, *next_i;
239 	int err, ret = 0;
240 
241 	/* For locking, see the comment in journal_submit_data_buffers() */
242 	spin_lock(&journal->j_list_lock);
243 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
244 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
245 		spin_unlock(&journal->j_list_lock);
246 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
247 		if (err) {
248 			/*
249 			 * Because AS_EIO is cleared by
250 			 * filemap_fdatawait_range(), set it again so
251 			 * that user process can get -EIO from fsync().
252 			 */
253 			set_bit(AS_EIO,
254 				&jinode->i_vfs_inode->i_mapping->flags);
255 
256 			if (!ret)
257 				ret = err;
258 		}
259 		spin_lock(&journal->j_list_lock);
260 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
261 		smp_mb__after_clear_bit();
262 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
263 	}
264 
265 	/* Now refile inode to proper lists */
266 	list_for_each_entry_safe(jinode, next_i,
267 				 &commit_transaction->t_inode_list, i_list) {
268 		list_del(&jinode->i_list);
269 		if (jinode->i_next_transaction) {
270 			jinode->i_transaction = jinode->i_next_transaction;
271 			jinode->i_next_transaction = NULL;
272 			list_add(&jinode->i_list,
273 				&jinode->i_transaction->t_inode_list);
274 		} else {
275 			jinode->i_transaction = NULL;
276 		}
277 	}
278 	spin_unlock(&journal->j_list_lock);
279 
280 	return ret;
281 }
282 
283 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
284 {
285 	struct page *page = bh->b_page;
286 	char *addr;
287 	__u32 checksum;
288 
289 	addr = kmap_atomic(page, KM_USER0);
290 	checksum = crc32_be(crc32_sum,
291 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
292 	kunmap_atomic(addr, KM_USER0);
293 
294 	return checksum;
295 }
296 
297 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
298 				   unsigned long long block)
299 {
300 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
301 	if (tag_bytes > JBD2_TAG_SIZE32)
302 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
303 }
304 
305 /*
306  * jbd2_journal_commit_transaction
307  *
308  * The primary function for committing a transaction to the log.  This
309  * function is called by the journal thread to begin a complete commit.
310  */
311 void jbd2_journal_commit_transaction(journal_t *journal)
312 {
313 	struct transaction_stats_s stats;
314 	transaction_t *commit_transaction;
315 	struct journal_head *jh, *new_jh, *descriptor;
316 	struct buffer_head **wbuf = journal->j_wbuf;
317 	int bufs;
318 	int flags;
319 	int err;
320 	unsigned long long blocknr;
321 	ktime_t start_time;
322 	u64 commit_time;
323 	char *tagp = NULL;
324 	journal_header_t *header;
325 	journal_block_tag_t *tag = NULL;
326 	int space_left = 0;
327 	int first_tag = 0;
328 	int tag_flag;
329 	int i, to_free = 0;
330 	int tag_bytes = journal_tag_bytes(journal);
331 	struct buffer_head *cbh = NULL; /* For transactional checksums */
332 	__u32 crc32_sum = ~0;
333 	struct blk_plug plug;
334 	/* Tail of the journal */
335 	unsigned long first_block;
336 	tid_t first_tid;
337 	int update_tail;
338 
339 	/*
340 	 * First job: lock down the current transaction and wait for
341 	 * all outstanding updates to complete.
342 	 */
343 
344 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
345 	if (journal->j_flags & JBD2_FLUSHED) {
346 		jbd_debug(3, "super block updated\n");
347 		mutex_lock(&journal->j_checkpoint_mutex);
348 		/*
349 		 * We hold j_checkpoint_mutex so tail cannot change under us.
350 		 * We don't need any special data guarantees for writing sb
351 		 * since journal is empty and it is ok for write to be
352 		 * flushed only with transaction commit.
353 		 */
354 		jbd2_journal_update_sb_log_tail(journal,
355 						journal->j_tail_sequence,
356 						journal->j_tail,
357 						WRITE_SYNC);
358 		mutex_unlock(&journal->j_checkpoint_mutex);
359 	} else {
360 		jbd_debug(3, "superblock not updated\n");
361 	}
362 
363 	J_ASSERT(journal->j_running_transaction != NULL);
364 	J_ASSERT(journal->j_committing_transaction == NULL);
365 
366 	commit_transaction = journal->j_running_transaction;
367 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
368 
369 	trace_jbd2_start_commit(journal, commit_transaction);
370 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
371 			commit_transaction->t_tid);
372 
373 	write_lock(&journal->j_state_lock);
374 	commit_transaction->t_state = T_LOCKED;
375 
376 	trace_jbd2_commit_locking(journal, commit_transaction);
377 	stats.run.rs_wait = commit_transaction->t_max_wait;
378 	stats.run.rs_locked = jiffies;
379 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
380 					      stats.run.rs_locked);
381 
382 	spin_lock(&commit_transaction->t_handle_lock);
383 	while (atomic_read(&commit_transaction->t_updates)) {
384 		DEFINE_WAIT(wait);
385 
386 		prepare_to_wait(&journal->j_wait_updates, &wait,
387 					TASK_UNINTERRUPTIBLE);
388 		if (atomic_read(&commit_transaction->t_updates)) {
389 			spin_unlock(&commit_transaction->t_handle_lock);
390 			write_unlock(&journal->j_state_lock);
391 			schedule();
392 			write_lock(&journal->j_state_lock);
393 			spin_lock(&commit_transaction->t_handle_lock);
394 		}
395 		finish_wait(&journal->j_wait_updates, &wait);
396 	}
397 	spin_unlock(&commit_transaction->t_handle_lock);
398 
399 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
400 			journal->j_max_transaction_buffers);
401 
402 	/*
403 	 * First thing we are allowed to do is to discard any remaining
404 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
405 	 * that there are no such buffers: if a large filesystem
406 	 * operation like a truncate needs to split itself over multiple
407 	 * transactions, then it may try to do a jbd2_journal_restart() while
408 	 * there are still BJ_Reserved buffers outstanding.  These must
409 	 * be released cleanly from the current transaction.
410 	 *
411 	 * In this case, the filesystem must still reserve write access
412 	 * again before modifying the buffer in the new transaction, but
413 	 * we do not require it to remember exactly which old buffers it
414 	 * has reserved.  This is consistent with the existing behaviour
415 	 * that multiple jbd2_journal_get_write_access() calls to the same
416 	 * buffer are perfectly permissible.
417 	 */
418 	while (commit_transaction->t_reserved_list) {
419 		jh = commit_transaction->t_reserved_list;
420 		JBUFFER_TRACE(jh, "reserved, unused: refile");
421 		/*
422 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
423 		 * leave undo-committed data.
424 		 */
425 		if (jh->b_committed_data) {
426 			struct buffer_head *bh = jh2bh(jh);
427 
428 			jbd_lock_bh_state(bh);
429 			jbd2_free(jh->b_committed_data, bh->b_size);
430 			jh->b_committed_data = NULL;
431 			jbd_unlock_bh_state(bh);
432 		}
433 		jbd2_journal_refile_buffer(journal, jh);
434 	}
435 
436 	/*
437 	 * Now try to drop any written-back buffers from the journal's
438 	 * checkpoint lists.  We do this *before* commit because it potentially
439 	 * frees some memory
440 	 */
441 	spin_lock(&journal->j_list_lock);
442 	__jbd2_journal_clean_checkpoint_list(journal);
443 	spin_unlock(&journal->j_list_lock);
444 
445 	jbd_debug(3, "JBD2: commit phase 1\n");
446 
447 	/*
448 	 * Clear revoked flag to reflect there is no revoked buffers
449 	 * in the next transaction which is going to be started.
450 	 */
451 	jbd2_clear_buffer_revoked_flags(journal);
452 
453 	/*
454 	 * Switch to a new revoke table.
455 	 */
456 	jbd2_journal_switch_revoke_table(journal);
457 
458 	trace_jbd2_commit_flushing(journal, commit_transaction);
459 	stats.run.rs_flushing = jiffies;
460 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
461 					     stats.run.rs_flushing);
462 
463 	commit_transaction->t_state = T_FLUSH;
464 	journal->j_committing_transaction = commit_transaction;
465 	journal->j_running_transaction = NULL;
466 	start_time = ktime_get();
467 	commit_transaction->t_log_start = journal->j_head;
468 	wake_up(&journal->j_wait_transaction_locked);
469 	write_unlock(&journal->j_state_lock);
470 
471 	jbd_debug(3, "JBD2: commit phase 2\n");
472 
473 	/*
474 	 * Now start flushing things to disk, in the order they appear
475 	 * on the transaction lists.  Data blocks go first.
476 	 */
477 	err = journal_submit_data_buffers(journal, commit_transaction);
478 	if (err)
479 		jbd2_journal_abort(journal, err);
480 
481 	blk_start_plug(&plug);
482 	jbd2_journal_write_revoke_records(journal, commit_transaction,
483 					  WRITE_SYNC);
484 	blk_finish_plug(&plug);
485 
486 	jbd_debug(3, "JBD2: commit phase 2\n");
487 
488 	/*
489 	 * Way to go: we have now written out all of the data for a
490 	 * transaction!  Now comes the tricky part: we need to write out
491 	 * metadata.  Loop over the transaction's entire buffer list:
492 	 */
493 	write_lock(&journal->j_state_lock);
494 	commit_transaction->t_state = T_COMMIT;
495 	write_unlock(&journal->j_state_lock);
496 
497 	trace_jbd2_commit_logging(journal, commit_transaction);
498 	stats.run.rs_logging = jiffies;
499 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
500 					       stats.run.rs_logging);
501 	stats.run.rs_blocks =
502 		atomic_read(&commit_transaction->t_outstanding_credits);
503 	stats.run.rs_blocks_logged = 0;
504 
505 	J_ASSERT(commit_transaction->t_nr_buffers <=
506 		 atomic_read(&commit_transaction->t_outstanding_credits));
507 
508 	err = 0;
509 	descriptor = NULL;
510 	bufs = 0;
511 	blk_start_plug(&plug);
512 	while (commit_transaction->t_buffers) {
513 
514 		/* Find the next buffer to be journaled... */
515 
516 		jh = commit_transaction->t_buffers;
517 
518 		/* If we're in abort mode, we just un-journal the buffer and
519 		   release it. */
520 
521 		if (is_journal_aborted(journal)) {
522 			clear_buffer_jbddirty(jh2bh(jh));
523 			JBUFFER_TRACE(jh, "journal is aborting: refile");
524 			jbd2_buffer_abort_trigger(jh,
525 						  jh->b_frozen_data ?
526 						  jh->b_frozen_triggers :
527 						  jh->b_triggers);
528 			jbd2_journal_refile_buffer(journal, jh);
529 			/* If that was the last one, we need to clean up
530 			 * any descriptor buffers which may have been
531 			 * already allocated, even if we are now
532 			 * aborting. */
533 			if (!commit_transaction->t_buffers)
534 				goto start_journal_io;
535 			continue;
536 		}
537 
538 		/* Make sure we have a descriptor block in which to
539 		   record the metadata buffer. */
540 
541 		if (!descriptor) {
542 			struct buffer_head *bh;
543 
544 			J_ASSERT (bufs == 0);
545 
546 			jbd_debug(4, "JBD2: get descriptor\n");
547 
548 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
549 			if (!descriptor) {
550 				jbd2_journal_abort(journal, -EIO);
551 				continue;
552 			}
553 
554 			bh = jh2bh(descriptor);
555 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
556 				(unsigned long long)bh->b_blocknr, bh->b_data);
557 			header = (journal_header_t *)&bh->b_data[0];
558 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
559 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
560 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
561 
562 			tagp = &bh->b_data[sizeof(journal_header_t)];
563 			space_left = bh->b_size - sizeof(journal_header_t);
564 			first_tag = 1;
565 			set_buffer_jwrite(bh);
566 			set_buffer_dirty(bh);
567 			wbuf[bufs++] = bh;
568 
569 			/* Record it so that we can wait for IO
570                            completion later */
571 			BUFFER_TRACE(bh, "ph3: file as descriptor");
572 			jbd2_journal_file_buffer(descriptor, commit_transaction,
573 					BJ_LogCtl);
574 		}
575 
576 		/* Where is the buffer to be written? */
577 
578 		err = jbd2_journal_next_log_block(journal, &blocknr);
579 		/* If the block mapping failed, just abandon the buffer
580 		   and repeat this loop: we'll fall into the
581 		   refile-on-abort condition above. */
582 		if (err) {
583 			jbd2_journal_abort(journal, err);
584 			continue;
585 		}
586 
587 		/*
588 		 * start_this_handle() uses t_outstanding_credits to determine
589 		 * the free space in the log, but this counter is changed
590 		 * by jbd2_journal_next_log_block() also.
591 		 */
592 		atomic_dec(&commit_transaction->t_outstanding_credits);
593 
594 		/* Bump b_count to prevent truncate from stumbling over
595                    the shadowed buffer!  @@@ This can go if we ever get
596                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
597 		atomic_inc(&jh2bh(jh)->b_count);
598 
599 		/* Make a temporary IO buffer with which to write it out
600                    (this will requeue both the metadata buffer and the
601                    temporary IO buffer). new_bh goes on BJ_IO*/
602 
603 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
604 		/*
605 		 * akpm: jbd2_journal_write_metadata_buffer() sets
606 		 * new_bh->b_transaction to commit_transaction.
607 		 * We need to clean this up before we release new_bh
608 		 * (which is of type BJ_IO)
609 		 */
610 		JBUFFER_TRACE(jh, "ph3: write metadata");
611 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
612 						      jh, &new_jh, blocknr);
613 		if (flags < 0) {
614 			jbd2_journal_abort(journal, flags);
615 			continue;
616 		}
617 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
618 		wbuf[bufs++] = jh2bh(new_jh);
619 
620 		/* Record the new block's tag in the current descriptor
621                    buffer */
622 
623 		tag_flag = 0;
624 		if (flags & 1)
625 			tag_flag |= JBD2_FLAG_ESCAPE;
626 		if (!first_tag)
627 			tag_flag |= JBD2_FLAG_SAME_UUID;
628 
629 		tag = (journal_block_tag_t *) tagp;
630 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
631 		tag->t_flags = cpu_to_be32(tag_flag);
632 		tagp += tag_bytes;
633 		space_left -= tag_bytes;
634 
635 		if (first_tag) {
636 			memcpy (tagp, journal->j_uuid, 16);
637 			tagp += 16;
638 			space_left -= 16;
639 			first_tag = 0;
640 		}
641 
642 		/* If there's no more to do, or if the descriptor is full,
643 		   let the IO rip! */
644 
645 		if (bufs == journal->j_wbufsize ||
646 		    commit_transaction->t_buffers == NULL ||
647 		    space_left < tag_bytes + 16) {
648 
649 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
650 
651 			/* Write an end-of-descriptor marker before
652                            submitting the IOs.  "tag" still points to
653                            the last tag we set up. */
654 
655 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
656 
657 start_journal_io:
658 			for (i = 0; i < bufs; i++) {
659 				struct buffer_head *bh = wbuf[i];
660 				/*
661 				 * Compute checksum.
662 				 */
663 				if (JBD2_HAS_COMPAT_FEATURE(journal,
664 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
665 					crc32_sum =
666 					    jbd2_checksum_data(crc32_sum, bh);
667 				}
668 
669 				lock_buffer(bh);
670 				clear_buffer_dirty(bh);
671 				set_buffer_uptodate(bh);
672 				bh->b_end_io = journal_end_buffer_io_sync;
673 				submit_bh(WRITE_SYNC, bh);
674 			}
675 			cond_resched();
676 			stats.run.rs_blocks_logged += bufs;
677 
678 			/* Force a new descriptor to be generated next
679                            time round the loop. */
680 			descriptor = NULL;
681 			bufs = 0;
682 		}
683 	}
684 
685 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
686 	if (err) {
687 		printk(KERN_WARNING
688 			"JBD2: Detected IO errors while flushing file data "
689 		       "on %s\n", journal->j_devname);
690 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
691 			jbd2_journal_abort(journal, err);
692 		err = 0;
693 	}
694 
695 	/*
696 	 * Get current oldest transaction in the log before we issue flush
697 	 * to the filesystem device. After the flush we can be sure that
698 	 * blocks of all older transactions are checkpointed to persistent
699 	 * storage and we will be safe to update journal start in the
700 	 * superblock with the numbers we get here.
701 	 */
702 	update_tail =
703 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
704 
705 	write_lock(&journal->j_state_lock);
706 	if (update_tail) {
707 		long freed = first_block - journal->j_tail;
708 
709 		if (first_block < journal->j_tail)
710 			freed += journal->j_last - journal->j_first;
711 		/* Update tail only if we free significant amount of space */
712 		if (freed < journal->j_maxlen / 4)
713 			update_tail = 0;
714 	}
715 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
716 	commit_transaction->t_state = T_COMMIT_DFLUSH;
717 	write_unlock(&journal->j_state_lock);
718 
719 	/*
720 	 * If the journal is not located on the file system device,
721 	 * then we must flush the file system device before we issue
722 	 * the commit record
723 	 */
724 	if (commit_transaction->t_need_data_flush &&
725 	    (journal->j_fs_dev != journal->j_dev) &&
726 	    (journal->j_flags & JBD2_BARRIER))
727 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
728 
729 	/* Done it all: now write the commit record asynchronously. */
730 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
731 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
732 		err = journal_submit_commit_record(journal, commit_transaction,
733 						 &cbh, crc32_sum);
734 		if (err)
735 			__jbd2_journal_abort_hard(journal);
736 	}
737 
738 	blk_finish_plug(&plug);
739 
740 	/* Lo and behold: we have just managed to send a transaction to
741            the log.  Before we can commit it, wait for the IO so far to
742            complete.  Control buffers being written are on the
743            transaction's t_log_list queue, and metadata buffers are on
744            the t_iobuf_list queue.
745 
746 	   Wait for the buffers in reverse order.  That way we are
747 	   less likely to be woken up until all IOs have completed, and
748 	   so we incur less scheduling load.
749 	*/
750 
751 	jbd_debug(3, "JBD2: commit phase 3\n");
752 
753 	/*
754 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
755 	 * See __journal_try_to_free_buffer.
756 	 */
757 wait_for_iobuf:
758 	while (commit_transaction->t_iobuf_list != NULL) {
759 		struct buffer_head *bh;
760 
761 		jh = commit_transaction->t_iobuf_list->b_tprev;
762 		bh = jh2bh(jh);
763 		if (buffer_locked(bh)) {
764 			wait_on_buffer(bh);
765 			goto wait_for_iobuf;
766 		}
767 		if (cond_resched())
768 			goto wait_for_iobuf;
769 
770 		if (unlikely(!buffer_uptodate(bh)))
771 			err = -EIO;
772 
773 		clear_buffer_jwrite(bh);
774 
775 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
776 		jbd2_journal_unfile_buffer(journal, jh);
777 
778 		/*
779 		 * ->t_iobuf_list should contain only dummy buffer_heads
780 		 * which were created by jbd2_journal_write_metadata_buffer().
781 		 */
782 		BUFFER_TRACE(bh, "dumping temporary bh");
783 		jbd2_journal_put_journal_head(jh);
784 		__brelse(bh);
785 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
786 		free_buffer_head(bh);
787 
788 		/* We also have to unlock and free the corresponding
789                    shadowed buffer */
790 		jh = commit_transaction->t_shadow_list->b_tprev;
791 		bh = jh2bh(jh);
792 		clear_bit(BH_JWrite, &bh->b_state);
793 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
794 
795 		/* The metadata is now released for reuse, but we need
796                    to remember it against this transaction so that when
797                    we finally commit, we can do any checkpointing
798                    required. */
799 		JBUFFER_TRACE(jh, "file as BJ_Forget");
800 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
801 		/*
802 		 * Wake up any transactions which were waiting for this IO to
803 		 * complete. The barrier must be here so that changes by
804 		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
805 		 * does the waitqueue check.
806 		 */
807 		smp_mb();
808 		wake_up_bit(&bh->b_state, BH_Unshadow);
809 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
810 		__brelse(bh);
811 	}
812 
813 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
814 
815 	jbd_debug(3, "JBD2: commit phase 4\n");
816 
817 	/* Here we wait for the revoke record and descriptor record buffers */
818  wait_for_ctlbuf:
819 	while (commit_transaction->t_log_list != NULL) {
820 		struct buffer_head *bh;
821 
822 		jh = commit_transaction->t_log_list->b_tprev;
823 		bh = jh2bh(jh);
824 		if (buffer_locked(bh)) {
825 			wait_on_buffer(bh);
826 			goto wait_for_ctlbuf;
827 		}
828 		if (cond_resched())
829 			goto wait_for_ctlbuf;
830 
831 		if (unlikely(!buffer_uptodate(bh)))
832 			err = -EIO;
833 
834 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
835 		clear_buffer_jwrite(bh);
836 		jbd2_journal_unfile_buffer(journal, jh);
837 		jbd2_journal_put_journal_head(jh);
838 		__brelse(bh);		/* One for getblk */
839 		/* AKPM: bforget here */
840 	}
841 
842 	if (err)
843 		jbd2_journal_abort(journal, err);
844 
845 	jbd_debug(3, "JBD2: commit phase 5\n");
846 	write_lock(&journal->j_state_lock);
847 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
848 	commit_transaction->t_state = T_COMMIT_JFLUSH;
849 	write_unlock(&journal->j_state_lock);
850 
851 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
852 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
853 		err = journal_submit_commit_record(journal, commit_transaction,
854 						&cbh, crc32_sum);
855 		if (err)
856 			__jbd2_journal_abort_hard(journal);
857 	}
858 	if (cbh)
859 		err = journal_wait_on_commit_record(journal, cbh);
860 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
861 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
862 	    journal->j_flags & JBD2_BARRIER) {
863 		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
864 	}
865 
866 	if (err)
867 		jbd2_journal_abort(journal, err);
868 
869 	/*
870 	 * Now disk caches for filesystem device are flushed so we are safe to
871 	 * erase checkpointed transactions from the log by updating journal
872 	 * superblock.
873 	 */
874 	if (update_tail)
875 		jbd2_update_log_tail(journal, first_tid, first_block);
876 
877 	/* End of a transaction!  Finally, we can do checkpoint
878            processing: any buffers committed as a result of this
879            transaction can be removed from any checkpoint list it was on
880            before. */
881 
882 	jbd_debug(3, "JBD2: commit phase 6\n");
883 
884 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
885 	J_ASSERT(commit_transaction->t_buffers == NULL);
886 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
887 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
888 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
889 	J_ASSERT(commit_transaction->t_log_list == NULL);
890 
891 restart_loop:
892 	/*
893 	 * As there are other places (journal_unmap_buffer()) adding buffers
894 	 * to this list we have to be careful and hold the j_list_lock.
895 	 */
896 	spin_lock(&journal->j_list_lock);
897 	while (commit_transaction->t_forget) {
898 		transaction_t *cp_transaction;
899 		struct buffer_head *bh;
900 		int try_to_free = 0;
901 
902 		jh = commit_transaction->t_forget;
903 		spin_unlock(&journal->j_list_lock);
904 		bh = jh2bh(jh);
905 		/*
906 		 * Get a reference so that bh cannot be freed before we are
907 		 * done with it.
908 		 */
909 		get_bh(bh);
910 		jbd_lock_bh_state(bh);
911 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
912 
913 		/*
914 		 * If there is undo-protected committed data against
915 		 * this buffer, then we can remove it now.  If it is a
916 		 * buffer needing such protection, the old frozen_data
917 		 * field now points to a committed version of the
918 		 * buffer, so rotate that field to the new committed
919 		 * data.
920 		 *
921 		 * Otherwise, we can just throw away the frozen data now.
922 		 *
923 		 * We also know that the frozen data has already fired
924 		 * its triggers if they exist, so we can clear that too.
925 		 */
926 		if (jh->b_committed_data) {
927 			jbd2_free(jh->b_committed_data, bh->b_size);
928 			jh->b_committed_data = NULL;
929 			if (jh->b_frozen_data) {
930 				jh->b_committed_data = jh->b_frozen_data;
931 				jh->b_frozen_data = NULL;
932 				jh->b_frozen_triggers = NULL;
933 			}
934 		} else if (jh->b_frozen_data) {
935 			jbd2_free(jh->b_frozen_data, bh->b_size);
936 			jh->b_frozen_data = NULL;
937 			jh->b_frozen_triggers = NULL;
938 		}
939 
940 		spin_lock(&journal->j_list_lock);
941 		cp_transaction = jh->b_cp_transaction;
942 		if (cp_transaction) {
943 			JBUFFER_TRACE(jh, "remove from old cp transaction");
944 			cp_transaction->t_chp_stats.cs_dropped++;
945 			__jbd2_journal_remove_checkpoint(jh);
946 		}
947 
948 		/* Only re-checkpoint the buffer_head if it is marked
949 		 * dirty.  If the buffer was added to the BJ_Forget list
950 		 * by jbd2_journal_forget, it may no longer be dirty and
951 		 * there's no point in keeping a checkpoint record for
952 		 * it. */
953 
954 		/* A buffer which has been freed while still being
955 		 * journaled by a previous transaction may end up still
956 		 * being dirty here, but we want to avoid writing back
957 		 * that buffer in the future after the "add to orphan"
958 		 * operation been committed,  That's not only a performance
959 		 * gain, it also stops aliasing problems if the buffer is
960 		 * left behind for writeback and gets reallocated for another
961 		 * use in a different page. */
962 		if (buffer_freed(bh) && !jh->b_next_transaction) {
963 			clear_buffer_freed(bh);
964 			clear_buffer_jbddirty(bh);
965 		}
966 
967 		if (buffer_jbddirty(bh)) {
968 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
969 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
970 			if (is_journal_aborted(journal))
971 				clear_buffer_jbddirty(bh);
972 		} else {
973 			J_ASSERT_BH(bh, !buffer_dirty(bh));
974 			/*
975 			 * The buffer on BJ_Forget list and not jbddirty means
976 			 * it has been freed by this transaction and hence it
977 			 * could not have been reallocated until this
978 			 * transaction has committed. *BUT* it could be
979 			 * reallocated once we have written all the data to
980 			 * disk and before we process the buffer on BJ_Forget
981 			 * list.
982 			 */
983 			if (!jh->b_next_transaction)
984 				try_to_free = 1;
985 		}
986 		JBUFFER_TRACE(jh, "refile or unfile buffer");
987 		__jbd2_journal_refile_buffer(jh);
988 		jbd_unlock_bh_state(bh);
989 		if (try_to_free)
990 			release_buffer_page(bh);	/* Drops bh reference */
991 		else
992 			__brelse(bh);
993 		cond_resched_lock(&journal->j_list_lock);
994 	}
995 	spin_unlock(&journal->j_list_lock);
996 	/*
997 	 * This is a bit sleazy.  We use j_list_lock to protect transition
998 	 * of a transaction into T_FINISHED state and calling
999 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1000 	 * other checkpointing code processing the transaction...
1001 	 */
1002 	write_lock(&journal->j_state_lock);
1003 	spin_lock(&journal->j_list_lock);
1004 	/*
1005 	 * Now recheck if some buffers did not get attached to the transaction
1006 	 * while the lock was dropped...
1007 	 */
1008 	if (commit_transaction->t_forget) {
1009 		spin_unlock(&journal->j_list_lock);
1010 		write_unlock(&journal->j_state_lock);
1011 		goto restart_loop;
1012 	}
1013 
1014 	/* Done with this transaction! */
1015 
1016 	jbd_debug(3, "JBD2: commit phase 7\n");
1017 
1018 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1019 
1020 	commit_transaction->t_start = jiffies;
1021 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1022 					      commit_transaction->t_start);
1023 
1024 	/*
1025 	 * File the transaction statistics
1026 	 */
1027 	stats.ts_tid = commit_transaction->t_tid;
1028 	stats.run.rs_handle_count =
1029 		atomic_read(&commit_transaction->t_handle_count);
1030 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1031 			     commit_transaction->t_tid, &stats.run);
1032 
1033 	/*
1034 	 * Calculate overall stats
1035 	 */
1036 	spin_lock(&journal->j_history_lock);
1037 	journal->j_stats.ts_tid++;
1038 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1039 	journal->j_stats.run.rs_running += stats.run.rs_running;
1040 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1041 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1042 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1043 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1044 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1045 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1046 	spin_unlock(&journal->j_history_lock);
1047 
1048 	commit_transaction->t_state = T_FINISHED;
1049 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1050 	journal->j_commit_sequence = commit_transaction->t_tid;
1051 	journal->j_committing_transaction = NULL;
1052 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1053 
1054 	/*
1055 	 * weight the commit time higher than the average time so we don't
1056 	 * react too strongly to vast changes in the commit time
1057 	 */
1058 	if (likely(journal->j_average_commit_time))
1059 		journal->j_average_commit_time = (commit_time +
1060 				journal->j_average_commit_time*3) / 4;
1061 	else
1062 		journal->j_average_commit_time = commit_time;
1063 	write_unlock(&journal->j_state_lock);
1064 
1065 	if (commit_transaction->t_checkpoint_list == NULL &&
1066 	    commit_transaction->t_checkpoint_io_list == NULL) {
1067 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1068 		to_free = 1;
1069 	} else {
1070 		if (journal->j_checkpoint_transactions == NULL) {
1071 			journal->j_checkpoint_transactions = commit_transaction;
1072 			commit_transaction->t_cpnext = commit_transaction;
1073 			commit_transaction->t_cpprev = commit_transaction;
1074 		} else {
1075 			commit_transaction->t_cpnext =
1076 				journal->j_checkpoint_transactions;
1077 			commit_transaction->t_cpprev =
1078 				commit_transaction->t_cpnext->t_cpprev;
1079 			commit_transaction->t_cpnext->t_cpprev =
1080 				commit_transaction;
1081 			commit_transaction->t_cpprev->t_cpnext =
1082 				commit_transaction;
1083 		}
1084 	}
1085 	spin_unlock(&journal->j_list_lock);
1086 
1087 	if (journal->j_commit_callback)
1088 		journal->j_commit_callback(journal, commit_transaction);
1089 
1090 	trace_jbd2_end_commit(journal, commit_transaction);
1091 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1092 		  journal->j_commit_sequence, journal->j_tail_sequence);
1093 	if (to_free)
1094 		jbd2_journal_free_transaction(commit_transaction);
1095 
1096 	wake_up(&journal->j_wait_done_commit);
1097 }
1098