xref: /linux/fs/jbd2/commit.c (revision bd7dd77c2a05c530684eea2e3af16449ae9c5d52)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/marker.h>
20 #include <linux/errno.h>
21 #include <linux/slab.h>
22 #include <linux/mm.h>
23 #include <linux/pagemap.h>
24 #include <linux/jiffies.h>
25 #include <linux/crc32.h>
26 #include <linux/writeback.h>
27 #include <linux/backing-dev.h>
28 #include <linux/bio.h>
29 
30 /*
31  * Default IO end handler for temporary BJ_IO buffer_heads.
32  */
33 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
34 {
35 	BUFFER_TRACE(bh, "");
36 	if (uptodate)
37 		set_buffer_uptodate(bh);
38 	else
39 		clear_buffer_uptodate(bh);
40 	unlock_buffer(bh);
41 }
42 
43 /*
44  * When an ext4 file is truncated, it is possible that some pages are not
45  * successfully freed, because they are attached to a committing transaction.
46  * After the transaction commits, these pages are left on the LRU, with no
47  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
48  * by the VM, but their apparent absence upsets the VM accounting, and it makes
49  * the numbers in /proc/meminfo look odd.
50  *
51  * So here, we have a buffer which has just come off the forget list.  Look to
52  * see if we can strip all buffers from the backing page.
53  *
54  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
55  * caller provided us with a ref against the buffer, and we drop that here.
56  */
57 static void release_buffer_page(struct buffer_head *bh)
58 {
59 	struct page *page;
60 
61 	if (buffer_dirty(bh))
62 		goto nope;
63 	if (atomic_read(&bh->b_count) != 1)
64 		goto nope;
65 	page = bh->b_page;
66 	if (!page)
67 		goto nope;
68 	if (page->mapping)
69 		goto nope;
70 
71 	/* OK, it's a truncated page */
72 	if (!trylock_page(page))
73 		goto nope;
74 
75 	page_cache_get(page);
76 	__brelse(bh);
77 	try_to_free_buffers(page);
78 	unlock_page(page);
79 	page_cache_release(page);
80 	return;
81 
82 nope:
83 	__brelse(bh);
84 }
85 
86 /*
87  * Done it all: now submit the commit record.  We should have
88  * cleaned up our previous buffers by now, so if we are in abort
89  * mode we can now just skip the rest of the journal write
90  * entirely.
91  *
92  * Returns 1 if the journal needs to be aborted or 0 on success
93  */
94 static int journal_submit_commit_record(journal_t *journal,
95 					transaction_t *commit_transaction,
96 					struct buffer_head **cbh,
97 					__u32 crc32_sum)
98 {
99 	struct journal_head *descriptor;
100 	struct commit_header *tmp;
101 	struct buffer_head *bh;
102 	int ret;
103 	int barrier_done = 0;
104 	struct timespec now = current_kernel_time();
105 
106 	if (is_journal_aborted(journal))
107 		return 0;
108 
109 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
110 	if (!descriptor)
111 		return 1;
112 
113 	bh = jh2bh(descriptor);
114 
115 	tmp = (struct commit_header *)bh->b_data;
116 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
117 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
118 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
119 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
120 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
121 
122 	if (JBD2_HAS_COMPAT_FEATURE(journal,
123 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
124 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
125 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
126 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
127 	}
128 
129 	JBUFFER_TRACE(descriptor, "submit commit block");
130 	lock_buffer(bh);
131 	clear_buffer_dirty(bh);
132 	set_buffer_uptodate(bh);
133 	bh->b_end_io = journal_end_buffer_io_sync;
134 
135 	if (journal->j_flags & JBD2_BARRIER &&
136 		!JBD2_HAS_INCOMPAT_FEATURE(journal,
137 					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 		set_buffer_ordered(bh);
139 		barrier_done = 1;
140 	}
141 	ret = submit_bh(WRITE_SYNC, bh);
142 	if (barrier_done)
143 		clear_buffer_ordered(bh);
144 
145 	/* is it possible for another commit to fail at roughly
146 	 * the same time as this one?  If so, we don't want to
147 	 * trust the barrier flag in the super, but instead want
148 	 * to remember if we sent a barrier request
149 	 */
150 	if (ret == -EOPNOTSUPP && barrier_done) {
151 		printk(KERN_WARNING
152 		       "JBD: barrier-based sync failed on %s - "
153 		       "disabling barriers\n", journal->j_devname);
154 		spin_lock(&journal->j_state_lock);
155 		journal->j_flags &= ~JBD2_BARRIER;
156 		spin_unlock(&journal->j_state_lock);
157 
158 		/* And try again, without the barrier */
159 		lock_buffer(bh);
160 		set_buffer_uptodate(bh);
161 		clear_buffer_dirty(bh);
162 		ret = submit_bh(WRITE_SYNC, bh);
163 	}
164 	*cbh = bh;
165 	return ret;
166 }
167 
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(journal_t *journal,
173 					 struct buffer_head *bh)
174 {
175 	int ret = 0;
176 
177 retry:
178 	clear_buffer_dirty(bh);
179 	wait_on_buffer(bh);
180 	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 		printk(KERN_WARNING
182 		       "JBD2: wait_on_commit_record: sync failed on %s - "
183 		       "disabling barriers\n", journal->j_devname);
184 		spin_lock(&journal->j_state_lock);
185 		journal->j_flags &= ~JBD2_BARRIER;
186 		spin_unlock(&journal->j_state_lock);
187 
188 		lock_buffer(bh);
189 		clear_buffer_dirty(bh);
190 		set_buffer_uptodate(bh);
191 		bh->b_end_io = journal_end_buffer_io_sync;
192 
193 		ret = submit_bh(WRITE_SYNC, bh);
194 		if (ret) {
195 			unlock_buffer(bh);
196 			return ret;
197 		}
198 		goto retry;
199 	}
200 
201 	if (unlikely(!buffer_uptodate(bh)))
202 		ret = -EIO;
203 	put_bh(bh);            /* One for getblk() */
204 	jbd2_journal_put_journal_head(bh2jh(bh));
205 
206 	return ret;
207 }
208 
209 /*
210  * write the filemap data using writepage() address_space_operations.
211  * We don't do block allocation here even for delalloc. We don't
212  * use writepages() because with dealyed allocation we may be doing
213  * block allocation in writepages().
214  */
215 static int journal_submit_inode_data_buffers(struct address_space *mapping)
216 {
217 	int ret;
218 	struct writeback_control wbc = {
219 		.sync_mode =  WB_SYNC_ALL,
220 		.nr_to_write = mapping->nrpages * 2,
221 		.range_start = 0,
222 		.range_end = i_size_read(mapping->host),
223 		.for_writepages = 1,
224 	};
225 
226 	ret = generic_writepages(mapping, &wbc);
227 	return ret;
228 }
229 
230 /*
231  * Submit all the data buffers of inode associated with the transaction to
232  * disk.
233  *
234  * We are in a committing transaction. Therefore no new inode can be added to
235  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
236  * operate on from being released while we write out pages.
237  */
238 static int journal_submit_data_buffers(journal_t *journal,
239 		transaction_t *commit_transaction)
240 {
241 	struct jbd2_inode *jinode;
242 	int err, ret = 0;
243 	struct address_space *mapping;
244 
245 	spin_lock(&journal->j_list_lock);
246 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
247 		mapping = jinode->i_vfs_inode->i_mapping;
248 		jinode->i_flags |= JI_COMMIT_RUNNING;
249 		spin_unlock(&journal->j_list_lock);
250 		/*
251 		 * submit the inode data buffers. We use writepage
252 		 * instead of writepages. Because writepages can do
253 		 * block allocation  with delalloc. We need to write
254 		 * only allocated blocks here.
255 		 */
256 		err = journal_submit_inode_data_buffers(mapping);
257 		if (!ret)
258 			ret = err;
259 		spin_lock(&journal->j_list_lock);
260 		J_ASSERT(jinode->i_transaction == commit_transaction);
261 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
262 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
263 	}
264 	spin_unlock(&journal->j_list_lock);
265 	return ret;
266 }
267 
268 /*
269  * Wait for data submitted for writeout, refile inodes to proper
270  * transaction if needed.
271  *
272  */
273 static int journal_finish_inode_data_buffers(journal_t *journal,
274 		transaction_t *commit_transaction)
275 {
276 	struct jbd2_inode *jinode, *next_i;
277 	int err, ret = 0;
278 
279 	/* For locking, see the comment in journal_submit_data_buffers() */
280 	spin_lock(&journal->j_list_lock);
281 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
282 		jinode->i_flags |= JI_COMMIT_RUNNING;
283 		spin_unlock(&journal->j_list_lock);
284 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
285 		if (err) {
286 			/*
287 			 * Because AS_EIO is cleared by
288 			 * wait_on_page_writeback_range(), set it again so
289 			 * that user process can get -EIO from fsync().
290 			 */
291 			set_bit(AS_EIO,
292 				&jinode->i_vfs_inode->i_mapping->flags);
293 
294 			if (!ret)
295 				ret = err;
296 		}
297 		spin_lock(&journal->j_list_lock);
298 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
299 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
300 	}
301 
302 	/* Now refile inode to proper lists */
303 	list_for_each_entry_safe(jinode, next_i,
304 				 &commit_transaction->t_inode_list, i_list) {
305 		list_del(&jinode->i_list);
306 		if (jinode->i_next_transaction) {
307 			jinode->i_transaction = jinode->i_next_transaction;
308 			jinode->i_next_transaction = NULL;
309 			list_add(&jinode->i_list,
310 				&jinode->i_transaction->t_inode_list);
311 		} else {
312 			jinode->i_transaction = NULL;
313 		}
314 	}
315 	spin_unlock(&journal->j_list_lock);
316 
317 	return ret;
318 }
319 
320 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
321 {
322 	struct page *page = bh->b_page;
323 	char *addr;
324 	__u32 checksum;
325 
326 	addr = kmap_atomic(page, KM_USER0);
327 	checksum = crc32_be(crc32_sum,
328 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
329 	kunmap_atomic(addr, KM_USER0);
330 
331 	return checksum;
332 }
333 
334 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
335 				   unsigned long long block)
336 {
337 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
338 	if (tag_bytes > JBD2_TAG_SIZE32)
339 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
340 }
341 
342 /*
343  * jbd2_journal_commit_transaction
344  *
345  * The primary function for committing a transaction to the log.  This
346  * function is called by the journal thread to begin a complete commit.
347  */
348 void jbd2_journal_commit_transaction(journal_t *journal)
349 {
350 	struct transaction_stats_s stats;
351 	transaction_t *commit_transaction;
352 	struct journal_head *jh, *new_jh, *descriptor;
353 	struct buffer_head **wbuf = journal->j_wbuf;
354 	int bufs;
355 	int flags;
356 	int err;
357 	unsigned long long blocknr;
358 	ktime_t start_time;
359 	u64 commit_time;
360 	char *tagp = NULL;
361 	journal_header_t *header;
362 	journal_block_tag_t *tag = NULL;
363 	int space_left = 0;
364 	int first_tag = 0;
365 	int tag_flag;
366 	int i, to_free = 0;
367 	int tag_bytes = journal_tag_bytes(journal);
368 	struct buffer_head *cbh = NULL; /* For transactional checksums */
369 	__u32 crc32_sum = ~0;
370 
371 	/*
372 	 * First job: lock down the current transaction and wait for
373 	 * all outstanding updates to complete.
374 	 */
375 
376 #ifdef COMMIT_STATS
377 	spin_lock(&journal->j_list_lock);
378 	summarise_journal_usage(journal);
379 	spin_unlock(&journal->j_list_lock);
380 #endif
381 
382 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
383 	if (journal->j_flags & JBD2_FLUSHED) {
384 		jbd_debug(3, "super block updated\n");
385 		jbd2_journal_update_superblock(journal, 1);
386 	} else {
387 		jbd_debug(3, "superblock not updated\n");
388 	}
389 
390 	J_ASSERT(journal->j_running_transaction != NULL);
391 	J_ASSERT(journal->j_committing_transaction == NULL);
392 
393 	commit_transaction = journal->j_running_transaction;
394 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
395 
396 	trace_mark(jbd2_start_commit, "dev %s transaction %d",
397 		   journal->j_devname, commit_transaction->t_tid);
398 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
399 			commit_transaction->t_tid);
400 
401 	spin_lock(&journal->j_state_lock);
402 	commit_transaction->t_state = T_LOCKED;
403 
404 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
405 	stats.u.run.rs_locked = jiffies;
406 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
407 						stats.u.run.rs_locked);
408 
409 	spin_lock(&commit_transaction->t_handle_lock);
410 	while (commit_transaction->t_updates) {
411 		DEFINE_WAIT(wait);
412 
413 		prepare_to_wait(&journal->j_wait_updates, &wait,
414 					TASK_UNINTERRUPTIBLE);
415 		if (commit_transaction->t_updates) {
416 			spin_unlock(&commit_transaction->t_handle_lock);
417 			spin_unlock(&journal->j_state_lock);
418 			schedule();
419 			spin_lock(&journal->j_state_lock);
420 			spin_lock(&commit_transaction->t_handle_lock);
421 		}
422 		finish_wait(&journal->j_wait_updates, &wait);
423 	}
424 	spin_unlock(&commit_transaction->t_handle_lock);
425 
426 	J_ASSERT (commit_transaction->t_outstanding_credits <=
427 			journal->j_max_transaction_buffers);
428 
429 	/*
430 	 * First thing we are allowed to do is to discard any remaining
431 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
432 	 * that there are no such buffers: if a large filesystem
433 	 * operation like a truncate needs to split itself over multiple
434 	 * transactions, then it may try to do a jbd2_journal_restart() while
435 	 * there are still BJ_Reserved buffers outstanding.  These must
436 	 * be released cleanly from the current transaction.
437 	 *
438 	 * In this case, the filesystem must still reserve write access
439 	 * again before modifying the buffer in the new transaction, but
440 	 * we do not require it to remember exactly which old buffers it
441 	 * has reserved.  This is consistent with the existing behaviour
442 	 * that multiple jbd2_journal_get_write_access() calls to the same
443 	 * buffer are perfectly permissable.
444 	 */
445 	while (commit_transaction->t_reserved_list) {
446 		jh = commit_transaction->t_reserved_list;
447 		JBUFFER_TRACE(jh, "reserved, unused: refile");
448 		/*
449 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
450 		 * leave undo-committed data.
451 		 */
452 		if (jh->b_committed_data) {
453 			struct buffer_head *bh = jh2bh(jh);
454 
455 			jbd_lock_bh_state(bh);
456 			jbd2_free(jh->b_committed_data, bh->b_size);
457 			jh->b_committed_data = NULL;
458 			jbd_unlock_bh_state(bh);
459 		}
460 		jbd2_journal_refile_buffer(journal, jh);
461 	}
462 
463 	/*
464 	 * Now try to drop any written-back buffers from the journal's
465 	 * checkpoint lists.  We do this *before* commit because it potentially
466 	 * frees some memory
467 	 */
468 	spin_lock(&journal->j_list_lock);
469 	__jbd2_journal_clean_checkpoint_list(journal);
470 	spin_unlock(&journal->j_list_lock);
471 
472 	jbd_debug (3, "JBD: commit phase 1\n");
473 
474 	/*
475 	 * Switch to a new revoke table.
476 	 */
477 	jbd2_journal_switch_revoke_table(journal);
478 
479 	stats.u.run.rs_flushing = jiffies;
480 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
481 					       stats.u.run.rs_flushing);
482 
483 	commit_transaction->t_state = T_FLUSH;
484 	journal->j_committing_transaction = commit_transaction;
485 	journal->j_running_transaction = NULL;
486 	start_time = ktime_get();
487 	commit_transaction->t_log_start = journal->j_head;
488 	wake_up(&journal->j_wait_transaction_locked);
489 	spin_unlock(&journal->j_state_lock);
490 
491 	jbd_debug (3, "JBD: commit phase 2\n");
492 
493 	/*
494 	 * Now start flushing things to disk, in the order they appear
495 	 * on the transaction lists.  Data blocks go first.
496 	 */
497 	err = journal_submit_data_buffers(journal, commit_transaction);
498 	if (err)
499 		jbd2_journal_abort(journal, err);
500 
501 	jbd2_journal_write_revoke_records(journal, commit_transaction);
502 
503 	jbd_debug(3, "JBD: commit phase 2\n");
504 
505 	/*
506 	 * Way to go: we have now written out all of the data for a
507 	 * transaction!  Now comes the tricky part: we need to write out
508 	 * metadata.  Loop over the transaction's entire buffer list:
509 	 */
510 	spin_lock(&journal->j_state_lock);
511 	commit_transaction->t_state = T_COMMIT;
512 	spin_unlock(&journal->j_state_lock);
513 
514 	stats.u.run.rs_logging = jiffies;
515 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
516 						 stats.u.run.rs_logging);
517 	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
518 	stats.u.run.rs_blocks_logged = 0;
519 
520 	J_ASSERT(commit_transaction->t_nr_buffers <=
521 		 commit_transaction->t_outstanding_credits);
522 
523 	err = 0;
524 	descriptor = NULL;
525 	bufs = 0;
526 	while (commit_transaction->t_buffers) {
527 
528 		/* Find the next buffer to be journaled... */
529 
530 		jh = commit_transaction->t_buffers;
531 
532 		/* If we're in abort mode, we just un-journal the buffer and
533 		   release it. */
534 
535 		if (is_journal_aborted(journal)) {
536 			clear_buffer_jbddirty(jh2bh(jh));
537 			JBUFFER_TRACE(jh, "journal is aborting: refile");
538 			jbd2_buffer_abort_trigger(jh,
539 						  jh->b_frozen_data ?
540 						  jh->b_frozen_triggers :
541 						  jh->b_triggers);
542 			jbd2_journal_refile_buffer(journal, jh);
543 			/* If that was the last one, we need to clean up
544 			 * any descriptor buffers which may have been
545 			 * already allocated, even if we are now
546 			 * aborting. */
547 			if (!commit_transaction->t_buffers)
548 				goto start_journal_io;
549 			continue;
550 		}
551 
552 		/* Make sure we have a descriptor block in which to
553 		   record the metadata buffer. */
554 
555 		if (!descriptor) {
556 			struct buffer_head *bh;
557 
558 			J_ASSERT (bufs == 0);
559 
560 			jbd_debug(4, "JBD: get descriptor\n");
561 
562 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
563 			if (!descriptor) {
564 				jbd2_journal_abort(journal, -EIO);
565 				continue;
566 			}
567 
568 			bh = jh2bh(descriptor);
569 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
570 				(unsigned long long)bh->b_blocknr, bh->b_data);
571 			header = (journal_header_t *)&bh->b_data[0];
572 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
573 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
574 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
575 
576 			tagp = &bh->b_data[sizeof(journal_header_t)];
577 			space_left = bh->b_size - sizeof(journal_header_t);
578 			first_tag = 1;
579 			set_buffer_jwrite(bh);
580 			set_buffer_dirty(bh);
581 			wbuf[bufs++] = bh;
582 
583 			/* Record it so that we can wait for IO
584                            completion later */
585 			BUFFER_TRACE(bh, "ph3: file as descriptor");
586 			jbd2_journal_file_buffer(descriptor, commit_transaction,
587 					BJ_LogCtl);
588 		}
589 
590 		/* Where is the buffer to be written? */
591 
592 		err = jbd2_journal_next_log_block(journal, &blocknr);
593 		/* If the block mapping failed, just abandon the buffer
594 		   and repeat this loop: we'll fall into the
595 		   refile-on-abort condition above. */
596 		if (err) {
597 			jbd2_journal_abort(journal, err);
598 			continue;
599 		}
600 
601 		/*
602 		 * start_this_handle() uses t_outstanding_credits to determine
603 		 * the free space in the log, but this counter is changed
604 		 * by jbd2_journal_next_log_block() also.
605 		 */
606 		commit_transaction->t_outstanding_credits--;
607 
608 		/* Bump b_count to prevent truncate from stumbling over
609                    the shadowed buffer!  @@@ This can go if we ever get
610                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
611 		atomic_inc(&jh2bh(jh)->b_count);
612 
613 		/* Make a temporary IO buffer with which to write it out
614                    (this will requeue both the metadata buffer and the
615                    temporary IO buffer). new_bh goes on BJ_IO*/
616 
617 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
618 		/*
619 		 * akpm: jbd2_journal_write_metadata_buffer() sets
620 		 * new_bh->b_transaction to commit_transaction.
621 		 * We need to clean this up before we release new_bh
622 		 * (which is of type BJ_IO)
623 		 */
624 		JBUFFER_TRACE(jh, "ph3: write metadata");
625 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
626 						      jh, &new_jh, blocknr);
627 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
628 		wbuf[bufs++] = jh2bh(new_jh);
629 
630 		/* Record the new block's tag in the current descriptor
631                    buffer */
632 
633 		tag_flag = 0;
634 		if (flags & 1)
635 			tag_flag |= JBD2_FLAG_ESCAPE;
636 		if (!first_tag)
637 			tag_flag |= JBD2_FLAG_SAME_UUID;
638 
639 		tag = (journal_block_tag_t *) tagp;
640 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
641 		tag->t_flags = cpu_to_be32(tag_flag);
642 		tagp += tag_bytes;
643 		space_left -= tag_bytes;
644 
645 		if (first_tag) {
646 			memcpy (tagp, journal->j_uuid, 16);
647 			tagp += 16;
648 			space_left -= 16;
649 			first_tag = 0;
650 		}
651 
652 		/* If there's no more to do, or if the descriptor is full,
653 		   let the IO rip! */
654 
655 		if (bufs == journal->j_wbufsize ||
656 		    commit_transaction->t_buffers == NULL ||
657 		    space_left < tag_bytes + 16) {
658 
659 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
660 
661 			/* Write an end-of-descriptor marker before
662                            submitting the IOs.  "tag" still points to
663                            the last tag we set up. */
664 
665 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
666 
667 start_journal_io:
668 			for (i = 0; i < bufs; i++) {
669 				struct buffer_head *bh = wbuf[i];
670 				/*
671 				 * Compute checksum.
672 				 */
673 				if (JBD2_HAS_COMPAT_FEATURE(journal,
674 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
675 					crc32_sum =
676 					    jbd2_checksum_data(crc32_sum, bh);
677 				}
678 
679 				lock_buffer(bh);
680 				clear_buffer_dirty(bh);
681 				set_buffer_uptodate(bh);
682 				bh->b_end_io = journal_end_buffer_io_sync;
683 				submit_bh(WRITE, bh);
684 			}
685 			cond_resched();
686 			stats.u.run.rs_blocks_logged += bufs;
687 
688 			/* Force a new descriptor to be generated next
689                            time round the loop. */
690 			descriptor = NULL;
691 			bufs = 0;
692 		}
693 	}
694 
695 	/* Done it all: now write the commit record asynchronously. */
696 
697 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
698 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
699 		err = journal_submit_commit_record(journal, commit_transaction,
700 						 &cbh, crc32_sum);
701 		if (err)
702 			__jbd2_journal_abort_hard(journal);
703 	}
704 
705 	/*
706 	 * This is the right place to wait for data buffers both for ASYNC
707 	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
708 	 * the commit block went to disk (which happens above). If commit is
709 	 * SYNC, we need to wait for data buffers before we start writing
710 	 * commit block, which happens below in such setting.
711 	 */
712 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
713 	if (err) {
714 		printk(KERN_WARNING
715 			"JBD2: Detected IO errors while flushing file data "
716 		       "on %s\n", journal->j_devname);
717 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
718 			jbd2_journal_abort(journal, err);
719 		err = 0;
720 	}
721 
722 	/* Lo and behold: we have just managed to send a transaction to
723            the log.  Before we can commit it, wait for the IO so far to
724            complete.  Control buffers being written are on the
725            transaction's t_log_list queue, and metadata buffers are on
726            the t_iobuf_list queue.
727 
728 	   Wait for the buffers in reverse order.  That way we are
729 	   less likely to be woken up until all IOs have completed, and
730 	   so we incur less scheduling load.
731 	*/
732 
733 	jbd_debug(3, "JBD: commit phase 3\n");
734 
735 	/*
736 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
737 	 * See __journal_try_to_free_buffer.
738 	 */
739 wait_for_iobuf:
740 	while (commit_transaction->t_iobuf_list != NULL) {
741 		struct buffer_head *bh;
742 
743 		jh = commit_transaction->t_iobuf_list->b_tprev;
744 		bh = jh2bh(jh);
745 		if (buffer_locked(bh)) {
746 			wait_on_buffer(bh);
747 			goto wait_for_iobuf;
748 		}
749 		if (cond_resched())
750 			goto wait_for_iobuf;
751 
752 		if (unlikely(!buffer_uptodate(bh)))
753 			err = -EIO;
754 
755 		clear_buffer_jwrite(bh);
756 
757 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
758 		jbd2_journal_unfile_buffer(journal, jh);
759 
760 		/*
761 		 * ->t_iobuf_list should contain only dummy buffer_heads
762 		 * which were created by jbd2_journal_write_metadata_buffer().
763 		 */
764 		BUFFER_TRACE(bh, "dumping temporary bh");
765 		jbd2_journal_put_journal_head(jh);
766 		__brelse(bh);
767 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
768 		free_buffer_head(bh);
769 
770 		/* We also have to unlock and free the corresponding
771                    shadowed buffer */
772 		jh = commit_transaction->t_shadow_list->b_tprev;
773 		bh = jh2bh(jh);
774 		clear_bit(BH_JWrite, &bh->b_state);
775 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
776 
777 		/* The metadata is now released for reuse, but we need
778                    to remember it against this transaction so that when
779                    we finally commit, we can do any checkpointing
780                    required. */
781 		JBUFFER_TRACE(jh, "file as BJ_Forget");
782 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
783 		/* Wake up any transactions which were waiting for this
784 		   IO to complete */
785 		wake_up_bit(&bh->b_state, BH_Unshadow);
786 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
787 		__brelse(bh);
788 	}
789 
790 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
791 
792 	jbd_debug(3, "JBD: commit phase 4\n");
793 
794 	/* Here we wait for the revoke record and descriptor record buffers */
795  wait_for_ctlbuf:
796 	while (commit_transaction->t_log_list != NULL) {
797 		struct buffer_head *bh;
798 
799 		jh = commit_transaction->t_log_list->b_tprev;
800 		bh = jh2bh(jh);
801 		if (buffer_locked(bh)) {
802 			wait_on_buffer(bh);
803 			goto wait_for_ctlbuf;
804 		}
805 		if (cond_resched())
806 			goto wait_for_ctlbuf;
807 
808 		if (unlikely(!buffer_uptodate(bh)))
809 			err = -EIO;
810 
811 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
812 		clear_buffer_jwrite(bh);
813 		jbd2_journal_unfile_buffer(journal, jh);
814 		jbd2_journal_put_journal_head(jh);
815 		__brelse(bh);		/* One for getblk */
816 		/* AKPM: bforget here */
817 	}
818 
819 	if (err)
820 		jbd2_journal_abort(journal, err);
821 
822 	jbd_debug(3, "JBD: commit phase 5\n");
823 
824 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
825 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
826 		err = journal_submit_commit_record(journal, commit_transaction,
827 						&cbh, crc32_sum);
828 		if (err)
829 			__jbd2_journal_abort_hard(journal);
830 	}
831 	if (!err && !is_journal_aborted(journal))
832 		err = journal_wait_on_commit_record(journal, cbh);
833 
834 	if (err)
835 		jbd2_journal_abort(journal, err);
836 
837 	/* End of a transaction!  Finally, we can do checkpoint
838            processing: any buffers committed as a result of this
839            transaction can be removed from any checkpoint list it was on
840            before. */
841 
842 	jbd_debug(3, "JBD: commit phase 6\n");
843 
844 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
845 	J_ASSERT(commit_transaction->t_buffers == NULL);
846 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
847 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
848 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
849 	J_ASSERT(commit_transaction->t_log_list == NULL);
850 
851 restart_loop:
852 	/*
853 	 * As there are other places (journal_unmap_buffer()) adding buffers
854 	 * to this list we have to be careful and hold the j_list_lock.
855 	 */
856 	spin_lock(&journal->j_list_lock);
857 	while (commit_transaction->t_forget) {
858 		transaction_t *cp_transaction;
859 		struct buffer_head *bh;
860 
861 		jh = commit_transaction->t_forget;
862 		spin_unlock(&journal->j_list_lock);
863 		bh = jh2bh(jh);
864 		jbd_lock_bh_state(bh);
865 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
866 			jh->b_transaction == journal->j_running_transaction);
867 
868 		/*
869 		 * If there is undo-protected committed data against
870 		 * this buffer, then we can remove it now.  If it is a
871 		 * buffer needing such protection, the old frozen_data
872 		 * field now points to a committed version of the
873 		 * buffer, so rotate that field to the new committed
874 		 * data.
875 		 *
876 		 * Otherwise, we can just throw away the frozen data now.
877 		 *
878 		 * We also know that the frozen data has already fired
879 		 * its triggers if they exist, so we can clear that too.
880 		 */
881 		if (jh->b_committed_data) {
882 			jbd2_free(jh->b_committed_data, bh->b_size);
883 			jh->b_committed_data = NULL;
884 			if (jh->b_frozen_data) {
885 				jh->b_committed_data = jh->b_frozen_data;
886 				jh->b_frozen_data = NULL;
887 				jh->b_frozen_triggers = NULL;
888 			}
889 		} else if (jh->b_frozen_data) {
890 			jbd2_free(jh->b_frozen_data, bh->b_size);
891 			jh->b_frozen_data = NULL;
892 			jh->b_frozen_triggers = NULL;
893 		}
894 
895 		spin_lock(&journal->j_list_lock);
896 		cp_transaction = jh->b_cp_transaction;
897 		if (cp_transaction) {
898 			JBUFFER_TRACE(jh, "remove from old cp transaction");
899 			cp_transaction->t_chp_stats.cs_dropped++;
900 			__jbd2_journal_remove_checkpoint(jh);
901 		}
902 
903 		/* Only re-checkpoint the buffer_head if it is marked
904 		 * dirty.  If the buffer was added to the BJ_Forget list
905 		 * by jbd2_journal_forget, it may no longer be dirty and
906 		 * there's no point in keeping a checkpoint record for
907 		 * it. */
908 
909 		/* A buffer which has been freed while still being
910 		 * journaled by a previous transaction may end up still
911 		 * being dirty here, but we want to avoid writing back
912 		 * that buffer in the future now that the last use has
913 		 * been committed.  That's not only a performance gain,
914 		 * it also stops aliasing problems if the buffer is left
915 		 * behind for writeback and gets reallocated for another
916 		 * use in a different page. */
917 		if (buffer_freed(bh)) {
918 			clear_buffer_freed(bh);
919 			clear_buffer_jbddirty(bh);
920 		}
921 
922 		if (buffer_jbddirty(bh)) {
923 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
924 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
925 			if (is_journal_aborted(journal))
926 				clear_buffer_jbddirty(bh);
927 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
928 			__jbd2_journal_refile_buffer(jh);
929 			jbd_unlock_bh_state(bh);
930 		} else {
931 			J_ASSERT_BH(bh, !buffer_dirty(bh));
932 			/* The buffer on BJ_Forget list and not jbddirty means
933 			 * it has been freed by this transaction and hence it
934 			 * could not have been reallocated until this
935 			 * transaction has committed. *BUT* it could be
936 			 * reallocated once we have written all the data to
937 			 * disk and before we process the buffer on BJ_Forget
938 			 * list. */
939 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
940 			__jbd2_journal_refile_buffer(jh);
941 			if (!jh->b_transaction) {
942 				jbd_unlock_bh_state(bh);
943 				 /* needs a brelse */
944 				jbd2_journal_remove_journal_head(bh);
945 				release_buffer_page(bh);
946 			} else
947 				jbd_unlock_bh_state(bh);
948 		}
949 		cond_resched_lock(&journal->j_list_lock);
950 	}
951 	spin_unlock(&journal->j_list_lock);
952 	/*
953 	 * This is a bit sleazy.  We use j_list_lock to protect transition
954 	 * of a transaction into T_FINISHED state and calling
955 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
956 	 * other checkpointing code processing the transaction...
957 	 */
958 	spin_lock(&journal->j_state_lock);
959 	spin_lock(&journal->j_list_lock);
960 	/*
961 	 * Now recheck if some buffers did not get attached to the transaction
962 	 * while the lock was dropped...
963 	 */
964 	if (commit_transaction->t_forget) {
965 		spin_unlock(&journal->j_list_lock);
966 		spin_unlock(&journal->j_state_lock);
967 		goto restart_loop;
968 	}
969 
970 	/* Done with this transaction! */
971 
972 	jbd_debug(3, "JBD: commit phase 7\n");
973 
974 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
975 
976 	commit_transaction->t_start = jiffies;
977 	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
978 						commit_transaction->t_start);
979 
980 	/*
981 	 * File the transaction for history
982 	 */
983 	stats.ts_type = JBD2_STATS_RUN;
984 	stats.ts_tid = commit_transaction->t_tid;
985 	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
986 	spin_lock(&journal->j_history_lock);
987 	memcpy(journal->j_history + journal->j_history_cur, &stats,
988 			sizeof(stats));
989 	if (++journal->j_history_cur == journal->j_history_max)
990 		journal->j_history_cur = 0;
991 
992 	/*
993 	 * Calculate overall stats
994 	 */
995 	journal->j_stats.ts_tid++;
996 	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
997 	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
998 	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
999 	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1000 	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1001 	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1002 	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1003 	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1004 	spin_unlock(&journal->j_history_lock);
1005 
1006 	commit_transaction->t_state = T_FINISHED;
1007 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1008 	journal->j_commit_sequence = commit_transaction->t_tid;
1009 	journal->j_committing_transaction = NULL;
1010 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1011 
1012 	/*
1013 	 * weight the commit time higher than the average time so we don't
1014 	 * react too strongly to vast changes in the commit time
1015 	 */
1016 	if (likely(journal->j_average_commit_time))
1017 		journal->j_average_commit_time = (commit_time +
1018 				journal->j_average_commit_time*3) / 4;
1019 	else
1020 		journal->j_average_commit_time = commit_time;
1021 	spin_unlock(&journal->j_state_lock);
1022 
1023 	if (commit_transaction->t_checkpoint_list == NULL &&
1024 	    commit_transaction->t_checkpoint_io_list == NULL) {
1025 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1026 		to_free = 1;
1027 	} else {
1028 		if (journal->j_checkpoint_transactions == NULL) {
1029 			journal->j_checkpoint_transactions = commit_transaction;
1030 			commit_transaction->t_cpnext = commit_transaction;
1031 			commit_transaction->t_cpprev = commit_transaction;
1032 		} else {
1033 			commit_transaction->t_cpnext =
1034 				journal->j_checkpoint_transactions;
1035 			commit_transaction->t_cpprev =
1036 				commit_transaction->t_cpnext->t_cpprev;
1037 			commit_transaction->t_cpnext->t_cpprev =
1038 				commit_transaction;
1039 			commit_transaction->t_cpprev->t_cpnext =
1040 				commit_transaction;
1041 		}
1042 	}
1043 	spin_unlock(&journal->j_list_lock);
1044 
1045 	if (journal->j_commit_callback)
1046 		journal->j_commit_callback(journal, commit_transaction);
1047 
1048 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1049 		   journal->j_devname, commit_transaction->t_tid,
1050 		   journal->j_tail_sequence);
1051 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1052 		  journal->j_commit_sequence, journal->j_tail_sequence);
1053 	if (to_free)
1054 		kfree(commit_transaction);
1055 
1056 	wake_up(&journal->j_wait_done_commit);
1057 }
1058