xref: /linux/fs/jbd2/commit.c (revision 44e668c6faa9a6c477a32788e7e88f0754c54a4e)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <trace/events/jbd2.h>
30 
31 /*
32  * Default IO end handler for temporary BJ_IO buffer_heads.
33  */
34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
35 {
36 	BUFFER_TRACE(bh, "");
37 	if (uptodate)
38 		set_buffer_uptodate(bh);
39 	else
40 		clear_buffer_uptodate(bh);
41 	unlock_buffer(bh);
42 }
43 
44 /*
45  * When an ext4 file is truncated, it is possible that some pages are not
46  * successfully freed, because they are attached to a committing transaction.
47  * After the transaction commits, these pages are left on the LRU, with no
48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
50  * the numbers in /proc/meminfo look odd.
51  *
52  * So here, we have a buffer which has just come off the forget list.  Look to
53  * see if we can strip all buffers from the backing page.
54  *
55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
56  * caller provided us with a ref against the buffer, and we drop that here.
57  */
58 static void release_buffer_page(struct buffer_head *bh)
59 {
60 	struct page *page;
61 
62 	if (buffer_dirty(bh))
63 		goto nope;
64 	if (atomic_read(&bh->b_count) != 1)
65 		goto nope;
66 	page = bh->b_page;
67 	if (!page)
68 		goto nope;
69 	if (page->mapping)
70 		goto nope;
71 
72 	/* OK, it's a truncated page */
73 	if (!trylock_page(page))
74 		goto nope;
75 
76 	page_cache_get(page);
77 	__brelse(bh);
78 	try_to_free_buffers(page);
79 	unlock_page(page);
80 	page_cache_release(page);
81 	return;
82 
83 nope:
84 	__brelse(bh);
85 }
86 
87 /*
88  * Done it all: now submit the commit record.  We should have
89  * cleaned up our previous buffers by now, so if we are in abort
90  * mode we can now just skip the rest of the journal write
91  * entirely.
92  *
93  * Returns 1 if the journal needs to be aborted or 0 on success
94  */
95 static int journal_submit_commit_record(journal_t *journal,
96 					transaction_t *commit_transaction,
97 					struct buffer_head **cbh,
98 					__u32 crc32_sum)
99 {
100 	struct journal_head *descriptor;
101 	struct commit_header *tmp;
102 	struct buffer_head *bh;
103 	int ret;
104 	struct timespec now = current_kernel_time();
105 
106 	if (is_journal_aborted(journal))
107 		return 0;
108 
109 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
110 	if (!descriptor)
111 		return 1;
112 
113 	bh = jh2bh(descriptor);
114 
115 	tmp = (struct commit_header *)bh->b_data;
116 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
117 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
118 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
119 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
120 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
121 
122 	if (JBD2_HAS_COMPAT_FEATURE(journal,
123 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
124 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
125 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
126 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
127 	}
128 
129 	JBUFFER_TRACE(descriptor, "submit commit block");
130 	lock_buffer(bh);
131 	clear_buffer_dirty(bh);
132 	set_buffer_uptodate(bh);
133 	bh->b_end_io = journal_end_buffer_io_sync;
134 
135 	if (journal->j_flags & JBD2_BARRIER &&
136 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
139 		if (ret == -EOPNOTSUPP) {
140 			printk(KERN_WARNING
141 			       "JBD2: Disabling barriers on %s, "
142 			       "not supported by device\n", journal->j_devname);
143 			write_lock(&journal->j_state_lock);
144 			journal->j_flags &= ~JBD2_BARRIER;
145 			write_unlock(&journal->j_state_lock);
146 
147 			/* And try again, without the barrier */
148 			lock_buffer(bh);
149 			set_buffer_uptodate(bh);
150 			clear_buffer_dirty(bh);
151 			ret = submit_bh(WRITE_SYNC_PLUG, bh);
152 		}
153 	} else {
154 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
155 	}
156 	*cbh = bh;
157 	return ret;
158 }
159 
160 /*
161  * This function along with journal_submit_commit_record
162  * allows to write the commit record asynchronously.
163  */
164 static int journal_wait_on_commit_record(journal_t *journal,
165 					 struct buffer_head *bh)
166 {
167 	int ret = 0;
168 
169 retry:
170 	clear_buffer_dirty(bh);
171 	wait_on_buffer(bh);
172 	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
173 		printk(KERN_WARNING
174 		       "JBD2: %s: disabling barries on %s - not supported "
175 		       "by device\n", __func__, journal->j_devname);
176 		write_lock(&journal->j_state_lock);
177 		journal->j_flags &= ~JBD2_BARRIER;
178 		write_unlock(&journal->j_state_lock);
179 
180 		lock_buffer(bh);
181 		clear_buffer_dirty(bh);
182 		set_buffer_uptodate(bh);
183 		bh->b_end_io = journal_end_buffer_io_sync;
184 
185 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
186 		if (ret) {
187 			unlock_buffer(bh);
188 			return ret;
189 		}
190 		goto retry;
191 	}
192 
193 	if (unlikely(!buffer_uptodate(bh)))
194 		ret = -EIO;
195 	put_bh(bh);            /* One for getblk() */
196 	jbd2_journal_put_journal_head(bh2jh(bh));
197 
198 	return ret;
199 }
200 
201 /*
202  * write the filemap data using writepage() address_space_operations.
203  * We don't do block allocation here even for delalloc. We don't
204  * use writepages() because with dealyed allocation we may be doing
205  * block allocation in writepages().
206  */
207 static int journal_submit_inode_data_buffers(struct address_space *mapping)
208 {
209 	int ret;
210 	struct writeback_control wbc = {
211 		.sync_mode =  WB_SYNC_ALL,
212 		.nr_to_write = mapping->nrpages * 2,
213 		.range_start = 0,
214 		.range_end = i_size_read(mapping->host),
215 	};
216 
217 	ret = generic_writepages(mapping, &wbc);
218 	return ret;
219 }
220 
221 /*
222  * Submit all the data buffers of inode associated with the transaction to
223  * disk.
224  *
225  * We are in a committing transaction. Therefore no new inode can be added to
226  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
227  * operate on from being released while we write out pages.
228  */
229 static int journal_submit_data_buffers(journal_t *journal,
230 		transaction_t *commit_transaction)
231 {
232 	struct jbd2_inode *jinode;
233 	int err, ret = 0;
234 	struct address_space *mapping;
235 
236 	spin_lock(&journal->j_list_lock);
237 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
238 		mapping = jinode->i_vfs_inode->i_mapping;
239 		jinode->i_flags |= JI_COMMIT_RUNNING;
240 		spin_unlock(&journal->j_list_lock);
241 		/*
242 		 * submit the inode data buffers. We use writepage
243 		 * instead of writepages. Because writepages can do
244 		 * block allocation  with delalloc. We need to write
245 		 * only allocated blocks here.
246 		 */
247 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
248 		err = journal_submit_inode_data_buffers(mapping);
249 		if (!ret)
250 			ret = err;
251 		spin_lock(&journal->j_list_lock);
252 		J_ASSERT(jinode->i_transaction == commit_transaction);
253 		commit_transaction->t_flushed_data_blocks = 1;
254 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
255 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
256 	}
257 	spin_unlock(&journal->j_list_lock);
258 	return ret;
259 }
260 
261 /*
262  * Wait for data submitted for writeout, refile inodes to proper
263  * transaction if needed.
264  *
265  */
266 static int journal_finish_inode_data_buffers(journal_t *journal,
267 		transaction_t *commit_transaction)
268 {
269 	struct jbd2_inode *jinode, *next_i;
270 	int err, ret = 0;
271 
272 	/* For locking, see the comment in journal_submit_data_buffers() */
273 	spin_lock(&journal->j_list_lock);
274 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
275 		jinode->i_flags |= JI_COMMIT_RUNNING;
276 		spin_unlock(&journal->j_list_lock);
277 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
278 		if (err) {
279 			/*
280 			 * Because AS_EIO is cleared by
281 			 * filemap_fdatawait_range(), set it again so
282 			 * that user process can get -EIO from fsync().
283 			 */
284 			set_bit(AS_EIO,
285 				&jinode->i_vfs_inode->i_mapping->flags);
286 
287 			if (!ret)
288 				ret = err;
289 		}
290 		spin_lock(&journal->j_list_lock);
291 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
292 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
293 	}
294 
295 	/* Now refile inode to proper lists */
296 	list_for_each_entry_safe(jinode, next_i,
297 				 &commit_transaction->t_inode_list, i_list) {
298 		list_del(&jinode->i_list);
299 		if (jinode->i_next_transaction) {
300 			jinode->i_transaction = jinode->i_next_transaction;
301 			jinode->i_next_transaction = NULL;
302 			list_add(&jinode->i_list,
303 				&jinode->i_transaction->t_inode_list);
304 		} else {
305 			jinode->i_transaction = NULL;
306 		}
307 	}
308 	spin_unlock(&journal->j_list_lock);
309 
310 	return ret;
311 }
312 
313 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
314 {
315 	struct page *page = bh->b_page;
316 	char *addr;
317 	__u32 checksum;
318 
319 	addr = kmap_atomic(page, KM_USER0);
320 	checksum = crc32_be(crc32_sum,
321 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
322 	kunmap_atomic(addr, KM_USER0);
323 
324 	return checksum;
325 }
326 
327 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
328 				   unsigned long long block)
329 {
330 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
331 	if (tag_bytes > JBD2_TAG_SIZE32)
332 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
333 }
334 
335 /*
336  * jbd2_journal_commit_transaction
337  *
338  * The primary function for committing a transaction to the log.  This
339  * function is called by the journal thread to begin a complete commit.
340  */
341 void jbd2_journal_commit_transaction(journal_t *journal)
342 {
343 	struct transaction_stats_s stats;
344 	transaction_t *commit_transaction;
345 	struct journal_head *jh, *new_jh, *descriptor;
346 	struct buffer_head **wbuf = journal->j_wbuf;
347 	int bufs;
348 	int flags;
349 	int err;
350 	unsigned long long blocknr;
351 	ktime_t start_time;
352 	u64 commit_time;
353 	char *tagp = NULL;
354 	journal_header_t *header;
355 	journal_block_tag_t *tag = NULL;
356 	int space_left = 0;
357 	int first_tag = 0;
358 	int tag_flag;
359 	int i, to_free = 0;
360 	int tag_bytes = journal_tag_bytes(journal);
361 	struct buffer_head *cbh = NULL; /* For transactional checksums */
362 	__u32 crc32_sum = ~0;
363 	int write_op = WRITE;
364 
365 	/*
366 	 * First job: lock down the current transaction and wait for
367 	 * all outstanding updates to complete.
368 	 */
369 
370 #ifdef COMMIT_STATS
371 	spin_lock(&journal->j_list_lock);
372 	summarise_journal_usage(journal);
373 	spin_unlock(&journal->j_list_lock);
374 #endif
375 
376 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
377 	if (journal->j_flags & JBD2_FLUSHED) {
378 		jbd_debug(3, "super block updated\n");
379 		jbd2_journal_update_superblock(journal, 1);
380 	} else {
381 		jbd_debug(3, "superblock not updated\n");
382 	}
383 
384 	J_ASSERT(journal->j_running_transaction != NULL);
385 	J_ASSERT(journal->j_committing_transaction == NULL);
386 
387 	commit_transaction = journal->j_running_transaction;
388 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
389 
390 	trace_jbd2_start_commit(journal, commit_transaction);
391 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
392 			commit_transaction->t_tid);
393 
394 	write_lock(&journal->j_state_lock);
395 	commit_transaction->t_state = T_LOCKED;
396 
397 	/*
398 	 * Use plugged writes here, since we want to submit several before
399 	 * we unplug the device. We don't do explicit unplugging in here,
400 	 * instead we rely on sync_buffer() doing the unplug for us.
401 	 */
402 	if (commit_transaction->t_synchronous_commit)
403 		write_op = WRITE_SYNC_PLUG;
404 	trace_jbd2_commit_locking(journal, commit_transaction);
405 	stats.run.rs_wait = commit_transaction->t_max_wait;
406 	stats.run.rs_locked = jiffies;
407 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
408 					      stats.run.rs_locked);
409 
410 	spin_lock(&commit_transaction->t_handle_lock);
411 	while (atomic_read(&commit_transaction->t_updates)) {
412 		DEFINE_WAIT(wait);
413 
414 		prepare_to_wait(&journal->j_wait_updates, &wait,
415 					TASK_UNINTERRUPTIBLE);
416 		if (atomic_read(&commit_transaction->t_updates)) {
417 			spin_unlock(&commit_transaction->t_handle_lock);
418 			write_unlock(&journal->j_state_lock);
419 			schedule();
420 			write_lock(&journal->j_state_lock);
421 			spin_lock(&commit_transaction->t_handle_lock);
422 		}
423 		finish_wait(&journal->j_wait_updates, &wait);
424 	}
425 	spin_unlock(&commit_transaction->t_handle_lock);
426 
427 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
428 			journal->j_max_transaction_buffers);
429 
430 	/*
431 	 * First thing we are allowed to do is to discard any remaining
432 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
433 	 * that there are no such buffers: if a large filesystem
434 	 * operation like a truncate needs to split itself over multiple
435 	 * transactions, then it may try to do a jbd2_journal_restart() while
436 	 * there are still BJ_Reserved buffers outstanding.  These must
437 	 * be released cleanly from the current transaction.
438 	 *
439 	 * In this case, the filesystem must still reserve write access
440 	 * again before modifying the buffer in the new transaction, but
441 	 * we do not require it to remember exactly which old buffers it
442 	 * has reserved.  This is consistent with the existing behaviour
443 	 * that multiple jbd2_journal_get_write_access() calls to the same
444 	 * buffer are perfectly permissable.
445 	 */
446 	while (commit_transaction->t_reserved_list) {
447 		jh = commit_transaction->t_reserved_list;
448 		JBUFFER_TRACE(jh, "reserved, unused: refile");
449 		/*
450 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
451 		 * leave undo-committed data.
452 		 */
453 		if (jh->b_committed_data) {
454 			struct buffer_head *bh = jh2bh(jh);
455 
456 			jbd_lock_bh_state(bh);
457 			jbd2_free(jh->b_committed_data, bh->b_size);
458 			jh->b_committed_data = NULL;
459 			jbd_unlock_bh_state(bh);
460 		}
461 		jbd2_journal_refile_buffer(journal, jh);
462 	}
463 
464 	/*
465 	 * Now try to drop any written-back buffers from the journal's
466 	 * checkpoint lists.  We do this *before* commit because it potentially
467 	 * frees some memory
468 	 */
469 	spin_lock(&journal->j_list_lock);
470 	__jbd2_journal_clean_checkpoint_list(journal);
471 	spin_unlock(&journal->j_list_lock);
472 
473 	jbd_debug (3, "JBD: commit phase 1\n");
474 
475 	/*
476 	 * Switch to a new revoke table.
477 	 */
478 	jbd2_journal_switch_revoke_table(journal);
479 
480 	trace_jbd2_commit_flushing(journal, commit_transaction);
481 	stats.run.rs_flushing = jiffies;
482 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
483 					     stats.run.rs_flushing);
484 
485 	commit_transaction->t_state = T_FLUSH;
486 	journal->j_committing_transaction = commit_transaction;
487 	journal->j_running_transaction = NULL;
488 	start_time = ktime_get();
489 	commit_transaction->t_log_start = journal->j_head;
490 	wake_up(&journal->j_wait_transaction_locked);
491 	write_unlock(&journal->j_state_lock);
492 
493 	jbd_debug (3, "JBD: commit phase 2\n");
494 
495 	/*
496 	 * Now start flushing things to disk, in the order they appear
497 	 * on the transaction lists.  Data blocks go first.
498 	 */
499 	err = journal_submit_data_buffers(journal, commit_transaction);
500 	if (err)
501 		jbd2_journal_abort(journal, err);
502 
503 	jbd2_journal_write_revoke_records(journal, commit_transaction,
504 					  write_op);
505 
506 	jbd_debug(3, "JBD: commit phase 2\n");
507 
508 	/*
509 	 * Way to go: we have now written out all of the data for a
510 	 * transaction!  Now comes the tricky part: we need to write out
511 	 * metadata.  Loop over the transaction's entire buffer list:
512 	 */
513 	write_lock(&journal->j_state_lock);
514 	commit_transaction->t_state = T_COMMIT;
515 	write_unlock(&journal->j_state_lock);
516 
517 	trace_jbd2_commit_logging(journal, commit_transaction);
518 	stats.run.rs_logging = jiffies;
519 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
520 					       stats.run.rs_logging);
521 	stats.run.rs_blocks =
522 		atomic_read(&commit_transaction->t_outstanding_credits);
523 	stats.run.rs_blocks_logged = 0;
524 
525 	J_ASSERT(commit_transaction->t_nr_buffers <=
526 		 atomic_read(&commit_transaction->t_outstanding_credits));
527 
528 	err = 0;
529 	descriptor = NULL;
530 	bufs = 0;
531 	while (commit_transaction->t_buffers) {
532 
533 		/* Find the next buffer to be journaled... */
534 
535 		jh = commit_transaction->t_buffers;
536 
537 		/* If we're in abort mode, we just un-journal the buffer and
538 		   release it. */
539 
540 		if (is_journal_aborted(journal)) {
541 			clear_buffer_jbddirty(jh2bh(jh));
542 			JBUFFER_TRACE(jh, "journal is aborting: refile");
543 			jbd2_buffer_abort_trigger(jh,
544 						  jh->b_frozen_data ?
545 						  jh->b_frozen_triggers :
546 						  jh->b_triggers);
547 			jbd2_journal_refile_buffer(journal, jh);
548 			/* If that was the last one, we need to clean up
549 			 * any descriptor buffers which may have been
550 			 * already allocated, even if we are now
551 			 * aborting. */
552 			if (!commit_transaction->t_buffers)
553 				goto start_journal_io;
554 			continue;
555 		}
556 
557 		/* Make sure we have a descriptor block in which to
558 		   record the metadata buffer. */
559 
560 		if (!descriptor) {
561 			struct buffer_head *bh;
562 
563 			J_ASSERT (bufs == 0);
564 
565 			jbd_debug(4, "JBD: get descriptor\n");
566 
567 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
568 			if (!descriptor) {
569 				jbd2_journal_abort(journal, -EIO);
570 				continue;
571 			}
572 
573 			bh = jh2bh(descriptor);
574 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
575 				(unsigned long long)bh->b_blocknr, bh->b_data);
576 			header = (journal_header_t *)&bh->b_data[0];
577 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
578 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
579 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
580 
581 			tagp = &bh->b_data[sizeof(journal_header_t)];
582 			space_left = bh->b_size - sizeof(journal_header_t);
583 			first_tag = 1;
584 			set_buffer_jwrite(bh);
585 			set_buffer_dirty(bh);
586 			wbuf[bufs++] = bh;
587 
588 			/* Record it so that we can wait for IO
589                            completion later */
590 			BUFFER_TRACE(bh, "ph3: file as descriptor");
591 			jbd2_journal_file_buffer(descriptor, commit_transaction,
592 					BJ_LogCtl);
593 		}
594 
595 		/* Where is the buffer to be written? */
596 
597 		err = jbd2_journal_next_log_block(journal, &blocknr);
598 		/* If the block mapping failed, just abandon the buffer
599 		   and repeat this loop: we'll fall into the
600 		   refile-on-abort condition above. */
601 		if (err) {
602 			jbd2_journal_abort(journal, err);
603 			continue;
604 		}
605 
606 		/*
607 		 * start_this_handle() uses t_outstanding_credits to determine
608 		 * the free space in the log, but this counter is changed
609 		 * by jbd2_journal_next_log_block() also.
610 		 */
611 		atomic_dec(&commit_transaction->t_outstanding_credits);
612 
613 		/* Bump b_count to prevent truncate from stumbling over
614                    the shadowed buffer!  @@@ This can go if we ever get
615                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
616 		atomic_inc(&jh2bh(jh)->b_count);
617 
618 		/* Make a temporary IO buffer with which to write it out
619                    (this will requeue both the metadata buffer and the
620                    temporary IO buffer). new_bh goes on BJ_IO*/
621 
622 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
623 		/*
624 		 * akpm: jbd2_journal_write_metadata_buffer() sets
625 		 * new_bh->b_transaction to commit_transaction.
626 		 * We need to clean this up before we release new_bh
627 		 * (which is of type BJ_IO)
628 		 */
629 		JBUFFER_TRACE(jh, "ph3: write metadata");
630 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
631 						      jh, &new_jh, blocknr);
632 		if (flags < 0) {
633 			jbd2_journal_abort(journal, flags);
634 			continue;
635 		}
636 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
637 		wbuf[bufs++] = jh2bh(new_jh);
638 
639 		/* Record the new block's tag in the current descriptor
640                    buffer */
641 
642 		tag_flag = 0;
643 		if (flags & 1)
644 			tag_flag |= JBD2_FLAG_ESCAPE;
645 		if (!first_tag)
646 			tag_flag |= JBD2_FLAG_SAME_UUID;
647 
648 		tag = (journal_block_tag_t *) tagp;
649 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
650 		tag->t_flags = cpu_to_be32(tag_flag);
651 		tagp += tag_bytes;
652 		space_left -= tag_bytes;
653 
654 		if (first_tag) {
655 			memcpy (tagp, journal->j_uuid, 16);
656 			tagp += 16;
657 			space_left -= 16;
658 			first_tag = 0;
659 		}
660 
661 		/* If there's no more to do, or if the descriptor is full,
662 		   let the IO rip! */
663 
664 		if (bufs == journal->j_wbufsize ||
665 		    commit_transaction->t_buffers == NULL ||
666 		    space_left < tag_bytes + 16) {
667 
668 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
669 
670 			/* Write an end-of-descriptor marker before
671                            submitting the IOs.  "tag" still points to
672                            the last tag we set up. */
673 
674 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
675 
676 start_journal_io:
677 			for (i = 0; i < bufs; i++) {
678 				struct buffer_head *bh = wbuf[i];
679 				/*
680 				 * Compute checksum.
681 				 */
682 				if (JBD2_HAS_COMPAT_FEATURE(journal,
683 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
684 					crc32_sum =
685 					    jbd2_checksum_data(crc32_sum, bh);
686 				}
687 
688 				lock_buffer(bh);
689 				clear_buffer_dirty(bh);
690 				set_buffer_uptodate(bh);
691 				bh->b_end_io = journal_end_buffer_io_sync;
692 				submit_bh(write_op, bh);
693 			}
694 			cond_resched();
695 			stats.run.rs_blocks_logged += bufs;
696 
697 			/* Force a new descriptor to be generated next
698                            time round the loop. */
699 			descriptor = NULL;
700 			bufs = 0;
701 		}
702 	}
703 
704 	/*
705 	 * If the journal is not located on the file system device,
706 	 * then we must flush the file system device before we issue
707 	 * the commit record
708 	 */
709 	if (commit_transaction->t_flushed_data_blocks &&
710 	    (journal->j_fs_dev != journal->j_dev) &&
711 	    (journal->j_flags & JBD2_BARRIER))
712 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
713 			BLKDEV_IFL_WAIT);
714 
715 	/* Done it all: now write the commit record asynchronously. */
716 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
717 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
718 		err = journal_submit_commit_record(journal, commit_transaction,
719 						 &cbh, crc32_sum);
720 		if (err)
721 			__jbd2_journal_abort_hard(journal);
722 		if (journal->j_flags & JBD2_BARRIER)
723 			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
724 				BLKDEV_IFL_WAIT);
725 	}
726 
727 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
728 	if (err) {
729 		printk(KERN_WARNING
730 			"JBD2: Detected IO errors while flushing file data "
731 		       "on %s\n", journal->j_devname);
732 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
733 			jbd2_journal_abort(journal, err);
734 		err = 0;
735 	}
736 
737 	/* Lo and behold: we have just managed to send a transaction to
738            the log.  Before we can commit it, wait for the IO so far to
739            complete.  Control buffers being written are on the
740            transaction's t_log_list queue, and metadata buffers are on
741            the t_iobuf_list queue.
742 
743 	   Wait for the buffers in reverse order.  That way we are
744 	   less likely to be woken up until all IOs have completed, and
745 	   so we incur less scheduling load.
746 	*/
747 
748 	jbd_debug(3, "JBD: commit phase 3\n");
749 
750 	/*
751 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
752 	 * See __journal_try_to_free_buffer.
753 	 */
754 wait_for_iobuf:
755 	while (commit_transaction->t_iobuf_list != NULL) {
756 		struct buffer_head *bh;
757 
758 		jh = commit_transaction->t_iobuf_list->b_tprev;
759 		bh = jh2bh(jh);
760 		if (buffer_locked(bh)) {
761 			wait_on_buffer(bh);
762 			goto wait_for_iobuf;
763 		}
764 		if (cond_resched())
765 			goto wait_for_iobuf;
766 
767 		if (unlikely(!buffer_uptodate(bh)))
768 			err = -EIO;
769 
770 		clear_buffer_jwrite(bh);
771 
772 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
773 		jbd2_journal_unfile_buffer(journal, jh);
774 
775 		/*
776 		 * ->t_iobuf_list should contain only dummy buffer_heads
777 		 * which were created by jbd2_journal_write_metadata_buffer().
778 		 */
779 		BUFFER_TRACE(bh, "dumping temporary bh");
780 		jbd2_journal_put_journal_head(jh);
781 		__brelse(bh);
782 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
783 		free_buffer_head(bh);
784 
785 		/* We also have to unlock and free the corresponding
786                    shadowed buffer */
787 		jh = commit_transaction->t_shadow_list->b_tprev;
788 		bh = jh2bh(jh);
789 		clear_bit(BH_JWrite, &bh->b_state);
790 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
791 
792 		/* The metadata is now released for reuse, but we need
793                    to remember it against this transaction so that when
794                    we finally commit, we can do any checkpointing
795                    required. */
796 		JBUFFER_TRACE(jh, "file as BJ_Forget");
797 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
798 		/* Wake up any transactions which were waiting for this
799 		   IO to complete */
800 		wake_up_bit(&bh->b_state, BH_Unshadow);
801 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
802 		__brelse(bh);
803 	}
804 
805 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
806 
807 	jbd_debug(3, "JBD: commit phase 4\n");
808 
809 	/* Here we wait for the revoke record and descriptor record buffers */
810  wait_for_ctlbuf:
811 	while (commit_transaction->t_log_list != NULL) {
812 		struct buffer_head *bh;
813 
814 		jh = commit_transaction->t_log_list->b_tprev;
815 		bh = jh2bh(jh);
816 		if (buffer_locked(bh)) {
817 			wait_on_buffer(bh);
818 			goto wait_for_ctlbuf;
819 		}
820 		if (cond_resched())
821 			goto wait_for_ctlbuf;
822 
823 		if (unlikely(!buffer_uptodate(bh)))
824 			err = -EIO;
825 
826 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
827 		clear_buffer_jwrite(bh);
828 		jbd2_journal_unfile_buffer(journal, jh);
829 		jbd2_journal_put_journal_head(jh);
830 		__brelse(bh);		/* One for getblk */
831 		/* AKPM: bforget here */
832 	}
833 
834 	if (err)
835 		jbd2_journal_abort(journal, err);
836 
837 	jbd_debug(3, "JBD: commit phase 5\n");
838 
839 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
840 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
841 		err = journal_submit_commit_record(journal, commit_transaction,
842 						&cbh, crc32_sum);
843 		if (err)
844 			__jbd2_journal_abort_hard(journal);
845 	}
846 	if (!err && !is_journal_aborted(journal))
847 		err = journal_wait_on_commit_record(journal, cbh);
848 
849 	if (err)
850 		jbd2_journal_abort(journal, err);
851 
852 	/* End of a transaction!  Finally, we can do checkpoint
853            processing: any buffers committed as a result of this
854            transaction can be removed from any checkpoint list it was on
855            before. */
856 
857 	jbd_debug(3, "JBD: commit phase 6\n");
858 
859 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
860 	J_ASSERT(commit_transaction->t_buffers == NULL);
861 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
862 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
863 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
864 	J_ASSERT(commit_transaction->t_log_list == NULL);
865 
866 restart_loop:
867 	/*
868 	 * As there are other places (journal_unmap_buffer()) adding buffers
869 	 * to this list we have to be careful and hold the j_list_lock.
870 	 */
871 	spin_lock(&journal->j_list_lock);
872 	while (commit_transaction->t_forget) {
873 		transaction_t *cp_transaction;
874 		struct buffer_head *bh;
875 
876 		jh = commit_transaction->t_forget;
877 		spin_unlock(&journal->j_list_lock);
878 		bh = jh2bh(jh);
879 		jbd_lock_bh_state(bh);
880 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
881 
882 		/*
883 		 * If there is undo-protected committed data against
884 		 * this buffer, then we can remove it now.  If it is a
885 		 * buffer needing such protection, the old frozen_data
886 		 * field now points to a committed version of the
887 		 * buffer, so rotate that field to the new committed
888 		 * data.
889 		 *
890 		 * Otherwise, we can just throw away the frozen data now.
891 		 *
892 		 * We also know that the frozen data has already fired
893 		 * its triggers if they exist, so we can clear that too.
894 		 */
895 		if (jh->b_committed_data) {
896 			jbd2_free(jh->b_committed_data, bh->b_size);
897 			jh->b_committed_data = NULL;
898 			if (jh->b_frozen_data) {
899 				jh->b_committed_data = jh->b_frozen_data;
900 				jh->b_frozen_data = NULL;
901 				jh->b_frozen_triggers = NULL;
902 			}
903 		} else if (jh->b_frozen_data) {
904 			jbd2_free(jh->b_frozen_data, bh->b_size);
905 			jh->b_frozen_data = NULL;
906 			jh->b_frozen_triggers = NULL;
907 		}
908 
909 		spin_lock(&journal->j_list_lock);
910 		cp_transaction = jh->b_cp_transaction;
911 		if (cp_transaction) {
912 			JBUFFER_TRACE(jh, "remove from old cp transaction");
913 			cp_transaction->t_chp_stats.cs_dropped++;
914 			__jbd2_journal_remove_checkpoint(jh);
915 		}
916 
917 		/* Only re-checkpoint the buffer_head if it is marked
918 		 * dirty.  If the buffer was added to the BJ_Forget list
919 		 * by jbd2_journal_forget, it may no longer be dirty and
920 		 * there's no point in keeping a checkpoint record for
921 		 * it. */
922 
923 		/* A buffer which has been freed while still being
924 		 * journaled by a previous transaction may end up still
925 		 * being dirty here, but we want to avoid writing back
926 		 * that buffer in the future after the "add to orphan"
927 		 * operation been committed,  That's not only a performance
928 		 * gain, it also stops aliasing problems if the buffer is
929 		 * left behind for writeback and gets reallocated for another
930 		 * use in a different page. */
931 		if (buffer_freed(bh) && !jh->b_next_transaction) {
932 			clear_buffer_freed(bh);
933 			clear_buffer_jbddirty(bh);
934 		}
935 
936 		if (buffer_jbddirty(bh)) {
937 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
938 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
939 			if (is_journal_aborted(journal))
940 				clear_buffer_jbddirty(bh);
941 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
942 			__jbd2_journal_refile_buffer(jh);
943 			jbd_unlock_bh_state(bh);
944 		} else {
945 			J_ASSERT_BH(bh, !buffer_dirty(bh));
946 			/* The buffer on BJ_Forget list and not jbddirty means
947 			 * it has been freed by this transaction and hence it
948 			 * could not have been reallocated until this
949 			 * transaction has committed. *BUT* it could be
950 			 * reallocated once we have written all the data to
951 			 * disk and before we process the buffer on BJ_Forget
952 			 * list. */
953 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
954 			__jbd2_journal_refile_buffer(jh);
955 			if (!jh->b_transaction) {
956 				jbd_unlock_bh_state(bh);
957 				 /* needs a brelse */
958 				jbd2_journal_remove_journal_head(bh);
959 				release_buffer_page(bh);
960 			} else
961 				jbd_unlock_bh_state(bh);
962 		}
963 		cond_resched_lock(&journal->j_list_lock);
964 	}
965 	spin_unlock(&journal->j_list_lock);
966 	/*
967 	 * This is a bit sleazy.  We use j_list_lock to protect transition
968 	 * of a transaction into T_FINISHED state and calling
969 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
970 	 * other checkpointing code processing the transaction...
971 	 */
972 	write_lock(&journal->j_state_lock);
973 	spin_lock(&journal->j_list_lock);
974 	/*
975 	 * Now recheck if some buffers did not get attached to the transaction
976 	 * while the lock was dropped...
977 	 */
978 	if (commit_transaction->t_forget) {
979 		spin_unlock(&journal->j_list_lock);
980 		write_unlock(&journal->j_state_lock);
981 		goto restart_loop;
982 	}
983 
984 	/* Done with this transaction! */
985 
986 	jbd_debug(3, "JBD: commit phase 7\n");
987 
988 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
989 
990 	commit_transaction->t_start = jiffies;
991 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
992 					      commit_transaction->t_start);
993 
994 	/*
995 	 * File the transaction statistics
996 	 */
997 	stats.ts_tid = commit_transaction->t_tid;
998 	stats.run.rs_handle_count =
999 		atomic_read(&commit_transaction->t_handle_count);
1000 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1001 			     commit_transaction->t_tid, &stats.run);
1002 
1003 	/*
1004 	 * Calculate overall stats
1005 	 */
1006 	spin_lock(&journal->j_history_lock);
1007 	journal->j_stats.ts_tid++;
1008 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1009 	journal->j_stats.run.rs_running += stats.run.rs_running;
1010 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1011 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1012 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1013 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1014 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1015 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1016 	spin_unlock(&journal->j_history_lock);
1017 
1018 	commit_transaction->t_state = T_FINISHED;
1019 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1020 	journal->j_commit_sequence = commit_transaction->t_tid;
1021 	journal->j_committing_transaction = NULL;
1022 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1023 
1024 	/*
1025 	 * weight the commit time higher than the average time so we don't
1026 	 * react too strongly to vast changes in the commit time
1027 	 */
1028 	if (likely(journal->j_average_commit_time))
1029 		journal->j_average_commit_time = (commit_time +
1030 				journal->j_average_commit_time*3) / 4;
1031 	else
1032 		journal->j_average_commit_time = commit_time;
1033 	write_unlock(&journal->j_state_lock);
1034 
1035 	if (commit_transaction->t_checkpoint_list == NULL &&
1036 	    commit_transaction->t_checkpoint_io_list == NULL) {
1037 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1038 		to_free = 1;
1039 	} else {
1040 		if (journal->j_checkpoint_transactions == NULL) {
1041 			journal->j_checkpoint_transactions = commit_transaction;
1042 			commit_transaction->t_cpnext = commit_transaction;
1043 			commit_transaction->t_cpprev = commit_transaction;
1044 		} else {
1045 			commit_transaction->t_cpnext =
1046 				journal->j_checkpoint_transactions;
1047 			commit_transaction->t_cpprev =
1048 				commit_transaction->t_cpnext->t_cpprev;
1049 			commit_transaction->t_cpnext->t_cpprev =
1050 				commit_transaction;
1051 			commit_transaction->t_cpprev->t_cpnext =
1052 				commit_transaction;
1053 		}
1054 	}
1055 	spin_unlock(&journal->j_list_lock);
1056 
1057 	if (journal->j_commit_callback)
1058 		journal->j_commit_callback(journal, commit_transaction);
1059 
1060 	trace_jbd2_end_commit(journal, commit_transaction);
1061 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1062 		  journal->j_commit_sequence, journal->j_tail_sequence);
1063 	if (to_free)
1064 		kfree(commit_transaction);
1065 
1066 	wake_up(&journal->j_wait_done_commit);
1067 }
1068