xref: /linux/fs/ext4/super.c (revision a5766f11cfd3a0c03450d99c8fe548c2940be884)
1 /*
2  *  linux/fs/ext4/super.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Big-endian to little-endian byte-swapping/bitmaps by
16  *        David S. Miller (davem@caip.rutgers.edu), 1995
17  */
18 
19 #include <linux/module.h>
20 #include <linux/string.h>
21 #include <linux/fs.h>
22 #include <linux/time.h>
23 #include <linux/jbd2.h>
24 #include <linux/slab.h>
25 #include <linux/init.h>
26 #include <linux/blkdev.h>
27 #include <linux/parser.h>
28 #include <linux/smp_lock.h>
29 #include <linux/buffer_head.h>
30 #include <linux/exportfs.h>
31 #include <linux/vfs.h>
32 #include <linux/random.h>
33 #include <linux/mount.h>
34 #include <linux/namei.h>
35 #include <linux/quotaops.h>
36 #include <linux/seq_file.h>
37 #include <linux/proc_fs.h>
38 #include <linux/marker.h>
39 #include <linux/log2.h>
40 #include <linux/crc16.h>
41 #include <asm/uaccess.h>
42 
43 #include "ext4.h"
44 #include "ext4_jbd2.h"
45 #include "xattr.h"
46 #include "acl.h"
47 #include "namei.h"
48 #include "group.h"
49 
50 struct proc_dir_entry *ext4_proc_root;
51 
52 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 			     unsigned long journal_devnum);
54 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
55 			       unsigned int);
56 static void ext4_commit_super(struct super_block *sb,
57 			      struct ext4_super_block *es, int sync);
58 static void ext4_mark_recovery_complete(struct super_block *sb,
59 					struct ext4_super_block *es);
60 static void ext4_clear_journal_err(struct super_block *sb,
61 				   struct ext4_super_block *es);
62 static int ext4_sync_fs(struct super_block *sb, int wait);
63 static const char *ext4_decode_error(struct super_block *sb, int errno,
64 				     char nbuf[16]);
65 static int ext4_remount(struct super_block *sb, int *flags, char *data);
66 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
67 static void ext4_unlockfs(struct super_block *sb);
68 static void ext4_write_super(struct super_block *sb);
69 static void ext4_write_super_lockfs(struct super_block *sb);
70 
71 
72 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
73 			       struct ext4_group_desc *bg)
74 {
75 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
76 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
77 		(ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
78 }
79 
80 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
81 			       struct ext4_group_desc *bg)
82 {
83 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
84 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
85 		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
86 }
87 
88 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
89 			      struct ext4_group_desc *bg)
90 {
91 	return le32_to_cpu(bg->bg_inode_table_lo) |
92 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
93 		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94 }
95 
96 void ext4_block_bitmap_set(struct super_block *sb,
97 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
98 {
99 	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
100 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
101 		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
102 }
103 
104 void ext4_inode_bitmap_set(struct super_block *sb,
105 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
106 {
107 	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
108 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
109 		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
110 }
111 
112 void ext4_inode_table_set(struct super_block *sb,
113 			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
114 {
115 	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
116 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
117 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
118 }
119 
120 /*
121  * Wrappers for jbd2_journal_start/end.
122  *
123  * The only special thing we need to do here is to make sure that all
124  * journal_end calls result in the superblock being marked dirty, so
125  * that sync() will call the filesystem's write_super callback if
126  * appropriate.
127  */
128 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
129 {
130 	journal_t *journal;
131 
132 	if (sb->s_flags & MS_RDONLY)
133 		return ERR_PTR(-EROFS);
134 
135 	/* Special case here: if the journal has aborted behind our
136 	 * backs (eg. EIO in the commit thread), then we still need to
137 	 * take the FS itself readonly cleanly. */
138 	journal = EXT4_SB(sb)->s_journal;
139 	if (is_journal_aborted(journal)) {
140 		ext4_abort(sb, __func__,
141 			   "Detected aborted journal");
142 		return ERR_PTR(-EROFS);
143 	}
144 
145 	return jbd2_journal_start(journal, nblocks);
146 }
147 
148 /*
149  * The only special thing we need to do here is to make sure that all
150  * jbd2_journal_stop calls result in the superblock being marked dirty, so
151  * that sync() will call the filesystem's write_super callback if
152  * appropriate.
153  */
154 int __ext4_journal_stop(const char *where, handle_t *handle)
155 {
156 	struct super_block *sb;
157 	int err;
158 	int rc;
159 
160 	sb = handle->h_transaction->t_journal->j_private;
161 	err = handle->h_err;
162 	rc = jbd2_journal_stop(handle);
163 
164 	if (!err)
165 		err = rc;
166 	if (err)
167 		__ext4_std_error(sb, where, err);
168 	return err;
169 }
170 
171 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
172 		struct buffer_head *bh, handle_t *handle, int err)
173 {
174 	char nbuf[16];
175 	const char *errstr = ext4_decode_error(NULL, err, nbuf);
176 
177 	if (bh)
178 		BUFFER_TRACE(bh, "abort");
179 
180 	if (!handle->h_err)
181 		handle->h_err = err;
182 
183 	if (is_handle_aborted(handle))
184 		return;
185 
186 	printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
187 	       caller, errstr, err_fn);
188 
189 	jbd2_journal_abort_handle(handle);
190 }
191 
192 /* Deal with the reporting of failure conditions on a filesystem such as
193  * inconsistencies detected or read IO failures.
194  *
195  * On ext2, we can store the error state of the filesystem in the
196  * superblock.  That is not possible on ext4, because we may have other
197  * write ordering constraints on the superblock which prevent us from
198  * writing it out straight away; and given that the journal is about to
199  * be aborted, we can't rely on the current, or future, transactions to
200  * write out the superblock safely.
201  *
202  * We'll just use the jbd2_journal_abort() error code to record an error in
203  * the journal instead.  On recovery, the journal will compain about
204  * that error until we've noted it down and cleared it.
205  */
206 
207 static void ext4_handle_error(struct super_block *sb)
208 {
209 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
210 
211 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
212 	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
213 
214 	if (sb->s_flags & MS_RDONLY)
215 		return;
216 
217 	if (!test_opt(sb, ERRORS_CONT)) {
218 		journal_t *journal = EXT4_SB(sb)->s_journal;
219 
220 		EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
221 		if (journal)
222 			jbd2_journal_abort(journal, -EIO);
223 	}
224 	if (test_opt(sb, ERRORS_RO)) {
225 		printk(KERN_CRIT "Remounting filesystem read-only\n");
226 		sb->s_flags |= MS_RDONLY;
227 	}
228 	ext4_commit_super(sb, es, 1);
229 	if (test_opt(sb, ERRORS_PANIC))
230 		panic("EXT4-fs (device %s): panic forced after error\n",
231 			sb->s_id);
232 }
233 
234 void ext4_error(struct super_block *sb, const char *function,
235 		const char *fmt, ...)
236 {
237 	va_list args;
238 
239 	va_start(args, fmt);
240 	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
241 	vprintk(fmt, args);
242 	printk("\n");
243 	va_end(args);
244 
245 	ext4_handle_error(sb);
246 }
247 
248 static const char *ext4_decode_error(struct super_block *sb, int errno,
249 				     char nbuf[16])
250 {
251 	char *errstr = NULL;
252 
253 	switch (errno) {
254 	case -EIO:
255 		errstr = "IO failure";
256 		break;
257 	case -ENOMEM:
258 		errstr = "Out of memory";
259 		break;
260 	case -EROFS:
261 		if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
262 			errstr = "Journal has aborted";
263 		else
264 			errstr = "Readonly filesystem";
265 		break;
266 	default:
267 		/* If the caller passed in an extra buffer for unknown
268 		 * errors, textualise them now.  Else we just return
269 		 * NULL. */
270 		if (nbuf) {
271 			/* Check for truncated error codes... */
272 			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
273 				errstr = nbuf;
274 		}
275 		break;
276 	}
277 
278 	return errstr;
279 }
280 
281 /* __ext4_std_error decodes expected errors from journaling functions
282  * automatically and invokes the appropriate error response.  */
283 
284 void __ext4_std_error(struct super_block *sb, const char *function, int errno)
285 {
286 	char nbuf[16];
287 	const char *errstr;
288 
289 	/* Special case: if the error is EROFS, and we're not already
290 	 * inside a transaction, then there's really no point in logging
291 	 * an error. */
292 	if (errno == -EROFS && journal_current_handle() == NULL &&
293 	    (sb->s_flags & MS_RDONLY))
294 		return;
295 
296 	errstr = ext4_decode_error(sb, errno, nbuf);
297 	printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
298 	       sb->s_id, function, errstr);
299 
300 	ext4_handle_error(sb);
301 }
302 
303 /*
304  * ext4_abort is a much stronger failure handler than ext4_error.  The
305  * abort function may be used to deal with unrecoverable failures such
306  * as journal IO errors or ENOMEM at a critical moment in log management.
307  *
308  * We unconditionally force the filesystem into an ABORT|READONLY state,
309  * unless the error response on the fs has been set to panic in which
310  * case we take the easy way out and panic immediately.
311  */
312 
313 void ext4_abort(struct super_block *sb, const char *function,
314 		const char *fmt, ...)
315 {
316 	va_list args;
317 
318 	printk(KERN_CRIT "ext4_abort called.\n");
319 
320 	va_start(args, fmt);
321 	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
322 	vprintk(fmt, args);
323 	printk("\n");
324 	va_end(args);
325 
326 	if (test_opt(sb, ERRORS_PANIC))
327 		panic("EXT4-fs panic from previous error\n");
328 
329 	if (sb->s_flags & MS_RDONLY)
330 		return;
331 
332 	printk(KERN_CRIT "Remounting filesystem read-only\n");
333 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
334 	sb->s_flags |= MS_RDONLY;
335 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
336 	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
337 }
338 
339 void ext4_warning(struct super_block *sb, const char *function,
340 		  const char *fmt, ...)
341 {
342 	va_list args;
343 
344 	va_start(args, fmt);
345 	printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
346 	       sb->s_id, function);
347 	vprintk(fmt, args);
348 	printk("\n");
349 	va_end(args);
350 }
351 
352 void ext4_update_dynamic_rev(struct super_block *sb)
353 {
354 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
355 
356 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
357 		return;
358 
359 	ext4_warning(sb, __func__,
360 		     "updating to rev %d because of new feature flag, "
361 		     "running e2fsck is recommended",
362 		     EXT4_DYNAMIC_REV);
363 
364 	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
365 	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
366 	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
367 	/* leave es->s_feature_*compat flags alone */
368 	/* es->s_uuid will be set by e2fsck if empty */
369 
370 	/*
371 	 * The rest of the superblock fields should be zero, and if not it
372 	 * means they are likely already in use, so leave them alone.  We
373 	 * can leave it up to e2fsck to clean up any inconsistencies there.
374 	 */
375 }
376 
377 int ext4_update_compat_feature(handle_t *handle,
378 					struct super_block *sb, __u32 compat)
379 {
380 	int err = 0;
381 	if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
382 		err = ext4_journal_get_write_access(handle,
383 				EXT4_SB(sb)->s_sbh);
384 		if (err)
385 			return err;
386 		EXT4_SET_COMPAT_FEATURE(sb, compat);
387 		sb->s_dirt = 1;
388 		handle->h_sync = 1;
389 		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
390 					"call ext4_journal_dirty_met adata");
391 		err = ext4_journal_dirty_metadata(handle,
392 				EXT4_SB(sb)->s_sbh);
393 	}
394 	return err;
395 }
396 
397 int ext4_update_rocompat_feature(handle_t *handle,
398 					struct super_block *sb, __u32 rocompat)
399 {
400 	int err = 0;
401 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
402 		err = ext4_journal_get_write_access(handle,
403 				EXT4_SB(sb)->s_sbh);
404 		if (err)
405 			return err;
406 		EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
407 		sb->s_dirt = 1;
408 		handle->h_sync = 1;
409 		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
410 					"call ext4_journal_dirty_met adata");
411 		err = ext4_journal_dirty_metadata(handle,
412 				EXT4_SB(sb)->s_sbh);
413 	}
414 	return err;
415 }
416 
417 int ext4_update_incompat_feature(handle_t *handle,
418 					struct super_block *sb, __u32 incompat)
419 {
420 	int err = 0;
421 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
422 		err = ext4_journal_get_write_access(handle,
423 				EXT4_SB(sb)->s_sbh);
424 		if (err)
425 			return err;
426 		EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
427 		sb->s_dirt = 1;
428 		handle->h_sync = 1;
429 		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
430 					"call ext4_journal_dirty_met adata");
431 		err = ext4_journal_dirty_metadata(handle,
432 				EXT4_SB(sb)->s_sbh);
433 	}
434 	return err;
435 }
436 
437 /*
438  * Open the external journal device
439  */
440 static struct block_device *ext4_blkdev_get(dev_t dev)
441 {
442 	struct block_device *bdev;
443 	char b[BDEVNAME_SIZE];
444 
445 	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
446 	if (IS_ERR(bdev))
447 		goto fail;
448 	return bdev;
449 
450 fail:
451 	printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
452 			__bdevname(dev, b), PTR_ERR(bdev));
453 	return NULL;
454 }
455 
456 /*
457  * Release the journal device
458  */
459 static int ext4_blkdev_put(struct block_device *bdev)
460 {
461 	bd_release(bdev);
462 	return blkdev_put(bdev);
463 }
464 
465 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
466 {
467 	struct block_device *bdev;
468 	int ret = -ENODEV;
469 
470 	bdev = sbi->journal_bdev;
471 	if (bdev) {
472 		ret = ext4_blkdev_put(bdev);
473 		sbi->journal_bdev = NULL;
474 	}
475 	return ret;
476 }
477 
478 static inline struct inode *orphan_list_entry(struct list_head *l)
479 {
480 	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
481 }
482 
483 static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
484 {
485 	struct list_head *l;
486 
487 	printk(KERN_ERR "sb orphan head is %d\n",
488 	       le32_to_cpu(sbi->s_es->s_last_orphan));
489 
490 	printk(KERN_ERR "sb_info orphan list:\n");
491 	list_for_each(l, &sbi->s_orphan) {
492 		struct inode *inode = orphan_list_entry(l);
493 		printk(KERN_ERR "  "
494 		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
495 		       inode->i_sb->s_id, inode->i_ino, inode,
496 		       inode->i_mode, inode->i_nlink,
497 		       NEXT_ORPHAN(inode));
498 	}
499 }
500 
501 static void ext4_put_super(struct super_block *sb)
502 {
503 	struct ext4_sb_info *sbi = EXT4_SB(sb);
504 	struct ext4_super_block *es = sbi->s_es;
505 	int i;
506 
507 	ext4_mb_release(sb);
508 	ext4_ext_release(sb);
509 	ext4_xattr_put_super(sb);
510 	if (jbd2_journal_destroy(sbi->s_journal) < 0)
511 		ext4_abort(sb, __func__, "Couldn't clean up the journal");
512 	sbi->s_journal = NULL;
513 	if (!(sb->s_flags & MS_RDONLY)) {
514 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
515 		es->s_state = cpu_to_le16(sbi->s_mount_state);
516 		ext4_commit_super(sb, es, 1);
517 	}
518 	if (sbi->s_proc) {
519 		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
520 		remove_proc_entry(sb->s_id, ext4_proc_root);
521 	}
522 
523 	for (i = 0; i < sbi->s_gdb_count; i++)
524 		brelse(sbi->s_group_desc[i]);
525 	kfree(sbi->s_group_desc);
526 	kfree(sbi->s_flex_groups);
527 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
528 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
529 	percpu_counter_destroy(&sbi->s_dirs_counter);
530 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
531 	brelse(sbi->s_sbh);
532 #ifdef CONFIG_QUOTA
533 	for (i = 0; i < MAXQUOTAS; i++)
534 		kfree(sbi->s_qf_names[i]);
535 #endif
536 
537 	/* Debugging code just in case the in-memory inode orphan list
538 	 * isn't empty.  The on-disk one can be non-empty if we've
539 	 * detected an error and taken the fs readonly, but the
540 	 * in-memory list had better be clean by this point. */
541 	if (!list_empty(&sbi->s_orphan))
542 		dump_orphan_list(sb, sbi);
543 	J_ASSERT(list_empty(&sbi->s_orphan));
544 
545 	invalidate_bdev(sb->s_bdev);
546 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
547 		/*
548 		 * Invalidate the journal device's buffers.  We don't want them
549 		 * floating about in memory - the physical journal device may
550 		 * hotswapped, and it breaks the `ro-after' testing code.
551 		 */
552 		sync_blockdev(sbi->journal_bdev);
553 		invalidate_bdev(sbi->journal_bdev);
554 		ext4_blkdev_remove(sbi);
555 	}
556 	sb->s_fs_info = NULL;
557 	kfree(sbi);
558 	return;
559 }
560 
561 static struct kmem_cache *ext4_inode_cachep;
562 
563 /*
564  * Called inside transaction, so use GFP_NOFS
565  */
566 static struct inode *ext4_alloc_inode(struct super_block *sb)
567 {
568 	struct ext4_inode_info *ei;
569 
570 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
571 	if (!ei)
572 		return NULL;
573 #ifdef CONFIG_EXT4_FS_POSIX_ACL
574 	ei->i_acl = EXT4_ACL_NOT_CACHED;
575 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
576 #endif
577 	ei->vfs_inode.i_version = 1;
578 	ei->vfs_inode.i_data.writeback_index = 0;
579 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
580 	INIT_LIST_HEAD(&ei->i_prealloc_list);
581 	spin_lock_init(&ei->i_prealloc_lock);
582 	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
583 	ei->i_reserved_data_blocks = 0;
584 	ei->i_reserved_meta_blocks = 0;
585 	ei->i_allocated_meta_blocks = 0;
586 	ei->i_delalloc_reserved_flag = 0;
587 	spin_lock_init(&(ei->i_block_reservation_lock));
588 	return &ei->vfs_inode;
589 }
590 
591 static void ext4_destroy_inode(struct inode *inode)
592 {
593 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
594 		printk("EXT4 Inode %p: orphan list check failed!\n",
595 			EXT4_I(inode));
596 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
597 				EXT4_I(inode), sizeof(struct ext4_inode_info),
598 				true);
599 		dump_stack();
600 	}
601 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
602 }
603 
604 static void init_once(void *foo)
605 {
606 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
607 
608 	INIT_LIST_HEAD(&ei->i_orphan);
609 #ifdef CONFIG_EXT4_FS_XATTR
610 	init_rwsem(&ei->xattr_sem);
611 #endif
612 	init_rwsem(&ei->i_data_sem);
613 	inode_init_once(&ei->vfs_inode);
614 }
615 
616 static int init_inodecache(void)
617 {
618 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
619 					     sizeof(struct ext4_inode_info),
620 					     0, (SLAB_RECLAIM_ACCOUNT|
621 						SLAB_MEM_SPREAD),
622 					     init_once);
623 	if (ext4_inode_cachep == NULL)
624 		return -ENOMEM;
625 	return 0;
626 }
627 
628 static void destroy_inodecache(void)
629 {
630 	kmem_cache_destroy(ext4_inode_cachep);
631 }
632 
633 static void ext4_clear_inode(struct inode *inode)
634 {
635 #ifdef CONFIG_EXT4_FS_POSIX_ACL
636 	if (EXT4_I(inode)->i_acl &&
637 			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
638 		posix_acl_release(EXT4_I(inode)->i_acl);
639 		EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
640 	}
641 	if (EXT4_I(inode)->i_default_acl &&
642 			EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
643 		posix_acl_release(EXT4_I(inode)->i_default_acl);
644 		EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
645 	}
646 #endif
647 	ext4_discard_preallocations(inode);
648 	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
649 				       &EXT4_I(inode)->jinode);
650 }
651 
652 static inline void ext4_show_quota_options(struct seq_file *seq,
653 					   struct super_block *sb)
654 {
655 #if defined(CONFIG_QUOTA)
656 	struct ext4_sb_info *sbi = EXT4_SB(sb);
657 
658 	if (sbi->s_jquota_fmt)
659 		seq_printf(seq, ",jqfmt=%s",
660 		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
661 
662 	if (sbi->s_qf_names[USRQUOTA])
663 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
664 
665 	if (sbi->s_qf_names[GRPQUOTA])
666 		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
667 
668 	if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
669 		seq_puts(seq, ",usrquota");
670 
671 	if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
672 		seq_puts(seq, ",grpquota");
673 #endif
674 }
675 
676 /*
677  * Show an option if
678  *  - it's set to a non-default value OR
679  *  - if the per-sb default is different from the global default
680  */
681 static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
682 {
683 	int def_errors;
684 	unsigned long def_mount_opts;
685 	struct super_block *sb = vfs->mnt_sb;
686 	struct ext4_sb_info *sbi = EXT4_SB(sb);
687 	struct ext4_super_block *es = sbi->s_es;
688 
689 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
690 	def_errors     = le16_to_cpu(es->s_errors);
691 
692 	if (sbi->s_sb_block != 1)
693 		seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
694 	if (test_opt(sb, MINIX_DF))
695 		seq_puts(seq, ",minixdf");
696 	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
697 		seq_puts(seq, ",grpid");
698 	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
699 		seq_puts(seq, ",nogrpid");
700 	if (sbi->s_resuid != EXT4_DEF_RESUID ||
701 	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
702 		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
703 	}
704 	if (sbi->s_resgid != EXT4_DEF_RESGID ||
705 	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
706 		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
707 	}
708 	if (test_opt(sb, ERRORS_RO)) {
709 		if (def_errors == EXT4_ERRORS_PANIC ||
710 		    def_errors == EXT4_ERRORS_CONTINUE) {
711 			seq_puts(seq, ",errors=remount-ro");
712 		}
713 	}
714 	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
715 		seq_puts(seq, ",errors=continue");
716 	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
717 		seq_puts(seq, ",errors=panic");
718 	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
719 		seq_puts(seq, ",nouid32");
720 	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
721 		seq_puts(seq, ",debug");
722 	if (test_opt(sb, OLDALLOC))
723 		seq_puts(seq, ",oldalloc");
724 #ifdef CONFIG_EXT4_FS_XATTR
725 	if (test_opt(sb, XATTR_USER) &&
726 		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
727 		seq_puts(seq, ",user_xattr");
728 	if (!test_opt(sb, XATTR_USER) &&
729 	    (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
730 		seq_puts(seq, ",nouser_xattr");
731 	}
732 #endif
733 #ifdef CONFIG_EXT4_FS_POSIX_ACL
734 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
735 		seq_puts(seq, ",acl");
736 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
737 		seq_puts(seq, ",noacl");
738 #endif
739 	if (!test_opt(sb, RESERVATION))
740 		seq_puts(seq, ",noreservation");
741 	if (sbi->s_commit_interval) {
742 		seq_printf(seq, ",commit=%u",
743 			   (unsigned) (sbi->s_commit_interval / HZ));
744 	}
745 	/*
746 	 * We're changing the default of barrier mount option, so
747 	 * let's always display its mount state so it's clear what its
748 	 * status is.
749 	 */
750 	seq_puts(seq, ",barrier=");
751 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
752 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
753 		seq_puts(seq, ",journal_async_commit");
754 	if (test_opt(sb, NOBH))
755 		seq_puts(seq, ",nobh");
756 	if (!test_opt(sb, EXTENTS))
757 		seq_puts(seq, ",noextents");
758 	if (test_opt(sb, I_VERSION))
759 		seq_puts(seq, ",i_version");
760 	if (!test_opt(sb, DELALLOC))
761 		seq_puts(seq, ",nodelalloc");
762 
763 
764 	if (sbi->s_stripe)
765 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
766 	/*
767 	 * journal mode get enabled in different ways
768 	 * So just print the value even if we didn't specify it
769 	 */
770 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
771 		seq_puts(seq, ",data=journal");
772 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
773 		seq_puts(seq, ",data=ordered");
774 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
775 		seq_puts(seq, ",data=writeback");
776 
777 	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
778 		seq_printf(seq, ",inode_readahead_blks=%u",
779 			   sbi->s_inode_readahead_blks);
780 
781 	if (test_opt(sb, DATA_ERR_ABORT))
782 		seq_puts(seq, ",data_err=abort");
783 
784 	ext4_show_quota_options(seq, sb);
785 	return 0;
786 }
787 
788 
789 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
790 		u64 ino, u32 generation)
791 {
792 	struct inode *inode;
793 
794 	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
795 		return ERR_PTR(-ESTALE);
796 	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
797 		return ERR_PTR(-ESTALE);
798 
799 	/* iget isn't really right if the inode is currently unallocated!!
800 	 *
801 	 * ext4_read_inode will return a bad_inode if the inode had been
802 	 * deleted, so we should be safe.
803 	 *
804 	 * Currently we don't know the generation for parent directory, so
805 	 * a generation of 0 means "accept any"
806 	 */
807 	inode = ext4_iget(sb, ino);
808 	if (IS_ERR(inode))
809 		return ERR_CAST(inode);
810 	if (generation && inode->i_generation != generation) {
811 		iput(inode);
812 		return ERR_PTR(-ESTALE);
813 	}
814 
815 	return inode;
816 }
817 
818 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
819 		int fh_len, int fh_type)
820 {
821 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
822 				    ext4_nfs_get_inode);
823 }
824 
825 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
826 		int fh_len, int fh_type)
827 {
828 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
829 				    ext4_nfs_get_inode);
830 }
831 
832 #ifdef CONFIG_QUOTA
833 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
834 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
835 
836 static int ext4_dquot_initialize(struct inode *inode, int type);
837 static int ext4_dquot_drop(struct inode *inode);
838 static int ext4_write_dquot(struct dquot *dquot);
839 static int ext4_acquire_dquot(struct dquot *dquot);
840 static int ext4_release_dquot(struct dquot *dquot);
841 static int ext4_mark_dquot_dirty(struct dquot *dquot);
842 static int ext4_write_info(struct super_block *sb, int type);
843 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
844 				char *path, int remount);
845 static int ext4_quota_on_mount(struct super_block *sb, int type);
846 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
847 			       size_t len, loff_t off);
848 static ssize_t ext4_quota_write(struct super_block *sb, int type,
849 				const char *data, size_t len, loff_t off);
850 
851 static struct dquot_operations ext4_quota_operations = {
852 	.initialize	= ext4_dquot_initialize,
853 	.drop		= ext4_dquot_drop,
854 	.alloc_space	= dquot_alloc_space,
855 	.alloc_inode	= dquot_alloc_inode,
856 	.free_space	= dquot_free_space,
857 	.free_inode	= dquot_free_inode,
858 	.transfer	= dquot_transfer,
859 	.write_dquot	= ext4_write_dquot,
860 	.acquire_dquot	= ext4_acquire_dquot,
861 	.release_dquot	= ext4_release_dquot,
862 	.mark_dirty	= ext4_mark_dquot_dirty,
863 	.write_info	= ext4_write_info
864 };
865 
866 static struct quotactl_ops ext4_qctl_operations = {
867 	.quota_on	= ext4_quota_on,
868 	.quota_off	= vfs_quota_off,
869 	.quota_sync	= vfs_quota_sync,
870 	.get_info	= vfs_get_dqinfo,
871 	.set_info	= vfs_set_dqinfo,
872 	.get_dqblk	= vfs_get_dqblk,
873 	.set_dqblk	= vfs_set_dqblk
874 };
875 #endif
876 
877 static const struct super_operations ext4_sops = {
878 	.alloc_inode	= ext4_alloc_inode,
879 	.destroy_inode	= ext4_destroy_inode,
880 	.write_inode	= ext4_write_inode,
881 	.dirty_inode	= ext4_dirty_inode,
882 	.delete_inode	= ext4_delete_inode,
883 	.put_super	= ext4_put_super,
884 	.write_super	= ext4_write_super,
885 	.sync_fs	= ext4_sync_fs,
886 	.write_super_lockfs = ext4_write_super_lockfs,
887 	.unlockfs	= ext4_unlockfs,
888 	.statfs		= ext4_statfs,
889 	.remount_fs	= ext4_remount,
890 	.clear_inode	= ext4_clear_inode,
891 	.show_options	= ext4_show_options,
892 #ifdef CONFIG_QUOTA
893 	.quota_read	= ext4_quota_read,
894 	.quota_write	= ext4_quota_write,
895 #endif
896 };
897 
898 static const struct export_operations ext4_export_ops = {
899 	.fh_to_dentry = ext4_fh_to_dentry,
900 	.fh_to_parent = ext4_fh_to_parent,
901 	.get_parent = ext4_get_parent,
902 };
903 
904 enum {
905 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
906 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
907 	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
908 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
909 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
910 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
911 	Opt_journal_checksum, Opt_journal_async_commit,
912 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
913 	Opt_data_err_abort, Opt_data_err_ignore,
914 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
915 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
916 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
917 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
918 	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
919 	Opt_inode_readahead_blks
920 };
921 
922 static const match_table_t tokens = {
923 	{Opt_bsd_df, "bsddf"},
924 	{Opt_minix_df, "minixdf"},
925 	{Opt_grpid, "grpid"},
926 	{Opt_grpid, "bsdgroups"},
927 	{Opt_nogrpid, "nogrpid"},
928 	{Opt_nogrpid, "sysvgroups"},
929 	{Opt_resgid, "resgid=%u"},
930 	{Opt_resuid, "resuid=%u"},
931 	{Opt_sb, "sb=%u"},
932 	{Opt_err_cont, "errors=continue"},
933 	{Opt_err_panic, "errors=panic"},
934 	{Opt_err_ro, "errors=remount-ro"},
935 	{Opt_nouid32, "nouid32"},
936 	{Opt_nocheck, "nocheck"},
937 	{Opt_nocheck, "check=none"},
938 	{Opt_debug, "debug"},
939 	{Opt_oldalloc, "oldalloc"},
940 	{Opt_orlov, "orlov"},
941 	{Opt_user_xattr, "user_xattr"},
942 	{Opt_nouser_xattr, "nouser_xattr"},
943 	{Opt_acl, "acl"},
944 	{Opt_noacl, "noacl"},
945 	{Opt_reservation, "reservation"},
946 	{Opt_noreservation, "noreservation"},
947 	{Opt_noload, "noload"},
948 	{Opt_nobh, "nobh"},
949 	{Opt_bh, "bh"},
950 	{Opt_commit, "commit=%u"},
951 	{Opt_journal_update, "journal=update"},
952 	{Opt_journal_inum, "journal=%u"},
953 	{Opt_journal_dev, "journal_dev=%u"},
954 	{Opt_journal_checksum, "journal_checksum"},
955 	{Opt_journal_async_commit, "journal_async_commit"},
956 	{Opt_abort, "abort"},
957 	{Opt_data_journal, "data=journal"},
958 	{Opt_data_ordered, "data=ordered"},
959 	{Opt_data_writeback, "data=writeback"},
960 	{Opt_data_err_abort, "data_err=abort"},
961 	{Opt_data_err_ignore, "data_err=ignore"},
962 	{Opt_offusrjquota, "usrjquota="},
963 	{Opt_usrjquota, "usrjquota=%s"},
964 	{Opt_offgrpjquota, "grpjquota="},
965 	{Opt_grpjquota, "grpjquota=%s"},
966 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
967 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
968 	{Opt_grpquota, "grpquota"},
969 	{Opt_noquota, "noquota"},
970 	{Opt_quota, "quota"},
971 	{Opt_usrquota, "usrquota"},
972 	{Opt_barrier, "barrier=%u"},
973 	{Opt_extents, "extents"},
974 	{Opt_noextents, "noextents"},
975 	{Opt_i_version, "i_version"},
976 	{Opt_mballoc, "mballoc"},
977 	{Opt_nomballoc, "nomballoc"},
978 	{Opt_stripe, "stripe=%u"},
979 	{Opt_resize, "resize"},
980 	{Opt_delalloc, "delalloc"},
981 	{Opt_nodelalloc, "nodelalloc"},
982 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
983 	{Opt_err, NULL},
984 };
985 
986 static ext4_fsblk_t get_sb_block(void **data)
987 {
988 	ext4_fsblk_t	sb_block;
989 	char		*options = (char *) *data;
990 
991 	if (!options || strncmp(options, "sb=", 3) != 0)
992 		return 1;	/* Default location */
993 	options += 3;
994 	/*todo: use simple_strtoll with >32bit ext4 */
995 	sb_block = simple_strtoul(options, &options, 0);
996 	if (*options && *options != ',') {
997 		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
998 		       (char *) *data);
999 		return 1;
1000 	}
1001 	if (*options == ',')
1002 		options++;
1003 	*data = (void *) options;
1004 	return sb_block;
1005 }
1006 
1007 static int parse_options(char *options, struct super_block *sb,
1008 			 unsigned int *inum, unsigned long *journal_devnum,
1009 			 ext4_fsblk_t *n_blocks_count, int is_remount)
1010 {
1011 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1012 	char *p;
1013 	substring_t args[MAX_OPT_ARGS];
1014 	int data_opt = 0;
1015 	int option;
1016 #ifdef CONFIG_QUOTA
1017 	int qtype, qfmt;
1018 	char *qname;
1019 #endif
1020 	ext4_fsblk_t last_block;
1021 
1022 	if (!options)
1023 		return 1;
1024 
1025 	while ((p = strsep(&options, ",")) != NULL) {
1026 		int token;
1027 		if (!*p)
1028 			continue;
1029 
1030 		token = match_token(p, tokens, args);
1031 		switch (token) {
1032 		case Opt_bsd_df:
1033 			clear_opt(sbi->s_mount_opt, MINIX_DF);
1034 			break;
1035 		case Opt_minix_df:
1036 			set_opt(sbi->s_mount_opt, MINIX_DF);
1037 			break;
1038 		case Opt_grpid:
1039 			set_opt(sbi->s_mount_opt, GRPID);
1040 			break;
1041 		case Opt_nogrpid:
1042 			clear_opt(sbi->s_mount_opt, GRPID);
1043 			break;
1044 		case Opt_resuid:
1045 			if (match_int(&args[0], &option))
1046 				return 0;
1047 			sbi->s_resuid = option;
1048 			break;
1049 		case Opt_resgid:
1050 			if (match_int(&args[0], &option))
1051 				return 0;
1052 			sbi->s_resgid = option;
1053 			break;
1054 		case Opt_sb:
1055 			/* handled by get_sb_block() instead of here */
1056 			/* *sb_block = match_int(&args[0]); */
1057 			break;
1058 		case Opt_err_panic:
1059 			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
1060 			clear_opt(sbi->s_mount_opt, ERRORS_RO);
1061 			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1062 			break;
1063 		case Opt_err_ro:
1064 			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
1065 			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
1066 			set_opt(sbi->s_mount_opt, ERRORS_RO);
1067 			break;
1068 		case Opt_err_cont:
1069 			clear_opt(sbi->s_mount_opt, ERRORS_RO);
1070 			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
1071 			set_opt(sbi->s_mount_opt, ERRORS_CONT);
1072 			break;
1073 		case Opt_nouid32:
1074 			set_opt(sbi->s_mount_opt, NO_UID32);
1075 			break;
1076 		case Opt_nocheck:
1077 			clear_opt(sbi->s_mount_opt, CHECK);
1078 			break;
1079 		case Opt_debug:
1080 			set_opt(sbi->s_mount_opt, DEBUG);
1081 			break;
1082 		case Opt_oldalloc:
1083 			set_opt(sbi->s_mount_opt, OLDALLOC);
1084 			break;
1085 		case Opt_orlov:
1086 			clear_opt(sbi->s_mount_opt, OLDALLOC);
1087 			break;
1088 #ifdef CONFIG_EXT4_FS_XATTR
1089 		case Opt_user_xattr:
1090 			set_opt(sbi->s_mount_opt, XATTR_USER);
1091 			break;
1092 		case Opt_nouser_xattr:
1093 			clear_opt(sbi->s_mount_opt, XATTR_USER);
1094 			break;
1095 #else
1096 		case Opt_user_xattr:
1097 		case Opt_nouser_xattr:
1098 			printk(KERN_ERR "EXT4 (no)user_xattr options "
1099 			       "not supported\n");
1100 			break;
1101 #endif
1102 #ifdef CONFIG_EXT4_FS_POSIX_ACL
1103 		case Opt_acl:
1104 			set_opt(sbi->s_mount_opt, POSIX_ACL);
1105 			break;
1106 		case Opt_noacl:
1107 			clear_opt(sbi->s_mount_opt, POSIX_ACL);
1108 			break;
1109 #else
1110 		case Opt_acl:
1111 		case Opt_noacl:
1112 			printk(KERN_ERR "EXT4 (no)acl options "
1113 			       "not supported\n");
1114 			break;
1115 #endif
1116 		case Opt_reservation:
1117 			set_opt(sbi->s_mount_opt, RESERVATION);
1118 			break;
1119 		case Opt_noreservation:
1120 			clear_opt(sbi->s_mount_opt, RESERVATION);
1121 			break;
1122 		case Opt_journal_update:
1123 			/* @@@ FIXME */
1124 			/* Eventually we will want to be able to create
1125 			   a journal file here.  For now, only allow the
1126 			   user to specify an existing inode to be the
1127 			   journal file. */
1128 			if (is_remount) {
1129 				printk(KERN_ERR "EXT4-fs: cannot specify "
1130 				       "journal on remount\n");
1131 				return 0;
1132 			}
1133 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1134 			break;
1135 		case Opt_journal_inum:
1136 			if (is_remount) {
1137 				printk(KERN_ERR "EXT4-fs: cannot specify "
1138 				       "journal on remount\n");
1139 				return 0;
1140 			}
1141 			if (match_int(&args[0], &option))
1142 				return 0;
1143 			*inum = option;
1144 			break;
1145 		case Opt_journal_dev:
1146 			if (is_remount) {
1147 				printk(KERN_ERR "EXT4-fs: cannot specify "
1148 				       "journal on remount\n");
1149 				return 0;
1150 			}
1151 			if (match_int(&args[0], &option))
1152 				return 0;
1153 			*journal_devnum = option;
1154 			break;
1155 		case Opt_journal_checksum:
1156 			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1157 			break;
1158 		case Opt_journal_async_commit:
1159 			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1160 			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1161 			break;
1162 		case Opt_noload:
1163 			set_opt(sbi->s_mount_opt, NOLOAD);
1164 			break;
1165 		case Opt_commit:
1166 			if (match_int(&args[0], &option))
1167 				return 0;
1168 			if (option < 0)
1169 				return 0;
1170 			if (option == 0)
1171 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1172 			sbi->s_commit_interval = HZ * option;
1173 			break;
1174 		case Opt_data_journal:
1175 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
1176 			goto datacheck;
1177 		case Opt_data_ordered:
1178 			data_opt = EXT4_MOUNT_ORDERED_DATA;
1179 			goto datacheck;
1180 		case Opt_data_writeback:
1181 			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1182 		datacheck:
1183 			if (is_remount) {
1184 				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
1185 						!= data_opt) {
1186 					printk(KERN_ERR
1187 						"EXT4-fs: cannot change data "
1188 						"mode on remount\n");
1189 					return 0;
1190 				}
1191 			} else {
1192 				sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
1193 				sbi->s_mount_opt |= data_opt;
1194 			}
1195 			break;
1196 		case Opt_data_err_abort:
1197 			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1198 			break;
1199 		case Opt_data_err_ignore:
1200 			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1201 			break;
1202 #ifdef CONFIG_QUOTA
1203 		case Opt_usrjquota:
1204 			qtype = USRQUOTA;
1205 			goto set_qf_name;
1206 		case Opt_grpjquota:
1207 			qtype = GRPQUOTA;
1208 set_qf_name:
1209 			if ((sb_any_quota_enabled(sb) ||
1210 			     sb_any_quota_suspended(sb)) &&
1211 			    !sbi->s_qf_names[qtype]) {
1212 				printk(KERN_ERR
1213 				       "EXT4-fs: Cannot change journaled "
1214 				       "quota options when quota turned on.\n");
1215 				return 0;
1216 			}
1217 			qname = match_strdup(&args[0]);
1218 			if (!qname) {
1219 				printk(KERN_ERR
1220 					"EXT4-fs: not enough memory for "
1221 					"storing quotafile name.\n");
1222 				return 0;
1223 			}
1224 			if (sbi->s_qf_names[qtype] &&
1225 			    strcmp(sbi->s_qf_names[qtype], qname)) {
1226 				printk(KERN_ERR
1227 					"EXT4-fs: %s quota file already "
1228 					"specified.\n", QTYPE2NAME(qtype));
1229 				kfree(qname);
1230 				return 0;
1231 			}
1232 			sbi->s_qf_names[qtype] = qname;
1233 			if (strchr(sbi->s_qf_names[qtype], '/')) {
1234 				printk(KERN_ERR
1235 					"EXT4-fs: quotafile must be on "
1236 					"filesystem root.\n");
1237 				kfree(sbi->s_qf_names[qtype]);
1238 				sbi->s_qf_names[qtype] = NULL;
1239 				return 0;
1240 			}
1241 			set_opt(sbi->s_mount_opt, QUOTA);
1242 			break;
1243 		case Opt_offusrjquota:
1244 			qtype = USRQUOTA;
1245 			goto clear_qf_name;
1246 		case Opt_offgrpjquota:
1247 			qtype = GRPQUOTA;
1248 clear_qf_name:
1249 			if ((sb_any_quota_enabled(sb) ||
1250 			     sb_any_quota_suspended(sb)) &&
1251 			    sbi->s_qf_names[qtype]) {
1252 				printk(KERN_ERR "EXT4-fs: Cannot change "
1253 					"journaled quota options when "
1254 					"quota turned on.\n");
1255 				return 0;
1256 			}
1257 			/*
1258 			 * The space will be released later when all options
1259 			 * are confirmed to be correct
1260 			 */
1261 			sbi->s_qf_names[qtype] = NULL;
1262 			break;
1263 		case Opt_jqfmt_vfsold:
1264 			qfmt = QFMT_VFS_OLD;
1265 			goto set_qf_format;
1266 		case Opt_jqfmt_vfsv0:
1267 			qfmt = QFMT_VFS_V0;
1268 set_qf_format:
1269 			if ((sb_any_quota_enabled(sb) ||
1270 			     sb_any_quota_suspended(sb)) &&
1271 			    sbi->s_jquota_fmt != qfmt) {
1272 				printk(KERN_ERR "EXT4-fs: Cannot change "
1273 					"journaled quota options when "
1274 					"quota turned on.\n");
1275 				return 0;
1276 			}
1277 			sbi->s_jquota_fmt = qfmt;
1278 			break;
1279 		case Opt_quota:
1280 		case Opt_usrquota:
1281 			set_opt(sbi->s_mount_opt, QUOTA);
1282 			set_opt(sbi->s_mount_opt, USRQUOTA);
1283 			break;
1284 		case Opt_grpquota:
1285 			set_opt(sbi->s_mount_opt, QUOTA);
1286 			set_opt(sbi->s_mount_opt, GRPQUOTA);
1287 			break;
1288 		case Opt_noquota:
1289 			if (sb_any_quota_enabled(sb)) {
1290 				printk(KERN_ERR "EXT4-fs: Cannot change quota "
1291 					"options when quota turned on.\n");
1292 				return 0;
1293 			}
1294 			clear_opt(sbi->s_mount_opt, QUOTA);
1295 			clear_opt(sbi->s_mount_opt, USRQUOTA);
1296 			clear_opt(sbi->s_mount_opt, GRPQUOTA);
1297 			break;
1298 #else
1299 		case Opt_quota:
1300 		case Opt_usrquota:
1301 		case Opt_grpquota:
1302 			printk(KERN_ERR
1303 				"EXT4-fs: quota options not supported.\n");
1304 			break;
1305 		case Opt_usrjquota:
1306 		case Opt_grpjquota:
1307 		case Opt_offusrjquota:
1308 		case Opt_offgrpjquota:
1309 		case Opt_jqfmt_vfsold:
1310 		case Opt_jqfmt_vfsv0:
1311 			printk(KERN_ERR
1312 				"EXT4-fs: journaled quota options not "
1313 				"supported.\n");
1314 			break;
1315 		case Opt_noquota:
1316 			break;
1317 #endif
1318 		case Opt_abort:
1319 			set_opt(sbi->s_mount_opt, ABORT);
1320 			break;
1321 		case Opt_barrier:
1322 			if (match_int(&args[0], &option))
1323 				return 0;
1324 			if (option)
1325 				set_opt(sbi->s_mount_opt, BARRIER);
1326 			else
1327 				clear_opt(sbi->s_mount_opt, BARRIER);
1328 			break;
1329 		case Opt_ignore:
1330 			break;
1331 		case Opt_resize:
1332 			if (!is_remount) {
1333 				printk("EXT4-fs: resize option only available "
1334 					"for remount\n");
1335 				return 0;
1336 			}
1337 			if (match_int(&args[0], &option) != 0)
1338 				return 0;
1339 			*n_blocks_count = option;
1340 			break;
1341 		case Opt_nobh:
1342 			set_opt(sbi->s_mount_opt, NOBH);
1343 			break;
1344 		case Opt_bh:
1345 			clear_opt(sbi->s_mount_opt, NOBH);
1346 			break;
1347 		case Opt_extents:
1348 			if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1349 					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1350 				ext4_warning(sb, __func__,
1351 					"extents feature not enabled "
1352 					"on this filesystem, use tune2fs\n");
1353 				return 0;
1354 			}
1355 			set_opt(sbi->s_mount_opt, EXTENTS);
1356 			break;
1357 		case Opt_noextents:
1358 			/*
1359 			 * When e2fsprogs support resizing an already existing
1360 			 * ext3 file system to greater than 2**32 we need to
1361 			 * add support to block allocator to handle growing
1362 			 * already existing block  mapped inode so that blocks
1363 			 * allocated for them fall within 2**32
1364 			 */
1365 			last_block = ext4_blocks_count(sbi->s_es) - 1;
1366 			if (last_block  > 0xffffffffULL) {
1367 				printk(KERN_ERR "EXT4-fs: Filesystem too "
1368 						"large to mount with "
1369 						"-o noextents options\n");
1370 				return 0;
1371 			}
1372 			clear_opt(sbi->s_mount_opt, EXTENTS);
1373 			break;
1374 		case Opt_i_version:
1375 			set_opt(sbi->s_mount_opt, I_VERSION);
1376 			sb->s_flags |= MS_I_VERSION;
1377 			break;
1378 		case Opt_nodelalloc:
1379 			clear_opt(sbi->s_mount_opt, DELALLOC);
1380 			break;
1381 		case Opt_stripe:
1382 			if (match_int(&args[0], &option))
1383 				return 0;
1384 			if (option < 0)
1385 				return 0;
1386 			sbi->s_stripe = option;
1387 			break;
1388 		case Opt_delalloc:
1389 			set_opt(sbi->s_mount_opt, DELALLOC);
1390 			break;
1391 		case Opt_inode_readahead_blks:
1392 			if (match_int(&args[0], &option))
1393 				return 0;
1394 			if (option < 0 || option > (1 << 30))
1395 				return 0;
1396 			sbi->s_inode_readahead_blks = option;
1397 			break;
1398 		default:
1399 			printk(KERN_ERR
1400 			       "EXT4-fs: Unrecognized mount option \"%s\" "
1401 			       "or missing value\n", p);
1402 			return 0;
1403 		}
1404 	}
1405 #ifdef CONFIG_QUOTA
1406 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1407 		if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
1408 		     sbi->s_qf_names[USRQUOTA])
1409 			clear_opt(sbi->s_mount_opt, USRQUOTA);
1410 
1411 		if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
1412 		     sbi->s_qf_names[GRPQUOTA])
1413 			clear_opt(sbi->s_mount_opt, GRPQUOTA);
1414 
1415 		if ((sbi->s_qf_names[USRQUOTA] &&
1416 				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1417 		    (sbi->s_qf_names[GRPQUOTA] &&
1418 				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1419 			printk(KERN_ERR "EXT4-fs: old and new quota "
1420 					"format mixing.\n");
1421 			return 0;
1422 		}
1423 
1424 		if (!sbi->s_jquota_fmt) {
1425 			printk(KERN_ERR "EXT4-fs: journaled quota format "
1426 					"not specified.\n");
1427 			return 0;
1428 		}
1429 	} else {
1430 		if (sbi->s_jquota_fmt) {
1431 			printk(KERN_ERR "EXT4-fs: journaled quota format "
1432 					"specified with no journaling "
1433 					"enabled.\n");
1434 			return 0;
1435 		}
1436 	}
1437 #endif
1438 	return 1;
1439 }
1440 
1441 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1442 			    int read_only)
1443 {
1444 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1445 	int res = 0;
1446 
1447 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1448 		printk(KERN_ERR "EXT4-fs warning: revision level too high, "
1449 		       "forcing read-only mode\n");
1450 		res = MS_RDONLY;
1451 	}
1452 	if (read_only)
1453 		return res;
1454 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
1455 		printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
1456 		       "running e2fsck is recommended\n");
1457 	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1458 		printk(KERN_WARNING
1459 		       "EXT4-fs warning: mounting fs with errors, "
1460 		       "running e2fsck is recommended\n");
1461 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1462 		 le16_to_cpu(es->s_mnt_count) >=
1463 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1464 		printk(KERN_WARNING
1465 		       "EXT4-fs warning: maximal mount count reached, "
1466 		       "running e2fsck is recommended\n");
1467 	else if (le32_to_cpu(es->s_checkinterval) &&
1468 		(le32_to_cpu(es->s_lastcheck) +
1469 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1470 		printk(KERN_WARNING
1471 		       "EXT4-fs warning: checktime reached, "
1472 		       "running e2fsck is recommended\n");
1473 #if 0
1474 		/* @@@ We _will_ want to clear the valid bit if we find
1475 		 * inconsistencies, to force a fsck at reboot.  But for
1476 		 * a plain journaled filesystem we can keep it set as
1477 		 * valid forever! :)
1478 		 */
1479 	es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1480 #endif
1481 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1482 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1483 	le16_add_cpu(&es->s_mnt_count, 1);
1484 	es->s_mtime = cpu_to_le32(get_seconds());
1485 	ext4_update_dynamic_rev(sb);
1486 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1487 
1488 	ext4_commit_super(sb, es, 1);
1489 	if (test_opt(sb, DEBUG))
1490 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
1491 				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
1492 			sb->s_blocksize,
1493 			sbi->s_groups_count,
1494 			EXT4_BLOCKS_PER_GROUP(sb),
1495 			EXT4_INODES_PER_GROUP(sb),
1496 			sbi->s_mount_opt);
1497 
1498 	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1499 	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1500 	       "external", EXT4_SB(sb)->s_journal->j_devname);
1501 	return res;
1502 }
1503 
1504 static int ext4_fill_flex_info(struct super_block *sb)
1505 {
1506 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1507 	struct ext4_group_desc *gdp = NULL;
1508 	struct buffer_head *bh;
1509 	ext4_group_t flex_group_count;
1510 	ext4_group_t flex_group;
1511 	int groups_per_flex = 0;
1512 	__u64 block_bitmap = 0;
1513 	int i;
1514 
1515 	if (!sbi->s_es->s_log_groups_per_flex) {
1516 		sbi->s_log_groups_per_flex = 0;
1517 		return 1;
1518 	}
1519 
1520 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1521 	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1522 
1523 	/* We allocate both existing and potentially added groups */
1524 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1525 			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
1526 			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
1527 			   groups_per_flex;
1528 	sbi->s_flex_groups = kzalloc(flex_group_count *
1529 				     sizeof(struct flex_groups), GFP_KERNEL);
1530 	if (sbi->s_flex_groups == NULL) {
1531 		printk(KERN_ERR "EXT4-fs: not enough memory for "
1532 				"%lu flex groups\n", flex_group_count);
1533 		goto failed;
1534 	}
1535 
1536 	gdp = ext4_get_group_desc(sb, 1, &bh);
1537 	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1538 
1539 	for (i = 0; i < sbi->s_groups_count; i++) {
1540 		gdp = ext4_get_group_desc(sb, i, &bh);
1541 
1542 		flex_group = ext4_flex_group(sbi, i);
1543 		sbi->s_flex_groups[flex_group].free_inodes +=
1544 			le16_to_cpu(gdp->bg_free_inodes_count);
1545 		sbi->s_flex_groups[flex_group].free_blocks +=
1546 			le16_to_cpu(gdp->bg_free_blocks_count);
1547 	}
1548 
1549 	return 1;
1550 failed:
1551 	return 0;
1552 }
1553 
1554 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1555 			    struct ext4_group_desc *gdp)
1556 {
1557 	__u16 crc = 0;
1558 
1559 	if (sbi->s_es->s_feature_ro_compat &
1560 	    cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
1561 		int offset = offsetof(struct ext4_group_desc, bg_checksum);
1562 		__le32 le_group = cpu_to_le32(block_group);
1563 
1564 		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
1565 		crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
1566 		crc = crc16(crc, (__u8 *)gdp, offset);
1567 		offset += sizeof(gdp->bg_checksum); /* skip checksum */
1568 		/* for checksum of struct ext4_group_desc do the rest...*/
1569 		if ((sbi->s_es->s_feature_incompat &
1570 		     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
1571 		    offset < le16_to_cpu(sbi->s_es->s_desc_size))
1572 			crc = crc16(crc, (__u8 *)gdp + offset,
1573 				    le16_to_cpu(sbi->s_es->s_desc_size) -
1574 					offset);
1575 	}
1576 
1577 	return cpu_to_le16(crc);
1578 }
1579 
1580 int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1581 				struct ext4_group_desc *gdp)
1582 {
1583 	if ((sbi->s_es->s_feature_ro_compat &
1584 	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
1585 	    (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
1586 		return 0;
1587 
1588 	return 1;
1589 }
1590 
1591 /* Called at mount-time, super-block is locked */
1592 static int ext4_check_descriptors(struct super_block *sb)
1593 {
1594 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1595 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1596 	ext4_fsblk_t last_block;
1597 	ext4_fsblk_t block_bitmap;
1598 	ext4_fsblk_t inode_bitmap;
1599 	ext4_fsblk_t inode_table;
1600 	int flexbg_flag = 0;
1601 	ext4_group_t i;
1602 
1603 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1604 		flexbg_flag = 1;
1605 
1606 	ext4_debug("Checking group descriptors");
1607 
1608 	for (i = 0; i < sbi->s_groups_count; i++) {
1609 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1610 
1611 		if (i == sbi->s_groups_count - 1 || flexbg_flag)
1612 			last_block = ext4_blocks_count(sbi->s_es) - 1;
1613 		else
1614 			last_block = first_block +
1615 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
1616 
1617 		block_bitmap = ext4_block_bitmap(sb, gdp);
1618 		if (block_bitmap < first_block || block_bitmap > last_block) {
1619 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1620 			       "Block bitmap for group %lu not in group "
1621 			       "(block %llu)!", i, block_bitmap);
1622 			return 0;
1623 		}
1624 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
1625 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
1626 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1627 			       "Inode bitmap for group %lu not in group "
1628 			       "(block %llu)!", i, inode_bitmap);
1629 			return 0;
1630 		}
1631 		inode_table = ext4_inode_table(sb, gdp);
1632 		if (inode_table < first_block ||
1633 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
1634 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1635 			       "Inode table for group %lu not in group "
1636 			       "(block %llu)!", i, inode_table);
1637 			return 0;
1638 		}
1639 		spin_lock(sb_bgl_lock(sbi, i));
1640 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1641 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1642 			       "Checksum for group %lu failed (%u!=%u)\n",
1643 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1644 			       gdp)), le16_to_cpu(gdp->bg_checksum));
1645 			if (!(sb->s_flags & MS_RDONLY)) {
1646 				spin_unlock(sb_bgl_lock(sbi, i));
1647 				return 0;
1648 			}
1649 		}
1650 		spin_unlock(sb_bgl_lock(sbi, i));
1651 		if (!flexbg_flag)
1652 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
1653 	}
1654 
1655 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1656 	sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
1657 	return 1;
1658 }
1659 
1660 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
1661  * the superblock) which were deleted from all directories, but held open by
1662  * a process at the time of a crash.  We walk the list and try to delete these
1663  * inodes at recovery time (only with a read-write filesystem).
1664  *
1665  * In order to keep the orphan inode chain consistent during traversal (in
1666  * case of crash during recovery), we link each inode into the superblock
1667  * orphan list_head and handle it the same way as an inode deletion during
1668  * normal operation (which journals the operations for us).
1669  *
1670  * We only do an iget() and an iput() on each inode, which is very safe if we
1671  * accidentally point at an in-use or already deleted inode.  The worst that
1672  * can happen in this case is that we get a "bit already cleared" message from
1673  * ext4_free_inode().  The only reason we would point at a wrong inode is if
1674  * e2fsck was run on this filesystem, and it must have already done the orphan
1675  * inode cleanup for us, so we can safely abort without any further action.
1676  */
1677 static void ext4_orphan_cleanup(struct super_block *sb,
1678 				struct ext4_super_block *es)
1679 {
1680 	unsigned int s_flags = sb->s_flags;
1681 	int nr_orphans = 0, nr_truncates = 0;
1682 #ifdef CONFIG_QUOTA
1683 	int i;
1684 #endif
1685 	if (!es->s_last_orphan) {
1686 		jbd_debug(4, "no orphan inodes to clean up\n");
1687 		return;
1688 	}
1689 
1690 	if (bdev_read_only(sb->s_bdev)) {
1691 		printk(KERN_ERR "EXT4-fs: write access "
1692 			"unavailable, skipping orphan cleanup.\n");
1693 		return;
1694 	}
1695 
1696 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
1697 		if (es->s_last_orphan)
1698 			jbd_debug(1, "Errors on filesystem, "
1699 				  "clearing orphan list.\n");
1700 		es->s_last_orphan = 0;
1701 		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1702 		return;
1703 	}
1704 
1705 	if (s_flags & MS_RDONLY) {
1706 		printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
1707 		       sb->s_id);
1708 		sb->s_flags &= ~MS_RDONLY;
1709 	}
1710 #ifdef CONFIG_QUOTA
1711 	/* Needed for iput() to work correctly and not trash data */
1712 	sb->s_flags |= MS_ACTIVE;
1713 	/* Turn on quotas so that they are updated correctly */
1714 	for (i = 0; i < MAXQUOTAS; i++) {
1715 		if (EXT4_SB(sb)->s_qf_names[i]) {
1716 			int ret = ext4_quota_on_mount(sb, i);
1717 			if (ret < 0)
1718 				printk(KERN_ERR
1719 					"EXT4-fs: Cannot turn on journaled "
1720 					"quota: error %d\n", ret);
1721 		}
1722 	}
1723 #endif
1724 
1725 	while (es->s_last_orphan) {
1726 		struct inode *inode;
1727 
1728 		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
1729 		if (IS_ERR(inode)) {
1730 			es->s_last_orphan = 0;
1731 			break;
1732 		}
1733 
1734 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1735 		DQUOT_INIT(inode);
1736 		if (inode->i_nlink) {
1737 			printk(KERN_DEBUG
1738 				"%s: truncating inode %lu to %lld bytes\n",
1739 				__func__, inode->i_ino, inode->i_size);
1740 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1741 				  inode->i_ino, inode->i_size);
1742 			ext4_truncate(inode);
1743 			nr_truncates++;
1744 		} else {
1745 			printk(KERN_DEBUG
1746 				"%s: deleting unreferenced inode %lu\n",
1747 				__func__, inode->i_ino);
1748 			jbd_debug(2, "deleting unreferenced inode %lu\n",
1749 				  inode->i_ino);
1750 			nr_orphans++;
1751 		}
1752 		iput(inode);  /* The delete magic happens here! */
1753 	}
1754 
1755 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
1756 
1757 	if (nr_orphans)
1758 		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
1759 		       sb->s_id, PLURAL(nr_orphans));
1760 	if (nr_truncates)
1761 		printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
1762 		       sb->s_id, PLURAL(nr_truncates));
1763 #ifdef CONFIG_QUOTA
1764 	/* Turn quotas off */
1765 	for (i = 0; i < MAXQUOTAS; i++) {
1766 		if (sb_dqopt(sb)->files[i])
1767 			vfs_quota_off(sb, i, 0);
1768 	}
1769 #endif
1770 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1771 }
1772 /*
1773  * Maximal extent format file size.
1774  * Resulting logical blkno at s_maxbytes must fit in our on-disk
1775  * extent format containers, within a sector_t, and within i_blocks
1776  * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
1777  * so that won't be a limiting factor.
1778  *
1779  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
1780  */
1781 static loff_t ext4_max_size(int blkbits)
1782 {
1783 	loff_t res;
1784 	loff_t upper_limit = MAX_LFS_FILESIZE;
1785 
1786 	/* small i_blocks in vfs inode? */
1787 	if (sizeof(blkcnt_t) < sizeof(u64)) {
1788 		/*
1789 		 * CONFIG_LSF is not enabled implies the inode
1790 		 * i_block represent total blocks in 512 bytes
1791 		 * 32 == size of vfs inode i_blocks * 8
1792 		 */
1793 		upper_limit = (1LL << 32) - 1;
1794 
1795 		/* total blocks in file system block size */
1796 		upper_limit >>= (blkbits - 9);
1797 		upper_limit <<= blkbits;
1798 	}
1799 
1800 	/* 32-bit extent-start container, ee_block */
1801 	res = 1LL << 32;
1802 	res <<= blkbits;
1803 	res -= 1;
1804 
1805 	/* Sanity check against vm- & vfs- imposed limits */
1806 	if (res > upper_limit)
1807 		res = upper_limit;
1808 
1809 	return res;
1810 }
1811 
1812 /*
1813  * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
1814  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
1815  * We need to be 1 filesystem block less than the 2^48 sector limit.
1816  */
1817 static loff_t ext4_max_bitmap_size(int bits)
1818 {
1819 	loff_t res = EXT4_NDIR_BLOCKS;
1820 	int meta_blocks;
1821 	loff_t upper_limit;
1822 	/* This is calculated to be the largest file size for a
1823 	 * dense, bitmapped file such that the total number of
1824 	 * sectors in the file, including data and all indirect blocks,
1825 	 * does not exceed 2^48 -1
1826 	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
1827 	 * total number of  512 bytes blocks of the file
1828 	 */
1829 
1830 	if (sizeof(blkcnt_t) < sizeof(u64)) {
1831 		/*
1832 		 * CONFIG_LSF is not enabled implies the inode
1833 		 * i_block represent total blocks in 512 bytes
1834 		 * 32 == size of vfs inode i_blocks * 8
1835 		 */
1836 		upper_limit = (1LL << 32) - 1;
1837 
1838 		/* total blocks in file system block size */
1839 		upper_limit >>= (bits - 9);
1840 
1841 	} else {
1842 		/*
1843 		 * We use 48 bit ext4_inode i_blocks
1844 		 * With EXT4_HUGE_FILE_FL set the i_blocks
1845 		 * represent total number of blocks in
1846 		 * file system block size
1847 		 */
1848 		upper_limit = (1LL << 48) - 1;
1849 
1850 	}
1851 
1852 	/* indirect blocks */
1853 	meta_blocks = 1;
1854 	/* double indirect blocks */
1855 	meta_blocks += 1 + (1LL << (bits-2));
1856 	/* tripple indirect blocks */
1857 	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1858 
1859 	upper_limit -= meta_blocks;
1860 	upper_limit <<= bits;
1861 
1862 	res += 1LL << (bits-2);
1863 	res += 1LL << (2*(bits-2));
1864 	res += 1LL << (3*(bits-2));
1865 	res <<= bits;
1866 	if (res > upper_limit)
1867 		res = upper_limit;
1868 
1869 	if (res > MAX_LFS_FILESIZE)
1870 		res = MAX_LFS_FILESIZE;
1871 
1872 	return res;
1873 }
1874 
1875 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1876 				ext4_fsblk_t logical_sb_block, int nr)
1877 {
1878 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1879 	ext4_group_t bg, first_meta_bg;
1880 	int has_super = 0;
1881 
1882 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1883 
1884 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
1885 	    nr < first_meta_bg)
1886 		return logical_sb_block + nr + 1;
1887 	bg = sbi->s_desc_per_block * nr;
1888 	if (ext4_bg_has_super(sb, bg))
1889 		has_super = 1;
1890 	return (has_super + ext4_group_first_block_no(sb, bg));
1891 }
1892 
1893 /**
1894  * ext4_get_stripe_size: Get the stripe size.
1895  * @sbi: In memory super block info
1896  *
1897  * If we have specified it via mount option, then
1898  * use the mount option value. If the value specified at mount time is
1899  * greater than the blocks per group use the super block value.
1900  * If the super block value is greater than blocks per group return 0.
1901  * Allocator needs it be less than blocks per group.
1902  *
1903  */
1904 static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1905 {
1906 	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
1907 	unsigned long stripe_width =
1908 			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
1909 
1910 	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
1911 		return sbi->s_stripe;
1912 
1913 	if (stripe_width <= sbi->s_blocks_per_group)
1914 		return stripe_width;
1915 
1916 	if (stride <= sbi->s_blocks_per_group)
1917 		return stride;
1918 
1919 	return 0;
1920 }
1921 
1922 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1923 				__releases(kernel_lock)
1924 				__acquires(kernel_lock)
1925 
1926 {
1927 	struct buffer_head *bh;
1928 	struct ext4_super_block *es = NULL;
1929 	struct ext4_sb_info *sbi;
1930 	ext4_fsblk_t block;
1931 	ext4_fsblk_t sb_block = get_sb_block(&data);
1932 	ext4_fsblk_t logical_sb_block;
1933 	unsigned long offset = 0;
1934 	unsigned int journal_inum = 0;
1935 	unsigned long journal_devnum = 0;
1936 	unsigned long def_mount_opts;
1937 	struct inode *root;
1938 	char *cp;
1939 	int ret = -EINVAL;
1940 	int blocksize;
1941 	int db_count;
1942 	int i;
1943 	int needs_recovery;
1944 	__le32 features;
1945 	__u64 blocks_count;
1946 	int err;
1947 
1948 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1949 	if (!sbi)
1950 		return -ENOMEM;
1951 	sb->s_fs_info = sbi;
1952 	sbi->s_mount_opt = 0;
1953 	sbi->s_resuid = EXT4_DEF_RESUID;
1954 	sbi->s_resgid = EXT4_DEF_RESGID;
1955 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
1956 	sbi->s_sb_block = sb_block;
1957 
1958 	unlock_kernel();
1959 
1960 	/* Cleanup superblock name */
1961 	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
1962 		*cp = '!';
1963 
1964 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1965 	if (!blocksize) {
1966 		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
1967 		goto out_fail;
1968 	}
1969 
1970 	/*
1971 	 * The ext4 superblock will not be buffer aligned for other than 1kB
1972 	 * block sizes.  We need to calculate the offset from buffer start.
1973 	 */
1974 	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
1975 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1976 		offset = do_div(logical_sb_block, blocksize);
1977 	} else {
1978 		logical_sb_block = sb_block;
1979 	}
1980 
1981 	if (!(bh = sb_bread(sb, logical_sb_block))) {
1982 		printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
1983 		goto out_fail;
1984 	}
1985 	/*
1986 	 * Note: s_es must be initialized as soon as possible because
1987 	 *       some ext4 macro-instructions depend on its value
1988 	 */
1989 	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
1990 	sbi->s_es = es;
1991 	sb->s_magic = le16_to_cpu(es->s_magic);
1992 	if (sb->s_magic != EXT4_SUPER_MAGIC)
1993 		goto cantfind_ext4;
1994 
1995 	/* Set defaults before we parse the mount options */
1996 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1997 	if (def_mount_opts & EXT4_DEFM_DEBUG)
1998 		set_opt(sbi->s_mount_opt, DEBUG);
1999 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
2000 		set_opt(sbi->s_mount_opt, GRPID);
2001 	if (def_mount_opts & EXT4_DEFM_UID16)
2002 		set_opt(sbi->s_mount_opt, NO_UID32);
2003 #ifdef CONFIG_EXT4_FS_XATTR
2004 	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
2005 		set_opt(sbi->s_mount_opt, XATTR_USER);
2006 #endif
2007 #ifdef CONFIG_EXT4_FS_POSIX_ACL
2008 	if (def_mount_opts & EXT4_DEFM_ACL)
2009 		set_opt(sbi->s_mount_opt, POSIX_ACL);
2010 #endif
2011 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2012 		sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
2013 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2014 		sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
2015 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2016 		sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
2017 
2018 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2019 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
2020 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
2021 		set_opt(sbi->s_mount_opt, ERRORS_CONT);
2022 	else
2023 		set_opt(sbi->s_mount_opt, ERRORS_RO);
2024 
2025 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
2026 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
2027 
2028 	set_opt(sbi->s_mount_opt, RESERVATION);
2029 	set_opt(sbi->s_mount_opt, BARRIER);
2030 
2031 	/*
2032 	 * turn on extents feature by default in ext4 filesystem
2033 	 * only if feature flag already set by mkfs or tune2fs.
2034 	 * Use -o noextents to turn it off
2035 	 */
2036 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2037 		set_opt(sbi->s_mount_opt, EXTENTS);
2038 	else
2039 		ext4_warning(sb, __func__,
2040 			"extents feature not enabled on this filesystem, "
2041 			"use tune2fs.\n");
2042 
2043 	/*
2044 	 * enable delayed allocation by default
2045 	 * Use -o nodelalloc to turn it off
2046 	 */
2047 	set_opt(sbi->s_mount_opt, DELALLOC);
2048 
2049 
2050 	if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
2051 			   NULL, 0))
2052 		goto failed_mount;
2053 
2054 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2055 		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
2056 
2057 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
2058 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
2059 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
2060 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
2061 		printk(KERN_WARNING
2062 		       "EXT4-fs warning: feature flags set on rev 0 fs, "
2063 		       "running e2fsck is recommended\n");
2064 
2065 	/*
2066 	 * Check feature flags regardless of the revision level, since we
2067 	 * previously didn't change the revision level when setting the flags,
2068 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
2069 	 */
2070 	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2071 	if (features) {
2072 		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
2073 		       "unsupported optional features (%x).\n",
2074 		       sb->s_id, le32_to_cpu(features));
2075 		goto failed_mount;
2076 	}
2077 	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2078 	if (!(sb->s_flags & MS_RDONLY) && features) {
2079 		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
2080 		       "unsupported optional features (%x).\n",
2081 		       sb->s_id, le32_to_cpu(features));
2082 		goto failed_mount;
2083 	}
2084 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2085 		/*
2086 		 * Large file size enabled file system can only be
2087 		 * mount if kernel is build with CONFIG_LSF
2088 		 */
2089 		if (sizeof(root->i_blocks) < sizeof(u64) &&
2090 				!(sb->s_flags & MS_RDONLY)) {
2091 			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
2092 					"files cannot be mounted read-write "
2093 					"without CONFIG_LSF.\n", sb->s_id);
2094 			goto failed_mount;
2095 		}
2096 	}
2097 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
2098 
2099 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
2100 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
2101 		printk(KERN_ERR
2102 		       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
2103 		       blocksize, sb->s_id);
2104 		goto failed_mount;
2105 	}
2106 
2107 	if (sb->s_blocksize != blocksize) {
2108 
2109 		/* Validate the filesystem blocksize */
2110 		if (!sb_set_blocksize(sb, blocksize)) {
2111 			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
2112 					blocksize);
2113 			goto failed_mount;
2114 		}
2115 
2116 		brelse(bh);
2117 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
2118 		offset = do_div(logical_sb_block, blocksize);
2119 		bh = sb_bread(sb, logical_sb_block);
2120 		if (!bh) {
2121 			printk(KERN_ERR
2122 			       "EXT4-fs: Can't read superblock on 2nd try.\n");
2123 			goto failed_mount;
2124 		}
2125 		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
2126 		sbi->s_es = es;
2127 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
2128 			printk(KERN_ERR
2129 			       "EXT4-fs: Magic mismatch, very weird !\n");
2130 			goto failed_mount;
2131 		}
2132 	}
2133 
2134 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
2135 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
2136 
2137 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
2138 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
2139 		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
2140 	} else {
2141 		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
2142 		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
2143 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
2144 		    (!is_power_of_2(sbi->s_inode_size)) ||
2145 		    (sbi->s_inode_size > blocksize)) {
2146 			printk(KERN_ERR
2147 			       "EXT4-fs: unsupported inode size: %d\n",
2148 			       sbi->s_inode_size);
2149 			goto failed_mount;
2150 		}
2151 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
2152 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
2153 	}
2154 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
2155 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
2156 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
2157 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
2158 		    !is_power_of_2(sbi->s_desc_size)) {
2159 			printk(KERN_ERR
2160 			       "EXT4-fs: unsupported descriptor size %lu\n",
2161 			       sbi->s_desc_size);
2162 			goto failed_mount;
2163 		}
2164 	} else
2165 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
2166 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
2167 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
2168 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
2169 		goto cantfind_ext4;
2170 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
2171 	if (sbi->s_inodes_per_block == 0)
2172 		goto cantfind_ext4;
2173 	sbi->s_itb_per_group = sbi->s_inodes_per_group /
2174 					sbi->s_inodes_per_block;
2175 	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
2176 	sbi->s_sbh = bh;
2177 	sbi->s_mount_state = le16_to_cpu(es->s_state);
2178 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
2179 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
2180 	for (i = 0; i < 4; i++)
2181 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2182 	sbi->s_def_hash_version = es->s_def_hash_version;
2183 
2184 	if (sbi->s_blocks_per_group > blocksize * 8) {
2185 		printk(KERN_ERR
2186 		       "EXT4-fs: #blocks per group too big: %lu\n",
2187 		       sbi->s_blocks_per_group);
2188 		goto failed_mount;
2189 	}
2190 	if (sbi->s_inodes_per_group > blocksize * 8) {
2191 		printk(KERN_ERR
2192 		       "EXT4-fs: #inodes per group too big: %lu\n",
2193 		       sbi->s_inodes_per_group);
2194 		goto failed_mount;
2195 	}
2196 
2197 	if (ext4_blocks_count(es) >
2198 		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
2199 		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
2200 			" too large to mount safely\n", sb->s_id);
2201 		if (sizeof(sector_t) < 8)
2202 			printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
2203 					"enabled\n");
2204 		goto failed_mount;
2205 	}
2206 
2207 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2208 		goto cantfind_ext4;
2209 
2210 	/* ensure blocks_count calculation below doesn't sign-extend */
2211 	if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2212 	    le32_to_cpu(es->s_first_data_block) + 1) {
2213 		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2214 		       "first data block %u, blocks per group %lu\n",
2215 			ext4_blocks_count(es),
2216 			le32_to_cpu(es->s_first_data_block),
2217 			EXT4_BLOCKS_PER_GROUP(sb));
2218 		goto failed_mount;
2219 	}
2220 	blocks_count = (ext4_blocks_count(es) -
2221 			le32_to_cpu(es->s_first_data_block) +
2222 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
2223 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2224 	sbi->s_groups_count = blocks_count;
2225 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2226 		   EXT4_DESC_PER_BLOCK(sb);
2227 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
2228 				    GFP_KERNEL);
2229 	if (sbi->s_group_desc == NULL) {
2230 		printk(KERN_ERR "EXT4-fs: not enough memory\n");
2231 		goto failed_mount;
2232 	}
2233 
2234 #ifdef CONFIG_PROC_FS
2235 	if (ext4_proc_root)
2236 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2237 
2238 	if (sbi->s_proc)
2239 		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2240 				 &ext4_ui_proc_fops,
2241 				 &sbi->s_inode_readahead_blks);
2242 #endif
2243 
2244 	bgl_lock_init(&sbi->s_blockgroup_lock);
2245 
2246 	for (i = 0; i < db_count; i++) {
2247 		block = descriptor_loc(sb, logical_sb_block, i);
2248 		sbi->s_group_desc[i] = sb_bread(sb, block);
2249 		if (!sbi->s_group_desc[i]) {
2250 			printk(KERN_ERR "EXT4-fs: "
2251 			       "can't read group descriptor %d\n", i);
2252 			db_count = i;
2253 			goto failed_mount2;
2254 		}
2255 	}
2256 	if (!ext4_check_descriptors(sb)) {
2257 		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
2258 		goto failed_mount2;
2259 	}
2260 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2261 		if (!ext4_fill_flex_info(sb)) {
2262 			printk(KERN_ERR
2263 			       "EXT4-fs: unable to initialize "
2264 			       "flex_bg meta info!\n");
2265 			goto failed_mount2;
2266 		}
2267 
2268 	sbi->s_gdb_count = db_count;
2269 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2270 	spin_lock_init(&sbi->s_next_gen_lock);
2271 
2272 	err = percpu_counter_init(&sbi->s_freeblocks_counter,
2273 			ext4_count_free_blocks(sb));
2274 	if (!err) {
2275 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
2276 				ext4_count_free_inodes(sb));
2277 	}
2278 	if (!err) {
2279 		err = percpu_counter_init(&sbi->s_dirs_counter,
2280 				ext4_count_dirs(sb));
2281 	}
2282 	if (!err) {
2283 		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2284 	}
2285 	if (err) {
2286 		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
2287 		goto failed_mount3;
2288 	}
2289 
2290 	sbi->s_stripe = ext4_get_stripe_size(sbi);
2291 
2292 	/*
2293 	 * set up enough so that it can read an inode
2294 	 */
2295 	sb->s_op = &ext4_sops;
2296 	sb->s_export_op = &ext4_export_ops;
2297 	sb->s_xattr = ext4_xattr_handlers;
2298 #ifdef CONFIG_QUOTA
2299 	sb->s_qcop = &ext4_qctl_operations;
2300 	sb->dq_op = &ext4_quota_operations;
2301 #endif
2302 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
2303 
2304 	sb->s_root = NULL;
2305 
2306 	needs_recovery = (es->s_last_orphan != 0 ||
2307 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
2308 				    EXT4_FEATURE_INCOMPAT_RECOVER));
2309 
2310 	/*
2311 	 * The first inode we look at is the journal inode.  Don't try
2312 	 * root first: it may be modified in the journal!
2313 	 */
2314 	if (!test_opt(sb, NOLOAD) &&
2315 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2316 		if (ext4_load_journal(sb, es, journal_devnum))
2317 			goto failed_mount3;
2318 		if (!(sb->s_flags & MS_RDONLY) &&
2319 		    EXT4_SB(sb)->s_journal->j_failed_commit) {
2320 			printk(KERN_CRIT "EXT4-fs error (device %s): "
2321 			       "ext4_fill_super: Journal transaction "
2322 			       "%u is corrupt\n", sb->s_id,
2323 			       EXT4_SB(sb)->s_journal->j_failed_commit);
2324 			if (test_opt(sb, ERRORS_RO)) {
2325 				printk(KERN_CRIT
2326 				       "Mounting filesystem read-only\n");
2327 				sb->s_flags |= MS_RDONLY;
2328 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2329 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2330 			}
2331 			if (test_opt(sb, ERRORS_PANIC)) {
2332 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2333 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2334 				ext4_commit_super(sb, es, 1);
2335 				printk(KERN_CRIT
2336 				       "EXT4-fs (device %s): mount failed\n",
2337 				      sb->s_id);
2338 				goto failed_mount4;
2339 			}
2340 		}
2341 	} else if (journal_inum) {
2342 		if (ext4_create_journal(sb, es, journal_inum))
2343 			goto failed_mount3;
2344 	} else {
2345 		if (!silent)
2346 			printk(KERN_ERR
2347 			       "ext4: No journal on filesystem on %s\n",
2348 			       sb->s_id);
2349 		goto failed_mount3;
2350 	}
2351 
2352 	if (ext4_blocks_count(es) > 0xffffffffULL &&
2353 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2354 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
2355 		printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
2356 		goto failed_mount4;
2357 	}
2358 
2359 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2360 		jbd2_journal_set_features(sbi->s_journal,
2361 				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2362 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2363 	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2364 		jbd2_journal_set_features(sbi->s_journal,
2365 				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2366 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2367 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2368 	} else {
2369 		jbd2_journal_clear_features(sbi->s_journal,
2370 				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2371 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2372 	}
2373 
2374 	/* We have now updated the journal if required, so we can
2375 	 * validate the data journaling mode. */
2376 	switch (test_opt(sb, DATA_FLAGS)) {
2377 	case 0:
2378 		/* No mode set, assume a default based on the journal
2379 		 * capabilities: ORDERED_DATA if the journal can
2380 		 * cope, else JOURNAL_DATA
2381 		 */
2382 		if (jbd2_journal_check_available_features
2383 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
2384 			set_opt(sbi->s_mount_opt, ORDERED_DATA);
2385 		else
2386 			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2387 		break;
2388 
2389 	case EXT4_MOUNT_ORDERED_DATA:
2390 	case EXT4_MOUNT_WRITEBACK_DATA:
2391 		if (!jbd2_journal_check_available_features
2392 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2393 			printk(KERN_ERR "EXT4-fs: Journal does not support "
2394 			       "requested data journaling mode\n");
2395 			goto failed_mount4;
2396 		}
2397 	default:
2398 		break;
2399 	}
2400 
2401 	if (test_opt(sb, NOBH)) {
2402 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2403 			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
2404 				"its supported only with writeback mode\n");
2405 			clear_opt(sbi->s_mount_opt, NOBH);
2406 		}
2407 	}
2408 	/*
2409 	 * The jbd2_journal_load will have done any necessary log recovery,
2410 	 * so we can safely mount the rest of the filesystem now.
2411 	 */
2412 
2413 	root = ext4_iget(sb, EXT4_ROOT_INO);
2414 	if (IS_ERR(root)) {
2415 		printk(KERN_ERR "EXT4-fs: get root inode failed\n");
2416 		ret = PTR_ERR(root);
2417 		goto failed_mount4;
2418 	}
2419 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2420 		iput(root);
2421 		printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
2422 		goto failed_mount4;
2423 	}
2424 	sb->s_root = d_alloc_root(root);
2425 	if (!sb->s_root) {
2426 		printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
2427 		iput(root);
2428 		ret = -ENOMEM;
2429 		goto failed_mount4;
2430 	}
2431 
2432 	ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
2433 
2434 	/* determine the minimum size of new large inodes, if present */
2435 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
2436 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
2437 						     EXT4_GOOD_OLD_INODE_SIZE;
2438 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
2439 				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
2440 			if (sbi->s_want_extra_isize <
2441 			    le16_to_cpu(es->s_want_extra_isize))
2442 				sbi->s_want_extra_isize =
2443 					le16_to_cpu(es->s_want_extra_isize);
2444 			if (sbi->s_want_extra_isize <
2445 			    le16_to_cpu(es->s_min_extra_isize))
2446 				sbi->s_want_extra_isize =
2447 					le16_to_cpu(es->s_min_extra_isize);
2448 		}
2449 	}
2450 	/* Check if enough inode space is available */
2451 	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
2452 							sbi->s_inode_size) {
2453 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
2454 						       EXT4_GOOD_OLD_INODE_SIZE;
2455 		printk(KERN_INFO "EXT4-fs: required extra inode space not"
2456 			"available.\n");
2457 	}
2458 
2459 	/*
2460 	 * akpm: core read_super() calls in here with the superblock locked.
2461 	 * That deadlocks, because orphan cleanup needs to lock the superblock
2462 	 * in numerous places.  Here we just pop the lock - it's relatively
2463 	 * harmless, because we are now ready to accept write_super() requests,
2464 	 * and aviro says that's the only reason for hanging onto the
2465 	 * superblock lock.
2466 	 */
2467 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2468 	ext4_orphan_cleanup(sb, es);
2469 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2470 	if (needs_recovery)
2471 		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
2472 	ext4_mark_recovery_complete(sb, es);
2473 	printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
2474 	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
2475 	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
2476 	       "writeback");
2477 
2478 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2479 		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2480 				"requested data journaling mode\n");
2481 		clear_opt(sbi->s_mount_opt, DELALLOC);
2482 	} else if (test_opt(sb, DELALLOC))
2483 		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2484 
2485 	ext4_ext_init(sb);
2486 	err = ext4_mb_init(sb, needs_recovery);
2487 	if (err) {
2488 		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2489 		       err);
2490 		goto failed_mount4;
2491 	}
2492 
2493 	lock_kernel();
2494 	return 0;
2495 
2496 cantfind_ext4:
2497 	if (!silent)
2498 		printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
2499 		       sb->s_id);
2500 	goto failed_mount;
2501 
2502 failed_mount4:
2503 	jbd2_journal_destroy(sbi->s_journal);
2504 	sbi->s_journal = NULL;
2505 failed_mount3:
2506 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
2507 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
2508 	percpu_counter_destroy(&sbi->s_dirs_counter);
2509 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
2510 failed_mount2:
2511 	for (i = 0; i < db_count; i++)
2512 		brelse(sbi->s_group_desc[i]);
2513 	kfree(sbi->s_group_desc);
2514 failed_mount:
2515 	if (sbi->s_proc) {
2516 		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2517 		remove_proc_entry(sb->s_id, ext4_proc_root);
2518 	}
2519 #ifdef CONFIG_QUOTA
2520 	for (i = 0; i < MAXQUOTAS; i++)
2521 		kfree(sbi->s_qf_names[i]);
2522 #endif
2523 	ext4_blkdev_remove(sbi);
2524 	brelse(bh);
2525 out_fail:
2526 	sb->s_fs_info = NULL;
2527 	kfree(sbi);
2528 	lock_kernel();
2529 	return ret;
2530 }
2531 
2532 /*
2533  * Setup any per-fs journal parameters now.  We'll do this both on
2534  * initial mount, once the journal has been initialised but before we've
2535  * done any recovery; and again on any subsequent remount.
2536  */
2537 static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2538 {
2539 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2540 
2541 	if (sbi->s_commit_interval)
2542 		journal->j_commit_interval = sbi->s_commit_interval;
2543 	/* We could also set up an ext4-specific default for the commit
2544 	 * interval here, but for now we'll just fall back to the jbd
2545 	 * default. */
2546 
2547 	spin_lock(&journal->j_state_lock);
2548 	if (test_opt(sb, BARRIER))
2549 		journal->j_flags |= JBD2_BARRIER;
2550 	else
2551 		journal->j_flags &= ~JBD2_BARRIER;
2552 	if (test_opt(sb, DATA_ERR_ABORT))
2553 		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
2554 	else
2555 		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
2556 	spin_unlock(&journal->j_state_lock);
2557 }
2558 
2559 static journal_t *ext4_get_journal(struct super_block *sb,
2560 				   unsigned int journal_inum)
2561 {
2562 	struct inode *journal_inode;
2563 	journal_t *journal;
2564 
2565 	/* First, test for the existence of a valid inode on disk.  Bad
2566 	 * things happen if we iget() an unused inode, as the subsequent
2567 	 * iput() will try to delete it. */
2568 
2569 	journal_inode = ext4_iget(sb, journal_inum);
2570 	if (IS_ERR(journal_inode)) {
2571 		printk(KERN_ERR "EXT4-fs: no journal found.\n");
2572 		return NULL;
2573 	}
2574 	if (!journal_inode->i_nlink) {
2575 		make_bad_inode(journal_inode);
2576 		iput(journal_inode);
2577 		printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
2578 		return NULL;
2579 	}
2580 
2581 	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2582 		  journal_inode, journal_inode->i_size);
2583 	if (!S_ISREG(journal_inode->i_mode)) {
2584 		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
2585 		iput(journal_inode);
2586 		return NULL;
2587 	}
2588 
2589 	journal = jbd2_journal_init_inode(journal_inode);
2590 	if (!journal) {
2591 		printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
2592 		iput(journal_inode);
2593 		return NULL;
2594 	}
2595 	journal->j_private = sb;
2596 	ext4_init_journal_params(sb, journal);
2597 	return journal;
2598 }
2599 
2600 static journal_t *ext4_get_dev_journal(struct super_block *sb,
2601 				       dev_t j_dev)
2602 {
2603 	struct buffer_head *bh;
2604 	journal_t *journal;
2605 	ext4_fsblk_t start;
2606 	ext4_fsblk_t len;
2607 	int hblock, blocksize;
2608 	ext4_fsblk_t sb_block;
2609 	unsigned long offset;
2610 	struct ext4_super_block *es;
2611 	struct block_device *bdev;
2612 
2613 	bdev = ext4_blkdev_get(j_dev);
2614 	if (bdev == NULL)
2615 		return NULL;
2616 
2617 	if (bd_claim(bdev, sb)) {
2618 		printk(KERN_ERR
2619 			"EXT4: failed to claim external journal device.\n");
2620 		blkdev_put(bdev);
2621 		return NULL;
2622 	}
2623 
2624 	blocksize = sb->s_blocksize;
2625 	hblock = bdev_hardsect_size(bdev);
2626 	if (blocksize < hblock) {
2627 		printk(KERN_ERR
2628 			"EXT4-fs: blocksize too small for journal device.\n");
2629 		goto out_bdev;
2630 	}
2631 
2632 	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
2633 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
2634 	set_blocksize(bdev, blocksize);
2635 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
2636 		printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
2637 		       "external journal\n");
2638 		goto out_bdev;
2639 	}
2640 
2641 	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
2642 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
2643 	    !(le32_to_cpu(es->s_feature_incompat) &
2644 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2645 		printk(KERN_ERR "EXT4-fs: external journal has "
2646 					"bad superblock\n");
2647 		brelse(bh);
2648 		goto out_bdev;
2649 	}
2650 
2651 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2652 		printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
2653 		brelse(bh);
2654 		goto out_bdev;
2655 	}
2656 
2657 	len = ext4_blocks_count(es);
2658 	start = sb_block + 1;
2659 	brelse(bh);	/* we're done with the superblock */
2660 
2661 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
2662 					start, len, blocksize);
2663 	if (!journal) {
2664 		printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
2665 		goto out_bdev;
2666 	}
2667 	journal->j_private = sb;
2668 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
2669 	wait_on_buffer(journal->j_sb_buffer);
2670 	if (!buffer_uptodate(journal->j_sb_buffer)) {
2671 		printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
2672 		goto out_journal;
2673 	}
2674 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2675 		printk(KERN_ERR "EXT4-fs: External journal has more than one "
2676 					"user (unsupported) - %d\n",
2677 			be32_to_cpu(journal->j_superblock->s_nr_users));
2678 		goto out_journal;
2679 	}
2680 	EXT4_SB(sb)->journal_bdev = bdev;
2681 	ext4_init_journal_params(sb, journal);
2682 	return journal;
2683 out_journal:
2684 	jbd2_journal_destroy(journal);
2685 out_bdev:
2686 	ext4_blkdev_put(bdev);
2687 	return NULL;
2688 }
2689 
2690 static int ext4_load_journal(struct super_block *sb,
2691 			     struct ext4_super_block *es,
2692 			     unsigned long journal_devnum)
2693 {
2694 	journal_t *journal;
2695 	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
2696 	dev_t journal_dev;
2697 	int err = 0;
2698 	int really_read_only;
2699 
2700 	if (journal_devnum &&
2701 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2702 		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
2703 			"numbers have changed\n");
2704 		journal_dev = new_decode_dev(journal_devnum);
2705 	} else
2706 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
2707 
2708 	really_read_only = bdev_read_only(sb->s_bdev);
2709 
2710 	/*
2711 	 * Are we loading a blank journal or performing recovery after a
2712 	 * crash?  For recovery, we need to check in advance whether we
2713 	 * can get read-write access to the device.
2714 	 */
2715 
2716 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2717 		if (sb->s_flags & MS_RDONLY) {
2718 			printk(KERN_INFO "EXT4-fs: INFO: recovery "
2719 					"required on readonly filesystem.\n");
2720 			if (really_read_only) {
2721 				printk(KERN_ERR "EXT4-fs: write access "
2722 					"unavailable, cannot proceed.\n");
2723 				return -EROFS;
2724 			}
2725 			printk(KERN_INFO "EXT4-fs: write access will "
2726 			       "be enabled during recovery.\n");
2727 		}
2728 	}
2729 
2730 	if (journal_inum && journal_dev) {
2731 		printk(KERN_ERR "EXT4-fs: filesystem has both journal "
2732 		       "and inode journals!\n");
2733 		return -EINVAL;
2734 	}
2735 
2736 	if (journal_inum) {
2737 		if (!(journal = ext4_get_journal(sb, journal_inum)))
2738 			return -EINVAL;
2739 	} else {
2740 		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
2741 			return -EINVAL;
2742 	}
2743 
2744 	if (journal->j_flags & JBD2_BARRIER)
2745 		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
2746 	else
2747 		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
2748 
2749 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2750 		err = jbd2_journal_update_format(journal);
2751 		if (err)  {
2752 			printk(KERN_ERR "EXT4-fs: error updating journal.\n");
2753 			jbd2_journal_destroy(journal);
2754 			return err;
2755 		}
2756 	}
2757 
2758 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
2759 		err = jbd2_journal_wipe(journal, !really_read_only);
2760 	if (!err)
2761 		err = jbd2_journal_load(journal);
2762 
2763 	if (err) {
2764 		printk(KERN_ERR "EXT4-fs: error loading journal.\n");
2765 		jbd2_journal_destroy(journal);
2766 		return err;
2767 	}
2768 
2769 	EXT4_SB(sb)->s_journal = journal;
2770 	ext4_clear_journal_err(sb, es);
2771 
2772 	if (journal_devnum &&
2773 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2774 		es->s_journal_dev = cpu_to_le32(journal_devnum);
2775 		sb->s_dirt = 1;
2776 
2777 		/* Make sure we flush the recovery flag to disk. */
2778 		ext4_commit_super(sb, es, 1);
2779 	}
2780 
2781 	return 0;
2782 }
2783 
2784 static int ext4_create_journal(struct super_block *sb,
2785 			       struct ext4_super_block *es,
2786 			       unsigned int journal_inum)
2787 {
2788 	journal_t *journal;
2789 	int err;
2790 
2791 	if (sb->s_flags & MS_RDONLY) {
2792 		printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2793 				"create journal.\n");
2794 		return -EROFS;
2795 	}
2796 
2797 	journal = ext4_get_journal(sb, journal_inum);
2798 	if (!journal)
2799 		return -EINVAL;
2800 
2801 	printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2802 	       journal_inum);
2803 
2804 	err = jbd2_journal_create(journal);
2805 	if (err) {
2806 		printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2807 		jbd2_journal_destroy(journal);
2808 		return -EIO;
2809 	}
2810 
2811 	EXT4_SB(sb)->s_journal = journal;
2812 
2813 	ext4_update_dynamic_rev(sb);
2814 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2815 	EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2816 
2817 	es->s_journal_inum = cpu_to_le32(journal_inum);
2818 	sb->s_dirt = 1;
2819 
2820 	/* Make sure we flush the recovery flag to disk. */
2821 	ext4_commit_super(sb, es, 1);
2822 
2823 	return 0;
2824 }
2825 
2826 static void ext4_commit_super(struct super_block *sb,
2827 			      struct ext4_super_block *es, int sync)
2828 {
2829 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2830 
2831 	if (!sbh)
2832 		return;
2833 	if (buffer_write_io_error(sbh)) {
2834 		/*
2835 		 * Oh, dear.  A previous attempt to write the
2836 		 * superblock failed.  This could happen because the
2837 		 * USB device was yanked out.  Or it could happen to
2838 		 * be a transient write error and maybe the block will
2839 		 * be remapped.  Nothing we can do but to retry the
2840 		 * write and hope for the best.
2841 		 */
2842 		printk(KERN_ERR "ext4: previous I/O error to "
2843 		       "superblock detected for %s.\n", sb->s_id);
2844 		clear_buffer_write_io_error(sbh);
2845 		set_buffer_uptodate(sbh);
2846 	}
2847 	es->s_wtime = cpu_to_le32(get_seconds());
2848 	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2849 	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2850 	BUFFER_TRACE(sbh, "marking dirty");
2851 	mark_buffer_dirty(sbh);
2852 	if (sync) {
2853 		sync_dirty_buffer(sbh);
2854 		if (buffer_write_io_error(sbh)) {
2855 			printk(KERN_ERR "ext4: I/O error while writing "
2856 			       "superblock for %s.\n", sb->s_id);
2857 			clear_buffer_write_io_error(sbh);
2858 			set_buffer_uptodate(sbh);
2859 		}
2860 	}
2861 }
2862 
2863 
2864 /*
2865  * Have we just finished recovery?  If so, and if we are mounting (or
2866  * remounting) the filesystem readonly, then we will end up with a
2867  * consistent fs on disk.  Record that fact.
2868  */
2869 static void ext4_mark_recovery_complete(struct super_block *sb,
2870 					struct ext4_super_block *es)
2871 {
2872 	journal_t *journal = EXT4_SB(sb)->s_journal;
2873 
2874 	jbd2_journal_lock_updates(journal);
2875 	if (jbd2_journal_flush(journal) < 0)
2876 		goto out;
2877 
2878 	lock_super(sb);
2879 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2880 	    sb->s_flags & MS_RDONLY) {
2881 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2882 		sb->s_dirt = 0;
2883 		ext4_commit_super(sb, es, 1);
2884 	}
2885 	unlock_super(sb);
2886 
2887 out:
2888 	jbd2_journal_unlock_updates(journal);
2889 }
2890 
2891 /*
2892  * If we are mounting (or read-write remounting) a filesystem whose journal
2893  * has recorded an error from a previous lifetime, move that error to the
2894  * main filesystem now.
2895  */
2896 static void ext4_clear_journal_err(struct super_block *sb,
2897 				   struct ext4_super_block *es)
2898 {
2899 	journal_t *journal;
2900 	int j_errno;
2901 	const char *errstr;
2902 
2903 	journal = EXT4_SB(sb)->s_journal;
2904 
2905 	/*
2906 	 * Now check for any error status which may have been recorded in the
2907 	 * journal by a prior ext4_error() or ext4_abort()
2908 	 */
2909 
2910 	j_errno = jbd2_journal_errno(journal);
2911 	if (j_errno) {
2912 		char nbuf[16];
2913 
2914 		errstr = ext4_decode_error(sb, j_errno, nbuf);
2915 		ext4_warning(sb, __func__, "Filesystem error recorded "
2916 			     "from previous mount: %s", errstr);
2917 		ext4_warning(sb, __func__, "Marking fs in need of "
2918 			     "filesystem check.");
2919 
2920 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2921 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2922 		ext4_commit_super(sb, es, 1);
2923 
2924 		jbd2_journal_clear_err(journal);
2925 	}
2926 }
2927 
2928 /*
2929  * Force the running and committing transactions to commit,
2930  * and wait on the commit.
2931  */
2932 int ext4_force_commit(struct super_block *sb)
2933 {
2934 	journal_t *journal;
2935 	int ret;
2936 
2937 	if (sb->s_flags & MS_RDONLY)
2938 		return 0;
2939 
2940 	journal = EXT4_SB(sb)->s_journal;
2941 	sb->s_dirt = 0;
2942 	ret = ext4_journal_force_commit(journal);
2943 	return ret;
2944 }
2945 
2946 /*
2947  * Ext4 always journals updates to the superblock itself, so we don't
2948  * have to propagate any other updates to the superblock on disk at this
2949  * point.  Just start an async writeback to get the buffers on their way
2950  * to the disk.
2951  *
2952  * This implicitly triggers the writebehind on sync().
2953  */
2954 
2955 static void ext4_write_super(struct super_block *sb)
2956 {
2957 	if (mutex_trylock(&sb->s_lock) != 0)
2958 		BUG();
2959 	sb->s_dirt = 0;
2960 }
2961 
2962 static int ext4_sync_fs(struct super_block *sb, int wait)
2963 {
2964 	tid_t target;
2965 
2966 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2967 	sb->s_dirt = 0;
2968 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2969 		if (wait)
2970 			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2971 	}
2972 	return 0;
2973 }
2974 
2975 /*
2976  * LVM calls this function before a (read-only) snapshot is created.  This
2977  * gives us a chance to flush the journal completely and mark the fs clean.
2978  */
2979 static void ext4_write_super_lockfs(struct super_block *sb)
2980 {
2981 	sb->s_dirt = 0;
2982 
2983 	if (!(sb->s_flags & MS_RDONLY)) {
2984 		journal_t *journal = EXT4_SB(sb)->s_journal;
2985 
2986 		/* Now we set up the journal barrier. */
2987 		jbd2_journal_lock_updates(journal);
2988 
2989 		/*
2990 		 * We don't want to clear needs_recovery flag when we failed
2991 		 * to flush the journal.
2992 		 */
2993 		if (jbd2_journal_flush(journal) < 0)
2994 			return;
2995 
2996 		/* Journal blocked and flushed, clear needs_recovery flag. */
2997 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2998 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2999 	}
3000 }
3001 
3002 /*
3003  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
3004  * flag here, even though the filesystem is not technically dirty yet.
3005  */
3006 static void ext4_unlockfs(struct super_block *sb)
3007 {
3008 	if (!(sb->s_flags & MS_RDONLY)) {
3009 		lock_super(sb);
3010 		/* Reser the needs_recovery flag before the fs is unlocked. */
3011 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3012 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3013 		unlock_super(sb);
3014 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3015 	}
3016 }
3017 
3018 static int ext4_remount(struct super_block *sb, int *flags, char *data)
3019 {
3020 	struct ext4_super_block *es;
3021 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3022 	ext4_fsblk_t n_blocks_count = 0;
3023 	unsigned long old_sb_flags;
3024 	struct ext4_mount_options old_opts;
3025 	ext4_group_t g;
3026 	int err;
3027 #ifdef CONFIG_QUOTA
3028 	int i;
3029 #endif
3030 
3031 	/* Store the original options */
3032 	old_sb_flags = sb->s_flags;
3033 	old_opts.s_mount_opt = sbi->s_mount_opt;
3034 	old_opts.s_resuid = sbi->s_resuid;
3035 	old_opts.s_resgid = sbi->s_resgid;
3036 	old_opts.s_commit_interval = sbi->s_commit_interval;
3037 #ifdef CONFIG_QUOTA
3038 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
3039 	for (i = 0; i < MAXQUOTAS; i++)
3040 		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
3041 #endif
3042 
3043 	/*
3044 	 * Allow the "check" option to be passed as a remount option.
3045 	 */
3046 	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
3047 		err = -EINVAL;
3048 		goto restore_opts;
3049 	}
3050 
3051 	if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
3052 		ext4_abort(sb, __func__, "Abort forced by user");
3053 
3054 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3055 		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
3056 
3057 	es = sbi->s_es;
3058 
3059 	ext4_init_journal_params(sb, sbi->s_journal);
3060 
3061 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
3062 		n_blocks_count > ext4_blocks_count(es)) {
3063 		if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
3064 			err = -EROFS;
3065 			goto restore_opts;
3066 		}
3067 
3068 		if (*flags & MS_RDONLY) {
3069 			/*
3070 			 * First of all, the unconditional stuff we have to do
3071 			 * to disable replay of the journal when we next remount
3072 			 */
3073 			sb->s_flags |= MS_RDONLY;
3074 
3075 			/*
3076 			 * OK, test if we are remounting a valid rw partition
3077 			 * readonly, and if so set the rdonly flag and then
3078 			 * mark the partition as valid again.
3079 			 */
3080 			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
3081 			    (sbi->s_mount_state & EXT4_VALID_FS))
3082 				es->s_state = cpu_to_le16(sbi->s_mount_state);
3083 
3084 			/*
3085 			 * We have to unlock super so that we can wait for
3086 			 * transactions.
3087 			 */
3088 			unlock_super(sb);
3089 			ext4_mark_recovery_complete(sb, es);
3090 			lock_super(sb);
3091 		} else {
3092 			__le32 ret;
3093 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3094 					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3095 				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
3096 				       "remount RDWR because of unsupported "
3097 				       "optional features (%x).\n",
3098 				       sb->s_id, le32_to_cpu(ret));
3099 				err = -EROFS;
3100 				goto restore_opts;
3101 			}
3102 
3103 			/*
3104 			 * Make sure the group descriptor checksums
3105 			 * are sane.  If they aren't, refuse to
3106 			 * remount r/w.
3107 			 */
3108 			for (g = 0; g < sbi->s_groups_count; g++) {
3109 				struct ext4_group_desc *gdp =
3110 					ext4_get_group_desc(sb, g, NULL);
3111 
3112 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3113 					printk(KERN_ERR
3114 	       "EXT4-fs: ext4_remount: "
3115 		"Checksum for group %lu failed (%u!=%u)\n",
3116 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3117 					       le16_to_cpu(gdp->bg_checksum));
3118 					err = -EINVAL;
3119 					goto restore_opts;
3120 				}
3121 			}
3122 
3123 			/*
3124 			 * If we have an unprocessed orphan list hanging
3125 			 * around from a previously readonly bdev mount,
3126 			 * require a full umount/remount for now.
3127 			 */
3128 			if (es->s_last_orphan) {
3129 				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
3130 				       "remount RDWR because of unprocessed "
3131 				       "orphan inode list.  Please "
3132 				       "umount/remount instead.\n",
3133 				       sb->s_id);
3134 				err = -EINVAL;
3135 				goto restore_opts;
3136 			}
3137 
3138 			/*
3139 			 * Mounting a RDONLY partition read-write, so reread
3140 			 * and store the current valid flag.  (It may have
3141 			 * been changed by e2fsck since we originally mounted
3142 			 * the partition.)
3143 			 */
3144 			ext4_clear_journal_err(sb, es);
3145 			sbi->s_mount_state = le16_to_cpu(es->s_state);
3146 			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
3147 				goto restore_opts;
3148 			if (!ext4_setup_super(sb, es, 0))
3149 				sb->s_flags &= ~MS_RDONLY;
3150 		}
3151 	}
3152 #ifdef CONFIG_QUOTA
3153 	/* Release old quota file names */
3154 	for (i = 0; i < MAXQUOTAS; i++)
3155 		if (old_opts.s_qf_names[i] &&
3156 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
3157 			kfree(old_opts.s_qf_names[i]);
3158 #endif
3159 	return 0;
3160 restore_opts:
3161 	sb->s_flags = old_sb_flags;
3162 	sbi->s_mount_opt = old_opts.s_mount_opt;
3163 	sbi->s_resuid = old_opts.s_resuid;
3164 	sbi->s_resgid = old_opts.s_resgid;
3165 	sbi->s_commit_interval = old_opts.s_commit_interval;
3166 #ifdef CONFIG_QUOTA
3167 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
3168 	for (i = 0; i < MAXQUOTAS; i++) {
3169 		if (sbi->s_qf_names[i] &&
3170 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
3171 			kfree(sbi->s_qf_names[i]);
3172 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
3173 	}
3174 #endif
3175 	return err;
3176 }
3177 
3178 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3179 {
3180 	struct super_block *sb = dentry->d_sb;
3181 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3182 	struct ext4_super_block *es = sbi->s_es;
3183 	u64 fsid;
3184 
3185 	if (test_opt(sb, MINIX_DF)) {
3186 		sbi->s_overhead_last = 0;
3187 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
3188 		ext4_group_t ngroups = sbi->s_groups_count, i;
3189 		ext4_fsblk_t overhead = 0;
3190 		smp_rmb();
3191 
3192 		/*
3193 		 * Compute the overhead (FS structures).  This is constant
3194 		 * for a given filesystem unless the number of block groups
3195 		 * changes so we cache the previous value until it does.
3196 		 */
3197 
3198 		/*
3199 		 * All of the blocks before first_data_block are
3200 		 * overhead
3201 		 */
3202 		overhead = le32_to_cpu(es->s_first_data_block);
3203 
3204 		/*
3205 		 * Add the overhead attributed to the superblock and
3206 		 * block group descriptors.  If the sparse superblocks
3207 		 * feature is turned on, then not all groups have this.
3208 		 */
3209 		for (i = 0; i < ngroups; i++) {
3210 			overhead += ext4_bg_has_super(sb, i) +
3211 				ext4_bg_num_gdb(sb, i);
3212 			cond_resched();
3213 		}
3214 
3215 		/*
3216 		 * Every block group has an inode bitmap, a block
3217 		 * bitmap, and an inode table.
3218 		 */
3219 		overhead += ngroups * (2 + sbi->s_itb_per_group);
3220 		sbi->s_overhead_last = overhead;
3221 		smp_wmb();
3222 		sbi->s_blocks_last = ext4_blocks_count(es);
3223 	}
3224 
3225 	buf->f_type = EXT4_SUPER_MAGIC;
3226 	buf->f_bsize = sb->s_blocksize;
3227 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3228 	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3229 		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3230 	ext4_free_blocks_count_set(es, buf->f_bfree);
3231 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3232 	if (buf->f_bfree < ext4_r_blocks_count(es))
3233 		buf->f_bavail = 0;
3234 	buf->f_files = le32_to_cpu(es->s_inodes_count);
3235 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
3236 	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
3237 	buf->f_namelen = EXT4_NAME_LEN;
3238 	fsid = le64_to_cpup((void *)es->s_uuid) ^
3239 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
3240 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
3241 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
3242 	return 0;
3243 }
3244 
3245 /* Helper function for writing quotas on sync - we need to start transaction before quota file
3246  * is locked for write. Otherwise the are possible deadlocks:
3247  * Process 1                         Process 2
3248  * ext4_create()                     quota_sync()
3249  *   jbd2_journal_start()                   write_dquot()
3250  *   DQUOT_INIT()                        down(dqio_mutex)
3251  *     down(dqio_mutex)                    jbd2_journal_start()
3252  *
3253  */
3254 
3255 #ifdef CONFIG_QUOTA
3256 
3257 static inline struct inode *dquot_to_inode(struct dquot *dquot)
3258 {
3259 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
3260 }
3261 
3262 static int ext4_dquot_initialize(struct inode *inode, int type)
3263 {
3264 	handle_t *handle;
3265 	int ret, err;
3266 
3267 	/* We may create quota structure so we need to reserve enough blocks */
3268 	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
3269 	if (IS_ERR(handle))
3270 		return PTR_ERR(handle);
3271 	ret = dquot_initialize(inode, type);
3272 	err = ext4_journal_stop(handle);
3273 	if (!ret)
3274 		ret = err;
3275 	return ret;
3276 }
3277 
3278 static int ext4_dquot_drop(struct inode *inode)
3279 {
3280 	handle_t *handle;
3281 	int ret, err;
3282 
3283 	/* We may delete quota structure so we need to reserve enough blocks */
3284 	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
3285 	if (IS_ERR(handle)) {
3286 		/*
3287 		 * We call dquot_drop() anyway to at least release references
3288 		 * to quota structures so that umount does not hang.
3289 		 */
3290 		dquot_drop(inode);
3291 		return PTR_ERR(handle);
3292 	}
3293 	ret = dquot_drop(inode);
3294 	err = ext4_journal_stop(handle);
3295 	if (!ret)
3296 		ret = err;
3297 	return ret;
3298 }
3299 
3300 static int ext4_write_dquot(struct dquot *dquot)
3301 {
3302 	int ret, err;
3303 	handle_t *handle;
3304 	struct inode *inode;
3305 
3306 	inode = dquot_to_inode(dquot);
3307 	handle = ext4_journal_start(inode,
3308 					EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
3309 	if (IS_ERR(handle))
3310 		return PTR_ERR(handle);
3311 	ret = dquot_commit(dquot);
3312 	err = ext4_journal_stop(handle);
3313 	if (!ret)
3314 		ret = err;
3315 	return ret;
3316 }
3317 
3318 static int ext4_acquire_dquot(struct dquot *dquot)
3319 {
3320 	int ret, err;
3321 	handle_t *handle;
3322 
3323 	handle = ext4_journal_start(dquot_to_inode(dquot),
3324 					EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
3325 	if (IS_ERR(handle))
3326 		return PTR_ERR(handle);
3327 	ret = dquot_acquire(dquot);
3328 	err = ext4_journal_stop(handle);
3329 	if (!ret)
3330 		ret = err;
3331 	return ret;
3332 }
3333 
3334 static int ext4_release_dquot(struct dquot *dquot)
3335 {
3336 	int ret, err;
3337 	handle_t *handle;
3338 
3339 	handle = ext4_journal_start(dquot_to_inode(dquot),
3340 					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
3341 	if (IS_ERR(handle)) {
3342 		/* Release dquot anyway to avoid endless cycle in dqput() */
3343 		dquot_release(dquot);
3344 		return PTR_ERR(handle);
3345 	}
3346 	ret = dquot_release(dquot);
3347 	err = ext4_journal_stop(handle);
3348 	if (!ret)
3349 		ret = err;
3350 	return ret;
3351 }
3352 
3353 static int ext4_mark_dquot_dirty(struct dquot *dquot)
3354 {
3355 	/* Are we journaling quotas? */
3356 	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
3357 	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
3358 		dquot_mark_dquot_dirty(dquot);
3359 		return ext4_write_dquot(dquot);
3360 	} else {
3361 		return dquot_mark_dquot_dirty(dquot);
3362 	}
3363 }
3364 
3365 static int ext4_write_info(struct super_block *sb, int type)
3366 {
3367 	int ret, err;
3368 	handle_t *handle;
3369 
3370 	/* Data block + inode block */
3371 	handle = ext4_journal_start(sb->s_root->d_inode, 2);
3372 	if (IS_ERR(handle))
3373 		return PTR_ERR(handle);
3374 	ret = dquot_commit_info(sb, type);
3375 	err = ext4_journal_stop(handle);
3376 	if (!ret)
3377 		ret = err;
3378 	return ret;
3379 }
3380 
3381 /*
3382  * Turn on quotas during mount time - we need to find
3383  * the quota file and such...
3384  */
3385 static int ext4_quota_on_mount(struct super_block *sb, int type)
3386 {
3387 	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3388 			EXT4_SB(sb)->s_jquota_fmt, type);
3389 }
3390 
3391 /*
3392  * Standard function to be called on quota_on
3393  */
3394 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3395 			 char *path, int remount)
3396 {
3397 	int err;
3398 	struct nameidata nd;
3399 
3400 	if (!test_opt(sb, QUOTA))
3401 		return -EINVAL;
3402 	/* When remounting, no checks are needed and in fact, path is NULL */
3403 	if (remount)
3404 		return vfs_quota_on(sb, type, format_id, path, remount);
3405 
3406 	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
3407 	if (err)
3408 		return err;
3409 
3410 	/* Quotafile not on the same filesystem? */
3411 	if (nd.path.mnt->mnt_sb != sb) {
3412 		path_put(&nd.path);
3413 		return -EXDEV;
3414 	}
3415 	/* Journaling quota? */
3416 	if (EXT4_SB(sb)->s_qf_names[type]) {
3417 		/* Quotafile not in fs root? */
3418 		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
3419 			printk(KERN_WARNING
3420 				"EXT4-fs: Quota file not on filesystem root. "
3421 				"Journaled quota will not work.\n");
3422 	}
3423 
3424 	/*
3425 	 * When we journal data on quota file, we have to flush journal to see
3426 	 * all updates to the file when we bypass pagecache...
3427 	 */
3428 	if (ext4_should_journal_data(nd.path.dentry->d_inode)) {
3429 		/*
3430 		 * We don't need to lock updates but journal_flush() could
3431 		 * otherwise be livelocked...
3432 		 */
3433 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
3434 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
3435 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3436 		if (err) {
3437 			path_put(&nd.path);
3438 			return err;
3439 		}
3440 	}
3441 
3442 	err = vfs_quota_on_path(sb, type, format_id, &nd.path);
3443 	path_put(&nd.path);
3444 	return err;
3445 }
3446 
3447 /* Read data from quotafile - avoid pagecache and such because we cannot afford
3448  * acquiring the locks... As quota files are never truncated and quota code
3449  * itself serializes the operations (and noone else should touch the files)
3450  * we don't have to be afraid of races */
3451 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
3452 			       size_t len, loff_t off)
3453 {
3454 	struct inode *inode = sb_dqopt(sb)->files[type];
3455 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3456 	int err = 0;
3457 	int offset = off & (sb->s_blocksize - 1);
3458 	int tocopy;
3459 	size_t toread;
3460 	struct buffer_head *bh;
3461 	loff_t i_size = i_size_read(inode);
3462 
3463 	if (off > i_size)
3464 		return 0;
3465 	if (off+len > i_size)
3466 		len = i_size-off;
3467 	toread = len;
3468 	while (toread > 0) {
3469 		tocopy = sb->s_blocksize - offset < toread ?
3470 				sb->s_blocksize - offset : toread;
3471 		bh = ext4_bread(NULL, inode, blk, 0, &err);
3472 		if (err)
3473 			return err;
3474 		if (!bh)	/* A hole? */
3475 			memset(data, 0, tocopy);
3476 		else
3477 			memcpy(data, bh->b_data+offset, tocopy);
3478 		brelse(bh);
3479 		offset = 0;
3480 		toread -= tocopy;
3481 		data += tocopy;
3482 		blk++;
3483 	}
3484 	return len;
3485 }
3486 
3487 /* Write to quotafile (we know the transaction is already started and has
3488  * enough credits) */
3489 static ssize_t ext4_quota_write(struct super_block *sb, int type,
3490 				const char *data, size_t len, loff_t off)
3491 {
3492 	struct inode *inode = sb_dqopt(sb)->files[type];
3493 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3494 	int err = 0;
3495 	int offset = off & (sb->s_blocksize - 1);
3496 	int tocopy;
3497 	int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
3498 	size_t towrite = len;
3499 	struct buffer_head *bh;
3500 	handle_t *handle = journal_current_handle();
3501 
3502 	if (!handle) {
3503 		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3504 			" cancelled because transaction is not started.\n",
3505 			(unsigned long long)off, (unsigned long long)len);
3506 		return -EIO;
3507 	}
3508 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
3509 	while (towrite > 0) {
3510 		tocopy = sb->s_blocksize - offset < towrite ?
3511 				sb->s_blocksize - offset : towrite;
3512 		bh = ext4_bread(handle, inode, blk, 1, &err);
3513 		if (!bh)
3514 			goto out;
3515 		if (journal_quota) {
3516 			err = ext4_journal_get_write_access(handle, bh);
3517 			if (err) {
3518 				brelse(bh);
3519 				goto out;
3520 			}
3521 		}
3522 		lock_buffer(bh);
3523 		memcpy(bh->b_data+offset, data, tocopy);
3524 		flush_dcache_page(bh->b_page);
3525 		unlock_buffer(bh);
3526 		if (journal_quota)
3527 			err = ext4_journal_dirty_metadata(handle, bh);
3528 		else {
3529 			/* Always do at least ordered writes for quotas */
3530 			err = ext4_jbd2_file_inode(handle, inode);
3531 			mark_buffer_dirty(bh);
3532 		}
3533 		brelse(bh);
3534 		if (err)
3535 			goto out;
3536 		offset = 0;
3537 		towrite -= tocopy;
3538 		data += tocopy;
3539 		blk++;
3540 	}
3541 out:
3542 	if (len == towrite) {
3543 		mutex_unlock(&inode->i_mutex);
3544 		return err;
3545 	}
3546 	if (inode->i_size < off+len-towrite) {
3547 		i_size_write(inode, off+len-towrite);
3548 		EXT4_I(inode)->i_disksize = inode->i_size;
3549 	}
3550 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3551 	ext4_mark_inode_dirty(handle, inode);
3552 	mutex_unlock(&inode->i_mutex);
3553 	return len - towrite;
3554 }
3555 
3556 #endif
3557 
3558 static int ext4_get_sb(struct file_system_type *fs_type,
3559 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
3560 {
3561 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3562 }
3563 
3564 #ifdef CONFIG_PROC_FS
3565 static int ext4_ui_proc_show(struct seq_file *m, void *v)
3566 {
3567 	unsigned int *p = m->private;
3568 
3569 	seq_printf(m, "%u\n", *p);
3570 	return 0;
3571 }
3572 
3573 static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3574 {
3575 	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3576 }
3577 
3578 static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3579 			       size_t cnt, loff_t *ppos)
3580 {
3581 	unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
3582 	char str[32];
3583 	unsigned long value;
3584 
3585 	if (cnt >= sizeof(str))
3586 		return -EINVAL;
3587 	if (copy_from_user(str, buf, cnt))
3588 		return -EFAULT;
3589 	value = simple_strtol(str, NULL, 0);
3590 	if (value < 0)
3591 		return -ERANGE;
3592 	*p = value;
3593 	return cnt;
3594 }
3595 
3596 const struct file_operations ext4_ui_proc_fops = {
3597 	.owner		= THIS_MODULE,
3598 	.open		= ext4_ui_proc_open,
3599 	.read		= seq_read,
3600 	.llseek		= seq_lseek,
3601 	.release	= single_release,
3602 	.write		= ext4_ui_proc_write,
3603 };
3604 #endif
3605 
3606 static struct file_system_type ext4_fs_type = {
3607 	.owner		= THIS_MODULE,
3608 	.name		= "ext4",
3609 	.get_sb		= ext4_get_sb,
3610 	.kill_sb	= kill_block_super,
3611 	.fs_flags	= FS_REQUIRES_DEV,
3612 };
3613 
3614 #ifdef CONFIG_EXT4DEV_COMPAT
3615 static int ext4dev_get_sb(struct file_system_type *fs_type,
3616 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
3617 {
3618 	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
3619 	       "to mount using ext4\n");
3620 	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
3621 	       "will go away by 2.6.31\n");
3622 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3623 }
3624 
3625 static struct file_system_type ext4dev_fs_type = {
3626 	.owner		= THIS_MODULE,
3627 	.name		= "ext4dev",
3628 	.get_sb		= ext4dev_get_sb,
3629 	.kill_sb	= kill_block_super,
3630 	.fs_flags	= FS_REQUIRES_DEV,
3631 };
3632 MODULE_ALIAS("ext4dev");
3633 #endif
3634 
3635 static int __init init_ext4_fs(void)
3636 {
3637 	int err;
3638 
3639 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3640 	err = init_ext4_mballoc();
3641 	if (err)
3642 		return err;
3643 
3644 	err = init_ext4_xattr();
3645 	if (err)
3646 		goto out2;
3647 	err = init_inodecache();
3648 	if (err)
3649 		goto out1;
3650 	err = register_filesystem(&ext4_fs_type);
3651 	if (err)
3652 		goto out;
3653 #ifdef CONFIG_EXT4DEV_COMPAT
3654 	err = register_filesystem(&ext4dev_fs_type);
3655 	if (err) {
3656 		unregister_filesystem(&ext4_fs_type);
3657 		goto out;
3658 	}
3659 #endif
3660 	return 0;
3661 out:
3662 	destroy_inodecache();
3663 out1:
3664 	exit_ext4_xattr();
3665 out2:
3666 	exit_ext4_mballoc();
3667 	return err;
3668 }
3669 
3670 static void __exit exit_ext4_fs(void)
3671 {
3672 	unregister_filesystem(&ext4_fs_type);
3673 #ifdef CONFIG_EXT4DEV_COMPAT
3674 	unregister_filesystem(&ext4dev_fs_type);
3675 #endif
3676 	destroy_inodecache();
3677 	exit_ext4_xattr();
3678 	exit_ext4_mballoc();
3679 	remove_proc_entry("fs/ext4", NULL);
3680 }
3681 
3682 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3683 MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
3684 MODULE_LICENSE("GPL");
3685 module_init(init_ext4_fs)
3686 module_exit(exit_ext4_fs)
3687