1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * linux/fs/ext4/super.c
4 *
5 * Copyright (C) 1992, 1993, 1994, 1995
6 * Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 *
10 * from
11 *
12 * linux/fs/minix/inode.c
13 *
14 * Copyright (C) 1991, 1992 Linus Torvalds
15 *
16 * Big-endian to little-endian byte-swapping/bitmaps by
17 * David S. Miller (davem@caip.rutgers.edu), 1995
18 */
19
20 #include <linux/module.h>
21 #include <linux/string.h>
22 #include <linux/fs.h>
23 #include <linux/time.h>
24 #include <linux/vmalloc.h>
25 #include <linux/slab.h>
26 #include <linux/init.h>
27 #include <linux/blkdev.h>
28 #include <linux/backing-dev.h>
29 #include <linux/parser.h>
30 #include <linux/buffer_head.h>
31 #include <linux/exportfs.h>
32 #include <linux/vfs.h>
33 #include <linux/random.h>
34 #include <linux/mount.h>
35 #include <linux/namei.h>
36 #include <linux/quotaops.h>
37 #include <linux/seq_file.h>
38 #include <linux/ctype.h>
39 #include <linux/log2.h>
40 #include <linux/crc16.h>
41 #include <linux/dax.h>
42 #include <linux/uaccess.h>
43 #include <linux/iversion.h>
44 #include <linux/unicode.h>
45 #include <linux/part_stat.h>
46 #include <linux/kthread.h>
47 #include <linux/freezer.h>
48 #include <linux/fsnotify.h>
49 #include <linux/fs_context.h>
50 #include <linux/fs_parser.h>
51 #include <linux/fserror.h>
52
53 #include "ext4.h"
54 #include "ext4_extents.h" /* Needed for trace points definition */
55 #include "ext4_jbd2.h"
56 #include "xattr.h"
57 #include "acl.h"
58 #include "mballoc.h"
59 #include "fsmap.h"
60
61 #define CREATE_TRACE_POINTS
62 #include <trace/events/ext4.h>
63
64 static struct ext4_lazy_init *ext4_li_info;
65 static DEFINE_MUTEX(ext4_li_mtx);
66 static struct ratelimit_state ext4_mount_msg_ratelimit;
67
68 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
69 unsigned long journal_devnum);
70 static int ext4_show_options(struct seq_file *seq, struct dentry *root);
71 static void ext4_update_super(struct super_block *sb);
72 static int ext4_commit_super(struct super_block *sb);
73 static int ext4_mark_recovery_complete(struct super_block *sb,
74 struct ext4_super_block *es);
75 static int ext4_clear_journal_err(struct super_block *sb,
76 struct ext4_super_block *es);
77 static int ext4_sync_fs(struct super_block *sb, int wait);
78 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
79 static int ext4_unfreeze(struct super_block *sb);
80 static int ext4_freeze(struct super_block *sb);
81 static inline int ext2_feature_set_ok(struct super_block *sb);
82 static inline int ext3_feature_set_ok(struct super_block *sb);
83 static void ext4_unregister_li_request(struct super_block *sb);
84 static void ext4_clear_request_list(void);
85 static struct inode *ext4_get_journal_inode(struct super_block *sb,
86 unsigned int journal_inum);
87 static int ext4_validate_options(struct fs_context *fc);
88 static int ext4_check_opt_consistency(struct fs_context *fc,
89 struct super_block *sb);
90 static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
91 static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
92 static int ext4_get_tree(struct fs_context *fc);
93 static int ext4_reconfigure(struct fs_context *fc);
94 static void ext4_fc_free(struct fs_context *fc);
95 static int ext4_init_fs_context(struct fs_context *fc);
96 static void ext4_kill_sb(struct super_block *sb);
97 static const struct fs_parameter_spec ext4_param_specs[];
98
99 /*
100 * Lock ordering
101 *
102 * page fault path:
103 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
104 * -> page lock -> i_data_sem (rw)
105 *
106 * buffered write path:
107 * sb_start_write -> i_mutex -> mmap_lock
108 * sb_start_write -> i_mutex -> transaction start -> page lock ->
109 * i_data_sem (rw)
110 *
111 * truncate:
112 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
113 * page lock
114 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
115 * i_data_sem (rw)
116 *
117 * direct IO:
118 * sb_start_write -> i_mutex -> mmap_lock
119 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
120 *
121 * writepages:
122 * transaction start -> page lock(s) -> i_data_sem (rw)
123 */
124
125 static const struct fs_context_operations ext4_context_ops = {
126 .parse_param = ext4_parse_param,
127 .get_tree = ext4_get_tree,
128 .reconfigure = ext4_reconfigure,
129 .free = ext4_fc_free,
130 };
131
132
133 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
134 static struct file_system_type ext2_fs_type = {
135 .owner = THIS_MODULE,
136 .name = "ext2",
137 .init_fs_context = ext4_init_fs_context,
138 .parameters = ext4_param_specs,
139 .kill_sb = ext4_kill_sb,
140 .fs_flags = FS_REQUIRES_DEV,
141 };
142 MODULE_ALIAS_FS("ext2");
143 MODULE_ALIAS("ext2");
144 #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
145 #else
146 #define IS_EXT2_SB(sb) (0)
147 #endif
148
149
150 static struct file_system_type ext3_fs_type = {
151 .owner = THIS_MODULE,
152 .name = "ext3",
153 .init_fs_context = ext4_init_fs_context,
154 .parameters = ext4_param_specs,
155 .kill_sb = ext4_kill_sb,
156 .fs_flags = FS_REQUIRES_DEV,
157 };
158 MODULE_ALIAS_FS("ext3");
159 MODULE_ALIAS("ext3");
160 #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)
161
162
__ext4_read_bh(struct buffer_head * bh,blk_opf_t op_flags,bh_end_io_t * end_io,bool simu_fail)163 static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
164 bh_end_io_t *end_io, bool simu_fail)
165 {
166 if (simu_fail) {
167 clear_buffer_uptodate(bh);
168 unlock_buffer(bh);
169 return;
170 }
171
172 /*
173 * buffer's verified bit is no longer valid after reading from
174 * disk again due to write out error, clear it to make sure we
175 * recheck the buffer contents.
176 */
177 clear_buffer_verified(bh);
178
179 bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
180 get_bh(bh);
181 submit_bh(REQ_OP_READ | op_flags, bh);
182 }
183
ext4_read_bh_nowait(struct buffer_head * bh,blk_opf_t op_flags,bh_end_io_t * end_io,bool simu_fail)184 void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
185 bh_end_io_t *end_io, bool simu_fail)
186 {
187 BUG_ON(!buffer_locked(bh));
188
189 if (ext4_buffer_uptodate(bh)) {
190 unlock_buffer(bh);
191 return;
192 }
193 __ext4_read_bh(bh, op_flags, end_io, simu_fail);
194 }
195
ext4_read_bh(struct buffer_head * bh,blk_opf_t op_flags,bh_end_io_t * end_io,bool simu_fail)196 int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
197 bh_end_io_t *end_io, bool simu_fail)
198 {
199 BUG_ON(!buffer_locked(bh));
200
201 if (ext4_buffer_uptodate(bh)) {
202 unlock_buffer(bh);
203 return 0;
204 }
205
206 __ext4_read_bh(bh, op_flags, end_io, simu_fail);
207
208 wait_on_buffer(bh);
209 if (buffer_uptodate(bh))
210 return 0;
211 return -EIO;
212 }
213
ext4_read_bh_lock(struct buffer_head * bh,blk_opf_t op_flags,bool wait)214 int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
215 {
216 lock_buffer(bh);
217 if (!wait) {
218 ext4_read_bh_nowait(bh, op_flags, NULL, false);
219 return 0;
220 }
221 return ext4_read_bh(bh, op_flags, NULL, false);
222 }
223
224 /*
225 * This works like __bread_gfp() except it uses ERR_PTR for error
226 * returns. Currently with sb_bread it's impossible to distinguish
227 * between ENOMEM and EIO situations (since both result in a NULL
228 * return.
229 */
__ext4_sb_bread_gfp(struct super_block * sb,sector_t block,blk_opf_t op_flags,gfp_t gfp)230 static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
231 sector_t block,
232 blk_opf_t op_flags, gfp_t gfp)
233 {
234 struct buffer_head *bh;
235 int ret;
236
237 bh = sb_getblk_gfp(sb, block, gfp);
238 if (bh == NULL)
239 return ERR_PTR(-ENOMEM);
240 if (ext4_buffer_uptodate(bh))
241 return bh;
242
243 ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
244 if (ret) {
245 put_bh(bh);
246 return ERR_PTR(ret);
247 }
248 return bh;
249 }
250
ext4_sb_bread(struct super_block * sb,sector_t block,blk_opf_t op_flags)251 struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
252 blk_opf_t op_flags)
253 {
254 gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
255 ~__GFP_FS) | __GFP_MOVABLE;
256
257 return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
258 }
259
ext4_sb_bread_unmovable(struct super_block * sb,sector_t block)260 struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
261 sector_t block)
262 {
263 gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
264 ~__GFP_FS);
265
266 return __ext4_sb_bread_gfp(sb, block, 0, gfp);
267 }
268
ext4_sb_bread_nofail(struct super_block * sb,sector_t block)269 struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
270 sector_t block)
271 {
272 gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
273 ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL;
274
275 return __ext4_sb_bread_gfp(sb, block, 0, gfp);
276 }
277
ext4_sb_breadahead_unmovable(struct super_block * sb,sector_t block)278 void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
279 {
280 struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
281 sb->s_blocksize, GFP_NOWAIT);
282
283 if (likely(bh)) {
284 if (trylock_buffer(bh))
285 ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false);
286 brelse(bh);
287 }
288 }
289
ext4_verify_csum_type(struct super_block * sb,struct ext4_super_block * es)290 static int ext4_verify_csum_type(struct super_block *sb,
291 struct ext4_super_block *es)
292 {
293 if (!ext4_has_feature_metadata_csum(sb))
294 return 1;
295
296 return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
297 }
298
ext4_superblock_csum(struct ext4_super_block * es)299 __le32 ext4_superblock_csum(struct ext4_super_block *es)
300 {
301 int offset = offsetof(struct ext4_super_block, s_checksum);
302 __u32 csum;
303
304 csum = ext4_chksum(~0, (char *)es, offset);
305
306 return cpu_to_le32(csum);
307 }
308
ext4_superblock_csum_verify(struct super_block * sb,struct ext4_super_block * es)309 static int ext4_superblock_csum_verify(struct super_block *sb,
310 struct ext4_super_block *es)
311 {
312 if (!ext4_has_feature_metadata_csum(sb))
313 return 1;
314
315 return es->s_checksum == ext4_superblock_csum(es);
316 }
317
ext4_superblock_csum_set(struct super_block * sb)318 void ext4_superblock_csum_set(struct super_block *sb)
319 {
320 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
321
322 if (!ext4_has_feature_metadata_csum(sb))
323 return;
324
325 es->s_checksum = ext4_superblock_csum(es);
326 }
327
ext4_block_bitmap(struct super_block * sb,struct ext4_group_desc * bg)328 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
329 struct ext4_group_desc *bg)
330 {
331 return le32_to_cpu(bg->bg_block_bitmap_lo) |
332 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
333 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
334 }
335
ext4_inode_bitmap(struct super_block * sb,struct ext4_group_desc * bg)336 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
337 struct ext4_group_desc *bg)
338 {
339 return le32_to_cpu(bg->bg_inode_bitmap_lo) |
340 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
341 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
342 }
343
ext4_inode_table(struct super_block * sb,struct ext4_group_desc * bg)344 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
345 struct ext4_group_desc *bg)
346 {
347 return le32_to_cpu(bg->bg_inode_table_lo) |
348 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
349 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
350 }
351
ext4_free_group_clusters(struct super_block * sb,struct ext4_group_desc * bg)352 __u32 ext4_free_group_clusters(struct super_block *sb,
353 struct ext4_group_desc *bg)
354 {
355 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
356 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
357 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
358 }
359
ext4_free_inodes_count(struct super_block * sb,struct ext4_group_desc * bg)360 __u32 ext4_free_inodes_count(struct super_block *sb,
361 struct ext4_group_desc *bg)
362 {
363 return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
364 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
365 (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
366 }
367
ext4_used_dirs_count(struct super_block * sb,struct ext4_group_desc * bg)368 __u32 ext4_used_dirs_count(struct super_block *sb,
369 struct ext4_group_desc *bg)
370 {
371 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
372 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
373 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
374 }
375
ext4_itable_unused_count(struct super_block * sb,struct ext4_group_desc * bg)376 __u32 ext4_itable_unused_count(struct super_block *sb,
377 struct ext4_group_desc *bg)
378 {
379 return le16_to_cpu(bg->bg_itable_unused_lo) |
380 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
381 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
382 }
383
ext4_block_bitmap_set(struct super_block * sb,struct ext4_group_desc * bg,ext4_fsblk_t blk)384 void ext4_block_bitmap_set(struct super_block *sb,
385 struct ext4_group_desc *bg, ext4_fsblk_t blk)
386 {
387 bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
388 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
389 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
390 }
391
ext4_inode_bitmap_set(struct super_block * sb,struct ext4_group_desc * bg,ext4_fsblk_t blk)392 void ext4_inode_bitmap_set(struct super_block *sb,
393 struct ext4_group_desc *bg, ext4_fsblk_t blk)
394 {
395 bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk);
396 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
397 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
398 }
399
ext4_inode_table_set(struct super_block * sb,struct ext4_group_desc * bg,ext4_fsblk_t blk)400 void ext4_inode_table_set(struct super_block *sb,
401 struct ext4_group_desc *bg, ext4_fsblk_t blk)
402 {
403 bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
404 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
405 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
406 }
407
ext4_free_group_clusters_set(struct super_block * sb,struct ext4_group_desc * bg,__u32 count)408 void ext4_free_group_clusters_set(struct super_block *sb,
409 struct ext4_group_desc *bg, __u32 count)
410 {
411 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
412 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
413 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
414 }
415
ext4_free_inodes_set(struct super_block * sb,struct ext4_group_desc * bg,__u32 count)416 void ext4_free_inodes_set(struct super_block *sb,
417 struct ext4_group_desc *bg, __u32 count)
418 {
419 WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
420 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
421 WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
422 }
423
ext4_used_dirs_set(struct super_block * sb,struct ext4_group_desc * bg,__u32 count)424 void ext4_used_dirs_set(struct super_block *sb,
425 struct ext4_group_desc *bg, __u32 count)
426 {
427 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
428 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
429 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
430 }
431
ext4_itable_unused_set(struct super_block * sb,struct ext4_group_desc * bg,__u32 count)432 void ext4_itable_unused_set(struct super_block *sb,
433 struct ext4_group_desc *bg, __u32 count)
434 {
435 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
436 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
437 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
438 }
439
__ext4_update_tstamp(__le32 * lo,__u8 * hi,time64_t now)440 static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
441 {
442 now = clamp_val(now, 0, (1ull << 40) - 1);
443
444 *lo = cpu_to_le32(lower_32_bits(now));
445 *hi = upper_32_bits(now);
446 }
447
__ext4_get_tstamp(__le32 * lo,__u8 * hi)448 static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
449 {
450 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
451 }
452 #define ext4_update_tstamp(es, tstamp) \
453 __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
454 ktime_get_real_seconds())
455 #define ext4_get_tstamp(es, tstamp) \
456 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
457
458 /*
459 * The ext4_maybe_update_superblock() function checks and updates the
460 * superblock if needed.
461 *
462 * This function is designed to update the on-disk superblock only under
463 * certain conditions to prevent excessive disk writes and unnecessary
464 * waking of the disk from sleep. The superblock will be updated if:
465 * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
466 * superblock update
467 * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
468 * last superblock update.
469 *
470 * @sb: The superblock
471 */
ext4_maybe_update_superblock(struct super_block * sb)472 static void ext4_maybe_update_superblock(struct super_block *sb)
473 {
474 struct ext4_sb_info *sbi = EXT4_SB(sb);
475 struct ext4_super_block *es = sbi->s_es;
476 journal_t *journal = sbi->s_journal;
477 time64_t now;
478 __u64 last_update;
479 __u64 lifetime_write_kbytes;
480 __u64 diff_size;
481
482 if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
483 !(sb->s_flags & SB_ACTIVE) || !journal ||
484 journal->j_flags & JBD2_UNMOUNT)
485 return;
486
487 now = ktime_get_real_seconds();
488 last_update = ext4_get_tstamp(es, s_wtime);
489
490 if (likely(now - last_update < sbi->s_sb_update_sec))
491 return;
492
493 lifetime_write_kbytes = sbi->s_kbytes_written +
494 ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
495 sbi->s_sectors_written_start) >> 1);
496
497 /* Get the number of kilobytes not written to disk to account
498 * for statistics and compare with a multiple of 16 MB. This
499 * is used to determine when the next superblock commit should
500 * occur (i.e. not more often than once per 16MB if there was
501 * less written in an hour).
502 */
503 diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
504
505 if (diff_size > sbi->s_sb_update_kb)
506 schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
507 }
508
ext4_journal_commit_callback(journal_t * journal,transaction_t * txn)509 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
510 {
511 struct super_block *sb = journal->j_private;
512
513 BUG_ON(txn->t_state == T_FINISHED);
514
515 ext4_process_freed_data(sb, txn->t_tid);
516 ext4_maybe_update_superblock(sb);
517 }
518
ext4_journalled_writepage_needs_redirty(struct jbd2_inode * jinode,struct folio * folio)519 static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
520 struct folio *folio)
521 {
522 struct buffer_head *bh, *head;
523 struct journal_head *jh;
524
525 bh = head = folio_buffers(folio);
526 do {
527 /*
528 * We have to redirty a page in these cases:
529 * 1) If buffer is dirty, it means the page was dirty because it
530 * contains a buffer that needs checkpointing. So the dirty bit
531 * needs to be preserved so that checkpointing writes the buffer
532 * properly.
533 * 2) If buffer is not part of the committing transaction
534 * (we may have just accidentally come across this buffer because
535 * inode range tracking is not exact) or if the currently running
536 * transaction already contains this buffer as well, dirty bit
537 * needs to be preserved so that the buffer gets writeprotected
538 * properly on running transaction's commit.
539 */
540 jh = bh2jh(bh);
541 if (buffer_dirty(bh) ||
542 (jh && (jh->b_transaction != jinode->i_transaction ||
543 jh->b_next_transaction)))
544 return true;
545 } while ((bh = bh->b_this_page) != head);
546
547 return false;
548 }
549
ext4_journalled_submit_inode_data_buffers(struct jbd2_inode * jinode)550 static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
551 {
552 struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
553 struct writeback_control wbc = {
554 .sync_mode = WB_SYNC_ALL,
555 .nr_to_write = LONG_MAX,
556 .range_start = jinode->i_dirty_start,
557 .range_end = jinode->i_dirty_end,
558 };
559 struct folio *folio = NULL;
560 int error;
561
562 /*
563 * writeback_iter() already checks for dirty pages and calls
564 * folio_clear_dirty_for_io(), which we want to write protect the
565 * folios.
566 *
567 * However, we may have to redirty a folio sometimes.
568 */
569 while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
570 if (ext4_journalled_writepage_needs_redirty(jinode, folio))
571 folio_redirty_for_writepage(&wbc, folio);
572 folio_unlock(folio);
573 }
574
575 return error;
576 }
577
ext4_journal_submit_inode_data_buffers(struct jbd2_inode * jinode)578 static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
579 {
580 int ret;
581
582 if (ext4_should_journal_data(jinode->i_vfs_inode))
583 ret = ext4_journalled_submit_inode_data_buffers(jinode);
584 else
585 ret = ext4_normal_submit_inode_data_buffers(jinode);
586 return ret;
587 }
588
ext4_journal_finish_inode_data_buffers(struct jbd2_inode * jinode)589 static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
590 {
591 int ret = 0;
592
593 if (!ext4_should_journal_data(jinode->i_vfs_inode))
594 ret = jbd2_journal_finish_inode_data_buffers(jinode);
595
596 return ret;
597 }
598
system_going_down(void)599 static bool system_going_down(void)
600 {
601 return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
602 || system_state == SYSTEM_RESTART;
603 }
604
605 struct ext4_err_translation {
606 int code;
607 int errno;
608 };
609
610 #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
611
612 static struct ext4_err_translation err_translation[] = {
613 EXT4_ERR_TRANSLATE(EIO),
614 EXT4_ERR_TRANSLATE(ENOMEM),
615 EXT4_ERR_TRANSLATE(EFSBADCRC),
616 EXT4_ERR_TRANSLATE(EFSCORRUPTED),
617 EXT4_ERR_TRANSLATE(ENOSPC),
618 EXT4_ERR_TRANSLATE(ENOKEY),
619 EXT4_ERR_TRANSLATE(EROFS),
620 EXT4_ERR_TRANSLATE(EFBIG),
621 EXT4_ERR_TRANSLATE(EEXIST),
622 EXT4_ERR_TRANSLATE(ERANGE),
623 EXT4_ERR_TRANSLATE(EOVERFLOW),
624 EXT4_ERR_TRANSLATE(EBUSY),
625 EXT4_ERR_TRANSLATE(ENOTDIR),
626 EXT4_ERR_TRANSLATE(ENOTEMPTY),
627 EXT4_ERR_TRANSLATE(ESHUTDOWN),
628 EXT4_ERR_TRANSLATE(EFAULT),
629 };
630
ext4_errno_to_code(int errno)631 static int ext4_errno_to_code(int errno)
632 {
633 int i;
634
635 for (i = 0; i < ARRAY_SIZE(err_translation); i++)
636 if (err_translation[i].errno == errno)
637 return err_translation[i].code;
638 return EXT4_ERR_UNKNOWN;
639 }
640
save_error_info(struct super_block * sb,int error,__u32 ino,__u64 block,const char * func,unsigned int line)641 static void save_error_info(struct super_block *sb, int error,
642 __u32 ino, __u64 block,
643 const char *func, unsigned int line)
644 {
645 struct ext4_sb_info *sbi = EXT4_SB(sb);
646
647 /* We default to EFSCORRUPTED error... */
648 if (error == 0)
649 error = EFSCORRUPTED;
650
651 spin_lock(&sbi->s_error_lock);
652 sbi->s_add_error_count++;
653 sbi->s_last_error_code = error;
654 sbi->s_last_error_line = line;
655 sbi->s_last_error_ino = ino;
656 sbi->s_last_error_block = block;
657 sbi->s_last_error_func = func;
658 sbi->s_last_error_time = ktime_get_real_seconds();
659 if (!sbi->s_first_error_time) {
660 sbi->s_first_error_code = error;
661 sbi->s_first_error_line = line;
662 sbi->s_first_error_ino = ino;
663 sbi->s_first_error_block = block;
664 sbi->s_first_error_func = func;
665 sbi->s_first_error_time = sbi->s_last_error_time;
666 }
667 spin_unlock(&sbi->s_error_lock);
668 }
669
670 /* Deal with the reporting of failure conditions on a filesystem such as
671 * inconsistencies detected or read IO failures.
672 *
673 * On ext2, we can store the error state of the filesystem in the
674 * superblock. That is not possible on ext4, because we may have other
675 * write ordering constraints on the superblock which prevent us from
676 * writing it out straight away; and given that the journal is about to
677 * be aborted, we can't rely on the current, or future, transactions to
678 * write out the superblock safely.
679 *
680 * We'll just use the jbd2_journal_abort() error code to record an error in
681 * the journal instead. On recovery, the journal will complain about
682 * that error until we've noted it down and cleared it.
683 *
684 * If force_ro is set, we unconditionally force the filesystem into an
685 * ABORT|READONLY state, unless the error response on the fs has been set to
686 * panic in which case we take the easy way out and panic immediately. This is
687 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
688 * at a critical moment in log management.
689 */
ext4_handle_error(struct super_block * sb,bool force_ro,int error,__u32 ino,__u64 block,const char * func,unsigned int line)690 static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
691 __u32 ino, __u64 block,
692 const char *func, unsigned int line)
693 {
694 journal_t *journal = EXT4_SB(sb)->s_journal;
695 bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);
696
697 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
698 if (test_opt(sb, WARN_ON_ERROR))
699 WARN_ON_ONCE(1);
700
701 if (!continue_fs && !ext4_emergency_ro(sb) && journal)
702 jbd2_journal_abort(journal, -error);
703
704 if (!bdev_read_only(sb->s_bdev)) {
705 save_error_info(sb, error, ino, block, func, line);
706 /*
707 * In case the fs should keep running, we need to writeout
708 * superblock through the journal. Due to lock ordering
709 * constraints, it may not be safe to do it right here so we
710 * defer superblock flushing to a workqueue. We just need to be
711 * careful when the journal is already shutting down. If we get
712 * here in that case, just update the sb directly as the last
713 * transaction won't commit anyway.
714 */
715 if (continue_fs && journal &&
716 !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
717 schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
718 else
719 ext4_commit_super(sb);
720 }
721
722 /*
723 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
724 * could panic during 'reboot -f' as the underlying device got already
725 * disabled.
726 */
727 if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
728 panic("EXT4-fs (device %s): panic forced after error\n",
729 sb->s_id);
730 }
731
732 if (ext4_emergency_ro(sb) || continue_fs)
733 return;
734
735 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
736 /*
737 * We don't set SB_RDONLY because that requires sb->s_umount
738 * semaphore and setting it without proper remount procedure is
739 * confusing code such as freeze_super() leading to deadlocks
740 * and other problems.
741 */
742 set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
743 }
744
update_super_work(struct work_struct * work)745 static void update_super_work(struct work_struct *work)
746 {
747 struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
748 s_sb_upd_work);
749 journal_t *journal = sbi->s_journal;
750 handle_t *handle;
751
752 /*
753 * If the journal is still running, we have to write out superblock
754 * through the journal to avoid collisions of other journalled sb
755 * updates.
756 *
757 * We use directly jbd2 functions here to avoid recursing back into
758 * ext4 error handling code during handling of previous errors.
759 */
760 if (!ext4_emergency_state(sbi->s_sb) &&
761 !sb_rdonly(sbi->s_sb) && journal) {
762 struct buffer_head *sbh = sbi->s_sbh;
763 bool call_notify_err = false;
764
765 handle = jbd2_journal_start(journal, 1);
766 if (IS_ERR(handle))
767 goto write_directly;
768 if (jbd2_journal_get_write_access(handle, sbh)) {
769 jbd2_journal_stop(handle);
770 goto write_directly;
771 }
772
773 if (sbi->s_add_error_count > 0)
774 call_notify_err = true;
775
776 ext4_update_super(sbi->s_sb);
777 if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
778 ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
779 "superblock detected");
780 clear_buffer_write_io_error(sbh);
781 set_buffer_uptodate(sbh);
782 }
783
784 if (jbd2_journal_dirty_metadata(handle, sbh)) {
785 jbd2_journal_stop(handle);
786 goto write_directly;
787 }
788 jbd2_journal_stop(handle);
789
790 if (call_notify_err)
791 ext4_notify_error_sysfs(sbi);
792
793 return;
794 }
795 write_directly:
796 /*
797 * Write through journal failed. Write sb directly to get error info
798 * out and hope for the best.
799 */
800 ext4_commit_super(sbi->s_sb);
801 ext4_notify_error_sysfs(sbi);
802 }
803
804 #define ext4_error_ratelimit(sb) \
805 ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \
806 "EXT4-fs error")
807
__ext4_error(struct super_block * sb,const char * function,unsigned int line,bool force_ro,int error,__u64 block,const char * fmt,...)808 void __ext4_error(struct super_block *sb, const char *function,
809 unsigned int line, bool force_ro, int error, __u64 block,
810 const char *fmt, ...)
811 {
812 struct va_format vaf;
813 va_list args;
814
815 if (unlikely(ext4_emergency_state(sb)))
816 return;
817
818 trace_ext4_error(sb, function, line);
819 if (ext4_error_ratelimit(sb)) {
820 va_start(args, fmt);
821 vaf.fmt = fmt;
822 vaf.va = &args;
823 printk(KERN_CRIT
824 "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
825 sb->s_id, function, line, current->comm, &vaf);
826 va_end(args);
827 }
828 fserror_report_metadata(sb, error ? -abs(error) : -EFSCORRUPTED,
829 GFP_ATOMIC);
830
831 ext4_handle_error(sb, force_ro, error, 0, block, function, line);
832 }
833
__ext4_error_inode(struct inode * inode,const char * function,unsigned int line,ext4_fsblk_t block,int error,const char * fmt,...)834 void __ext4_error_inode(struct inode *inode, const char *function,
835 unsigned int line, ext4_fsblk_t block, int error,
836 const char *fmt, ...)
837 {
838 va_list args;
839 struct va_format vaf;
840
841 if (unlikely(ext4_emergency_state(inode->i_sb)))
842 return;
843
844 trace_ext4_error(inode->i_sb, function, line);
845 if (ext4_error_ratelimit(inode->i_sb)) {
846 va_start(args, fmt);
847 vaf.fmt = fmt;
848 vaf.va = &args;
849 if (block)
850 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
851 "inode #%lu: block %llu: comm %s: %pV\n",
852 inode->i_sb->s_id, function, line, inode->i_ino,
853 block, current->comm, &vaf);
854 else
855 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
856 "inode #%lu: comm %s: %pV\n",
857 inode->i_sb->s_id, function, line, inode->i_ino,
858 current->comm, &vaf);
859 va_end(args);
860 }
861 fserror_report_file_metadata(inode,
862 error ? -abs(error) : -EFSCORRUPTED,
863 GFP_ATOMIC);
864
865 ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
866 function, line);
867 }
868
__ext4_error_file(struct file * file,const char * function,unsigned int line,ext4_fsblk_t block,const char * fmt,...)869 void __ext4_error_file(struct file *file, const char *function,
870 unsigned int line, ext4_fsblk_t block,
871 const char *fmt, ...)
872 {
873 va_list args;
874 struct va_format vaf;
875 struct inode *inode = file_inode(file);
876 char pathname[80], *path;
877
878 if (unlikely(ext4_emergency_state(inode->i_sb)))
879 return;
880
881 trace_ext4_error(inode->i_sb, function, line);
882 if (ext4_error_ratelimit(inode->i_sb)) {
883 path = file_path(file, pathname, sizeof(pathname));
884 if (IS_ERR(path))
885 path = "(unknown)";
886 va_start(args, fmt);
887 vaf.fmt = fmt;
888 vaf.va = &args;
889 if (block)
890 printk(KERN_CRIT
891 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
892 "block %llu: comm %s: path %s: %pV\n",
893 inode->i_sb->s_id, function, line, inode->i_ino,
894 block, current->comm, path, &vaf);
895 else
896 printk(KERN_CRIT
897 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
898 "comm %s: path %s: %pV\n",
899 inode->i_sb->s_id, function, line, inode->i_ino,
900 current->comm, path, &vaf);
901 va_end(args);
902 }
903 fserror_report_file_metadata(inode, -EFSCORRUPTED, GFP_ATOMIC);
904
905 ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
906 function, line);
907 }
908
ext4_decode_error(struct super_block * sb,int errno,char nbuf[16])909 const char *ext4_decode_error(struct super_block *sb, int errno,
910 char nbuf[16])
911 {
912 char *errstr = NULL;
913
914 switch (errno) {
915 case -EFSCORRUPTED:
916 errstr = "Corrupt filesystem";
917 break;
918 case -EFSBADCRC:
919 errstr = "Filesystem failed CRC";
920 break;
921 case -EIO:
922 errstr = "IO failure";
923 break;
924 case -ENOMEM:
925 errstr = "Out of memory";
926 break;
927 case -EROFS:
928 if (!sb || (EXT4_SB(sb)->s_journal &&
929 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
930 errstr = "Journal has aborted";
931 else
932 errstr = "Readonly filesystem";
933 break;
934 default:
935 /* If the caller passed in an extra buffer for unknown
936 * errors, textualise them now. Else we just return
937 * NULL. */
938 if (nbuf) {
939 /* Check for truncated error codes... */
940 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
941 errstr = nbuf;
942 }
943 break;
944 }
945
946 return errstr;
947 }
948
949 /* __ext4_std_error decodes expected errors from journaling functions
950 * automatically and invokes the appropriate error response. */
951
__ext4_std_error(struct super_block * sb,const char * function,unsigned int line,int errno)952 void __ext4_std_error(struct super_block *sb, const char *function,
953 unsigned int line, int errno)
954 {
955 char nbuf[16];
956 const char *errstr;
957
958 if (unlikely(ext4_emergency_state(sb)))
959 return;
960
961 /* Special case: if the error is EROFS, and we're not already
962 * inside a transaction, then there's really no point in logging
963 * an error. */
964 if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
965 return;
966
967 if (ext4_error_ratelimit(sb)) {
968 errstr = ext4_decode_error(sb, errno, nbuf);
969 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
970 sb->s_id, function, line, errstr);
971 }
972 fserror_report_metadata(sb, errno ? -abs(errno) : -EFSCORRUPTED,
973 GFP_ATOMIC);
974
975 ext4_handle_error(sb, false, -errno, 0, 0, function, line);
976 }
977
__ext4_msg(struct super_block * sb,const char * prefix,const char * fmt,...)978 void __ext4_msg(struct super_block *sb,
979 const char *prefix, const char *fmt, ...)
980 {
981 struct va_format vaf;
982 va_list args;
983
984 if (sb) {
985 atomic_inc(&EXT4_SB(sb)->s_msg_count);
986 if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
987 "EXT4-fs"))
988 return;
989 }
990
991 va_start(args, fmt);
992 vaf.fmt = fmt;
993 vaf.va = &args;
994 if (sb)
995 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
996 else
997 printk("%sEXT4-fs: %pV\n", prefix, &vaf);
998 va_end(args);
999 }
1000
ext4_warning_ratelimit(struct super_block * sb)1001 static int ext4_warning_ratelimit(struct super_block *sb)
1002 {
1003 atomic_inc(&EXT4_SB(sb)->s_warning_count);
1004 return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
1005 "EXT4-fs warning");
1006 }
1007
__ext4_warning(struct super_block * sb,const char * function,unsigned int line,const char * fmt,...)1008 void __ext4_warning(struct super_block *sb, const char *function,
1009 unsigned int line, const char *fmt, ...)
1010 {
1011 struct va_format vaf;
1012 va_list args;
1013
1014 if (!ext4_warning_ratelimit(sb))
1015 return;
1016
1017 va_start(args, fmt);
1018 vaf.fmt = fmt;
1019 vaf.va = &args;
1020 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
1021 sb->s_id, function, line, &vaf);
1022 va_end(args);
1023 }
1024
__ext4_warning_inode(const struct inode * inode,const char * function,unsigned int line,const char * fmt,...)1025 void __ext4_warning_inode(const struct inode *inode, const char *function,
1026 unsigned int line, const char *fmt, ...)
1027 {
1028 struct va_format vaf;
1029 va_list args;
1030
1031 if (!ext4_warning_ratelimit(inode->i_sb))
1032 return;
1033
1034 va_start(args, fmt);
1035 vaf.fmt = fmt;
1036 vaf.va = &args;
1037 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
1038 "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
1039 function, line, inode->i_ino, current->comm, &vaf);
1040 va_end(args);
1041 }
1042
__ext4_grp_locked_error(const char * function,unsigned int line,struct super_block * sb,ext4_group_t grp,unsigned long ino,ext4_fsblk_t block,const char * fmt,...)1043 void __ext4_grp_locked_error(const char *function, unsigned int line,
1044 struct super_block *sb, ext4_group_t grp,
1045 unsigned long ino, ext4_fsblk_t block,
1046 const char *fmt, ...)
1047 __releases(bitlock)
1048 __acquires(bitlock)
1049 {
1050 struct va_format vaf;
1051 va_list args;
1052
1053 if (unlikely(ext4_emergency_state(sb)))
1054 return;
1055
1056 trace_ext4_error(sb, function, line);
1057 if (ext4_error_ratelimit(sb)) {
1058 va_start(args, fmt);
1059 vaf.fmt = fmt;
1060 vaf.va = &args;
1061 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
1062 sb->s_id, function, line, grp);
1063 if (ino)
1064 printk(KERN_CONT "inode %lu: ", ino);
1065 if (block)
1066 printk(KERN_CONT "block %llu:",
1067 (unsigned long long) block);
1068 printk(KERN_CONT "%pV\n", &vaf);
1069 va_end(args);
1070 }
1071
1072 if (test_opt(sb, ERRORS_CONT)) {
1073 if (test_opt(sb, WARN_ON_ERROR))
1074 WARN_ON_ONCE(1);
1075 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
1076 if (!bdev_read_only(sb->s_bdev)) {
1077 save_error_info(sb, EFSCORRUPTED, ino, block, function,
1078 line);
1079 schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
1080 }
1081 return;
1082 }
1083 ext4_unlock_group(sb, grp);
1084 ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
1085 /*
1086 * We only get here in the ERRORS_RO case; relocking the group
1087 * may be dangerous, but nothing bad will happen since the
1088 * filesystem will have already been marked read/only and the
1089 * journal has been aborted. We return 1 as a hint to callers
1090 * who might what to use the return value from
1091 * ext4_grp_locked_error() to distinguish between the
1092 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
1093 * aggressively from the ext4 function in question, with a
1094 * more appropriate error code.
1095 */
1096 ext4_lock_group(sb, grp);
1097 return;
1098 }
1099
ext4_mark_group_bitmap_corrupted(struct super_block * sb,ext4_group_t group,unsigned int flags)1100 void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
1101 ext4_group_t group,
1102 unsigned int flags)
1103 {
1104 struct ext4_sb_info *sbi = EXT4_SB(sb);
1105 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1106 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
1107 int ret;
1108
1109 if (!grp || !gdp)
1110 return;
1111 if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
1112 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1113 &grp->bb_state);
1114 if (!ret)
1115 percpu_counter_sub(&sbi->s_freeclusters_counter,
1116 grp->bb_free);
1117 }
1118
1119 if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
1120 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
1121 &grp->bb_state);
1122 if (!ret && gdp) {
1123 int count;
1124
1125 count = ext4_free_inodes_count(sb, gdp);
1126 percpu_counter_sub(&sbi->s_freeinodes_counter,
1127 count);
1128 }
1129 }
1130 }
1131
ext4_update_dynamic_rev(struct super_block * sb)1132 void ext4_update_dynamic_rev(struct super_block *sb)
1133 {
1134 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1135
1136 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
1137 return;
1138
1139 ext4_warning(sb,
1140 "updating to rev %d because of new feature flag, "
1141 "running e2fsck is recommended",
1142 EXT4_DYNAMIC_REV);
1143
1144 es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
1145 es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
1146 es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
1147 /* leave es->s_feature_*compat flags alone */
1148 /* es->s_uuid will be set by e2fsck if empty */
1149
1150 /*
1151 * The rest of the superblock fields should be zero, and if not it
1152 * means they are likely already in use, so leave them alone. We
1153 * can leave it up to e2fsck to clean up any inconsistencies there.
1154 */
1155 }
1156
orphan_list_entry(struct list_head * l)1157 static inline struct inode *orphan_list_entry(struct list_head *l)
1158 {
1159 return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
1160 }
1161
dump_orphan_list(struct super_block * sb,struct ext4_sb_info * sbi)1162 static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
1163 {
1164 struct list_head *l;
1165
1166 ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
1167 le32_to_cpu(sbi->s_es->s_last_orphan));
1168
1169 printk(KERN_ERR "sb_info orphan list:\n");
1170 list_for_each(l, &sbi->s_orphan) {
1171 struct inode *inode = orphan_list_entry(l);
1172 printk(KERN_ERR " "
1173 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
1174 inode->i_sb->s_id, inode->i_ino, inode,
1175 inode->i_mode, inode->i_nlink,
1176 NEXT_ORPHAN(inode));
1177 }
1178 }
1179
1180 #ifdef CONFIG_QUOTA
1181 static int ext4_quota_off(struct super_block *sb, int type);
1182
ext4_quotas_off(struct super_block * sb,int type)1183 static inline void ext4_quotas_off(struct super_block *sb, int type)
1184 {
1185 BUG_ON(type > EXT4_MAXQUOTAS);
1186
1187 /* Use our quota_off function to clear inode flags etc. */
1188 for (type--; type >= 0; type--)
1189 ext4_quota_off(sb, type);
1190 }
1191
1192 /*
1193 * This is a helper function which is used in the mount/remount
1194 * codepaths (which holds s_umount) to fetch the quota file name.
1195 */
get_qf_name(struct super_block * sb,struct ext4_sb_info * sbi,int type)1196 static inline char *get_qf_name(struct super_block *sb,
1197 struct ext4_sb_info *sbi,
1198 int type)
1199 {
1200 return rcu_dereference_protected(sbi->s_qf_names[type],
1201 lockdep_is_held(&sb->s_umount));
1202 }
1203 #else
ext4_quotas_off(struct super_block * sb,int type)1204 static inline void ext4_quotas_off(struct super_block *sb, int type)
1205 {
1206 }
1207 #endif
1208
ext4_percpu_param_init(struct ext4_sb_info * sbi)1209 static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
1210 {
1211 ext4_fsblk_t block;
1212 int err;
1213
1214 block = ext4_count_free_clusters(sbi->s_sb);
1215 ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
1216 err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
1217 GFP_KERNEL);
1218 if (!err) {
1219 unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
1220 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
1221 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
1222 GFP_KERNEL);
1223 }
1224 if (!err)
1225 err = percpu_counter_init(&sbi->s_dirs_counter,
1226 ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
1227 if (!err)
1228 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
1229 GFP_KERNEL);
1230 if (!err)
1231 err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
1232 GFP_KERNEL);
1233 if (!err)
1234 err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
1235
1236 if (err)
1237 ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");
1238
1239 return err;
1240 }
1241
ext4_percpu_param_destroy(struct ext4_sb_info * sbi)1242 static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
1243 {
1244 percpu_counter_destroy(&sbi->s_freeclusters_counter);
1245 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1246 percpu_counter_destroy(&sbi->s_dirs_counter);
1247 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1248 percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
1249 percpu_free_rwsem(&sbi->s_writepages_rwsem);
1250 }
1251
ext4_group_desc_free(struct ext4_sb_info * sbi)1252 static void ext4_group_desc_free(struct ext4_sb_info *sbi)
1253 {
1254 struct buffer_head **group_desc;
1255 int i;
1256
1257 group_desc = rcu_access_pointer(sbi->s_group_desc);
1258 for (i = 0; i < sbi->s_gdb_count; i++)
1259 brelse(group_desc[i]);
1260 kvfree(group_desc);
1261 }
1262
ext4_flex_groups_free(struct ext4_sb_info * sbi)1263 static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
1264 {
1265 struct flex_groups **flex_groups;
1266 int i;
1267
1268 flex_groups = rcu_access_pointer(sbi->s_flex_groups);
1269 if (flex_groups) {
1270 for (i = 0; i < sbi->s_flex_groups_allocated; i++)
1271 kvfree(flex_groups[i]);
1272 kvfree(flex_groups);
1273 }
1274 }
1275
ext4_put_super(struct super_block * sb)1276 static void ext4_put_super(struct super_block *sb)
1277 {
1278 struct ext4_sb_info *sbi = EXT4_SB(sb);
1279 struct ext4_super_block *es = sbi->s_es;
1280 int aborted = 0;
1281 int err;
1282
1283 /*
1284 * Unregister sysfs before destroying jbd2 journal.
1285 * Since we could still access attr_journal_task attribute via sysfs
1286 * path which could have sbi->s_journal->j_task as NULL
1287 * Unregister sysfs before flush sbi->s_sb_upd_work.
1288 * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
1289 * read metadata verify failed then will queue error work.
1290 * update_super_work will call start_this_handle may trigger
1291 * BUG_ON.
1292 */
1293 ext4_unregister_sysfs(sb);
1294
1295 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
1296 ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
1297 &sb->s_uuid);
1298
1299 ext4_unregister_li_request(sb);
1300 ext4_quotas_off(sb, EXT4_MAXQUOTAS);
1301
1302 destroy_workqueue(sbi->rsv_conversion_wq);
1303 ext4_release_orphan_info(sb);
1304
1305 if (sbi->s_journal) {
1306 aborted = is_journal_aborted(sbi->s_journal);
1307 err = ext4_journal_destroy(sbi, sbi->s_journal);
1308 if ((err < 0) && !aborted) {
1309 ext4_abort(sb, -err, "Couldn't clean up the journal");
1310 }
1311 } else
1312 flush_work(&sbi->s_sb_upd_work);
1313
1314 ext4_es_unregister_shrinker(sbi);
1315 timer_shutdown_sync(&sbi->s_err_report);
1316 ext4_release_system_zone(sb);
1317 ext4_mb_release(sb);
1318 ext4_ext_release(sb);
1319
1320 if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
1321 if (!aborted) {
1322 ext4_clear_feature_journal_needs_recovery(sb);
1323 ext4_clear_feature_orphan_present(sb);
1324 es->s_state = cpu_to_le16(sbi->s_mount_state);
1325 }
1326 ext4_commit_super(sb);
1327 }
1328
1329 ext4_group_desc_free(sbi);
1330 ext4_flex_groups_free(sbi);
1331
1332 WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
1333 percpu_counter_sum(&sbi->s_dirtyclusters_counter));
1334 ext4_percpu_param_destroy(sbi);
1335 #ifdef CONFIG_QUOTA
1336 for (int i = 0; i < EXT4_MAXQUOTAS; i++)
1337 kfree(get_qf_name(sb, sbi, i));
1338 #endif
1339
1340 /* Debugging code just in case the in-memory inode orphan list
1341 * isn't empty. The on-disk one can be non-empty if we've
1342 * detected an error and taken the fs readonly, but the
1343 * in-memory list had better be clean by this point. */
1344 if (!list_empty(&sbi->s_orphan))
1345 dump_orphan_list(sb, sbi);
1346 ASSERT(list_empty(&sbi->s_orphan));
1347
1348 sync_blockdev(sb->s_bdev);
1349 invalidate_bdev(sb->s_bdev);
1350 if (sbi->s_journal_bdev_file) {
1351 /*
1352 * Invalidate the journal device's buffers. We don't want them
1353 * floating about in memory - the physical journal device may
1354 * hotswapped, and it breaks the `ro-after' testing code.
1355 */
1356 sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
1357 invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
1358 }
1359
1360 ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1361 sbi->s_ea_inode_cache = NULL;
1362
1363 ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1364 sbi->s_ea_block_cache = NULL;
1365
1366 ext4_stop_mmpd(sbi);
1367
1368 brelse(sbi->s_sbh);
1369 sb->s_fs_info = NULL;
1370 /*
1371 * Now that we are completely done shutting down the
1372 * superblock, we need to actually destroy the kobject.
1373 */
1374 kobject_put(&sbi->s_kobj);
1375 wait_for_completion(&sbi->s_kobj_unregister);
1376 kfree(sbi->s_blockgroup_lock);
1377 fs_put_dax(sbi->s_daxdev, NULL);
1378 fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
1379 #if IS_ENABLED(CONFIG_UNICODE)
1380 utf8_unload(sb->s_encoding);
1381 #endif
1382 kfree(sbi);
1383 }
1384
1385 static struct kmem_cache *ext4_inode_cachep;
1386
1387 /*
1388 * Called inside transaction, so use GFP_NOFS
1389 */
ext4_alloc_inode(struct super_block * sb)1390 static struct inode *ext4_alloc_inode(struct super_block *sb)
1391 {
1392 struct ext4_inode_info *ei;
1393
1394 ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
1395 if (!ei)
1396 return NULL;
1397
1398 inode_set_iversion(&ei->vfs_inode, 1);
1399 ei->i_flags = 0;
1400 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1401 spin_lock_init(&ei->i_raw_lock);
1402 ei->i_prealloc_node = RB_ROOT;
1403 atomic_set(&ei->i_prealloc_active, 0);
1404 rwlock_init(&ei->i_prealloc_lock);
1405 ext4_es_init_tree(&ei->i_es_tree);
1406 rwlock_init(&ei->i_es_lock);
1407 INIT_LIST_HEAD(&ei->i_es_list);
1408 ei->i_es_all_nr = 0;
1409 ei->i_es_shk_nr = 0;
1410 ei->i_es_shrink_lblk = 0;
1411 ei->i_es_seq = 0;
1412 ei->i_reserved_data_blocks = 0;
1413 spin_lock_init(&(ei->i_block_reservation_lock));
1414 ext4_init_pending_tree(&ei->i_pending_tree);
1415 #ifdef CONFIG_QUOTA
1416 ei->i_reserved_quota = 0;
1417 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1418 #endif
1419 ei->jinode = NULL;
1420 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1421 spin_lock_init(&ei->i_completed_io_lock);
1422 ei->i_sync_tid = 0;
1423 ei->i_datasync_tid = 0;
1424 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1425 ext4_fc_init_inode(&ei->vfs_inode);
1426 spin_lock_init(&ei->i_fc_lock);
1427 return &ei->vfs_inode;
1428 }
1429
ext4_drop_inode(struct inode * inode)1430 static int ext4_drop_inode(struct inode *inode)
1431 {
1432 int drop = inode_generic_drop(inode);
1433
1434 if (!drop)
1435 drop = fscrypt_drop_inode(inode);
1436
1437 trace_ext4_drop_inode(inode, drop);
1438 return drop;
1439 }
1440
ext4_free_in_core_inode(struct inode * inode)1441 static void ext4_free_in_core_inode(struct inode *inode)
1442 {
1443 fscrypt_free_inode(inode);
1444 if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
1445 pr_warn("%s: inode %ld still in fc list",
1446 __func__, inode->i_ino);
1447 }
1448 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1449 }
1450
ext4_destroy_inode(struct inode * inode)1451 static void ext4_destroy_inode(struct inode *inode)
1452 {
1453 if (ext4_inode_orphan_tracked(inode)) {
1454 ext4_msg(inode->i_sb, KERN_ERR,
1455 "Inode %lu (%p): inode tracked as orphan!",
1456 inode->i_ino, EXT4_I(inode));
1457 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1458 EXT4_I(inode), sizeof(struct ext4_inode_info),
1459 true);
1460 dump_stack();
1461 }
1462
1463 if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
1464 WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
1465 ext4_msg(inode->i_sb, KERN_ERR,
1466 "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
1467 inode->i_ino, EXT4_I(inode),
1468 EXT4_I(inode)->i_reserved_data_blocks);
1469 }
1470
ext4_shutdown(struct super_block * sb)1471 static void ext4_shutdown(struct super_block *sb)
1472 {
1473 ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
1474 }
1475
init_once(void * foo)1476 static void init_once(void *foo)
1477 {
1478 struct ext4_inode_info *ei = foo;
1479
1480 INIT_LIST_HEAD(&ei->i_orphan);
1481 init_rwsem(&ei->xattr_sem);
1482 init_rwsem(&ei->i_data_sem);
1483 inode_init_once(&ei->vfs_inode);
1484 ext4_fc_init_inode(&ei->vfs_inode);
1485 #ifdef CONFIG_FS_ENCRYPTION
1486 ei->i_crypt_info = NULL;
1487 #endif
1488 }
1489
init_inodecache(void)1490 static int __init init_inodecache(void)
1491 {
1492 struct kmem_cache_args args = {
1493 .useroffset = offsetof(struct ext4_inode_info, i_data),
1494 .usersize = sizeof_field(struct ext4_inode_info, i_data),
1495 .use_freeptr_offset = true,
1496 .freeptr_offset = offsetof(struct ext4_inode_info, i_flags),
1497 .ctor = init_once,
1498 };
1499
1500 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
1501 sizeof(struct ext4_inode_info),
1502 &args,
1503 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT);
1504
1505 if (ext4_inode_cachep == NULL)
1506 return -ENOMEM;
1507 return 0;
1508 }
1509
destroy_inodecache(void)1510 static void destroy_inodecache(void)
1511 {
1512 /*
1513 * Make sure all delayed rcu free inodes are flushed before we
1514 * destroy cache.
1515 */
1516 rcu_barrier();
1517 kmem_cache_destroy(ext4_inode_cachep);
1518 }
1519
ext4_clear_inode(struct inode * inode)1520 void ext4_clear_inode(struct inode *inode)
1521 {
1522 ext4_fc_del(inode);
1523 invalidate_inode_buffers(inode);
1524 clear_inode(inode);
1525 ext4_discard_preallocations(inode);
1526 /*
1527 * We must remove the inode from the hash before ext4_free_inode()
1528 * clears the bit in inode bitmap as otherwise another process reusing
1529 * the inode will block in insert_inode_hash() waiting for inode
1530 * eviction to complete while holding transaction handle open, but
1531 * ext4_evict_inode() still running for that inode could block waiting
1532 * for transaction commit if the inode is marked as IS_SYNC => deadlock.
1533 *
1534 * Removing the inode from the hash here is safe. There are two cases
1535 * to consider:
1536 * 1) The inode still has references to it (i_nlink > 0). In that case
1537 * we are keeping the inode and once we remove the inode from the hash,
1538 * iget() can create the new inode structure for the same inode number
1539 * and we are fine with that as all IO on behalf of the inode is
1540 * finished.
1541 * 2) We are deleting the inode (i_nlink == 0). In that case inode
1542 * number cannot be reused until ext4_free_inode() clears the bit in
1543 * the inode bitmap, at which point all IO is done and reuse is fine
1544 * again.
1545 */
1546 remove_inode_hash(inode);
1547 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1548 dquot_drop(inode);
1549 if (EXT4_I(inode)->jinode) {
1550 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1551 EXT4_I(inode)->jinode);
1552 jbd2_free_inode(EXT4_I(inode)->jinode);
1553 EXT4_I(inode)->jinode = NULL;
1554 }
1555 fscrypt_put_encryption_info(inode);
1556 }
1557
ext4_nfs_get_inode(struct super_block * sb,u64 ino,u32 generation)1558 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1559 u64 ino, u32 generation)
1560 {
1561 struct inode *inode;
1562
1563 /*
1564 * Currently we don't know the generation for parent directory, so
1565 * a generation of 0 means "accept any"
1566 */
1567 inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1568 if (IS_ERR(inode))
1569 return ERR_CAST(inode);
1570 if (generation && inode->i_generation != generation) {
1571 iput(inode);
1572 return ERR_PTR(-ESTALE);
1573 }
1574
1575 return inode;
1576 }
1577
ext4_fh_to_dentry(struct super_block * sb,struct fid * fid,int fh_len,int fh_type)1578 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1579 int fh_len, int fh_type)
1580 {
1581 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1582 ext4_nfs_get_inode);
1583 }
1584
ext4_fh_to_parent(struct super_block * sb,struct fid * fid,int fh_len,int fh_type)1585 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1586 int fh_len, int fh_type)
1587 {
1588 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1589 ext4_nfs_get_inode);
1590 }
1591
ext4_nfs_commit_metadata(struct inode * inode)1592 static int ext4_nfs_commit_metadata(struct inode *inode)
1593 {
1594 struct writeback_control wbc = {
1595 .sync_mode = WB_SYNC_ALL
1596 };
1597
1598 trace_ext4_nfs_commit_metadata(inode);
1599 return ext4_write_inode(inode, &wbc);
1600 }
1601
1602 #ifdef CONFIG_QUOTA
1603 static const char * const quotatypes[] = INITQFNAMES;
1604 #define QTYPE2NAME(t) (quotatypes[t])
1605
1606 static int ext4_write_dquot(struct dquot *dquot);
1607 static int ext4_acquire_dquot(struct dquot *dquot);
1608 static int ext4_release_dquot(struct dquot *dquot);
1609 static int ext4_mark_dquot_dirty(struct dquot *dquot);
1610 static int ext4_write_info(struct super_block *sb, int type);
1611 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1612 const struct path *path);
1613 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1614 size_t len, loff_t off);
1615 static ssize_t ext4_quota_write(struct super_block *sb, int type,
1616 const char *data, size_t len, loff_t off);
1617 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1618 unsigned int flags);
1619
ext4_get_dquots(struct inode * inode)1620 static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
1621 {
1622 return EXT4_I(inode)->i_dquot;
1623 }
1624
1625 static const struct dquot_operations ext4_quota_operations = {
1626 .get_reserved_space = ext4_get_reserved_space,
1627 .write_dquot = ext4_write_dquot,
1628 .acquire_dquot = ext4_acquire_dquot,
1629 .release_dquot = ext4_release_dquot,
1630 .mark_dirty = ext4_mark_dquot_dirty,
1631 .write_info = ext4_write_info,
1632 .alloc_dquot = dquot_alloc,
1633 .destroy_dquot = dquot_destroy,
1634 .get_projid = ext4_get_projid,
1635 .get_inode_usage = ext4_get_inode_usage,
1636 .get_next_id = dquot_get_next_id,
1637 };
1638
1639 static const struct quotactl_ops ext4_qctl_operations = {
1640 .quota_on = ext4_quota_on,
1641 .quota_off = ext4_quota_off,
1642 .quota_sync = dquot_quota_sync,
1643 .get_state = dquot_get_state,
1644 .set_info = dquot_set_dqinfo,
1645 .get_dqblk = dquot_get_dqblk,
1646 .set_dqblk = dquot_set_dqblk,
1647 .get_nextdqblk = dquot_get_next_dqblk,
1648 };
1649 #endif
1650
1651 static const struct super_operations ext4_sops = {
1652 .alloc_inode = ext4_alloc_inode,
1653 .free_inode = ext4_free_in_core_inode,
1654 .destroy_inode = ext4_destroy_inode,
1655 .write_inode = ext4_write_inode,
1656 .dirty_inode = ext4_dirty_inode,
1657 .drop_inode = ext4_drop_inode,
1658 .evict_inode = ext4_evict_inode,
1659 .put_super = ext4_put_super,
1660 .sync_fs = ext4_sync_fs,
1661 .freeze_fs = ext4_freeze,
1662 .unfreeze_fs = ext4_unfreeze,
1663 .statfs = ext4_statfs,
1664 .show_options = ext4_show_options,
1665 .shutdown = ext4_shutdown,
1666 #ifdef CONFIG_QUOTA
1667 .quota_read = ext4_quota_read,
1668 .quota_write = ext4_quota_write,
1669 .get_dquots = ext4_get_dquots,
1670 #endif
1671 };
1672
1673 static const struct export_operations ext4_export_ops = {
1674 .encode_fh = generic_encode_ino32_fh,
1675 .fh_to_dentry = ext4_fh_to_dentry,
1676 .fh_to_parent = ext4_fh_to_parent,
1677 .get_parent = ext4_get_parent,
1678 .commit_metadata = ext4_nfs_commit_metadata,
1679 };
1680
1681 enum {
1682 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1683 Opt_resgid, Opt_resuid, Opt_sb,
1684 Opt_nouid32, Opt_debug, Opt_removed,
1685 Opt_user_xattr, Opt_acl,
1686 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1687 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1688 Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1689 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1690 Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1691 Opt_inlinecrypt,
1692 Opt_usrjquota, Opt_grpjquota, Opt_quota,
1693 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1694 Opt_usrquota, Opt_grpquota, Opt_prjquota,
1695 Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
1696 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1697 Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
1698 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1699 Opt_inode_readahead_blks, Opt_journal_ioprio,
1700 Opt_dioread_nolock, Opt_dioread_lock,
1701 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1702 Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1703 Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
1704 Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
1705 #ifdef CONFIG_EXT4_DEBUG
1706 Opt_fc_debug_max_replay, Opt_fc_debug_force
1707 #endif
1708 };
1709
1710 static const struct constant_table ext4_param_errors[] = {
1711 {"continue", EXT4_MOUNT_ERRORS_CONT},
1712 {"panic", EXT4_MOUNT_ERRORS_PANIC},
1713 {"remount-ro", EXT4_MOUNT_ERRORS_RO},
1714 {}
1715 };
1716
1717 static const struct constant_table ext4_param_data[] = {
1718 {"journal", EXT4_MOUNT_JOURNAL_DATA},
1719 {"ordered", EXT4_MOUNT_ORDERED_DATA},
1720 {"writeback", EXT4_MOUNT_WRITEBACK_DATA},
1721 {}
1722 };
1723
1724 static const struct constant_table ext4_param_data_err[] = {
1725 {"abort", Opt_data_err_abort},
1726 {"ignore", Opt_data_err_ignore},
1727 {}
1728 };
1729
1730 static const struct constant_table ext4_param_jqfmt[] = {
1731 {"vfsold", QFMT_VFS_OLD},
1732 {"vfsv0", QFMT_VFS_V0},
1733 {"vfsv1", QFMT_VFS_V1},
1734 {}
1735 };
1736
1737 static const struct constant_table ext4_param_dax[] = {
1738 {"always", Opt_dax_always},
1739 {"inode", Opt_dax_inode},
1740 {"never", Opt_dax_never},
1741 {}
1742 };
1743
1744 /*
1745 * Mount option specification
1746 * We don't use fsparam_flag_no because of the way we set the
1747 * options and the way we show them in _ext4_show_options(). To
1748 * keep the changes to a minimum, let's keep the negative options
1749 * separate for now.
1750 */
1751 static const struct fs_parameter_spec ext4_param_specs[] = {
1752 fsparam_flag ("bsddf", Opt_bsd_df),
1753 fsparam_flag ("minixdf", Opt_minix_df),
1754 fsparam_flag ("grpid", Opt_grpid),
1755 fsparam_flag ("bsdgroups", Opt_grpid),
1756 fsparam_flag ("nogrpid", Opt_nogrpid),
1757 fsparam_flag ("sysvgroups", Opt_nogrpid),
1758 fsparam_gid ("resgid", Opt_resgid),
1759 fsparam_uid ("resuid", Opt_resuid),
1760 fsparam_u32 ("sb", Opt_sb),
1761 fsparam_enum ("errors", Opt_errors, ext4_param_errors),
1762 fsparam_flag ("nouid32", Opt_nouid32),
1763 fsparam_flag ("debug", Opt_debug),
1764 fsparam_flag ("oldalloc", Opt_removed),
1765 fsparam_flag ("orlov", Opt_removed),
1766 fsparam_flag ("user_xattr", Opt_user_xattr),
1767 fsparam_flag ("acl", Opt_acl),
1768 fsparam_flag ("norecovery", Opt_noload),
1769 fsparam_flag ("noload", Opt_noload),
1770 fsparam_flag ("bh", Opt_removed),
1771 fsparam_flag ("nobh", Opt_removed),
1772 fsparam_u32 ("commit", Opt_commit),
1773 fsparam_u32 ("min_batch_time", Opt_min_batch_time),
1774 fsparam_u32 ("max_batch_time", Opt_max_batch_time),
1775 fsparam_u32 ("journal_dev", Opt_journal_dev),
1776 fsparam_bdev ("journal_path", Opt_journal_path),
1777 fsparam_flag ("journal_checksum", Opt_journal_checksum),
1778 fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum),
1779 fsparam_flag ("journal_async_commit",Opt_journal_async_commit),
1780 fsparam_flag ("abort", Opt_abort),
1781 fsparam_enum ("data", Opt_data, ext4_param_data),
1782 fsparam_enum ("data_err", Opt_data_err,
1783 ext4_param_data_err),
1784 fsparam_string_empty
1785 ("usrjquota", Opt_usrjquota),
1786 fsparam_string_empty
1787 ("grpjquota", Opt_grpjquota),
1788 fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt),
1789 fsparam_flag ("grpquota", Opt_grpquota),
1790 fsparam_flag ("quota", Opt_quota),
1791 fsparam_flag ("noquota", Opt_noquota),
1792 fsparam_flag ("usrquota", Opt_usrquota),
1793 fsparam_flag ("prjquota", Opt_prjquota),
1794 fsparam_flag ("barrier", Opt_barrier),
1795 fsparam_u32 ("barrier", Opt_barrier),
1796 fsparam_flag ("nobarrier", Opt_nobarrier),
1797 fsparam_flag ("i_version", Opt_removed),
1798 fsparam_flag ("dax", Opt_dax),
1799 fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
1800 fsparam_u32 ("stripe", Opt_stripe),
1801 fsparam_flag ("delalloc", Opt_delalloc),
1802 fsparam_flag ("nodelalloc", Opt_nodelalloc),
1803 fsparam_flag ("warn_on_error", Opt_warn_on_error),
1804 fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error),
1805 fsparam_u32 ("debug_want_extra_isize",
1806 Opt_debug_want_extra_isize),
1807 fsparam_flag ("mblk_io_submit", Opt_removed),
1808 fsparam_flag ("nomblk_io_submit", Opt_removed),
1809 fsparam_flag ("block_validity", Opt_block_validity),
1810 fsparam_flag ("noblock_validity", Opt_noblock_validity),
1811 fsparam_u32 ("inode_readahead_blks",
1812 Opt_inode_readahead_blks),
1813 fsparam_u32 ("journal_ioprio", Opt_journal_ioprio),
1814 fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc),
1815 fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc),
1816 fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc),
1817 fsparam_flag ("dioread_nolock", Opt_dioread_nolock),
1818 fsparam_flag ("nodioread_nolock", Opt_dioread_lock),
1819 fsparam_flag ("dioread_lock", Opt_dioread_lock),
1820 fsparam_flag ("discard", Opt_discard),
1821 fsparam_flag ("nodiscard", Opt_nodiscard),
1822 fsparam_u32 ("init_itable", Opt_init_itable),
1823 fsparam_flag ("init_itable", Opt_init_itable),
1824 fsparam_flag ("noinit_itable", Opt_noinit_itable),
1825 #ifdef CONFIG_EXT4_DEBUG
1826 fsparam_flag ("fc_debug_force", Opt_fc_debug_force),
1827 fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay),
1828 #endif
1829 fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb),
1830 fsparam_flag ("test_dummy_encryption",
1831 Opt_test_dummy_encryption),
1832 fsparam_string ("test_dummy_encryption",
1833 Opt_test_dummy_encryption),
1834 fsparam_flag ("inlinecrypt", Opt_inlinecrypt),
1835 fsparam_flag ("nombcache", Opt_nombcache),
1836 fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */
1837 fsparam_flag ("prefetch_block_bitmaps",
1838 Opt_removed),
1839 fsparam_flag ("no_prefetch_block_bitmaps",
1840 Opt_no_prefetch_block_bitmaps),
1841 fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan),
1842 fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */
1843 fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */
1844 fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */
1845 fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */
1846 fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */
1847 {}
1848 };
1849
1850
1851 #define MOPT_SET 0x0001
1852 #define MOPT_CLEAR 0x0002
1853 #define MOPT_NOSUPPORT 0x0004
1854 #define MOPT_EXPLICIT 0x0008
1855 #ifdef CONFIG_QUOTA
1856 #define MOPT_Q 0
1857 #define MOPT_QFMT 0x0010
1858 #else
1859 #define MOPT_Q MOPT_NOSUPPORT
1860 #define MOPT_QFMT MOPT_NOSUPPORT
1861 #endif
1862 #define MOPT_NO_EXT2 0x0020
1863 #define MOPT_NO_EXT3 0x0040
1864 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
1865 #define MOPT_SKIP 0x0080
1866 #define MOPT_2 0x0100
1867
1868 static const struct mount_opts {
1869 int token;
1870 int mount_opt;
1871 int flags;
1872 } ext4_mount_opts[] = {
1873 {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1874 {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1875 {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1876 {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1877 {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1878 {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1879 {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1880 MOPT_EXT4_ONLY | MOPT_SET},
1881 {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1882 MOPT_EXT4_ONLY | MOPT_CLEAR},
1883 {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1884 {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1885 {Opt_delalloc, EXT4_MOUNT_DELALLOC,
1886 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1887 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1888 MOPT_EXT4_ONLY | MOPT_CLEAR},
1889 {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1890 {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1891 {Opt_commit, 0, MOPT_NO_EXT2},
1892 {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1893 MOPT_EXT4_ONLY | MOPT_CLEAR},
1894 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1895 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1896 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1897 EXT4_MOUNT_JOURNAL_CHECKSUM),
1898 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1899 {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1900 {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
1901 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1902 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1903 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1904 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1905 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1906 {Opt_dax_type, 0, MOPT_EXT4_ONLY},
1907 {Opt_journal_dev, 0, MOPT_NO_EXT2},
1908 {Opt_journal_path, 0, MOPT_NO_EXT2},
1909 {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
1910 {Opt_data, 0, MOPT_NO_EXT2},
1911 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1912 #ifdef CONFIG_EXT4_FS_POSIX_ACL
1913 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1914 #else
1915 {Opt_acl, 0, MOPT_NOSUPPORT},
1916 #endif
1917 {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1918 {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1919 {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1920 {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1921 MOPT_SET | MOPT_Q},
1922 {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1923 MOPT_SET | MOPT_Q},
1924 {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1925 MOPT_SET | MOPT_Q},
1926 {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1927 EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1928 MOPT_CLEAR | MOPT_Q},
1929 {Opt_usrjquota, 0, MOPT_Q},
1930 {Opt_grpjquota, 0, MOPT_Q},
1931 {Opt_jqfmt, 0, MOPT_QFMT},
1932 {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
1933 {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
1934 MOPT_SET},
1935 #ifdef CONFIG_EXT4_DEBUG
1936 {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
1937 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
1938 #endif
1939 {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
1940 {Opt_err, 0, 0}
1941 };
1942
1943 #if IS_ENABLED(CONFIG_UNICODE)
1944 static const struct ext4_sb_encodings {
1945 __u16 magic;
1946 char *name;
1947 unsigned int version;
1948 } ext4_sb_encoding_map[] = {
1949 {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
1950 };
1951
1952 static const struct ext4_sb_encodings *
ext4_sb_read_encoding(const struct ext4_super_block * es)1953 ext4_sb_read_encoding(const struct ext4_super_block *es)
1954 {
1955 __u16 magic = le16_to_cpu(es->s_encoding);
1956 int i;
1957
1958 for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
1959 if (magic == ext4_sb_encoding_map[i].magic)
1960 return &ext4_sb_encoding_map[i];
1961
1962 return NULL;
1963 }
1964 #endif
1965
1966 #define EXT4_SPEC_JQUOTA (1 << 0)
1967 #define EXT4_SPEC_JQFMT (1 << 1)
1968 #define EXT4_SPEC_DATAJ (1 << 2)
1969 #define EXT4_SPEC_SB_BLOCK (1 << 3)
1970 #define EXT4_SPEC_JOURNAL_DEV (1 << 4)
1971 #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
1972 #define EXT4_SPEC_s_want_extra_isize (1 << 7)
1973 #define EXT4_SPEC_s_max_batch_time (1 << 8)
1974 #define EXT4_SPEC_s_min_batch_time (1 << 9)
1975 #define EXT4_SPEC_s_inode_readahead_blks (1 << 10)
1976 #define EXT4_SPEC_s_li_wait_mult (1 << 11)
1977 #define EXT4_SPEC_s_max_dir_size_kb (1 << 12)
1978 #define EXT4_SPEC_s_stripe (1 << 13)
1979 #define EXT4_SPEC_s_resuid (1 << 14)
1980 #define EXT4_SPEC_s_resgid (1 << 15)
1981 #define EXT4_SPEC_s_commit_interval (1 << 16)
1982 #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
1983 #define EXT4_SPEC_s_sb_block (1 << 18)
1984 #define EXT4_SPEC_mb_optimize_scan (1 << 19)
1985
1986 struct ext4_fs_context {
1987 char *s_qf_names[EXT4_MAXQUOTAS];
1988 struct fscrypt_dummy_policy dummy_enc_policy;
1989 int s_jquota_fmt; /* Format of quota to use */
1990 #ifdef CONFIG_EXT4_DEBUG
1991 int s_fc_debug_max_replay;
1992 #endif
1993 unsigned short qname_spec;
1994 unsigned long vals_s_flags; /* Bits to set in s_flags */
1995 unsigned long mask_s_flags; /* Bits changed in s_flags */
1996 unsigned long journal_devnum;
1997 unsigned long s_commit_interval;
1998 unsigned long s_stripe;
1999 unsigned int s_inode_readahead_blks;
2000 unsigned int s_want_extra_isize;
2001 unsigned int s_li_wait_mult;
2002 unsigned int s_max_dir_size_kb;
2003 unsigned int journal_ioprio;
2004 unsigned int vals_s_mount_opt;
2005 unsigned int mask_s_mount_opt;
2006 unsigned int vals_s_mount_opt2;
2007 unsigned int mask_s_mount_opt2;
2008 unsigned int opt_flags; /* MOPT flags */
2009 unsigned int spec;
2010 u32 s_max_batch_time;
2011 u32 s_min_batch_time;
2012 kuid_t s_resuid;
2013 kgid_t s_resgid;
2014 ext4_fsblk_t s_sb_block;
2015 };
2016
ext4_fc_free(struct fs_context * fc)2017 static void ext4_fc_free(struct fs_context *fc)
2018 {
2019 struct ext4_fs_context *ctx = fc->fs_private;
2020 int i;
2021
2022 if (!ctx)
2023 return;
2024
2025 for (i = 0; i < EXT4_MAXQUOTAS; i++)
2026 kfree(ctx->s_qf_names[i]);
2027
2028 fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
2029 kfree(ctx);
2030 }
2031
ext4_init_fs_context(struct fs_context * fc)2032 int ext4_init_fs_context(struct fs_context *fc)
2033 {
2034 struct ext4_fs_context *ctx;
2035
2036 ctx = kzalloc_obj(struct ext4_fs_context);
2037 if (!ctx)
2038 return -ENOMEM;
2039
2040 fc->fs_private = ctx;
2041 fc->ops = &ext4_context_ops;
2042
2043 /* i_version is always enabled now */
2044 fc->sb_flags |= SB_I_VERSION;
2045
2046 return 0;
2047 }
2048
2049 #ifdef CONFIG_QUOTA
2050 /*
2051 * Note the name of the specified quota file.
2052 */
note_qf_name(struct fs_context * fc,int qtype,struct fs_parameter * param)2053 static int note_qf_name(struct fs_context *fc, int qtype,
2054 struct fs_parameter *param)
2055 {
2056 struct ext4_fs_context *ctx = fc->fs_private;
2057 char *qname;
2058
2059 if (param->size < 1) {
2060 ext4_msg(NULL, KERN_ERR, "Missing quota name");
2061 return -EINVAL;
2062 }
2063 if (strchr(param->string, '/')) {
2064 ext4_msg(NULL, KERN_ERR,
2065 "quotafile must be on filesystem root");
2066 return -EINVAL;
2067 }
2068 if (ctx->s_qf_names[qtype]) {
2069 if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
2070 ext4_msg(NULL, KERN_ERR,
2071 "%s quota file already specified",
2072 QTYPE2NAME(qtype));
2073 return -EINVAL;
2074 }
2075 return 0;
2076 }
2077
2078 qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
2079 if (!qname) {
2080 ext4_msg(NULL, KERN_ERR,
2081 "Not enough memory for storing quotafile name");
2082 return -ENOMEM;
2083 }
2084 ctx->s_qf_names[qtype] = qname;
2085 ctx->qname_spec |= 1 << qtype;
2086 ctx->spec |= EXT4_SPEC_JQUOTA;
2087 return 0;
2088 }
2089
2090 /*
2091 * Clear the name of the specified quota file.
2092 */
unnote_qf_name(struct fs_context * fc,int qtype)2093 static int unnote_qf_name(struct fs_context *fc, int qtype)
2094 {
2095 struct ext4_fs_context *ctx = fc->fs_private;
2096
2097 kfree(ctx->s_qf_names[qtype]);
2098
2099 ctx->s_qf_names[qtype] = NULL;
2100 ctx->qname_spec |= 1 << qtype;
2101 ctx->spec |= EXT4_SPEC_JQUOTA;
2102 return 0;
2103 }
2104 #endif
2105
ext4_parse_test_dummy_encryption(const struct fs_parameter * param,struct ext4_fs_context * ctx)2106 static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
2107 struct ext4_fs_context *ctx)
2108 {
2109 int err;
2110
2111 if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
2112 ext4_msg(NULL, KERN_WARNING,
2113 "test_dummy_encryption option not supported");
2114 return -EINVAL;
2115 }
2116 err = fscrypt_parse_test_dummy_encryption(param,
2117 &ctx->dummy_enc_policy);
2118 if (err == -EINVAL) {
2119 ext4_msg(NULL, KERN_WARNING,
2120 "Value of option \"%s\" is unrecognized", param->key);
2121 } else if (err == -EEXIST) {
2122 ext4_msg(NULL, KERN_WARNING,
2123 "Conflicting test_dummy_encryption options");
2124 return -EINVAL;
2125 }
2126 return err;
2127 }
2128
2129 #define EXT4_SET_CTX(name) \
2130 static inline __maybe_unused \
2131 void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \
2132 { \
2133 ctx->mask_s_##name |= flag; \
2134 ctx->vals_s_##name |= flag; \
2135 }
2136
2137 #define EXT4_CLEAR_CTX(name) \
2138 static inline __maybe_unused \
2139 void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \
2140 { \
2141 ctx->mask_s_##name |= flag; \
2142 ctx->vals_s_##name &= ~flag; \
2143 }
2144
2145 #define EXT4_TEST_CTX(name) \
2146 static inline unsigned long \
2147 ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
2148 { \
2149 return (ctx->vals_s_##name & flag); \
2150 }
2151
2152 EXT4_SET_CTX(flags); /* set only */
2153 EXT4_SET_CTX(mount_opt);
2154 EXT4_CLEAR_CTX(mount_opt);
2155 EXT4_TEST_CTX(mount_opt);
2156 EXT4_SET_CTX(mount_opt2);
2157 EXT4_CLEAR_CTX(mount_opt2);
2158 EXT4_TEST_CTX(mount_opt2);
2159
ext4_parse_param(struct fs_context * fc,struct fs_parameter * param)2160 static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
2161 {
2162 struct ext4_fs_context *ctx = fc->fs_private;
2163 struct fs_parse_result result;
2164 const struct mount_opts *m;
2165 int is_remount;
2166 int token;
2167
2168 token = fs_parse(fc, ext4_param_specs, param, &result);
2169 if (token < 0)
2170 return token;
2171 is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
2172
2173 for (m = ext4_mount_opts; m->token != Opt_err; m++)
2174 if (token == m->token)
2175 break;
2176
2177 ctx->opt_flags |= m->flags;
2178
2179 if (m->flags & MOPT_EXPLICIT) {
2180 if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
2181 ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
2182 } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
2183 ctx_set_mount_opt2(ctx,
2184 EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
2185 } else
2186 return -EINVAL;
2187 }
2188
2189 if (m->flags & MOPT_NOSUPPORT) {
2190 ext4_msg(NULL, KERN_ERR, "%s option not supported",
2191 param->key);
2192 return 0;
2193 }
2194
2195 switch (token) {
2196 #ifdef CONFIG_QUOTA
2197 case Opt_usrjquota:
2198 if (!*param->string)
2199 return unnote_qf_name(fc, USRQUOTA);
2200 else
2201 return note_qf_name(fc, USRQUOTA, param);
2202 case Opt_grpjquota:
2203 if (!*param->string)
2204 return unnote_qf_name(fc, GRPQUOTA);
2205 else
2206 return note_qf_name(fc, GRPQUOTA, param);
2207 #endif
2208 case Opt_sb:
2209 if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
2210 ext4_msg(NULL, KERN_WARNING,
2211 "Ignoring %s option on remount", param->key);
2212 } else {
2213 ctx->s_sb_block = result.uint_32;
2214 ctx->spec |= EXT4_SPEC_s_sb_block;
2215 }
2216 return 0;
2217 case Opt_removed:
2218 ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
2219 param->key);
2220 return 0;
2221 case Opt_inlinecrypt:
2222 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
2223 ctx_set_flags(ctx, SB_INLINECRYPT);
2224 #else
2225 ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
2226 #endif
2227 return 0;
2228 case Opt_errors:
2229 ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
2230 ctx_set_mount_opt(ctx, result.uint_32);
2231 return 0;
2232 #ifdef CONFIG_QUOTA
2233 case Opt_jqfmt:
2234 ctx->s_jquota_fmt = result.uint_32;
2235 ctx->spec |= EXT4_SPEC_JQFMT;
2236 return 0;
2237 #endif
2238 case Opt_data:
2239 ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
2240 ctx_set_mount_opt(ctx, result.uint_32);
2241 ctx->spec |= EXT4_SPEC_DATAJ;
2242 return 0;
2243 case Opt_commit:
2244 if (result.uint_32 == 0)
2245 result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
2246 else if (result.uint_32 > INT_MAX / HZ) {
2247 ext4_msg(NULL, KERN_ERR,
2248 "Invalid commit interval %d, "
2249 "must be smaller than %d",
2250 result.uint_32, INT_MAX / HZ);
2251 return -EINVAL;
2252 }
2253 ctx->s_commit_interval = HZ * result.uint_32;
2254 ctx->spec |= EXT4_SPEC_s_commit_interval;
2255 return 0;
2256 case Opt_debug_want_extra_isize:
2257 if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
2258 ext4_msg(NULL, KERN_ERR,
2259 "Invalid want_extra_isize %d", result.uint_32);
2260 return -EINVAL;
2261 }
2262 ctx->s_want_extra_isize = result.uint_32;
2263 ctx->spec |= EXT4_SPEC_s_want_extra_isize;
2264 return 0;
2265 case Opt_max_batch_time:
2266 ctx->s_max_batch_time = result.uint_32;
2267 ctx->spec |= EXT4_SPEC_s_max_batch_time;
2268 return 0;
2269 case Opt_min_batch_time:
2270 ctx->s_min_batch_time = result.uint_32;
2271 ctx->spec |= EXT4_SPEC_s_min_batch_time;
2272 return 0;
2273 case Opt_inode_readahead_blks:
2274 if (result.uint_32 &&
2275 (result.uint_32 > (1 << 30) ||
2276 !is_power_of_2(result.uint_32))) {
2277 ext4_msg(NULL, KERN_ERR,
2278 "EXT4-fs: inode_readahead_blks must be "
2279 "0 or a power of 2 smaller than 2^31");
2280 return -EINVAL;
2281 }
2282 ctx->s_inode_readahead_blks = result.uint_32;
2283 ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
2284 return 0;
2285 case Opt_init_itable:
2286 ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
2287 ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
2288 if (param->type == fs_value_is_string)
2289 ctx->s_li_wait_mult = result.uint_32;
2290 ctx->spec |= EXT4_SPEC_s_li_wait_mult;
2291 return 0;
2292 case Opt_max_dir_size_kb:
2293 ctx->s_max_dir_size_kb = result.uint_32;
2294 ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
2295 return 0;
2296 #ifdef CONFIG_EXT4_DEBUG
2297 case Opt_fc_debug_max_replay:
2298 ctx->s_fc_debug_max_replay = result.uint_32;
2299 ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
2300 return 0;
2301 #endif
2302 case Opt_stripe:
2303 ctx->s_stripe = result.uint_32;
2304 ctx->spec |= EXT4_SPEC_s_stripe;
2305 return 0;
2306 case Opt_resuid:
2307 ctx->s_resuid = result.uid;
2308 ctx->spec |= EXT4_SPEC_s_resuid;
2309 return 0;
2310 case Opt_resgid:
2311 ctx->s_resgid = result.gid;
2312 ctx->spec |= EXT4_SPEC_s_resgid;
2313 return 0;
2314 case Opt_journal_dev:
2315 if (is_remount) {
2316 ext4_msg(NULL, KERN_ERR,
2317 "Cannot specify journal on remount");
2318 return -EINVAL;
2319 }
2320 ctx->journal_devnum = result.uint_32;
2321 ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
2322 return 0;
2323 case Opt_journal_path:
2324 {
2325 struct inode *journal_inode;
2326 struct path path;
2327 int error;
2328
2329 if (is_remount) {
2330 ext4_msg(NULL, KERN_ERR,
2331 "Cannot specify journal on remount");
2332 return -EINVAL;
2333 }
2334
2335 error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
2336 if (error) {
2337 ext4_msg(NULL, KERN_ERR, "error: could not find "
2338 "journal device path");
2339 return -EINVAL;
2340 }
2341
2342 journal_inode = d_inode(path.dentry);
2343 ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
2344 ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
2345 path_put(&path);
2346 return 0;
2347 }
2348 case Opt_journal_ioprio:
2349 if (result.uint_32 > 7) {
2350 ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
2351 " (must be 0-7)");
2352 return -EINVAL;
2353 }
2354 ctx->journal_ioprio =
2355 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
2356 ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
2357 return 0;
2358 case Opt_test_dummy_encryption:
2359 return ext4_parse_test_dummy_encryption(param, ctx);
2360 case Opt_dax:
2361 case Opt_dax_type:
2362 #ifdef CONFIG_FS_DAX
2363 {
2364 int type = (token == Opt_dax) ?
2365 Opt_dax : result.uint_32;
2366
2367 switch (type) {
2368 case Opt_dax:
2369 case Opt_dax_always:
2370 ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2371 ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2372 break;
2373 case Opt_dax_never:
2374 ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2375 ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2376 break;
2377 case Opt_dax_inode:
2378 ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
2379 ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
2380 /* Strictly for printing options */
2381 ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
2382 break;
2383 }
2384 return 0;
2385 }
2386 #else
2387 ext4_msg(NULL, KERN_INFO, "dax option not supported");
2388 return -EINVAL;
2389 #endif
2390 case Opt_data_err:
2391 if (result.uint_32 == Opt_data_err_abort)
2392 ctx_set_mount_opt(ctx, m->mount_opt);
2393 else if (result.uint_32 == Opt_data_err_ignore)
2394 ctx_clear_mount_opt(ctx, m->mount_opt);
2395 return 0;
2396 case Opt_mb_optimize_scan:
2397 if (result.int_32 == 1) {
2398 ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
2399 ctx->spec |= EXT4_SPEC_mb_optimize_scan;
2400 } else if (result.int_32 == 0) {
2401 ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
2402 ctx->spec |= EXT4_SPEC_mb_optimize_scan;
2403 } else {
2404 ext4_msg(NULL, KERN_WARNING,
2405 "mb_optimize_scan should be set to 0 or 1.");
2406 return -EINVAL;
2407 }
2408 return 0;
2409 }
2410
2411 /*
2412 * At this point we should only be getting options requiring MOPT_SET,
2413 * or MOPT_CLEAR. Anything else is a bug
2414 */
2415 if (m->token == Opt_err) {
2416 ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
2417 param->key);
2418 WARN_ON(1);
2419 return -EINVAL;
2420 }
2421
2422 else {
2423 unsigned int set = 0;
2424
2425 if ((param->type == fs_value_is_flag) ||
2426 result.uint_32 > 0)
2427 set = 1;
2428
2429 if (m->flags & MOPT_CLEAR)
2430 set = !set;
2431 else if (unlikely(!(m->flags & MOPT_SET))) {
2432 ext4_msg(NULL, KERN_WARNING,
2433 "buggy handling of option %s",
2434 param->key);
2435 WARN_ON(1);
2436 return -EINVAL;
2437 }
2438 if (m->flags & MOPT_2) {
2439 if (set != 0)
2440 ctx_set_mount_opt2(ctx, m->mount_opt);
2441 else
2442 ctx_clear_mount_opt2(ctx, m->mount_opt);
2443 } else {
2444 if (set != 0)
2445 ctx_set_mount_opt(ctx, m->mount_opt);
2446 else
2447 ctx_clear_mount_opt(ctx, m->mount_opt);
2448 }
2449 }
2450
2451 return 0;
2452 }
2453
parse_options(struct fs_context * fc,char * options)2454 static int parse_options(struct fs_context *fc, char *options)
2455 {
2456 struct fs_parameter param;
2457 int ret;
2458 char *key;
2459
2460 if (!options)
2461 return 0;
2462
2463 while ((key = strsep(&options, ",")) != NULL) {
2464 if (*key) {
2465 size_t v_len = 0;
2466 char *value = strchr(key, '=');
2467
2468 param.type = fs_value_is_flag;
2469 param.string = NULL;
2470
2471 if (value) {
2472 if (value == key)
2473 continue;
2474
2475 *value++ = 0;
2476 v_len = strlen(value);
2477 param.string = kmemdup_nul(value, v_len,
2478 GFP_KERNEL);
2479 if (!param.string)
2480 return -ENOMEM;
2481 param.type = fs_value_is_string;
2482 }
2483
2484 param.key = key;
2485 param.size = v_len;
2486
2487 ret = ext4_parse_param(fc, ¶m);
2488 kfree(param.string);
2489 if (ret < 0)
2490 return ret;
2491 }
2492 }
2493
2494 ret = ext4_validate_options(fc);
2495 if (ret < 0)
2496 return ret;
2497
2498 return 0;
2499 }
2500
parse_apply_sb_mount_options(struct super_block * sb,struct ext4_fs_context * m_ctx)2501 static int parse_apply_sb_mount_options(struct super_block *sb,
2502 struct ext4_fs_context *m_ctx)
2503 {
2504 struct ext4_sb_info *sbi = EXT4_SB(sb);
2505 char s_mount_opts[64];
2506 struct ext4_fs_context *s_ctx = NULL;
2507 struct fs_context *fc = NULL;
2508 int ret = -ENOMEM;
2509
2510 if (!sbi->s_es->s_mount_opts[0])
2511 return 0;
2512
2513 if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0)
2514 return -E2BIG;
2515
2516 fc = kzalloc_obj(struct fs_context);
2517 if (!fc)
2518 return -ENOMEM;
2519
2520 s_ctx = kzalloc_obj(struct ext4_fs_context);
2521 if (!s_ctx)
2522 goto out_free;
2523
2524 fc->fs_private = s_ctx;
2525 fc->s_fs_info = sbi;
2526
2527 ret = parse_options(fc, s_mount_opts);
2528 if (ret < 0)
2529 goto parse_failed;
2530
2531 ret = ext4_check_opt_consistency(fc, sb);
2532 if (ret < 0) {
2533 parse_failed:
2534 ext4_msg(sb, KERN_WARNING,
2535 "failed to parse options in superblock: %s",
2536 s_mount_opts);
2537 ret = 0;
2538 goto out_free;
2539 }
2540
2541 if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
2542 m_ctx->journal_devnum = s_ctx->journal_devnum;
2543 if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
2544 m_ctx->journal_ioprio = s_ctx->journal_ioprio;
2545
2546 ext4_apply_options(fc, sb);
2547 ret = 0;
2548
2549 out_free:
2550 ext4_fc_free(fc);
2551 kfree(fc);
2552 return ret;
2553 }
2554
ext4_apply_quota_options(struct fs_context * fc,struct super_block * sb)2555 static void ext4_apply_quota_options(struct fs_context *fc,
2556 struct super_block *sb)
2557 {
2558 #ifdef CONFIG_QUOTA
2559 bool quota_feature = ext4_has_feature_quota(sb);
2560 struct ext4_fs_context *ctx = fc->fs_private;
2561 struct ext4_sb_info *sbi = EXT4_SB(sb);
2562 char *qname;
2563 int i;
2564
2565 if (quota_feature)
2566 return;
2567
2568 if (ctx->spec & EXT4_SPEC_JQUOTA) {
2569 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2570 if (!(ctx->qname_spec & (1 << i)))
2571 continue;
2572
2573 qname = ctx->s_qf_names[i]; /* May be NULL */
2574 if (qname)
2575 set_opt(sb, QUOTA);
2576 ctx->s_qf_names[i] = NULL;
2577 qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
2578 lockdep_is_held(&sb->s_umount));
2579 if (qname)
2580 kfree_rcu_mightsleep(qname);
2581 }
2582 }
2583
2584 if (ctx->spec & EXT4_SPEC_JQFMT)
2585 sbi->s_jquota_fmt = ctx->s_jquota_fmt;
2586 #endif
2587 }
2588
2589 /*
2590 * Check quota settings consistency.
2591 */
ext4_check_quota_consistency(struct fs_context * fc,struct super_block * sb)2592 static int ext4_check_quota_consistency(struct fs_context *fc,
2593 struct super_block *sb)
2594 {
2595 #ifdef CONFIG_QUOTA
2596 struct ext4_fs_context *ctx = fc->fs_private;
2597 struct ext4_sb_info *sbi = EXT4_SB(sb);
2598 bool quota_feature = ext4_has_feature_quota(sb);
2599 bool quota_loaded = sb_any_quota_loaded(sb);
2600 bool usr_qf_name, grp_qf_name, usrquota, grpquota;
2601 int quota_flags, i;
2602
2603 /*
2604 * We do the test below only for project quotas. 'usrquota' and
2605 * 'grpquota' mount options are allowed even without quota feature
2606 * to support legacy quotas in quota files.
2607 */
2608 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
2609 !ext4_has_feature_project(sb)) {
2610 ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
2611 "Cannot enable project quota enforcement.");
2612 return -EINVAL;
2613 }
2614
2615 quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
2616 EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
2617 if (quota_loaded &&
2618 ctx->mask_s_mount_opt & quota_flags &&
2619 !ctx_test_mount_opt(ctx, quota_flags))
2620 goto err_quota_change;
2621
2622 if (ctx->spec & EXT4_SPEC_JQUOTA) {
2623
2624 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2625 if (!(ctx->qname_spec & (1 << i)))
2626 continue;
2627
2628 if (quota_loaded &&
2629 !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
2630 goto err_jquota_change;
2631
2632 if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
2633 strcmp(get_qf_name(sb, sbi, i),
2634 ctx->s_qf_names[i]) != 0)
2635 goto err_jquota_specified;
2636 }
2637
2638 if (quota_feature) {
2639 ext4_msg(NULL, KERN_INFO,
2640 "Journaled quota options ignored when "
2641 "QUOTA feature is enabled");
2642 return 0;
2643 }
2644 }
2645
2646 if (ctx->spec & EXT4_SPEC_JQFMT) {
2647 if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
2648 goto err_jquota_change;
2649 if (quota_feature) {
2650 ext4_msg(NULL, KERN_INFO, "Quota format mount options "
2651 "ignored when QUOTA feature is enabled");
2652 return 0;
2653 }
2654 }
2655
2656 /* Make sure we don't mix old and new quota format */
2657 usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
2658 ctx->s_qf_names[USRQUOTA]);
2659 grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
2660 ctx->s_qf_names[GRPQUOTA]);
2661
2662 usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
2663 test_opt(sb, USRQUOTA));
2664
2665 grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
2666 test_opt(sb, GRPQUOTA));
2667
2668 if (usr_qf_name) {
2669 ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
2670 usrquota = false;
2671 }
2672 if (grp_qf_name) {
2673 ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
2674 grpquota = false;
2675 }
2676
2677 if (usr_qf_name || grp_qf_name) {
2678 if (usrquota || grpquota) {
2679 ext4_msg(NULL, KERN_ERR, "old and new quota "
2680 "format mixing");
2681 return -EINVAL;
2682 }
2683
2684 if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
2685 ext4_msg(NULL, KERN_ERR, "journaled quota format "
2686 "not specified");
2687 return -EINVAL;
2688 }
2689 }
2690
2691 return 0;
2692
2693 err_quota_change:
2694 ext4_msg(NULL, KERN_ERR,
2695 "Cannot change quota options when quota turned on");
2696 return -EINVAL;
2697 err_jquota_change:
2698 ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
2699 "options when quota turned on");
2700 return -EINVAL;
2701 err_jquota_specified:
2702 ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
2703 QTYPE2NAME(i));
2704 return -EINVAL;
2705 #else
2706 return 0;
2707 #endif
2708 }
2709
ext4_check_test_dummy_encryption(const struct fs_context * fc,struct super_block * sb)2710 static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
2711 struct super_block *sb)
2712 {
2713 const struct ext4_fs_context *ctx = fc->fs_private;
2714 const struct ext4_sb_info *sbi = EXT4_SB(sb);
2715
2716 if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
2717 return 0;
2718
2719 if (!ext4_has_feature_encrypt(sb)) {
2720 ext4_msg(NULL, KERN_WARNING,
2721 "test_dummy_encryption requires encrypt feature");
2722 return -EINVAL;
2723 }
2724 /*
2725 * This mount option is just for testing, and it's not worthwhile to
2726 * implement the extra complexity (e.g. RCU protection) that would be
2727 * needed to allow it to be set or changed during remount. We do allow
2728 * it to be specified during remount, but only if there is no change.
2729 */
2730 if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
2731 if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
2732 &ctx->dummy_enc_policy))
2733 return 0;
2734 ext4_msg(NULL, KERN_WARNING,
2735 "Can't set or change test_dummy_encryption on remount");
2736 return -EINVAL;
2737 }
2738 /* Also make sure s_mount_opts didn't contain a conflicting value. */
2739 if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
2740 if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
2741 &ctx->dummy_enc_policy))
2742 return 0;
2743 ext4_msg(NULL, KERN_WARNING,
2744 "Conflicting test_dummy_encryption options");
2745 return -EINVAL;
2746 }
2747 return 0;
2748 }
2749
ext4_apply_test_dummy_encryption(struct ext4_fs_context * ctx,struct super_block * sb)2750 static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
2751 struct super_block *sb)
2752 {
2753 if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
2754 /* if already set, it was already verified to be the same */
2755 fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
2756 return;
2757 EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
2758 memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
2759 ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
2760 }
2761
ext4_check_opt_consistency(struct fs_context * fc,struct super_block * sb)2762 static int ext4_check_opt_consistency(struct fs_context *fc,
2763 struct super_block *sb)
2764 {
2765 struct ext4_fs_context *ctx = fc->fs_private;
2766 struct ext4_sb_info *sbi = fc->s_fs_info;
2767 int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
2768 int err;
2769
2770 if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
2771 ext4_msg(NULL, KERN_ERR,
2772 "Mount option(s) incompatible with ext2");
2773 return -EINVAL;
2774 }
2775 if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
2776 ext4_msg(NULL, KERN_ERR,
2777 "Mount option(s) incompatible with ext3");
2778 return -EINVAL;
2779 }
2780
2781 if (ctx->s_want_extra_isize >
2782 (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
2783 ext4_msg(NULL, KERN_ERR,
2784 "Invalid want_extra_isize %d",
2785 ctx->s_want_extra_isize);
2786 return -EINVAL;
2787 }
2788
2789 err = ext4_check_test_dummy_encryption(fc, sb);
2790 if (err)
2791 return err;
2792
2793 if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
2794 if (!sbi->s_journal) {
2795 ext4_msg(NULL, KERN_WARNING,
2796 "Remounting file system with no journal "
2797 "so ignoring journalled data option");
2798 ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
2799 } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
2800 test_opt(sb, DATA_FLAGS)) {
2801 ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
2802 "on remount");
2803 return -EINVAL;
2804 }
2805 }
2806
2807 if (is_remount) {
2808 if (!sbi->s_journal &&
2809 ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
2810 ext4_msg(NULL, KERN_WARNING,
2811 "Remounting fs w/o journal so ignoring data_err option");
2812 ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
2813 }
2814
2815 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
2816 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2817 ext4_msg(NULL, KERN_ERR, "can't mount with "
2818 "both data=journal and dax");
2819 return -EINVAL;
2820 }
2821
2822 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
2823 (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2824 (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
2825 fail_dax_change_remount:
2826 ext4_msg(NULL, KERN_ERR, "can't change "
2827 "dax mount option while remounting");
2828 return -EINVAL;
2829 } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
2830 (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2831 (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
2832 goto fail_dax_change_remount;
2833 } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
2834 ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2835 (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2836 !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
2837 goto fail_dax_change_remount;
2838 }
2839 }
2840
2841 return ext4_check_quota_consistency(fc, sb);
2842 }
2843
ext4_apply_options(struct fs_context * fc,struct super_block * sb)2844 static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
2845 {
2846 struct ext4_fs_context *ctx = fc->fs_private;
2847 struct ext4_sb_info *sbi = fc->s_fs_info;
2848
2849 sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
2850 sbi->s_mount_opt |= ctx->vals_s_mount_opt;
2851 sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
2852 sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
2853 sb->s_flags &= ~ctx->mask_s_flags;
2854 sb->s_flags |= ctx->vals_s_flags;
2855
2856 #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
2857 APPLY(s_commit_interval);
2858 APPLY(s_stripe);
2859 APPLY(s_max_batch_time);
2860 APPLY(s_min_batch_time);
2861 APPLY(s_want_extra_isize);
2862 APPLY(s_inode_readahead_blks);
2863 APPLY(s_max_dir_size_kb);
2864 APPLY(s_li_wait_mult);
2865 APPLY(s_resgid);
2866 APPLY(s_resuid);
2867
2868 #ifdef CONFIG_EXT4_DEBUG
2869 APPLY(s_fc_debug_max_replay);
2870 #endif
2871
2872 ext4_apply_quota_options(fc, sb);
2873 ext4_apply_test_dummy_encryption(ctx, sb);
2874 }
2875
2876
ext4_validate_options(struct fs_context * fc)2877 static int ext4_validate_options(struct fs_context *fc)
2878 {
2879 #ifdef CONFIG_QUOTA
2880 struct ext4_fs_context *ctx = fc->fs_private;
2881 char *usr_qf_name, *grp_qf_name;
2882
2883 usr_qf_name = ctx->s_qf_names[USRQUOTA];
2884 grp_qf_name = ctx->s_qf_names[GRPQUOTA];
2885
2886 if (usr_qf_name || grp_qf_name) {
2887 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
2888 ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
2889
2890 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
2891 ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
2892
2893 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
2894 ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
2895 ext4_msg(NULL, KERN_ERR, "old and new quota "
2896 "format mixing");
2897 return -EINVAL;
2898 }
2899 }
2900 #endif
2901 return 1;
2902 }
2903
ext4_show_quota_options(struct seq_file * seq,struct super_block * sb)2904 static inline void ext4_show_quota_options(struct seq_file *seq,
2905 struct super_block *sb)
2906 {
2907 #if defined(CONFIG_QUOTA)
2908 struct ext4_sb_info *sbi = EXT4_SB(sb);
2909 char *usr_qf_name, *grp_qf_name;
2910
2911 if (sbi->s_jquota_fmt) {
2912 char *fmtname = "";
2913
2914 switch (sbi->s_jquota_fmt) {
2915 case QFMT_VFS_OLD:
2916 fmtname = "vfsold";
2917 break;
2918 case QFMT_VFS_V0:
2919 fmtname = "vfsv0";
2920 break;
2921 case QFMT_VFS_V1:
2922 fmtname = "vfsv1";
2923 break;
2924 }
2925 seq_printf(seq, ",jqfmt=%s", fmtname);
2926 }
2927
2928 rcu_read_lock();
2929 usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2930 grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2931 if (usr_qf_name)
2932 seq_show_option(seq, "usrjquota", usr_qf_name);
2933 if (grp_qf_name)
2934 seq_show_option(seq, "grpjquota", grp_qf_name);
2935 rcu_read_unlock();
2936 #endif
2937 }
2938
token2str(int token)2939 static const char *token2str(int token)
2940 {
2941 const struct fs_parameter_spec *spec;
2942
2943 for (spec = ext4_param_specs; spec->name != NULL; spec++)
2944 if (spec->opt == token && !spec->type)
2945 break;
2946 return spec->name;
2947 }
2948
2949 /*
2950 * Show an option if
2951 * - it's set to a non-default value OR
2952 * - if the per-sb default is different from the global default
2953 */
_ext4_show_options(struct seq_file * seq,struct super_block * sb,int nodefs)2954 static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2955 int nodefs)
2956 {
2957 struct ext4_sb_info *sbi = EXT4_SB(sb);
2958 struct ext4_super_block *es = sbi->s_es;
2959 int def_errors;
2960 const struct mount_opts *m;
2961 char sep = nodefs ? '\n' : ',';
2962
2963 #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2964 #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2965
2966 if (sbi->s_sb_block != 1)
2967 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2968
2969 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2970 int want_set = m->flags & MOPT_SET;
2971 int opt_2 = m->flags & MOPT_2;
2972 unsigned int mount_opt, def_mount_opt;
2973
2974 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2975 m->flags & MOPT_SKIP)
2976 continue;
2977
2978 if (opt_2) {
2979 mount_opt = sbi->s_mount_opt2;
2980 def_mount_opt = sbi->s_def_mount_opt2;
2981 } else {
2982 mount_opt = sbi->s_mount_opt;
2983 def_mount_opt = sbi->s_def_mount_opt;
2984 }
2985 /* skip if same as the default */
2986 if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
2987 continue;
2988 /* select Opt_noFoo vs Opt_Foo */
2989 if ((want_set &&
2990 (mount_opt & m->mount_opt) != m->mount_opt) ||
2991 (!want_set && (mount_opt & m->mount_opt)))
2992 continue;
2993 SEQ_OPTS_PRINT("%s", token2str(m->token));
2994 }
2995
2996 if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2997 ext4_get_resuid(es) != EXT4_DEF_RESUID)
2998 SEQ_OPTS_PRINT("resuid=%u",
2999 from_kuid_munged(&init_user_ns, sbi->s_resuid));
3000 if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
3001 ext4_get_resgid(es) != EXT4_DEF_RESGID)
3002 SEQ_OPTS_PRINT("resgid=%u",
3003 from_kgid_munged(&init_user_ns, sbi->s_resgid));
3004 def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
3005 if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
3006 SEQ_OPTS_PUTS("errors=remount-ro");
3007 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
3008 SEQ_OPTS_PUTS("errors=continue");
3009 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
3010 SEQ_OPTS_PUTS("errors=panic");
3011 if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
3012 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
3013 if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
3014 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
3015 if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
3016 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
3017 if (nodefs && sb->s_flags & SB_I_VERSION)
3018 SEQ_OPTS_PUTS("i_version");
3019 if (nodefs || sbi->s_stripe)
3020 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
3021 if (nodefs || EXT4_MOUNT_DATA_FLAGS &
3022 (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
3023 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
3024 SEQ_OPTS_PUTS("data=journal");
3025 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
3026 SEQ_OPTS_PUTS("data=ordered");
3027 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
3028 SEQ_OPTS_PUTS("data=writeback");
3029 }
3030 if (nodefs ||
3031 sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
3032 SEQ_OPTS_PRINT("inode_readahead_blks=%u",
3033 sbi->s_inode_readahead_blks);
3034
3035 if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
3036 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
3037 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
3038 if (nodefs || sbi->s_max_dir_size_kb)
3039 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
3040 if (test_opt(sb, DATA_ERR_ABORT))
3041 SEQ_OPTS_PUTS("data_err=abort");
3042
3043 fscrypt_show_test_dummy_encryption(seq, sep, sb);
3044
3045 if (sb->s_flags & SB_INLINECRYPT)
3046 SEQ_OPTS_PUTS("inlinecrypt");
3047
3048 if (test_opt(sb, DAX_ALWAYS)) {
3049 if (IS_EXT2_SB(sb))
3050 SEQ_OPTS_PUTS("dax");
3051 else
3052 SEQ_OPTS_PUTS("dax=always");
3053 } else if (test_opt2(sb, DAX_NEVER)) {
3054 SEQ_OPTS_PUTS("dax=never");
3055 } else if (test_opt2(sb, DAX_INODE)) {
3056 SEQ_OPTS_PUTS("dax=inode");
3057 }
3058
3059 if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
3060 !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
3061 SEQ_OPTS_PUTS("mb_optimize_scan=0");
3062 } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
3063 test_opt2(sb, MB_OPTIMIZE_SCAN)) {
3064 SEQ_OPTS_PUTS("mb_optimize_scan=1");
3065 }
3066
3067 if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
3068 SEQ_OPTS_PUTS("prefetch_block_bitmaps");
3069
3070 if (ext4_emergency_ro(sb))
3071 SEQ_OPTS_PUTS("emergency_ro");
3072
3073 if (ext4_forced_shutdown(sb))
3074 SEQ_OPTS_PUTS("shutdown");
3075
3076 ext4_show_quota_options(seq, sb);
3077 return 0;
3078 }
3079
ext4_show_options(struct seq_file * seq,struct dentry * root)3080 static int ext4_show_options(struct seq_file *seq, struct dentry *root)
3081 {
3082 return _ext4_show_options(seq, root->d_sb, 0);
3083 }
3084
ext4_seq_options_show(struct seq_file * seq,void * offset)3085 int ext4_seq_options_show(struct seq_file *seq, void *offset)
3086 {
3087 struct super_block *sb = seq->private;
3088 int rc;
3089
3090 seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
3091 rc = _ext4_show_options(seq, sb, 1);
3092 seq_putc(seq, '\n');
3093 return rc;
3094 }
3095
ext4_setup_super(struct super_block * sb,struct ext4_super_block * es,int read_only)3096 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
3097 int read_only)
3098 {
3099 struct ext4_sb_info *sbi = EXT4_SB(sb);
3100 int err = 0;
3101
3102 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
3103 ext4_msg(sb, KERN_ERR, "revision level too high, "
3104 "forcing read-only mode");
3105 err = -EROFS;
3106 goto done;
3107 }
3108 if (read_only)
3109 goto done;
3110 if (!(sbi->s_mount_state & EXT4_VALID_FS))
3111 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
3112 "running e2fsck is recommended");
3113 else if (sbi->s_mount_state & EXT4_ERROR_FS)
3114 ext4_msg(sb, KERN_WARNING,
3115 "warning: mounting fs with errors, "
3116 "running e2fsck is recommended");
3117 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
3118 le16_to_cpu(es->s_mnt_count) >=
3119 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
3120 ext4_msg(sb, KERN_WARNING,
3121 "warning: maximal mount count reached, "
3122 "running e2fsck is recommended");
3123 else if (le32_to_cpu(es->s_checkinterval) &&
3124 (ext4_get_tstamp(es, s_lastcheck) +
3125 le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
3126 ext4_msg(sb, KERN_WARNING,
3127 "warning: checktime reached, "
3128 "running e2fsck is recommended");
3129 if (!sbi->s_journal)
3130 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
3131 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
3132 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
3133 le16_add_cpu(&es->s_mnt_count, 1);
3134 ext4_update_tstamp(es, s_mtime);
3135 if (sbi->s_journal) {
3136 ext4_set_feature_journal_needs_recovery(sb);
3137 if (ext4_has_feature_orphan_file(sb))
3138 ext4_set_feature_orphan_present(sb);
3139 }
3140
3141 err = ext4_commit_super(sb);
3142 done:
3143 if (test_opt(sb, DEBUG))
3144 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
3145 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
3146 sb->s_blocksize,
3147 sbi->s_groups_count,
3148 EXT4_BLOCKS_PER_GROUP(sb),
3149 EXT4_INODES_PER_GROUP(sb),
3150 sbi->s_mount_opt, sbi->s_mount_opt2);
3151 return err;
3152 }
3153
ext4_alloc_flex_bg_array(struct super_block * sb,ext4_group_t ngroup)3154 int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
3155 {
3156 struct ext4_sb_info *sbi = EXT4_SB(sb);
3157 struct flex_groups **old_groups, **new_groups;
3158 int size, i, j;
3159
3160 if (!sbi->s_log_groups_per_flex)
3161 return 0;
3162
3163 size = ext4_flex_group(sbi, ngroup - 1) + 1;
3164 if (size <= sbi->s_flex_groups_allocated)
3165 return 0;
3166
3167 new_groups = kvzalloc(roundup_pow_of_two(size *
3168 sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
3169 if (!new_groups) {
3170 ext4_msg(sb, KERN_ERR,
3171 "not enough memory for %d flex group pointers", size);
3172 return -ENOMEM;
3173 }
3174 for (i = sbi->s_flex_groups_allocated; i < size; i++) {
3175 new_groups[i] = kvzalloc(roundup_pow_of_two(
3176 sizeof(struct flex_groups)),
3177 GFP_KERNEL);
3178 if (!new_groups[i]) {
3179 for (j = sbi->s_flex_groups_allocated; j < i; j++)
3180 kvfree(new_groups[j]);
3181 kvfree(new_groups);
3182 ext4_msg(sb, KERN_ERR,
3183 "not enough memory for %d flex groups", size);
3184 return -ENOMEM;
3185 }
3186 }
3187 rcu_read_lock();
3188 old_groups = rcu_dereference(sbi->s_flex_groups);
3189 if (old_groups)
3190 memcpy(new_groups, old_groups,
3191 (sbi->s_flex_groups_allocated *
3192 sizeof(struct flex_groups *)));
3193 rcu_read_unlock();
3194 rcu_assign_pointer(sbi->s_flex_groups, new_groups);
3195 sbi->s_flex_groups_allocated = size;
3196 if (old_groups)
3197 ext4_kvfree_array_rcu(old_groups);
3198 return 0;
3199 }
3200
ext4_fill_flex_info(struct super_block * sb)3201 static int ext4_fill_flex_info(struct super_block *sb)
3202 {
3203 struct ext4_sb_info *sbi = EXT4_SB(sb);
3204 struct ext4_group_desc *gdp = NULL;
3205 struct flex_groups *fg;
3206 ext4_group_t flex_group;
3207 int i, err;
3208
3209 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
3210 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
3211 sbi->s_log_groups_per_flex = 0;
3212 return 1;
3213 }
3214
3215 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
3216 if (err)
3217 goto failed;
3218
3219 for (i = 0; i < sbi->s_groups_count; i++) {
3220 gdp = ext4_get_group_desc(sb, i, NULL);
3221
3222 flex_group = ext4_flex_group(sbi, i);
3223 fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
3224 atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
3225 atomic64_add(ext4_free_group_clusters(sb, gdp),
3226 &fg->free_clusters);
3227 atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
3228 }
3229
3230 return 1;
3231 failed:
3232 return 0;
3233 }
3234
ext4_group_desc_csum(struct super_block * sb,__u32 block_group,struct ext4_group_desc * gdp)3235 static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
3236 struct ext4_group_desc *gdp)
3237 {
3238 int offset = offsetof(struct ext4_group_desc, bg_checksum);
3239 __u16 crc = 0;
3240 __le32 le_group = cpu_to_le32(block_group);
3241 struct ext4_sb_info *sbi = EXT4_SB(sb);
3242
3243 if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
3244 /* Use new metadata_csum algorithm */
3245 __u32 csum32;
3246 __u16 dummy_csum = 0;
3247
3248 csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
3249 sizeof(le_group));
3250 csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
3251 csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
3252 sizeof(dummy_csum));
3253 offset += sizeof(dummy_csum);
3254 if (offset < sbi->s_desc_size)
3255 csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
3256 sbi->s_desc_size - offset);
3257
3258 crc = csum32 & 0xFFFF;
3259 goto out;
3260 }
3261
3262 /* old crc16 code */
3263 if (!ext4_has_feature_gdt_csum(sb))
3264 return 0;
3265
3266 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
3267 crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
3268 crc = crc16(crc, (__u8 *)gdp, offset);
3269 offset += sizeof(gdp->bg_checksum); /* skip checksum */
3270 /* for checksum of struct ext4_group_desc do the rest...*/
3271 if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
3272 crc = crc16(crc, (__u8 *)gdp + offset,
3273 sbi->s_desc_size - offset);
3274
3275 out:
3276 return cpu_to_le16(crc);
3277 }
3278
ext4_group_desc_csum_verify(struct super_block * sb,__u32 block_group,struct ext4_group_desc * gdp)3279 int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
3280 struct ext4_group_desc *gdp)
3281 {
3282 if (ext4_has_group_desc_csum(sb) &&
3283 (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
3284 return 0;
3285
3286 return 1;
3287 }
3288
ext4_group_desc_csum_set(struct super_block * sb,__u32 block_group,struct ext4_group_desc * gdp)3289 void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
3290 struct ext4_group_desc *gdp)
3291 {
3292 if (!ext4_has_group_desc_csum(sb))
3293 return;
3294 gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
3295 }
3296
3297 /* Called at mount-time, super-block is locked */
ext4_check_descriptors(struct super_block * sb,ext4_fsblk_t sb_block,ext4_group_t * first_not_zeroed)3298 static int ext4_check_descriptors(struct super_block *sb,
3299 ext4_fsblk_t sb_block,
3300 ext4_group_t *first_not_zeroed)
3301 {
3302 struct ext4_sb_info *sbi = EXT4_SB(sb);
3303 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
3304 ext4_fsblk_t last_block;
3305 ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
3306 ext4_fsblk_t block_bitmap;
3307 ext4_fsblk_t inode_bitmap;
3308 ext4_fsblk_t inode_table;
3309 int flexbg_flag = 0;
3310 ext4_group_t i, grp = sbi->s_groups_count;
3311
3312 if (ext4_has_feature_flex_bg(sb))
3313 flexbg_flag = 1;
3314
3315 ext4_debug("Checking group descriptors");
3316
3317 for (i = 0; i < sbi->s_groups_count; i++) {
3318 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
3319
3320 if (i == sbi->s_groups_count - 1 || flexbg_flag)
3321 last_block = ext4_blocks_count(sbi->s_es) - 1;
3322 else
3323 last_block = first_block +
3324 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
3325
3326 if ((grp == sbi->s_groups_count) &&
3327 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3328 grp = i;
3329
3330 block_bitmap = ext4_block_bitmap(sb, gdp);
3331 if (block_bitmap == sb_block) {
3332 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3333 "Block bitmap for group %u overlaps "
3334 "superblock", i);
3335 if (!sb_rdonly(sb))
3336 return 0;
3337 }
3338 if (block_bitmap >= sb_block + 1 &&
3339 block_bitmap <= last_bg_block) {
3340 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3341 "Block bitmap for group %u overlaps "
3342 "block group descriptors", i);
3343 if (!sb_rdonly(sb))
3344 return 0;
3345 }
3346 if (block_bitmap < first_block || block_bitmap > last_block) {
3347 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3348 "Block bitmap for group %u not in group "
3349 "(block %llu)!", i, block_bitmap);
3350 return 0;
3351 }
3352 inode_bitmap = ext4_inode_bitmap(sb, gdp);
3353 if (inode_bitmap == sb_block) {
3354 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3355 "Inode bitmap for group %u overlaps "
3356 "superblock", i);
3357 if (!sb_rdonly(sb))
3358 return 0;
3359 }
3360 if (inode_bitmap >= sb_block + 1 &&
3361 inode_bitmap <= last_bg_block) {
3362 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3363 "Inode bitmap for group %u overlaps "
3364 "block group descriptors", i);
3365 if (!sb_rdonly(sb))
3366 return 0;
3367 }
3368 if (inode_bitmap < first_block || inode_bitmap > last_block) {
3369 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3370 "Inode bitmap for group %u not in group "
3371 "(block %llu)!", i, inode_bitmap);
3372 return 0;
3373 }
3374 inode_table = ext4_inode_table(sb, gdp);
3375 if (inode_table == sb_block) {
3376 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3377 "Inode table for group %u overlaps "
3378 "superblock", i);
3379 if (!sb_rdonly(sb))
3380 return 0;
3381 }
3382 if (inode_table >= sb_block + 1 &&
3383 inode_table <= last_bg_block) {
3384 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3385 "Inode table for group %u overlaps "
3386 "block group descriptors", i);
3387 if (!sb_rdonly(sb))
3388 return 0;
3389 }
3390 if (inode_table < first_block ||
3391 inode_table + sbi->s_itb_per_group - 1 > last_block) {
3392 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3393 "Inode table for group %u not in group "
3394 "(block %llu)!", i, inode_table);
3395 return 0;
3396 }
3397 ext4_lock_group(sb, i);
3398 if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
3399 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
3400 "Checksum for group %u failed (%u!=%u)",
3401 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
3402 gdp)), le16_to_cpu(gdp->bg_checksum));
3403 if (!sb_rdonly(sb)) {
3404 ext4_unlock_group(sb, i);
3405 return 0;
3406 }
3407 }
3408 ext4_unlock_group(sb, i);
3409 if (!flexbg_flag)
3410 first_block += EXT4_BLOCKS_PER_GROUP(sb);
3411 }
3412 if (NULL != first_not_zeroed)
3413 *first_not_zeroed = grp;
3414 return 1;
3415 }
3416
3417 /*
3418 * Maximal extent format file size.
3419 * Resulting logical blkno at s_maxbytes must fit in our on-disk
3420 * extent format containers, within a sector_t, and within i_blocks
3421 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
3422 * so that won't be a limiting factor.
3423 *
3424 * However there is other limiting factor. We do store extents in the form
3425 * of starting block and length, hence the resulting length of the extent
3426 * covering maximum file size must fit into on-disk format containers as
3427 * well. Given that length is always by 1 unit bigger than max unit (because
3428 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
3429 *
3430 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
3431 */
ext4_max_size(int blkbits,int has_huge_files)3432 static loff_t ext4_max_size(int blkbits, int has_huge_files)
3433 {
3434 loff_t res;
3435 loff_t upper_limit = MAX_LFS_FILESIZE;
3436
3437 BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
3438
3439 if (!has_huge_files) {
3440 upper_limit = (1LL << 32) - 1;
3441
3442 /* total blocks in file system block size */
3443 upper_limit >>= (blkbits - 9);
3444 upper_limit <<= blkbits;
3445 }
3446
3447 /*
3448 * 32-bit extent-start container, ee_block. We lower the maxbytes
3449 * by one fs block, so ee_len can cover the extent of maximum file
3450 * size
3451 */
3452 res = (1LL << 32) - 1;
3453 res <<= blkbits;
3454
3455 /* Sanity check against vm- & vfs- imposed limits */
3456 if (res > upper_limit)
3457 res = upper_limit;
3458
3459 return res;
3460 }
3461
3462 /*
3463 * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect
3464 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
3465 * We need to be 1 filesystem block less than the 2^48 sector limit.
3466 */
ext4_max_bitmap_size(int bits,int has_huge_files)3467 static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
3468 {
3469 loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
3470 int meta_blocks;
3471 unsigned int ppb = 1 << (bits - 2);
3472
3473 /*
3474 * This is calculated to be the largest file size for a dense, block
3475 * mapped file such that the file's total number of 512-byte sectors,
3476 * including data and all indirect blocks, does not exceed (2^48 - 1).
3477 *
3478 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
3479 * number of 512-byte sectors of the file.
3480 */
3481 if (!has_huge_files) {
3482 /*
3483 * !has_huge_files or implies that the inode i_block field
3484 * represents total file blocks in 2^32 512-byte sectors ==
3485 * size of vfs inode i_blocks * 8
3486 */
3487 upper_limit = (1LL << 32) - 1;
3488
3489 /* total blocks in file system block size */
3490 upper_limit >>= (bits - 9);
3491
3492 } else {
3493 /*
3494 * We use 48 bit ext4_inode i_blocks
3495 * With EXT4_HUGE_FILE_FL set the i_blocks
3496 * represent total number of blocks in
3497 * file system block size
3498 */
3499 upper_limit = (1LL << 48) - 1;
3500
3501 }
3502
3503 /* Compute how many blocks we can address by block tree */
3504 res += ppb;
3505 res += ppb * ppb;
3506 res += ((loff_t)ppb) * ppb * ppb;
3507 /* Compute how many metadata blocks are needed */
3508 meta_blocks = 1;
3509 meta_blocks += 1 + ppb;
3510 meta_blocks += 1 + ppb + ppb * ppb;
3511 /* Does block tree limit file size? */
3512 if (res + meta_blocks <= upper_limit)
3513 goto check_lfs;
3514
3515 res = upper_limit;
3516 /* How many metadata blocks are needed for addressing upper_limit? */
3517 upper_limit -= EXT4_NDIR_BLOCKS;
3518 /* indirect blocks */
3519 meta_blocks = 1;
3520 upper_limit -= ppb;
3521 /* double indirect blocks */
3522 if (upper_limit < ppb * ppb) {
3523 meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
3524 res -= meta_blocks;
3525 goto check_lfs;
3526 }
3527 meta_blocks += 1 + ppb;
3528 upper_limit -= ppb * ppb;
3529 /* tripple indirect blocks for the rest */
3530 meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
3531 DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
3532 res -= meta_blocks;
3533 check_lfs:
3534 res <<= bits;
3535 if (res > MAX_LFS_FILESIZE)
3536 res = MAX_LFS_FILESIZE;
3537
3538 return res;
3539 }
3540
descriptor_loc(struct super_block * sb,ext4_fsblk_t logical_sb_block,int nr)3541 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
3542 ext4_fsblk_t logical_sb_block, int nr)
3543 {
3544 struct ext4_sb_info *sbi = EXT4_SB(sb);
3545 ext4_group_t bg, first_meta_bg;
3546 int has_super = 0;
3547
3548 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
3549
3550 if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
3551 return logical_sb_block + nr + 1;
3552 bg = sbi->s_desc_per_block * nr;
3553 if (ext4_bg_has_super(sb, bg))
3554 has_super = 1;
3555
3556 /*
3557 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
3558 * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled
3559 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
3560 * compensate.
3561 */
3562 if (sb->s_blocksize == 1024 && nr == 0 &&
3563 le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
3564 has_super++;
3565
3566 return (has_super + ext4_group_first_block_no(sb, bg));
3567 }
3568
3569 /**
3570 * ext4_get_stripe_size: Get the stripe size.
3571 * @sbi: In memory super block info
3572 *
3573 * If we have specified it via mount option, then
3574 * use the mount option value. If the value specified at mount time is
3575 * greater than the blocks per group use the super block value.
3576 * If the super block value is greater than blocks per group return 0.
3577 * Allocator needs it be less than blocks per group.
3578 *
3579 */
ext4_get_stripe_size(struct ext4_sb_info * sbi)3580 static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
3581 {
3582 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
3583 unsigned long stripe_width =
3584 le32_to_cpu(sbi->s_es->s_raid_stripe_width);
3585 int ret;
3586
3587 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
3588 ret = sbi->s_stripe;
3589 else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
3590 ret = stripe_width;
3591 else if (stride && stride <= sbi->s_blocks_per_group)
3592 ret = stride;
3593 else
3594 ret = 0;
3595
3596 /*
3597 * If the stripe width is 1, this makes no sense and
3598 * we set it to 0 to turn off stripe handling code.
3599 */
3600 if (ret <= 1)
3601 ret = 0;
3602
3603 return ret;
3604 }
3605
3606 /*
3607 * Check whether this filesystem can be mounted based on
3608 * the features present and the RDONLY/RDWR mount requested.
3609 * Returns 1 if this filesystem can be mounted as requested,
3610 * 0 if it cannot be.
3611 */
ext4_feature_set_ok(struct super_block * sb,int readonly)3612 int ext4_feature_set_ok(struct super_block *sb, int readonly)
3613 {
3614 if (ext4_has_unknown_ext4_incompat_features(sb)) {
3615 ext4_msg(sb, KERN_ERR,
3616 "Couldn't mount because of "
3617 "unsupported optional features (%x)",
3618 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
3619 ~EXT4_FEATURE_INCOMPAT_SUPP));
3620 return 0;
3621 }
3622
3623 if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) {
3624 ext4_msg(sb, KERN_ERR,
3625 "Filesystem with casefold feature cannot be "
3626 "mounted without CONFIG_UNICODE");
3627 return 0;
3628 }
3629
3630 if (readonly)
3631 return 1;
3632
3633 if (ext4_has_feature_readonly(sb)) {
3634 ext4_msg(sb, KERN_INFO, "filesystem is read-only");
3635 sb->s_flags |= SB_RDONLY;
3636 return 1;
3637 }
3638
3639 /* Check that feature set is OK for a read-write mount */
3640 if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
3641 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
3642 "unsupported optional features (%x)",
3643 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
3644 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3645 return 0;
3646 }
3647 if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
3648 ext4_msg(sb, KERN_ERR,
3649 "Can't support bigalloc feature without "
3650 "extents feature\n");
3651 return 0;
3652 }
3653 if (ext4_has_feature_bigalloc(sb) &&
3654 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
3655 ext4_msg(sb, KERN_WARNING,
3656 "bad geometry: bigalloc file system with non-zero "
3657 "first_data_block\n");
3658 return 0;
3659 }
3660
3661 #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
3662 if (!readonly && (ext4_has_feature_quota(sb) ||
3663 ext4_has_feature_project(sb))) {
3664 ext4_msg(sb, KERN_ERR,
3665 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
3666 return 0;
3667 }
3668 #endif /* CONFIG_QUOTA */
3669 return 1;
3670 }
3671
3672 /*
3673 * This function is called once a day by default if we have errors logged
3674 * on the file system.
3675 * Use the err_report_sec sysfs attribute to disable or adjust its call
3676 * freequency.
3677 */
print_daily_error_info(struct timer_list * t)3678 void print_daily_error_info(struct timer_list *t)
3679 {
3680 struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
3681 struct super_block *sb = sbi->s_sb;
3682 struct ext4_super_block *es = sbi->s_es;
3683
3684 if (es->s_error_count)
3685 /* fsck newer than v1.41.13 is needed to clean this condition. */
3686 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
3687 le32_to_cpu(es->s_error_count));
3688 if (es->s_first_error_time) {
3689 printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
3690 sb->s_id,
3691 ext4_get_tstamp(es, s_first_error_time),
3692 (int) sizeof(es->s_first_error_func),
3693 es->s_first_error_func,
3694 le32_to_cpu(es->s_first_error_line));
3695 if (es->s_first_error_ino)
3696 printk(KERN_CONT ": inode %u",
3697 le32_to_cpu(es->s_first_error_ino));
3698 if (es->s_first_error_block)
3699 printk(KERN_CONT ": block %llu", (unsigned long long)
3700 le64_to_cpu(es->s_first_error_block));
3701 printk(KERN_CONT "\n");
3702 }
3703 if (es->s_last_error_time) {
3704 printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
3705 sb->s_id,
3706 ext4_get_tstamp(es, s_last_error_time),
3707 (int) sizeof(es->s_last_error_func),
3708 es->s_last_error_func,
3709 le32_to_cpu(es->s_last_error_line));
3710 if (es->s_last_error_ino)
3711 printk(KERN_CONT ": inode %u",
3712 le32_to_cpu(es->s_last_error_ino));
3713 if (es->s_last_error_block)
3714 printk(KERN_CONT ": block %llu", (unsigned long long)
3715 le64_to_cpu(es->s_last_error_block));
3716 printk(KERN_CONT "\n");
3717 }
3718
3719 if (sbi->s_err_report_sec)
3720 mod_timer(&sbi->s_err_report, jiffies + secs_to_jiffies(sbi->s_err_report_sec));
3721 }
3722
3723 /* Find next suitable group and run ext4_init_inode_table */
ext4_run_li_request(struct ext4_li_request * elr)3724 static int ext4_run_li_request(struct ext4_li_request *elr)
3725 {
3726 struct ext4_group_desc *gdp = NULL;
3727 struct super_block *sb = elr->lr_super;
3728 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3729 ext4_group_t group = elr->lr_next_group;
3730 unsigned int prefetch_ios = 0;
3731 int ret = 0;
3732 int nr = EXT4_SB(sb)->s_mb_prefetch;
3733 u64 start_time;
3734
3735 if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
3736 elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
3737 ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
3738 trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
3739 if (group >= elr->lr_next_group) {
3740 ret = 1;
3741 if (elr->lr_first_not_zeroed != ngroups &&
3742 !ext4_emergency_state(sb) && !sb_rdonly(sb) &&
3743 test_opt(sb, INIT_INODE_TABLE)) {
3744 elr->lr_next_group = elr->lr_first_not_zeroed;
3745 elr->lr_mode = EXT4_LI_MODE_ITABLE;
3746 ret = 0;
3747 }
3748 }
3749 return ret;
3750 }
3751
3752 for (; group < ngroups; group++) {
3753 gdp = ext4_get_group_desc(sb, group, NULL);
3754 if (!gdp) {
3755 ret = 1;
3756 break;
3757 }
3758
3759 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3760 break;
3761 }
3762
3763 if (group >= ngroups)
3764 ret = 1;
3765
3766 if (!ret) {
3767 start_time = ktime_get_ns();
3768 ret = ext4_init_inode_table(sb, group,
3769 elr->lr_timeout ? 0 : 1);
3770 trace_ext4_lazy_itable_init(sb, group);
3771 if (elr->lr_timeout == 0) {
3772 elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) *
3773 EXT4_SB(elr->lr_super)->s_li_wait_mult);
3774 }
3775 elr->lr_next_sched = jiffies + elr->lr_timeout;
3776 elr->lr_next_group = group + 1;
3777 }
3778 return ret;
3779 }
3780
3781 /*
3782 * Remove lr_request from the list_request and free the
3783 * request structure. Should be called with li_list_mtx held
3784 */
ext4_remove_li_request(struct ext4_li_request * elr)3785 static void ext4_remove_li_request(struct ext4_li_request *elr)
3786 {
3787 if (!elr)
3788 return;
3789
3790 list_del(&elr->lr_request);
3791 EXT4_SB(elr->lr_super)->s_li_request = NULL;
3792 kfree(elr);
3793 }
3794
ext4_unregister_li_request(struct super_block * sb)3795 static void ext4_unregister_li_request(struct super_block *sb)
3796 {
3797 mutex_lock(&ext4_li_mtx);
3798 if (!ext4_li_info) {
3799 mutex_unlock(&ext4_li_mtx);
3800 return;
3801 }
3802
3803 mutex_lock(&ext4_li_info->li_list_mtx);
3804 ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3805 mutex_unlock(&ext4_li_info->li_list_mtx);
3806 mutex_unlock(&ext4_li_mtx);
3807 }
3808
3809 static struct task_struct *ext4_lazyinit_task;
3810
3811 /*
3812 * This is the function where ext4lazyinit thread lives. It walks
3813 * through the request list searching for next scheduled filesystem.
3814 * When such a fs is found, run the lazy initialization request
3815 * (ext4_rn_li_request) and keep track of the time spend in this
3816 * function. Based on that time we compute next schedule time of
3817 * the request. When walking through the list is complete, compute
3818 * next waking time and put itself into sleep.
3819 */
ext4_lazyinit_thread(void * arg)3820 static int ext4_lazyinit_thread(void *arg)
3821 {
3822 struct ext4_lazy_init *eli = arg;
3823 struct list_head *pos, *n;
3824 struct ext4_li_request *elr;
3825 unsigned long next_wakeup, cur;
3826
3827 BUG_ON(NULL == eli);
3828 set_freezable();
3829
3830 cont_thread:
3831 while (true) {
3832 bool next_wakeup_initialized = false;
3833
3834 next_wakeup = 0;
3835 mutex_lock(&eli->li_list_mtx);
3836 if (list_empty(&eli->li_request_list)) {
3837 mutex_unlock(&eli->li_list_mtx);
3838 goto exit_thread;
3839 }
3840 list_for_each_safe(pos, n, &eli->li_request_list) {
3841 int err = 0;
3842 int progress = 0;
3843 elr = list_entry(pos, struct ext4_li_request,
3844 lr_request);
3845
3846 if (time_before(jiffies, elr->lr_next_sched)) {
3847 if (!next_wakeup_initialized ||
3848 time_before(elr->lr_next_sched, next_wakeup)) {
3849 next_wakeup = elr->lr_next_sched;
3850 next_wakeup_initialized = true;
3851 }
3852 continue;
3853 }
3854 if (down_read_trylock(&elr->lr_super->s_umount)) {
3855 if (sb_start_write_trylock(elr->lr_super)) {
3856 progress = 1;
3857 /*
3858 * We hold sb->s_umount, sb can not
3859 * be removed from the list, it is
3860 * now safe to drop li_list_mtx
3861 */
3862 mutex_unlock(&eli->li_list_mtx);
3863 err = ext4_run_li_request(elr);
3864 sb_end_write(elr->lr_super);
3865 mutex_lock(&eli->li_list_mtx);
3866 n = pos->next;
3867 }
3868 up_read((&elr->lr_super->s_umount));
3869 }
3870 /* error, remove the lazy_init job */
3871 if (err) {
3872 ext4_remove_li_request(elr);
3873 continue;
3874 }
3875 if (!progress) {
3876 elr->lr_next_sched = jiffies +
3877 get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
3878 }
3879 if (!next_wakeup_initialized ||
3880 time_before(elr->lr_next_sched, next_wakeup)) {
3881 next_wakeup = elr->lr_next_sched;
3882 next_wakeup_initialized = true;
3883 }
3884 }
3885 mutex_unlock(&eli->li_list_mtx);
3886
3887 try_to_freeze();
3888
3889 cur = jiffies;
3890 if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) {
3891 cond_resched();
3892 continue;
3893 }
3894
3895 schedule_timeout_interruptible(next_wakeup - cur);
3896
3897 if (kthread_should_stop()) {
3898 ext4_clear_request_list();
3899 goto exit_thread;
3900 }
3901 }
3902
3903 exit_thread:
3904 /*
3905 * It looks like the request list is empty, but we need
3906 * to check it under the li_list_mtx lock, to prevent any
3907 * additions into it, and of course we should lock ext4_li_mtx
3908 * to atomically free the list and ext4_li_info, because at
3909 * this point another ext4 filesystem could be registering
3910 * new one.
3911 */
3912 mutex_lock(&ext4_li_mtx);
3913 mutex_lock(&eli->li_list_mtx);
3914 if (!list_empty(&eli->li_request_list)) {
3915 mutex_unlock(&eli->li_list_mtx);
3916 mutex_unlock(&ext4_li_mtx);
3917 goto cont_thread;
3918 }
3919 mutex_unlock(&eli->li_list_mtx);
3920 kfree(ext4_li_info);
3921 ext4_li_info = NULL;
3922 mutex_unlock(&ext4_li_mtx);
3923
3924 return 0;
3925 }
3926
ext4_clear_request_list(void)3927 static void ext4_clear_request_list(void)
3928 {
3929 struct list_head *pos, *n;
3930 struct ext4_li_request *elr;
3931
3932 mutex_lock(&ext4_li_info->li_list_mtx);
3933 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3934 elr = list_entry(pos, struct ext4_li_request,
3935 lr_request);
3936 ext4_remove_li_request(elr);
3937 }
3938 mutex_unlock(&ext4_li_info->li_list_mtx);
3939 }
3940
ext4_run_lazyinit_thread(void)3941 static int ext4_run_lazyinit_thread(void)
3942 {
3943 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3944 ext4_li_info, "ext4lazyinit");
3945 if (IS_ERR(ext4_lazyinit_task)) {
3946 int err = PTR_ERR(ext4_lazyinit_task);
3947 ext4_clear_request_list();
3948 kfree(ext4_li_info);
3949 ext4_li_info = NULL;
3950 printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3951 "initialization thread\n",
3952 err);
3953 return err;
3954 }
3955 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3956 return 0;
3957 }
3958
3959 /*
3960 * Check whether it make sense to run itable init. thread or not.
3961 * If there is at least one uninitialized inode table, return
3962 * corresponding group number, else the loop goes through all
3963 * groups and return total number of groups.
3964 */
ext4_has_uninit_itable(struct super_block * sb)3965 static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3966 {
3967 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3968 struct ext4_group_desc *gdp = NULL;
3969
3970 if (!ext4_has_group_desc_csum(sb))
3971 return ngroups;
3972
3973 for (group = 0; group < ngroups; group++) {
3974 gdp = ext4_get_group_desc(sb, group, NULL);
3975 if (!gdp)
3976 continue;
3977
3978 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3979 break;
3980 }
3981
3982 return group;
3983 }
3984
ext4_li_info_new(void)3985 static int ext4_li_info_new(void)
3986 {
3987 struct ext4_lazy_init *eli = NULL;
3988
3989 eli = kzalloc_obj(*eli);
3990 if (!eli)
3991 return -ENOMEM;
3992
3993 INIT_LIST_HEAD(&eli->li_request_list);
3994 mutex_init(&eli->li_list_mtx);
3995
3996 eli->li_state |= EXT4_LAZYINIT_QUIT;
3997
3998 ext4_li_info = eli;
3999
4000 return 0;
4001 }
4002
ext4_li_request_new(struct super_block * sb,ext4_group_t start)4003 static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
4004 ext4_group_t start)
4005 {
4006 struct ext4_li_request *elr;
4007
4008 elr = kzalloc_obj(*elr);
4009 if (!elr)
4010 return NULL;
4011
4012 elr->lr_super = sb;
4013 elr->lr_first_not_zeroed = start;
4014 if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
4015 elr->lr_mode = EXT4_LI_MODE_ITABLE;
4016 elr->lr_next_group = start;
4017 } else {
4018 elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
4019 }
4020
4021 /*
4022 * Randomize first schedule time of the request to
4023 * spread the inode table initialization requests
4024 * better.
4025 */
4026 elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
4027 return elr;
4028 }
4029
ext4_register_li_request(struct super_block * sb,ext4_group_t first_not_zeroed)4030 int ext4_register_li_request(struct super_block *sb,
4031 ext4_group_t first_not_zeroed)
4032 {
4033 struct ext4_sb_info *sbi = EXT4_SB(sb);
4034 struct ext4_li_request *elr = NULL;
4035 ext4_group_t ngroups = sbi->s_groups_count;
4036 int ret = 0;
4037
4038 mutex_lock(&ext4_li_mtx);
4039 if (sbi->s_li_request != NULL) {
4040 /*
4041 * Reset timeout so it can be computed again, because
4042 * s_li_wait_mult might have changed.
4043 */
4044 sbi->s_li_request->lr_timeout = 0;
4045 goto out;
4046 }
4047
4048 if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
4049 (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
4050 (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
4051 goto out;
4052
4053 elr = ext4_li_request_new(sb, first_not_zeroed);
4054 if (!elr) {
4055 ret = -ENOMEM;
4056 goto out;
4057 }
4058
4059 if (NULL == ext4_li_info) {
4060 ret = ext4_li_info_new();
4061 if (ret)
4062 goto out;
4063 }
4064
4065 mutex_lock(&ext4_li_info->li_list_mtx);
4066 list_add(&elr->lr_request, &ext4_li_info->li_request_list);
4067 mutex_unlock(&ext4_li_info->li_list_mtx);
4068
4069 sbi->s_li_request = elr;
4070 /*
4071 * set elr to NULL here since it has been inserted to
4072 * the request_list and the removal and free of it is
4073 * handled by ext4_clear_request_list from now on.
4074 */
4075 elr = NULL;
4076
4077 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
4078 ret = ext4_run_lazyinit_thread();
4079 if (ret)
4080 goto out;
4081 }
4082 out:
4083 mutex_unlock(&ext4_li_mtx);
4084 if (ret)
4085 kfree(elr);
4086 return ret;
4087 }
4088
4089 /*
4090 * We do not need to lock anything since this is called on
4091 * module unload.
4092 */
ext4_destroy_lazyinit_thread(void)4093 static void ext4_destroy_lazyinit_thread(void)
4094 {
4095 /*
4096 * If thread exited earlier
4097 * there's nothing to be done.
4098 */
4099 if (!ext4_li_info || !ext4_lazyinit_task)
4100 return;
4101
4102 kthread_stop(ext4_lazyinit_task);
4103 }
4104
set_journal_csum_feature_set(struct super_block * sb)4105 static int set_journal_csum_feature_set(struct super_block *sb)
4106 {
4107 int ret = 1;
4108 int compat, incompat;
4109 struct ext4_sb_info *sbi = EXT4_SB(sb);
4110
4111 if (ext4_has_feature_metadata_csum(sb)) {
4112 /* journal checksum v3 */
4113 compat = 0;
4114 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
4115 } else {
4116 /* journal checksum v1 */
4117 compat = JBD2_FEATURE_COMPAT_CHECKSUM;
4118 incompat = 0;
4119 }
4120
4121 jbd2_journal_clear_features(sbi->s_journal,
4122 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
4123 JBD2_FEATURE_INCOMPAT_CSUM_V3 |
4124 JBD2_FEATURE_INCOMPAT_CSUM_V2);
4125 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4126 ret = jbd2_journal_set_features(sbi->s_journal,
4127 compat, 0,
4128 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
4129 incompat);
4130 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
4131 ret = jbd2_journal_set_features(sbi->s_journal,
4132 compat, 0,
4133 incompat);
4134 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
4135 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
4136 } else {
4137 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
4138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
4139 }
4140
4141 return ret;
4142 }
4143
4144 /*
4145 * Note: calculating the overhead so we can be compatible with
4146 * historical BSD practice is quite difficult in the face of
4147 * clusters/bigalloc. This is because multiple metadata blocks from
4148 * different block group can end up in the same allocation cluster.
4149 * Calculating the exact overhead in the face of clustered allocation
4150 * requires either O(all block bitmaps) in memory or O(number of block
4151 * groups**2) in time. We will still calculate the superblock for
4152 * older file systems --- and if we come across with a bigalloc file
4153 * system with zero in s_overhead_clusters the estimate will be close to
4154 * correct especially for very large cluster sizes --- but for newer
4155 * file systems, it's better to calculate this figure once at mkfs
4156 * time, and store it in the superblock. If the superblock value is
4157 * present (even for non-bigalloc file systems), we will use it.
4158 */
count_overhead(struct super_block * sb,ext4_group_t grp,char * buf)4159 static int count_overhead(struct super_block *sb, ext4_group_t grp,
4160 char *buf)
4161 {
4162 struct ext4_sb_info *sbi = EXT4_SB(sb);
4163 struct ext4_group_desc *gdp;
4164 ext4_fsblk_t first_block, last_block, b;
4165 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4166 int s, j, count = 0;
4167 int has_super = ext4_bg_has_super(sb, grp);
4168
4169 if (!ext4_has_feature_bigalloc(sb))
4170 return (has_super + ext4_bg_num_gdb(sb, grp) +
4171 (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
4172 sbi->s_itb_per_group + 2);
4173
4174 first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
4175 (grp * EXT4_BLOCKS_PER_GROUP(sb));
4176 last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
4177 for (i = 0; i < ngroups; i++) {
4178 gdp = ext4_get_group_desc(sb, i, NULL);
4179 b = ext4_block_bitmap(sb, gdp);
4180 if (b >= first_block && b <= last_block) {
4181 ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
4182 count++;
4183 }
4184 b = ext4_inode_bitmap(sb, gdp);
4185 if (b >= first_block && b <= last_block) {
4186 ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
4187 count++;
4188 }
4189 b = ext4_inode_table(sb, gdp);
4190 if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
4191 for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
4192 int c = EXT4_B2C(sbi, b - first_block);
4193 ext4_set_bit(c, buf);
4194 count++;
4195 }
4196 if (i != grp)
4197 continue;
4198 s = 0;
4199 if (ext4_bg_has_super(sb, grp)) {
4200 ext4_set_bit(s++, buf);
4201 count++;
4202 }
4203 j = ext4_bg_num_gdb(sb, grp);
4204 if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
4205 ext4_error(sb, "Invalid number of block group "
4206 "descriptor blocks: %d", j);
4207 j = EXT4_BLOCKS_PER_GROUP(sb) - s;
4208 }
4209 count += j;
4210 for (; j > 0; j--)
4211 ext4_set_bit(EXT4_B2C(sbi, s++), buf);
4212 }
4213 if (!count)
4214 return 0;
4215 return EXT4_CLUSTERS_PER_GROUP(sb) -
4216 ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
4217 }
4218
4219 /*
4220 * Compute the overhead and stash it in sbi->s_overhead
4221 */
ext4_calculate_overhead(struct super_block * sb)4222 int ext4_calculate_overhead(struct super_block *sb)
4223 {
4224 struct ext4_sb_info *sbi = EXT4_SB(sb);
4225 struct ext4_super_block *es = sbi->s_es;
4226 struct inode *j_inode;
4227 unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
4228 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4229 ext4_fsblk_t overhead = 0;
4230 char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO);
4231
4232 if (!buf)
4233 return -ENOMEM;
4234
4235 /*
4236 * Compute the overhead (FS structures). This is constant
4237 * for a given filesystem unless the number of block groups
4238 * changes so we cache the previous value until it does.
4239 */
4240
4241 /*
4242 * All of the blocks before first_data_block are overhead
4243 */
4244 overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
4245
4246 /*
4247 * Add the overhead found in each block group
4248 */
4249 for (i = 0; i < ngroups; i++) {
4250 int blks;
4251
4252 blks = count_overhead(sb, i, buf);
4253 overhead += blks;
4254 if (blks)
4255 memset(buf, 0, sb->s_blocksize);
4256 cond_resched();
4257 }
4258
4259 /*
4260 * Add the internal journal blocks whether the journal has been
4261 * loaded or not
4262 */
4263 if (sbi->s_journal && !sbi->s_journal_bdev_file)
4264 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
4265 else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
4266 /* j_inum for internal journal is non-zero */
4267 j_inode = ext4_get_journal_inode(sb, j_inum);
4268 if (!IS_ERR(j_inode)) {
4269 j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
4270 overhead += EXT4_NUM_B2C(sbi, j_blocks);
4271 iput(j_inode);
4272 } else {
4273 ext4_msg(sb, KERN_ERR, "can't get journal size");
4274 }
4275 }
4276 sbi->s_overhead = overhead;
4277 smp_wmb();
4278 kvfree(buf);
4279 return 0;
4280 }
4281
ext4_set_resv_clusters(struct super_block * sb)4282 static void ext4_set_resv_clusters(struct super_block *sb)
4283 {
4284 ext4_fsblk_t resv_clusters;
4285 struct ext4_sb_info *sbi = EXT4_SB(sb);
4286
4287 /*
4288 * There's no need to reserve anything when we aren't using extents.
4289 * The space estimates are exact, there are no unwritten extents,
4290 * hole punching doesn't need new metadata... This is needed especially
4291 * to keep ext2/3 backward compatibility.
4292 */
4293 if (!ext4_has_feature_extents(sb))
4294 return;
4295 /*
4296 * By default we reserve 2% or 4096 clusters, whichever is smaller.
4297 * This should cover the situations where we can not afford to run
4298 * out of space like for example punch hole, or converting
4299 * unwritten extents in delalloc path. In most cases such
4300 * allocation would require 1, or 2 blocks, higher numbers are
4301 * very rare.
4302 */
4303 resv_clusters = (ext4_blocks_count(sbi->s_es) >>
4304 sbi->s_cluster_bits);
4305
4306 do_div(resv_clusters, 50);
4307 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
4308
4309 atomic64_set(&sbi->s_resv_clusters, resv_clusters);
4310 }
4311
ext4_quota_mode(struct super_block * sb)4312 static const char *ext4_quota_mode(struct super_block *sb)
4313 {
4314 #ifdef CONFIG_QUOTA
4315 if (!ext4_quota_capable(sb))
4316 return "none";
4317
4318 if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
4319 return "journalled";
4320 else
4321 return "writeback";
4322 #else
4323 return "disabled";
4324 #endif
4325 }
4326
ext4_setup_csum_trigger(struct super_block * sb,enum ext4_journal_trigger_type type,void (* trigger)(struct jbd2_buffer_trigger_type * type,struct buffer_head * bh,void * mapped_data,size_t size))4327 static void ext4_setup_csum_trigger(struct super_block *sb,
4328 enum ext4_journal_trigger_type type,
4329 void (*trigger)(
4330 struct jbd2_buffer_trigger_type *type,
4331 struct buffer_head *bh,
4332 void *mapped_data,
4333 size_t size))
4334 {
4335 struct ext4_sb_info *sbi = EXT4_SB(sb);
4336
4337 sbi->s_journal_triggers[type].sb = sb;
4338 sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
4339 }
4340
ext4_free_sbi(struct ext4_sb_info * sbi)4341 static void ext4_free_sbi(struct ext4_sb_info *sbi)
4342 {
4343 if (!sbi)
4344 return;
4345
4346 kfree(sbi->s_blockgroup_lock);
4347 fs_put_dax(sbi->s_daxdev, NULL);
4348 kfree(sbi);
4349 }
4350
ext4_alloc_sbi(struct super_block * sb)4351 static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
4352 {
4353 struct ext4_sb_info *sbi;
4354
4355 sbi = kzalloc_obj(*sbi);
4356 if (!sbi)
4357 return NULL;
4358
4359 sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
4360 NULL, NULL);
4361
4362 sbi->s_blockgroup_lock =
4363 kzalloc_obj(struct blockgroup_lock);
4364
4365 if (!sbi->s_blockgroup_lock)
4366 goto err_out;
4367
4368 sb->s_fs_info = sbi;
4369 sbi->s_sb = sb;
4370 return sbi;
4371 err_out:
4372 fs_put_dax(sbi->s_daxdev, NULL);
4373 kfree(sbi);
4374 return NULL;
4375 }
4376
ext4_set_def_opts(struct super_block * sb,struct ext4_super_block * es)4377 static void ext4_set_def_opts(struct super_block *sb,
4378 struct ext4_super_block *es)
4379 {
4380 unsigned long def_mount_opts;
4381
4382 /* Set defaults before we parse the mount options */
4383 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
4384 set_opt(sb, INIT_INODE_TABLE);
4385 if (def_mount_opts & EXT4_DEFM_DEBUG)
4386 set_opt(sb, DEBUG);
4387 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
4388 set_opt(sb, GRPID);
4389 if (def_mount_opts & EXT4_DEFM_UID16)
4390 set_opt(sb, NO_UID32);
4391 /* xattr user namespace & acls are now defaulted on */
4392 set_opt(sb, XATTR_USER);
4393 #ifdef CONFIG_EXT4_FS_POSIX_ACL
4394 set_opt(sb, POSIX_ACL);
4395 #endif
4396 if (ext4_has_feature_fast_commit(sb))
4397 set_opt2(sb, JOURNAL_FAST_COMMIT);
4398 /* don't forget to enable journal_csum when metadata_csum is enabled. */
4399 if (ext4_has_feature_metadata_csum(sb))
4400 set_opt(sb, JOURNAL_CHECKSUM);
4401
4402 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
4403 set_opt(sb, JOURNAL_DATA);
4404 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
4405 set_opt(sb, ORDERED_DATA);
4406 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
4407 set_opt(sb, WRITEBACK_DATA);
4408
4409 if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
4410 set_opt(sb, ERRORS_PANIC);
4411 else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
4412 set_opt(sb, ERRORS_CONT);
4413 else
4414 set_opt(sb, ERRORS_RO);
4415 /* block_validity enabled by default; disable with noblock_validity */
4416 set_opt(sb, BLOCK_VALIDITY);
4417 if (def_mount_opts & EXT4_DEFM_DISCARD)
4418 set_opt(sb, DISCARD);
4419
4420 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
4421 set_opt(sb, BARRIER);
4422
4423 /*
4424 * enable delayed allocation by default
4425 * Use -o nodelalloc to turn it off
4426 */
4427 if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
4428 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
4429 set_opt(sb, DELALLOC);
4430
4431 set_opt(sb, DIOREAD_NOLOCK);
4432 }
4433
ext4_handle_clustersize(struct super_block * sb)4434 static int ext4_handle_clustersize(struct super_block *sb)
4435 {
4436 struct ext4_sb_info *sbi = EXT4_SB(sb);
4437 struct ext4_super_block *es = sbi->s_es;
4438 int clustersize;
4439
4440 /* Handle clustersize */
4441 clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4442 if (ext4_has_feature_bigalloc(sb)) {
4443 if (clustersize < sb->s_blocksize) {
4444 ext4_msg(sb, KERN_ERR,
4445 "cluster size (%d) smaller than "
4446 "block size (%lu)", clustersize, sb->s_blocksize);
4447 return -EINVAL;
4448 }
4449 sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4450 le32_to_cpu(es->s_log_block_size);
4451 } else {
4452 if (clustersize != sb->s_blocksize) {
4453 ext4_msg(sb, KERN_ERR,
4454 "fragment/cluster size (%d) != "
4455 "block size (%lu)", clustersize, sb->s_blocksize);
4456 return -EINVAL;
4457 }
4458 if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
4459 ext4_msg(sb, KERN_ERR,
4460 "#blocks per group too big: %lu",
4461 sbi->s_blocks_per_group);
4462 return -EINVAL;
4463 }
4464 sbi->s_cluster_bits = 0;
4465 }
4466 sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
4467 if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
4468 ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
4469 sbi->s_clusters_per_group);
4470 return -EINVAL;
4471 }
4472 if (sbi->s_blocks_per_group !=
4473 (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
4474 ext4_msg(sb, KERN_ERR,
4475 "blocks per group (%lu) and clusters per group (%lu) inconsistent",
4476 sbi->s_blocks_per_group, sbi->s_clusters_per_group);
4477 return -EINVAL;
4478 }
4479 sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
4480
4481 /* Do we have standard group size of clustersize * 8 blocks ? */
4482 if (sbi->s_blocks_per_group == clustersize << 3)
4483 set_opt2(sb, STD_GROUP_SIZE);
4484
4485 return 0;
4486 }
4487
4488 /*
4489 * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
4490 * With non-bigalloc filesystem awu will be based upon filesystem blocksize
4491 * & bdev awu units.
4492 * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
4493 * @sb: super block
4494 */
ext4_atomic_write_init(struct super_block * sb)4495 static void ext4_atomic_write_init(struct super_block *sb)
4496 {
4497 struct ext4_sb_info *sbi = EXT4_SB(sb);
4498 struct block_device *bdev = sb->s_bdev;
4499 unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);
4500
4501 if (!bdev_can_atomic_write(bdev))
4502 return;
4503
4504 if (!ext4_has_feature_extents(sb))
4505 return;
4506
4507 sbi->s_awu_min = max(sb->s_blocksize,
4508 bdev_atomic_write_unit_min_bytes(bdev));
4509 sbi->s_awu_max = min(clustersize,
4510 bdev_atomic_write_unit_max_bytes(bdev));
4511 if (sbi->s_awu_min && sbi->s_awu_max &&
4512 sbi->s_awu_min <= sbi->s_awu_max) {
4513 ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
4514 sbi->s_awu_min, sbi->s_awu_max);
4515 } else {
4516 sbi->s_awu_min = 0;
4517 sbi->s_awu_max = 0;
4518 }
4519 }
4520
ext4_fast_commit_init(struct super_block * sb)4521 static void ext4_fast_commit_init(struct super_block *sb)
4522 {
4523 struct ext4_sb_info *sbi = EXT4_SB(sb);
4524
4525 /* Initialize fast commit stuff */
4526 atomic_set(&sbi->s_fc_subtid, 0);
4527 INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
4528 INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
4529 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
4530 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
4531 sbi->s_fc_bytes = 0;
4532 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
4533 sbi->s_fc_ineligible_tid = 0;
4534 mutex_init(&sbi->s_fc_lock);
4535 memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
4536 sbi->s_fc_replay_state.fc_regions = NULL;
4537 sbi->s_fc_replay_state.fc_regions_size = 0;
4538 sbi->s_fc_replay_state.fc_regions_used = 0;
4539 sbi->s_fc_replay_state.fc_regions_valid = 0;
4540 sbi->s_fc_replay_state.fc_modified_inodes = NULL;
4541 sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
4542 sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
4543 }
4544
ext4_inode_info_init(struct super_block * sb,struct ext4_super_block * es)4545 static int ext4_inode_info_init(struct super_block *sb,
4546 struct ext4_super_block *es)
4547 {
4548 struct ext4_sb_info *sbi = EXT4_SB(sb);
4549
4550 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
4551 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
4552 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
4553 } else {
4554 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
4555 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
4556 if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
4557 ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
4558 sbi->s_first_ino);
4559 return -EINVAL;
4560 }
4561 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
4562 (!is_power_of_2(sbi->s_inode_size)) ||
4563 (sbi->s_inode_size > sb->s_blocksize)) {
4564 ext4_msg(sb, KERN_ERR,
4565 "unsupported inode size: %d",
4566 sbi->s_inode_size);
4567 ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
4568 return -EINVAL;
4569 }
4570 /*
4571 * i_atime_extra is the last extra field available for
4572 * [acm]times in struct ext4_inode. Checking for that
4573 * field should suffice to ensure we have extra space
4574 * for all three.
4575 */
4576 if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
4577 sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
4578 sb->s_time_gran = 1;
4579 sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
4580 } else {
4581 sb->s_time_gran = NSEC_PER_SEC;
4582 sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
4583 }
4584 sb->s_time_min = EXT4_TIMESTAMP_MIN;
4585 }
4586
4587 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4588 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4589 EXT4_GOOD_OLD_INODE_SIZE;
4590 if (ext4_has_feature_extra_isize(sb)) {
4591 unsigned v, max = (sbi->s_inode_size -
4592 EXT4_GOOD_OLD_INODE_SIZE);
4593
4594 v = le16_to_cpu(es->s_want_extra_isize);
4595 if (v > max) {
4596 ext4_msg(sb, KERN_ERR,
4597 "bad s_want_extra_isize: %d", v);
4598 return -EINVAL;
4599 }
4600 if (sbi->s_want_extra_isize < v)
4601 sbi->s_want_extra_isize = v;
4602
4603 v = le16_to_cpu(es->s_min_extra_isize);
4604 if (v > max) {
4605 ext4_msg(sb, KERN_ERR,
4606 "bad s_min_extra_isize: %d", v);
4607 return -EINVAL;
4608 }
4609 if (sbi->s_want_extra_isize < v)
4610 sbi->s_want_extra_isize = v;
4611 }
4612 }
4613
4614 return 0;
4615 }
4616
4617 #if IS_ENABLED(CONFIG_UNICODE)
ext4_encoding_init(struct super_block * sb,struct ext4_super_block * es)4618 static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
4619 {
4620 const struct ext4_sb_encodings *encoding_info;
4621 struct unicode_map *encoding;
4622 __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
4623
4624 if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
4625 return 0;
4626
4627 encoding_info = ext4_sb_read_encoding(es);
4628 if (!encoding_info) {
4629 ext4_msg(sb, KERN_ERR,
4630 "Encoding requested by superblock is unknown");
4631 return -EINVAL;
4632 }
4633
4634 encoding = utf8_load(encoding_info->version);
4635 if (IS_ERR(encoding)) {
4636 ext4_msg(sb, KERN_ERR,
4637 "can't mount with superblock charset: %s-%u.%u.%u "
4638 "not supported by the kernel. flags: 0x%x.",
4639 encoding_info->name,
4640 unicode_major(encoding_info->version),
4641 unicode_minor(encoding_info->version),
4642 unicode_rev(encoding_info->version),
4643 encoding_flags);
4644 return -EINVAL;
4645 }
4646 ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
4647 "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
4648 unicode_major(encoding_info->version),
4649 unicode_minor(encoding_info->version),
4650 unicode_rev(encoding_info->version),
4651 encoding_flags);
4652
4653 sb->s_encoding = encoding;
4654 sb->s_encoding_flags = encoding_flags;
4655
4656 return 0;
4657 }
4658 #else
ext4_encoding_init(struct super_block * sb,struct ext4_super_block * es)4659 static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
4660 {
4661 return 0;
4662 }
4663 #endif
4664
ext4_init_metadata_csum(struct super_block * sb,struct ext4_super_block * es)4665 static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
4666 {
4667 struct ext4_sb_info *sbi = EXT4_SB(sb);
4668
4669 /* Warn if metadata_csum and gdt_csum are both set. */
4670 if (ext4_has_feature_metadata_csum(sb) &&
4671 ext4_has_feature_gdt_csum(sb))
4672 ext4_warning(sb, "metadata_csum and uninit_bg are "
4673 "redundant flags; please run fsck.");
4674
4675 /* Check for a known checksum algorithm */
4676 if (!ext4_verify_csum_type(sb, es)) {
4677 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4678 "unknown checksum algorithm.");
4679 return -EINVAL;
4680 }
4681 ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
4682 ext4_orphan_file_block_trigger);
4683
4684 /* Check superblock checksum */
4685 if (!ext4_superblock_csum_verify(sb, es)) {
4686 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
4687 "invalid superblock checksum. Run e2fsck?");
4688 return -EFSBADCRC;
4689 }
4690
4691 /* Precompute checksum seed for all metadata */
4692 if (ext4_has_feature_csum_seed(sb))
4693 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
4694 else if (ext4_has_feature_metadata_csum(sb) ||
4695 ext4_has_feature_ea_inode(sb))
4696 sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
4697 sizeof(es->s_uuid));
4698 return 0;
4699 }
4700
ext4_check_feature_compatibility(struct super_block * sb,struct ext4_super_block * es,int silent)4701 static int ext4_check_feature_compatibility(struct super_block *sb,
4702 struct ext4_super_block *es,
4703 int silent)
4704 {
4705 struct ext4_sb_info *sbi = EXT4_SB(sb);
4706
4707 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
4708 (ext4_has_compat_features(sb) ||
4709 ext4_has_ro_compat_features(sb) ||
4710 ext4_has_incompat_features(sb)))
4711 ext4_msg(sb, KERN_WARNING,
4712 "feature flags set on rev 0 fs, "
4713 "running e2fsck is recommended");
4714
4715 if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
4716 set_opt2(sb, HURD_COMPAT);
4717 if (ext4_has_feature_64bit(sb)) {
4718 ext4_msg(sb, KERN_ERR,
4719 "The Hurd can't support 64-bit file systems");
4720 return -EINVAL;
4721 }
4722
4723 /*
4724 * ea_inode feature uses l_i_version field which is not
4725 * available in HURD_COMPAT mode.
4726 */
4727 if (ext4_has_feature_ea_inode(sb)) {
4728 ext4_msg(sb, KERN_ERR,
4729 "ea_inode feature is not supported for Hurd");
4730 return -EINVAL;
4731 }
4732 }
4733
4734 if (IS_EXT2_SB(sb)) {
4735 if (ext2_feature_set_ok(sb))
4736 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
4737 "using the ext4 subsystem");
4738 else {
4739 /*
4740 * If we're probing be silent, if this looks like
4741 * it's actually an ext[34] filesystem.
4742 */
4743 if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4744 return -EINVAL;
4745 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
4746 "to feature incompatibilities");
4747 return -EINVAL;
4748 }
4749 }
4750
4751 if (IS_EXT3_SB(sb)) {
4752 if (ext3_feature_set_ok(sb))
4753 ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
4754 "using the ext4 subsystem");
4755 else {
4756 /*
4757 * If we're probing be silent, if this looks like
4758 * it's actually an ext4 filesystem.
4759 */
4760 if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4761 return -EINVAL;
4762 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
4763 "to feature incompatibilities");
4764 return -EINVAL;
4765 }
4766 }
4767
4768 /*
4769 * Check feature flags regardless of the revision level, since we
4770 * previously didn't change the revision level when setting the flags,
4771 * so there is a chance incompat flags are set on a rev 0 filesystem.
4772 */
4773 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
4774 return -EINVAL;
4775
4776 if (sbi->s_daxdev) {
4777 if (sb->s_blocksize == PAGE_SIZE)
4778 set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
4779 else
4780 ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
4781 }
4782
4783 if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
4784 if (ext4_has_feature_inline_data(sb)) {
4785 ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
4786 " that may contain inline data");
4787 return -EINVAL;
4788 }
4789 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
4790 ext4_msg(sb, KERN_ERR,
4791 "DAX unsupported by block device.");
4792 return -EINVAL;
4793 }
4794 }
4795
4796 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
4797 ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
4798 es->s_encryption_level);
4799 return -EINVAL;
4800 }
4801
4802 return 0;
4803 }
4804
ext4_check_geometry(struct super_block * sb,struct ext4_super_block * es)4805 static int ext4_check_geometry(struct super_block *sb,
4806 struct ext4_super_block *es)
4807 {
4808 struct ext4_sb_info *sbi = EXT4_SB(sb);
4809 __u64 blocks_count;
4810 int err;
4811
4812 if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
4813 ext4_msg(sb, KERN_ERR,
4814 "Number of reserved GDT blocks insanely large: %d",
4815 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
4816 return -EINVAL;
4817 }
4818 /*
4819 * Test whether we have more sectors than will fit in sector_t,
4820 * and whether the max offset is addressable by the page cache.
4821 */
4822 err = generic_check_addressable(sb->s_blocksize_bits,
4823 ext4_blocks_count(es));
4824 if (err) {
4825 ext4_msg(sb, KERN_ERR, "filesystem"
4826 " too large to mount safely on this system");
4827 return err;
4828 }
4829
4830 /* check blocks count against device size */
4831 blocks_count = sb_bdev_nr_blocks(sb);
4832 if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4833 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4834 "exceeds size of device (%llu blocks)",
4835 ext4_blocks_count(es), blocks_count);
4836 return -EINVAL;
4837 }
4838
4839 /*
4840 * It makes no sense for the first data block to be beyond the end
4841 * of the filesystem.
4842 */
4843 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4844 ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4845 "block %u is beyond end of filesystem (%llu)",
4846 le32_to_cpu(es->s_first_data_block),
4847 ext4_blocks_count(es));
4848 return -EINVAL;
4849 }
4850 if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4851 (sbi->s_cluster_ratio == 1)) {
4852 ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4853 "block is 0 with a 1k block and cluster size");
4854 return -EINVAL;
4855 }
4856
4857 blocks_count = (ext4_blocks_count(es) -
4858 le32_to_cpu(es->s_first_data_block) +
4859 EXT4_BLOCKS_PER_GROUP(sb) - 1);
4860 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4861 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4862 ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
4863 "(block count %llu, first data block %u, "
4864 "blocks per group %lu)", blocks_count,
4865 ext4_blocks_count(es),
4866 le32_to_cpu(es->s_first_data_block),
4867 EXT4_BLOCKS_PER_GROUP(sb));
4868 return -EINVAL;
4869 }
4870 sbi->s_groups_count = blocks_count;
4871 sbi->s_blockfile_groups = min(sbi->s_groups_count,
4872 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4873 if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4874 le32_to_cpu(es->s_inodes_count)) {
4875 ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4876 le32_to_cpu(es->s_inodes_count),
4877 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4878 return -EINVAL;
4879 }
4880
4881 return 0;
4882 }
4883
ext4_group_desc_init(struct super_block * sb,struct ext4_super_block * es,ext4_fsblk_t logical_sb_block,ext4_group_t * first_not_zeroed)4884 static int ext4_group_desc_init(struct super_block *sb,
4885 struct ext4_super_block *es,
4886 ext4_fsblk_t logical_sb_block,
4887 ext4_group_t *first_not_zeroed)
4888 {
4889 struct ext4_sb_info *sbi = EXT4_SB(sb);
4890 unsigned int db_count;
4891 ext4_fsblk_t block;
4892 int i;
4893
4894 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4895 EXT4_DESC_PER_BLOCK(sb);
4896 if (ext4_has_feature_meta_bg(sb)) {
4897 if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4898 ext4_msg(sb, KERN_WARNING,
4899 "first meta block group too large: %u "
4900 "(group descriptor block count %u)",
4901 le32_to_cpu(es->s_first_meta_bg), db_count);
4902 return -EINVAL;
4903 }
4904 }
4905 rcu_assign_pointer(sbi->s_group_desc,
4906 kvmalloc_objs(struct buffer_head *, db_count));
4907 if (sbi->s_group_desc == NULL) {
4908 ext4_msg(sb, KERN_ERR, "not enough memory");
4909 return -ENOMEM;
4910 }
4911
4912 bgl_lock_init(sbi->s_blockgroup_lock);
4913
4914 /* Pre-read the descriptors into the buffer cache */
4915 for (i = 0; i < db_count; i++) {
4916 block = descriptor_loc(sb, logical_sb_block, i);
4917 ext4_sb_breadahead_unmovable(sb, block);
4918 }
4919
4920 for (i = 0; i < db_count; i++) {
4921 struct buffer_head *bh;
4922
4923 block = descriptor_loc(sb, logical_sb_block, i);
4924 bh = ext4_sb_bread_unmovable(sb, block);
4925 if (IS_ERR(bh)) {
4926 ext4_msg(sb, KERN_ERR,
4927 "can't read group descriptor %d", i);
4928 sbi->s_gdb_count = i;
4929 return PTR_ERR(bh);
4930 }
4931 rcu_read_lock();
4932 rcu_dereference(sbi->s_group_desc)[i] = bh;
4933 rcu_read_unlock();
4934 }
4935 sbi->s_gdb_count = db_count;
4936 if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
4937 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4938 return -EFSCORRUPTED;
4939 }
4940
4941 return 0;
4942 }
4943
ext4_load_and_init_journal(struct super_block * sb,struct ext4_super_block * es,struct ext4_fs_context * ctx)4944 static int ext4_load_and_init_journal(struct super_block *sb,
4945 struct ext4_super_block *es,
4946 struct ext4_fs_context *ctx)
4947 {
4948 struct ext4_sb_info *sbi = EXT4_SB(sb);
4949 int err;
4950
4951 err = ext4_load_journal(sb, es, ctx->journal_devnum);
4952 if (err)
4953 return err;
4954
4955 if (ext4_has_feature_64bit(sb) &&
4956 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4957 JBD2_FEATURE_INCOMPAT_64BIT)) {
4958 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4959 goto out;
4960 }
4961
4962 if (!set_journal_csum_feature_set(sb)) {
4963 ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4964 "feature set");
4965 goto out;
4966 }
4967
4968 if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
4969 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4970 JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
4971 ext4_msg(sb, KERN_ERR,
4972 "Failed to set fast commit journal feature");
4973 goto out;
4974 }
4975
4976 /* We have now updated the journal if required, so we can
4977 * validate the data journaling mode. */
4978 switch (test_opt(sb, DATA_FLAGS)) {
4979 case 0:
4980 /* No mode set, assume a default based on the journal
4981 * capabilities: ORDERED_DATA if the journal can
4982 * cope, else JOURNAL_DATA
4983 */
4984 if (jbd2_journal_check_available_features
4985 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4986 set_opt(sb, ORDERED_DATA);
4987 sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4988 } else {
4989 set_opt(sb, JOURNAL_DATA);
4990 sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4991 }
4992 break;
4993
4994 case EXT4_MOUNT_ORDERED_DATA:
4995 case EXT4_MOUNT_WRITEBACK_DATA:
4996 if (!jbd2_journal_check_available_features
4997 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4998 ext4_msg(sb, KERN_ERR, "Journal does not support "
4999 "requested data journaling mode");
5000 goto out;
5001 }
5002 break;
5003 default:
5004 break;
5005 }
5006
5007 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
5008 test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5009 ext4_msg(sb, KERN_ERR, "can't mount with "
5010 "journal_async_commit in data=ordered mode");
5011 goto out;
5012 }
5013
5014 set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
5015
5016 sbi->s_journal->j_submit_inode_data_buffers =
5017 ext4_journal_submit_inode_data_buffers;
5018 sbi->s_journal->j_finish_inode_data_buffers =
5019 ext4_journal_finish_inode_data_buffers;
5020
5021 return 0;
5022
5023 out:
5024 ext4_journal_destroy(sbi, sbi->s_journal);
5025 return -EINVAL;
5026 }
5027
ext4_check_journal_data_mode(struct super_block * sb)5028 static int ext4_check_journal_data_mode(struct super_block *sb)
5029 {
5030 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
5031 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
5032 "data=journal disables delayed allocation, "
5033 "dioread_nolock, O_DIRECT and fast_commit support!\n");
5034 /* can't mount with both data=journal and dioread_nolock. */
5035 clear_opt(sb, DIOREAD_NOLOCK);
5036 clear_opt2(sb, JOURNAL_FAST_COMMIT);
5037 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
5038 ext4_msg(sb, KERN_ERR, "can't mount with "
5039 "both data=journal and delalloc");
5040 return -EINVAL;
5041 }
5042 if (test_opt(sb, DAX_ALWAYS)) {
5043 ext4_msg(sb, KERN_ERR, "can't mount with "
5044 "both data=journal and dax");
5045 return -EINVAL;
5046 }
5047 if (ext4_has_feature_encrypt(sb)) {
5048 ext4_msg(sb, KERN_WARNING,
5049 "encrypted files will use data=ordered "
5050 "instead of data journaling mode");
5051 }
5052 if (test_opt(sb, DELALLOC))
5053 clear_opt(sb, DELALLOC);
5054 } else {
5055 sb->s_iflags |= SB_I_CGROUPWB;
5056 }
5057
5058 return 0;
5059 }
5060
ext4_has_journal_option(struct super_block * sb)5061 static const char *ext4_has_journal_option(struct super_block *sb)
5062 {
5063 struct ext4_sb_info *sbi = EXT4_SB(sb);
5064
5065 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
5066 return "journal_async_commit";
5067 if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
5068 return "journal_checksum";
5069 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
5070 return "commit=";
5071 if (EXT4_MOUNT_DATA_FLAGS &
5072 (sbi->s_mount_opt ^ sbi->s_def_mount_opt))
5073 return "data=";
5074 if (test_opt(sb, DATA_ERR_ABORT))
5075 return "data_err=abort";
5076 return NULL;
5077 }
5078
5079 /*
5080 * Limit the maximum folio order to 2048 blocks to prevent overestimation
5081 * of reserve handle credits during the folio writeback in environments
5082 * where the PAGE_SIZE exceeds 4KB.
5083 */
5084 #define EXT4_MAX_PAGECACHE_ORDER(sb) \
5085 umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT))
ext4_set_max_mapping_order(struct super_block * sb)5086 static void ext4_set_max_mapping_order(struct super_block *sb)
5087 {
5088 struct ext4_sb_info *sbi = EXT4_SB(sb);
5089
5090 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
5091 sbi->s_max_folio_order = sbi->s_min_folio_order;
5092 else
5093 sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb);
5094 }
5095
ext4_check_large_folio(struct super_block * sb)5096 static int ext4_check_large_folio(struct super_block *sb)
5097 {
5098 const char *err_str = NULL;
5099
5100 if (ext4_has_feature_encrypt(sb))
5101 err_str = "encrypt";
5102
5103 if (!err_str) {
5104 ext4_set_max_mapping_order(sb);
5105 } else if (sb->s_blocksize > PAGE_SIZE) {
5106 ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s",
5107 sb->s_blocksize, PAGE_SIZE, err_str);
5108 return -EINVAL;
5109 }
5110
5111 return 0;
5112 }
5113
ext4_load_super(struct super_block * sb,ext4_fsblk_t * lsb,int silent)5114 static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
5115 int silent)
5116 {
5117 struct ext4_sb_info *sbi = EXT4_SB(sb);
5118 struct ext4_super_block *es;
5119 ext4_fsblk_t logical_sb_block;
5120 unsigned long offset = 0;
5121 struct buffer_head *bh;
5122 int ret = -EINVAL;
5123 int blocksize;
5124
5125 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
5126 if (!blocksize) {
5127 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
5128 return -EINVAL;
5129 }
5130
5131 /*
5132 * The ext4 superblock will not be buffer aligned for other than 1kB
5133 * block sizes. We need to calculate the offset from buffer start.
5134 */
5135 if (blocksize != EXT4_MIN_BLOCK_SIZE) {
5136 logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
5137 offset = do_div(logical_sb_block, blocksize);
5138 } else {
5139 logical_sb_block = sbi->s_sb_block;
5140 }
5141
5142 bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
5143 if (IS_ERR(bh)) {
5144 ext4_msg(sb, KERN_ERR, "unable to read superblock");
5145 return PTR_ERR(bh);
5146 }
5147 /*
5148 * Note: s_es must be initialized as soon as possible because
5149 * some ext4 macro-instructions depend on its value
5150 */
5151 es = (struct ext4_super_block *) (bh->b_data + offset);
5152 sbi->s_es = es;
5153 sb->s_magic = le16_to_cpu(es->s_magic);
5154 if (sb->s_magic != EXT4_SUPER_MAGIC) {
5155 if (!silent)
5156 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5157 goto out;
5158 }
5159
5160 if (le32_to_cpu(es->s_log_block_size) >
5161 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
5162 ext4_msg(sb, KERN_ERR,
5163 "Invalid log block size: %u",
5164 le32_to_cpu(es->s_log_block_size));
5165 goto out;
5166 }
5167 if (le32_to_cpu(es->s_log_cluster_size) >
5168 (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
5169 ext4_msg(sb, KERN_ERR,
5170 "Invalid log cluster size: %u",
5171 le32_to_cpu(es->s_log_cluster_size));
5172 goto out;
5173 }
5174
5175 blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
5176
5177 /*
5178 * If the default block size is not the same as the real block size,
5179 * we need to reload it.
5180 */
5181 if (sb->s_blocksize == blocksize)
5182 goto success;
5183
5184 /*
5185 * bh must be released before kill_bdev(), otherwise
5186 * it won't be freed and its page also. kill_bdev()
5187 * is called by sb_set_blocksize().
5188 */
5189 brelse(bh);
5190 /* Validate the filesystem blocksize */
5191 if (!sb_set_blocksize(sb, blocksize)) {
5192 ext4_msg(sb, KERN_ERR, "bad block size %d",
5193 blocksize);
5194 bh = NULL;
5195 goto out;
5196 }
5197
5198 logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
5199 offset = do_div(logical_sb_block, blocksize);
5200 bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
5201 if (IS_ERR(bh)) {
5202 ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
5203 ret = PTR_ERR(bh);
5204 bh = NULL;
5205 goto out;
5206 }
5207 es = (struct ext4_super_block *)(bh->b_data + offset);
5208 sbi->s_es = es;
5209 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
5210 ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
5211 goto out;
5212 }
5213
5214 success:
5215 sbi->s_min_folio_order = get_order(blocksize);
5216 *lsb = logical_sb_block;
5217 sbi->s_sbh = bh;
5218 return 0;
5219 out:
5220 brelse(bh);
5221 return ret;
5222 }
5223
ext4_hash_info_init(struct super_block * sb)5224 static int ext4_hash_info_init(struct super_block *sb)
5225 {
5226 struct ext4_sb_info *sbi = EXT4_SB(sb);
5227 struct ext4_super_block *es = sbi->s_es;
5228 unsigned int i;
5229
5230 sbi->s_def_hash_version = es->s_def_hash_version;
5231
5232 if (sbi->s_def_hash_version > DX_HASH_LAST) {
5233 ext4_msg(sb, KERN_ERR,
5234 "Invalid default hash set in the superblock");
5235 return -EINVAL;
5236 } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) {
5237 ext4_msg(sb, KERN_ERR,
5238 "SIPHASH is not a valid default hash value");
5239 return -EINVAL;
5240 }
5241
5242 for (i = 0; i < 4; i++)
5243 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
5244
5245 if (ext4_has_feature_dir_index(sb)) {
5246 i = le32_to_cpu(es->s_flags);
5247 if (i & EXT2_FLAGS_UNSIGNED_HASH)
5248 sbi->s_hash_unsigned = 3;
5249 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
5250 #ifdef __CHAR_UNSIGNED__
5251 if (!sb_rdonly(sb))
5252 es->s_flags |=
5253 cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
5254 sbi->s_hash_unsigned = 3;
5255 #else
5256 if (!sb_rdonly(sb))
5257 es->s_flags |=
5258 cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
5259 #endif
5260 }
5261 }
5262 return 0;
5263 }
5264
ext4_block_group_meta_init(struct super_block * sb,int silent)5265 static int ext4_block_group_meta_init(struct super_block *sb, int silent)
5266 {
5267 struct ext4_sb_info *sbi = EXT4_SB(sb);
5268 struct ext4_super_block *es = sbi->s_es;
5269 int has_huge_files;
5270
5271 has_huge_files = ext4_has_feature_huge_file(sb);
5272 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
5273 has_huge_files);
5274 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
5275
5276 sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
5277 if (ext4_has_feature_64bit(sb)) {
5278 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
5279 sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
5280 !is_power_of_2(sbi->s_desc_size)) {
5281 ext4_msg(sb, KERN_ERR,
5282 "unsupported descriptor size %lu",
5283 sbi->s_desc_size);
5284 return -EINVAL;
5285 }
5286 } else
5287 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
5288
5289 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
5290 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
5291
5292 sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
5293 if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
5294 if (!silent)
5295 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
5296 return -EINVAL;
5297 }
5298 if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
5299 sbi->s_inodes_per_group > sb->s_blocksize * 8) {
5300 ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
5301 sbi->s_inodes_per_group);
5302 return -EINVAL;
5303 }
5304 sbi->s_itb_per_group = sbi->s_inodes_per_group /
5305 sbi->s_inodes_per_block;
5306 sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
5307 sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
5308 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
5309 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
5310
5311 return 0;
5312 }
5313
5314 /*
5315 * It's hard to get stripe aligned blocks if stripe is not aligned with
5316 * cluster, just disable stripe and alert user to simplify code and avoid
5317 * stripe aligned allocation which will rarely succeed.
5318 */
ext4_is_stripe_incompatible(struct super_block * sb,unsigned long stripe)5319 static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe)
5320 {
5321 struct ext4_sb_info *sbi = EXT4_SB(sb);
5322 return (stripe > 0 && sbi->s_cluster_ratio > 1 &&
5323 stripe % sbi->s_cluster_ratio != 0);
5324 }
5325
__ext4_fill_super(struct fs_context * fc,struct super_block * sb)5326 static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
5327 {
5328 struct ext4_super_block *es = NULL;
5329 struct ext4_sb_info *sbi = EXT4_SB(sb);
5330 ext4_fsblk_t logical_sb_block;
5331 struct inode *root;
5332 int needs_recovery;
5333 int err;
5334 ext4_group_t first_not_zeroed;
5335 struct ext4_fs_context *ctx = fc->fs_private;
5336 int silent = fc->sb_flags & SB_SILENT;
5337
5338 /* Set defaults for the variables that will be set during parsing */
5339 if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
5340 ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
5341
5342 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
5343 sbi->s_sectors_written_start =
5344 part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
5345
5346 err = ext4_load_super(sb, &logical_sb_block, silent);
5347 if (err)
5348 goto out_fail;
5349
5350 es = sbi->s_es;
5351 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
5352
5353 err = ext4_init_metadata_csum(sb, es);
5354 if (err)
5355 goto failed_mount;
5356
5357 ext4_set_def_opts(sb, es);
5358
5359 sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es));
5360 sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es));
5361 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
5362 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
5363 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
5364 sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
5365 sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;
5366
5367 /*
5368 * set default s_li_wait_mult for lazyinit, for the case there is
5369 * no mount option specified.
5370 */
5371 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
5372
5373 err = ext4_inode_info_init(sb, es);
5374 if (err)
5375 goto failed_mount;
5376
5377 err = parse_apply_sb_mount_options(sb, ctx);
5378 if (err < 0)
5379 goto failed_mount;
5380
5381 sbi->s_def_mount_opt = sbi->s_mount_opt;
5382 sbi->s_def_mount_opt2 = sbi->s_mount_opt2;
5383
5384 err = ext4_check_opt_consistency(fc, sb);
5385 if (err < 0)
5386 goto failed_mount;
5387
5388 ext4_apply_options(fc, sb);
5389
5390 err = ext4_check_large_folio(sb);
5391 if (err < 0)
5392 goto failed_mount;
5393
5394 err = ext4_encoding_init(sb, es);
5395 if (err)
5396 goto failed_mount;
5397
5398 err = ext4_check_journal_data_mode(sb);
5399 if (err)
5400 goto failed_mount;
5401
5402 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5403 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5404
5405 /* HSM events are allowed by default. */
5406 sb->s_iflags |= SB_I_ALLOW_HSM;
5407
5408 err = ext4_check_feature_compatibility(sb, es, silent);
5409 if (err)
5410 goto failed_mount;
5411
5412 err = ext4_block_group_meta_init(sb, silent);
5413 if (err)
5414 goto failed_mount;
5415
5416 err = ext4_hash_info_init(sb);
5417 if (err)
5418 goto failed_mount;
5419
5420 err = ext4_handle_clustersize(sb);
5421 if (err)
5422 goto failed_mount;
5423
5424 err = ext4_check_geometry(sb, es);
5425 if (err)
5426 goto failed_mount;
5427
5428 timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
5429 spin_lock_init(&sbi->s_error_lock);
5430 mutex_init(&sbi->s_error_notify_mutex);
5431 INIT_WORK(&sbi->s_sb_upd_work, update_super_work);
5432
5433 err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
5434 if (err)
5435 goto failed_mount3;
5436
5437 err = ext4_es_register_shrinker(sbi);
5438 if (err)
5439 goto failed_mount3;
5440
5441 sbi->s_stripe = ext4_get_stripe_size(sbi);
5442 if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) {
5443 ext4_msg(sb, KERN_WARNING,
5444 "stripe (%lu) is not aligned with cluster size (%u), "
5445 "stripe is disabled",
5446 sbi->s_stripe, sbi->s_cluster_ratio);
5447 sbi->s_stripe = 0;
5448 }
5449 sbi->s_extent_max_zeroout_kb = 32;
5450
5451 /*
5452 * set up enough so that it can read an inode
5453 */
5454 sb->s_op = &ext4_sops;
5455 sb->s_export_op = &ext4_export_ops;
5456 sb->s_xattr = ext4_xattr_handlers;
5457 #ifdef CONFIG_FS_ENCRYPTION
5458 sb->s_cop = &ext4_cryptops;
5459 #endif
5460 #ifdef CONFIG_FS_VERITY
5461 sb->s_vop = &ext4_verityops;
5462 #endif
5463 #ifdef CONFIG_QUOTA
5464 sb->dq_op = &ext4_quota_operations;
5465 if (ext4_has_feature_quota(sb))
5466 sb->s_qcop = &dquot_quotactl_sysfile_ops;
5467 else
5468 sb->s_qcop = &ext4_qctl_operations;
5469 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
5470 #endif
5471 super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
5472 super_set_sysfs_name_bdev(sb);
5473
5474 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
5475 mutex_init(&sbi->s_orphan_lock);
5476
5477 spin_lock_init(&sbi->s_bdev_wb_lock);
5478
5479 ext4_atomic_write_init(sb);
5480 ext4_fast_commit_init(sb);
5481
5482 sb->s_root = NULL;
5483
5484 needs_recovery = (es->s_last_orphan != 0 ||
5485 ext4_has_feature_orphan_present(sb) ||
5486 ext4_has_feature_journal_needs_recovery(sb));
5487
5488 if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
5489 err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
5490 if (err)
5491 goto failed_mount3a;
5492 }
5493
5494 err = -EINVAL;
5495 /*
5496 * The first inode we look at is the journal inode. Don't try
5497 * root first: it may be modified in the journal!
5498 */
5499 if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
5500 err = ext4_load_and_init_journal(sb, es, ctx);
5501 if (err)
5502 goto failed_mount3a;
5503 if (bdev_read_only(sb->s_bdev))
5504 needs_recovery = 0;
5505 } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
5506 ext4_has_feature_journal_needs_recovery(sb)) {
5507 ext4_msg(sb, KERN_ERR, "required journal recovery "
5508 "suppressed and not mounted read-only");
5509 goto failed_mount3a;
5510 } else {
5511 const char *journal_option;
5512
5513 /* Nojournal mode, all journal mount options are illegal */
5514 journal_option = ext4_has_journal_option(sb);
5515 if (journal_option != NULL) {
5516 ext4_msg(sb, KERN_ERR,
5517 "can't mount with %s, fs mounted w/o journal",
5518 journal_option);
5519 goto failed_mount3a;
5520 }
5521
5522 sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
5523 clear_opt(sb, JOURNAL_CHECKSUM);
5524 clear_opt(sb, DATA_FLAGS);
5525 clear_opt2(sb, JOURNAL_FAST_COMMIT);
5526 sbi->s_journal = NULL;
5527 needs_recovery = 0;
5528 }
5529
5530 if (!test_opt(sb, NO_MBCACHE)) {
5531 sbi->s_ea_block_cache = ext4_xattr_create_cache();
5532 if (!sbi->s_ea_block_cache) {
5533 ext4_msg(sb, KERN_ERR,
5534 "Failed to create ea_block_cache");
5535 err = -EINVAL;
5536 goto failed_mount_wq;
5537 }
5538
5539 if (ext4_has_feature_ea_inode(sb)) {
5540 sbi->s_ea_inode_cache = ext4_xattr_create_cache();
5541 if (!sbi->s_ea_inode_cache) {
5542 ext4_msg(sb, KERN_ERR,
5543 "Failed to create ea_inode_cache");
5544 err = -EINVAL;
5545 goto failed_mount_wq;
5546 }
5547 }
5548 }
5549
5550 /*
5551 * Get the # of file system overhead blocks from the
5552 * superblock if present.
5553 */
5554 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
5555 /* ignore the precalculated value if it is ridiculous */
5556 if (sbi->s_overhead > ext4_blocks_count(es))
5557 sbi->s_overhead = 0;
5558 /*
5559 * If the bigalloc feature is not enabled recalculating the
5560 * overhead doesn't take long, so we might as well just redo
5561 * it to make sure we are using the correct value.
5562 */
5563 if (!ext4_has_feature_bigalloc(sb))
5564 sbi->s_overhead = 0;
5565 if (sbi->s_overhead == 0) {
5566 err = ext4_calculate_overhead(sb);
5567 if (err)
5568 goto failed_mount_wq;
5569 }
5570
5571 /*
5572 * The maximum number of concurrent works can be high and
5573 * concurrency isn't really necessary. Limit it to 1.
5574 */
5575 EXT4_SB(sb)->rsv_conversion_wq =
5576 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
5577 if (!EXT4_SB(sb)->rsv_conversion_wq) {
5578 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
5579 err = -ENOMEM;
5580 goto failed_mount4;
5581 }
5582
5583 /*
5584 * The jbd2_journal_load will have done any necessary log recovery,
5585 * so we can safely mount the rest of the filesystem now.
5586 */
5587
5588 root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
5589 if (IS_ERR(root)) {
5590 ext4_msg(sb, KERN_ERR, "get root inode failed");
5591 err = PTR_ERR(root);
5592 root = NULL;
5593 goto failed_mount4;
5594 }
5595 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
5596 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
5597 iput(root);
5598 err = -EFSCORRUPTED;
5599 goto failed_mount4;
5600 }
5601
5602 generic_set_sb_d_ops(sb);
5603 sb->s_root = d_make_root(root);
5604 if (!sb->s_root) {
5605 ext4_msg(sb, KERN_ERR, "get root dentry failed");
5606 err = -ENOMEM;
5607 goto failed_mount4;
5608 }
5609
5610 err = ext4_setup_super(sb, es, sb_rdonly(sb));
5611 if (err == -EROFS) {
5612 sb->s_flags |= SB_RDONLY;
5613 } else if (err)
5614 goto failed_mount4a;
5615
5616 ext4_set_resv_clusters(sb);
5617
5618 if (test_opt(sb, BLOCK_VALIDITY)) {
5619 err = ext4_setup_system_zone(sb);
5620 if (err) {
5621 ext4_msg(sb, KERN_ERR, "failed to initialize system "
5622 "zone (%d)", err);
5623 goto failed_mount4a;
5624 }
5625 }
5626 ext4_fc_replay_cleanup(sb);
5627
5628 ext4_ext_init(sb);
5629
5630 /*
5631 * Enable optimize_scan if number of groups is > threshold. This can be
5632 * turned off by passing "mb_optimize_scan=0". This can also be
5633 * turned on forcefully by passing "mb_optimize_scan=1".
5634 */
5635 if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
5636 if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
5637 set_opt2(sb, MB_OPTIMIZE_SCAN);
5638 else
5639 clear_opt2(sb, MB_OPTIMIZE_SCAN);
5640 }
5641
5642 err = ext4_percpu_param_init(sbi);
5643 if (err)
5644 goto failed_mount5;
5645
5646 err = ext4_mb_init(sb);
5647 if (err) {
5648 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
5649 err);
5650 goto failed_mount5;
5651 }
5652
5653 /*
5654 * We can only set up the journal commit callback once
5655 * mballoc is initialized
5656 */
5657 if (sbi->s_journal)
5658 sbi->s_journal->j_commit_callback =
5659 ext4_journal_commit_callback;
5660
5661 if (ext4_has_feature_flex_bg(sb))
5662 if (!ext4_fill_flex_info(sb)) {
5663 ext4_msg(sb, KERN_ERR,
5664 "unable to initialize "
5665 "flex_bg meta info!");
5666 err = -ENOMEM;
5667 goto failed_mount6;
5668 }
5669
5670 err = ext4_register_li_request(sb, first_not_zeroed);
5671 if (err)
5672 goto failed_mount6;
5673
5674 err = ext4_init_orphan_info(sb);
5675 if (err)
5676 goto failed_mount7;
5677 #ifdef CONFIG_QUOTA
5678 /* Enable quota usage during mount. */
5679 if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
5680 err = ext4_enable_quotas(sb);
5681 if (err)
5682 goto failed_mount8;
5683 }
5684 #endif /* CONFIG_QUOTA */
5685
5686 /*
5687 * Save the original bdev mapping's wb_err value which could be
5688 * used to detect the metadata async write error.
5689 */
5690 errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
5691 &sbi->s_bdev_wb_err);
5692 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
5693 ext4_orphan_cleanup(sb, es);
5694 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
5695 /*
5696 * Update the checksum after updating free space/inode counters and
5697 * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
5698 * checksum in the buffer cache until it is written out and
5699 * e2fsprogs programs trying to open a file system immediately
5700 * after it is mounted can fail.
5701 */
5702 ext4_superblock_csum_set(sb);
5703 if (needs_recovery) {
5704 ext4_msg(sb, KERN_INFO, "recovery complete");
5705 err = ext4_mark_recovery_complete(sb, es);
5706 if (err)
5707 goto failed_mount9;
5708 }
5709
5710 if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
5711 ext4_msg(sb, KERN_WARNING,
5712 "mounting with \"discard\" option, but the device does not support discard");
5713 clear_opt(sb, DISCARD);
5714 }
5715
5716 if (es->s_error_count) {
5717 sbi->s_err_report_sec = 5*60; /* first time 5 minutes */
5718 mod_timer(&sbi->s_err_report,
5719 jiffies + secs_to_jiffies(sbi->s_err_report_sec));
5720 }
5721 sbi->s_err_report_sec = 24*60*60; /* Once a day */
5722
5723 /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
5724 ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
5725 ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
5726 ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
5727 atomic_set(&sbi->s_warning_count, 0);
5728 atomic_set(&sbi->s_msg_count, 0);
5729
5730 /* Register sysfs after all initializations are complete. */
5731 err = ext4_register_sysfs(sb);
5732 if (err)
5733 goto failed_mount9;
5734
5735 return 0;
5736
5737 failed_mount9:
5738 ext4_quotas_off(sb, EXT4_MAXQUOTAS);
5739 failed_mount8: __maybe_unused
5740 ext4_release_orphan_info(sb);
5741 failed_mount7:
5742 ext4_unregister_li_request(sb);
5743 failed_mount6:
5744 ext4_mb_release(sb);
5745 ext4_flex_groups_free(sbi);
5746 failed_mount5:
5747 ext4_percpu_param_destroy(sbi);
5748 ext4_ext_release(sb);
5749 ext4_release_system_zone(sb);
5750 failed_mount4a:
5751 dput(sb->s_root);
5752 sb->s_root = NULL;
5753 failed_mount4:
5754 ext4_msg(sb, KERN_ERR, "mount failed");
5755 if (EXT4_SB(sb)->rsv_conversion_wq)
5756 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
5757 failed_mount_wq:
5758 ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
5759 sbi->s_ea_inode_cache = NULL;
5760
5761 ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
5762 sbi->s_ea_block_cache = NULL;
5763
5764 if (sbi->s_journal) {
5765 ext4_journal_destroy(sbi, sbi->s_journal);
5766 }
5767 failed_mount3a:
5768 ext4_es_unregister_shrinker(sbi);
5769 failed_mount3:
5770 /* flush s_sb_upd_work before sbi destroy */
5771 flush_work(&sbi->s_sb_upd_work);
5772 ext4_stop_mmpd(sbi);
5773 timer_delete_sync(&sbi->s_err_report);
5774 ext4_group_desc_free(sbi);
5775 failed_mount:
5776 #if IS_ENABLED(CONFIG_UNICODE)
5777 utf8_unload(sb->s_encoding);
5778 #endif
5779
5780 #ifdef CONFIG_QUOTA
5781 for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
5782 kfree(get_qf_name(sb, sbi, i));
5783 #endif
5784 fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
5785 brelse(sbi->s_sbh);
5786 if (sbi->s_journal_bdev_file) {
5787 invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
5788 bdev_fput(sbi->s_journal_bdev_file);
5789 }
5790 out_fail:
5791 invalidate_bdev(sb->s_bdev);
5792 sb->s_fs_info = NULL;
5793 return err;
5794 }
5795
ext4_fill_super(struct super_block * sb,struct fs_context * fc)5796 static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
5797 {
5798 struct ext4_fs_context *ctx = fc->fs_private;
5799 struct ext4_sb_info *sbi;
5800 const char *descr;
5801 int ret;
5802
5803 sbi = ext4_alloc_sbi(sb);
5804 if (!sbi)
5805 return -ENOMEM;
5806
5807 fc->s_fs_info = sbi;
5808
5809 /* Cleanup superblock name */
5810 strreplace(sb->s_id, '/', '!');
5811
5812 sbi->s_sb_block = 1; /* Default super block location */
5813 if (ctx->spec & EXT4_SPEC_s_sb_block)
5814 sbi->s_sb_block = ctx->s_sb_block;
5815
5816 ret = __ext4_fill_super(fc, sb);
5817 if (ret < 0)
5818 goto free_sbi;
5819
5820 if (sbi->s_journal) {
5821 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
5822 descr = " journalled data mode";
5823 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
5824 descr = " ordered data mode";
5825 else
5826 descr = " writeback data mode";
5827 } else
5828 descr = "out journal";
5829
5830 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
5831 ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
5832 "Quota mode: %s.", &sb->s_uuid,
5833 sb_rdonly(sb) ? "ro" : "r/w", descr,
5834 ext4_quota_mode(sb));
5835
5836 /* Update the s_overhead_clusters if necessary */
5837 ext4_update_overhead(sb, false);
5838 return 0;
5839
5840 free_sbi:
5841 ext4_free_sbi(sbi);
5842 fc->s_fs_info = NULL;
5843 return ret;
5844 }
5845
ext4_get_tree(struct fs_context * fc)5846 static int ext4_get_tree(struct fs_context *fc)
5847 {
5848 return get_tree_bdev(fc, ext4_fill_super);
5849 }
5850
5851 /*
5852 * Setup any per-fs journal parameters now. We'll do this both on
5853 * initial mount, once the journal has been initialised but before we've
5854 * done any recovery; and again on any subsequent remount.
5855 */
ext4_init_journal_params(struct super_block * sb,journal_t * journal)5856 static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
5857 {
5858 struct ext4_sb_info *sbi = EXT4_SB(sb);
5859
5860 journal->j_commit_interval = sbi->s_commit_interval;
5861 journal->j_min_batch_time = sbi->s_min_batch_time;
5862 journal->j_max_batch_time = sbi->s_max_batch_time;
5863 ext4_fc_init(sb, journal);
5864
5865 write_lock(&journal->j_state_lock);
5866 if (test_opt(sb, BARRIER))
5867 journal->j_flags |= JBD2_BARRIER;
5868 else
5869 journal->j_flags &= ~JBD2_BARRIER;
5870 /*
5871 * Always enable journal cycle record option, letting the journal
5872 * records log transactions continuously between each mount.
5873 */
5874 journal->j_flags |= JBD2_CYCLE_RECORD;
5875 write_unlock(&journal->j_state_lock);
5876 }
5877
ext4_get_journal_inode(struct super_block * sb,unsigned int journal_inum)5878 static struct inode *ext4_get_journal_inode(struct super_block *sb,
5879 unsigned int journal_inum)
5880 {
5881 struct inode *journal_inode;
5882
5883 /*
5884 * Test for the existence of a valid inode on disk. Bad things
5885 * happen if we iget() an unused inode, as the subsequent iput()
5886 * will try to delete it.
5887 */
5888 journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
5889 if (IS_ERR(journal_inode)) {
5890 ext4_msg(sb, KERN_ERR, "no journal found");
5891 return ERR_CAST(journal_inode);
5892 }
5893 if (!journal_inode->i_nlink) {
5894 make_bad_inode(journal_inode);
5895 iput(journal_inode);
5896 ext4_msg(sb, KERN_ERR, "journal inode is deleted");
5897 return ERR_PTR(-EFSCORRUPTED);
5898 }
5899 if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
5900 ext4_msg(sb, KERN_ERR, "invalid journal inode");
5901 iput(journal_inode);
5902 return ERR_PTR(-EFSCORRUPTED);
5903 }
5904
5905 ext4_debug("Journal inode found at %p: %lld bytes\n",
5906 journal_inode, journal_inode->i_size);
5907 return journal_inode;
5908 }
5909
ext4_journal_bmap(journal_t * journal,sector_t * block)5910 static int ext4_journal_bmap(journal_t *journal, sector_t *block)
5911 {
5912 struct ext4_map_blocks map;
5913 int ret;
5914
5915 if (journal->j_inode == NULL)
5916 return 0;
5917
5918 map.m_lblk = *block;
5919 map.m_len = 1;
5920 ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
5921 if (ret <= 0) {
5922 ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
5923 "journal bmap failed: block %llu ret %d\n",
5924 *block, ret);
5925 jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED);
5926 return ret;
5927 }
5928 *block = map.m_pblk;
5929 return 0;
5930 }
5931
ext4_open_inode_journal(struct super_block * sb,unsigned int journal_inum)5932 static journal_t *ext4_open_inode_journal(struct super_block *sb,
5933 unsigned int journal_inum)
5934 {
5935 struct inode *journal_inode;
5936 journal_t *journal;
5937
5938 journal_inode = ext4_get_journal_inode(sb, journal_inum);
5939 if (IS_ERR(journal_inode))
5940 return ERR_CAST(journal_inode);
5941
5942 journal = jbd2_journal_init_inode(journal_inode);
5943 if (IS_ERR(journal)) {
5944 ext4_msg(sb, KERN_ERR, "Could not load journal inode");
5945 iput(journal_inode);
5946 return ERR_CAST(journal);
5947 }
5948 journal->j_private = sb;
5949 journal->j_bmap = ext4_journal_bmap;
5950 ext4_init_journal_params(sb, journal);
5951 return journal;
5952 }
5953
ext4_get_journal_blkdev(struct super_block * sb,dev_t j_dev,ext4_fsblk_t * j_start,ext4_fsblk_t * j_len)5954 static struct file *ext4_get_journal_blkdev(struct super_block *sb,
5955 dev_t j_dev, ext4_fsblk_t *j_start,
5956 ext4_fsblk_t *j_len)
5957 {
5958 struct buffer_head *bh;
5959 struct block_device *bdev;
5960 struct file *bdev_file;
5961 int hblock, blocksize;
5962 ext4_fsblk_t sb_block;
5963 unsigned long offset;
5964 struct ext4_super_block *es;
5965 int errno;
5966
5967 bdev_file = bdev_file_open_by_dev(j_dev,
5968 BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
5969 sb, &fs_holder_ops);
5970 if (IS_ERR(bdev_file)) {
5971 ext4_msg(sb, KERN_ERR,
5972 "failed to open journal device unknown-block(%u,%u) %ld",
5973 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
5974 return bdev_file;
5975 }
5976
5977 bdev = file_bdev(bdev_file);
5978 blocksize = sb->s_blocksize;
5979 hblock = bdev_logical_block_size(bdev);
5980 if (blocksize < hblock) {
5981 ext4_msg(sb, KERN_ERR,
5982 "blocksize too small for journal device");
5983 errno = -EINVAL;
5984 goto out_bdev;
5985 }
5986
5987 sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
5988 offset = EXT4_MIN_BLOCK_SIZE % blocksize;
5989 set_blocksize(bdev_file, blocksize);
5990 bh = __bread(bdev, sb_block, blocksize);
5991 if (!bh) {
5992 ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
5993 "external journal");
5994 errno = -EINVAL;
5995 goto out_bdev;
5996 }
5997
5998 es = (struct ext4_super_block *) (bh->b_data + offset);
5999 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
6000 !(le32_to_cpu(es->s_feature_incompat) &
6001 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
6002 ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
6003 errno = -EFSCORRUPTED;
6004 goto out_bh;
6005 }
6006
6007 if ((le32_to_cpu(es->s_feature_ro_compat) &
6008 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
6009 es->s_checksum != ext4_superblock_csum(es)) {
6010 ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
6011 errno = -EFSCORRUPTED;
6012 goto out_bh;
6013 }
6014
6015 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
6016 ext4_msg(sb, KERN_ERR, "journal UUID does not match");
6017 errno = -EFSCORRUPTED;
6018 goto out_bh;
6019 }
6020
6021 *j_start = sb_block + 1;
6022 *j_len = ext4_blocks_count(es);
6023 brelse(bh);
6024 return bdev_file;
6025
6026 out_bh:
6027 brelse(bh);
6028 out_bdev:
6029 bdev_fput(bdev_file);
6030 return ERR_PTR(errno);
6031 }
6032
ext4_open_dev_journal(struct super_block * sb,dev_t j_dev)6033 static journal_t *ext4_open_dev_journal(struct super_block *sb,
6034 dev_t j_dev)
6035 {
6036 journal_t *journal;
6037 ext4_fsblk_t j_start;
6038 ext4_fsblk_t j_len;
6039 struct file *bdev_file;
6040 int errno = 0;
6041
6042 bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
6043 if (IS_ERR(bdev_file))
6044 return ERR_CAST(bdev_file);
6045
6046 journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
6047 j_len, sb->s_blocksize);
6048 if (IS_ERR(journal)) {
6049 ext4_msg(sb, KERN_ERR, "failed to create device journal");
6050 errno = PTR_ERR(journal);
6051 goto out_bdev;
6052 }
6053 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
6054 ext4_msg(sb, KERN_ERR, "External journal has more than one "
6055 "user (unsupported) - %d",
6056 be32_to_cpu(journal->j_superblock->s_nr_users));
6057 errno = -EINVAL;
6058 goto out_journal;
6059 }
6060 journal->j_private = sb;
6061 EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
6062 ext4_init_journal_params(sb, journal);
6063 return journal;
6064
6065 out_journal:
6066 ext4_journal_destroy(EXT4_SB(sb), journal);
6067 out_bdev:
6068 bdev_fput(bdev_file);
6069 return ERR_PTR(errno);
6070 }
6071
ext4_load_journal(struct super_block * sb,struct ext4_super_block * es,unsigned long journal_devnum)6072 static int ext4_load_journal(struct super_block *sb,
6073 struct ext4_super_block *es,
6074 unsigned long journal_devnum)
6075 {
6076 journal_t *journal;
6077 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
6078 dev_t journal_dev;
6079 int err = 0;
6080 int really_read_only;
6081 int journal_dev_ro;
6082
6083 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
6084 return -EFSCORRUPTED;
6085
6086 if (journal_devnum &&
6087 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
6088 ext4_msg(sb, KERN_INFO, "external journal device major/minor "
6089 "numbers have changed");
6090 journal_dev = new_decode_dev(journal_devnum);
6091 } else
6092 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
6093
6094 if (journal_inum && journal_dev) {
6095 ext4_msg(sb, KERN_ERR,
6096 "filesystem has both journal inode and journal device!");
6097 return -EINVAL;
6098 }
6099
6100 if (journal_inum) {
6101 journal = ext4_open_inode_journal(sb, journal_inum);
6102 if (IS_ERR(journal))
6103 return PTR_ERR(journal);
6104 } else {
6105 journal = ext4_open_dev_journal(sb, journal_dev);
6106 if (IS_ERR(journal))
6107 return PTR_ERR(journal);
6108 }
6109
6110 journal_dev_ro = bdev_read_only(journal->j_dev);
6111 really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;
6112
6113 if (journal_dev_ro && !sb_rdonly(sb)) {
6114 ext4_msg(sb, KERN_ERR,
6115 "journal device read-only, try mounting with '-o ro'");
6116 err = -EROFS;
6117 goto err_out;
6118 }
6119
6120 /*
6121 * Are we loading a blank journal or performing recovery after a
6122 * crash? For recovery, we need to check in advance whether we
6123 * can get read-write access to the device.
6124 */
6125 if (ext4_has_feature_journal_needs_recovery(sb)) {
6126 if (sb_rdonly(sb)) {
6127 ext4_msg(sb, KERN_INFO, "INFO: recovery "
6128 "required on readonly filesystem");
6129 if (really_read_only) {
6130 ext4_msg(sb, KERN_ERR, "write access "
6131 "unavailable, cannot proceed "
6132 "(try mounting with noload)");
6133 err = -EROFS;
6134 goto err_out;
6135 }
6136 ext4_msg(sb, KERN_INFO, "write access will "
6137 "be enabled during recovery");
6138 }
6139 }
6140
6141 if (!(journal->j_flags & JBD2_BARRIER))
6142 ext4_msg(sb, KERN_INFO, "barriers disabled");
6143
6144 if (!ext4_has_feature_journal_needs_recovery(sb))
6145 err = jbd2_journal_wipe(journal, !really_read_only);
6146 if (!err) {
6147 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
6148 __le16 orig_state;
6149 bool changed = false;
6150
6151 if (save)
6152 memcpy(save, ((char *) es) +
6153 EXT4_S_ERR_START, EXT4_S_ERR_LEN);
6154 err = jbd2_journal_load(journal);
6155 if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
6156 save, EXT4_S_ERR_LEN)) {
6157 memcpy(((char *) es) + EXT4_S_ERR_START,
6158 save, EXT4_S_ERR_LEN);
6159 changed = true;
6160 }
6161 kfree(save);
6162 orig_state = es->s_state;
6163 es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
6164 EXT4_ERROR_FS);
6165 if (orig_state != es->s_state)
6166 changed = true;
6167 /* Write out restored error information to the superblock */
6168 if (changed && !really_read_only) {
6169 int err2;
6170 err2 = ext4_commit_super(sb);
6171 err = err ? : err2;
6172 }
6173 }
6174
6175 if (err) {
6176 ext4_msg(sb, KERN_ERR, "error loading journal");
6177 goto err_out;
6178 }
6179
6180 EXT4_SB(sb)->s_journal = journal;
6181 err = ext4_clear_journal_err(sb, es);
6182 if (err) {
6183 ext4_journal_destroy(EXT4_SB(sb), journal);
6184 return err;
6185 }
6186
6187 if (!really_read_only && journal_devnum &&
6188 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
6189 es->s_journal_dev = cpu_to_le32(journal_devnum);
6190 ext4_commit_super(sb);
6191 }
6192 if (!really_read_only && journal_inum &&
6193 journal_inum != le32_to_cpu(es->s_journal_inum)) {
6194 es->s_journal_inum = cpu_to_le32(journal_inum);
6195 ext4_commit_super(sb);
6196 }
6197
6198 return 0;
6199
6200 err_out:
6201 ext4_journal_destroy(EXT4_SB(sb), journal);
6202 return err;
6203 }
6204
6205 /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
ext4_update_super(struct super_block * sb)6206 static void ext4_update_super(struct super_block *sb)
6207 {
6208 struct ext4_sb_info *sbi = EXT4_SB(sb);
6209 struct ext4_super_block *es = sbi->s_es;
6210 struct buffer_head *sbh = sbi->s_sbh;
6211
6212 lock_buffer(sbh);
6213 /*
6214 * If the file system is mounted read-only, don't update the
6215 * superblock write time. This avoids updating the superblock
6216 * write time when we are mounting the root file system
6217 * read/only but we need to replay the journal; at that point,
6218 * for people who are east of GMT and who make their clock
6219 * tick in localtime for Windows bug-for-bug compatibility,
6220 * the clock is set in the future, and this will cause e2fsck
6221 * to complain and force a full file system check.
6222 */
6223 if (!sb_rdonly(sb))
6224 ext4_update_tstamp(es, s_wtime);
6225 es->s_kbytes_written =
6226 cpu_to_le64(sbi->s_kbytes_written +
6227 ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
6228 sbi->s_sectors_written_start) >> 1));
6229 if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
6230 ext4_free_blocks_count_set(es,
6231 EXT4_C2B(sbi, percpu_counter_sum_positive(
6232 &sbi->s_freeclusters_counter)));
6233 if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
6234 es->s_free_inodes_count =
6235 cpu_to_le32(percpu_counter_sum_positive(
6236 &sbi->s_freeinodes_counter));
6237 /* Copy error information to the on-disk superblock */
6238 spin_lock(&sbi->s_error_lock);
6239 if (sbi->s_add_error_count > 0) {
6240 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
6241 if (!es->s_first_error_time && !es->s_first_error_time_hi) {
6242 __ext4_update_tstamp(&es->s_first_error_time,
6243 &es->s_first_error_time_hi,
6244 sbi->s_first_error_time);
6245 strtomem_pad(es->s_first_error_func,
6246 sbi->s_first_error_func, 0);
6247 es->s_first_error_line =
6248 cpu_to_le32(sbi->s_first_error_line);
6249 es->s_first_error_ino =
6250 cpu_to_le32(sbi->s_first_error_ino);
6251 es->s_first_error_block =
6252 cpu_to_le64(sbi->s_first_error_block);
6253 es->s_first_error_errcode =
6254 ext4_errno_to_code(sbi->s_first_error_code);
6255 }
6256 __ext4_update_tstamp(&es->s_last_error_time,
6257 &es->s_last_error_time_hi,
6258 sbi->s_last_error_time);
6259 strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
6260 es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
6261 es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
6262 es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
6263 es->s_last_error_errcode =
6264 ext4_errno_to_code(sbi->s_last_error_code);
6265 /*
6266 * Start the daily error reporting function if it hasn't been
6267 * started already and sbi->s_err_report_sec is not zero
6268 */
6269 if (!es->s_error_count && !sbi->s_err_report_sec)
6270 mod_timer(&sbi->s_err_report,
6271 jiffies + secs_to_jiffies(sbi->s_err_report_sec));
6272 le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
6273 sbi->s_add_error_count = 0;
6274 }
6275 spin_unlock(&sbi->s_error_lock);
6276
6277 ext4_superblock_csum_set(sb);
6278 unlock_buffer(sbh);
6279 }
6280
ext4_commit_super(struct super_block * sb)6281 static int ext4_commit_super(struct super_block *sb)
6282 {
6283 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
6284
6285 if (!sbh)
6286 return -EINVAL;
6287
6288 ext4_update_super(sb);
6289
6290 lock_buffer(sbh);
6291 /* Buffer got discarded which means block device got invalidated */
6292 if (!buffer_mapped(sbh)) {
6293 unlock_buffer(sbh);
6294 return -EIO;
6295 }
6296
6297 if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
6298 /*
6299 * Oh, dear. A previous attempt to write the
6300 * superblock failed. This could happen because the
6301 * USB device was yanked out. Or it could happen to
6302 * be a transient write error and maybe the block will
6303 * be remapped. Nothing we can do but to retry the
6304 * write and hope for the best.
6305 */
6306 ext4_msg(sb, KERN_ERR, "previous I/O error to "
6307 "superblock detected");
6308 clear_buffer_write_io_error(sbh);
6309 set_buffer_uptodate(sbh);
6310 }
6311 get_bh(sbh);
6312 /* Clear potential dirty bit if it was journalled update */
6313 clear_buffer_dirty(sbh);
6314 sbh->b_end_io = end_buffer_write_sync;
6315 submit_bh(REQ_OP_WRITE | REQ_SYNC |
6316 (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
6317 wait_on_buffer(sbh);
6318 if (buffer_write_io_error(sbh)) {
6319 ext4_msg(sb, KERN_ERR, "I/O error while writing "
6320 "superblock");
6321 clear_buffer_write_io_error(sbh);
6322 set_buffer_uptodate(sbh);
6323 return -EIO;
6324 }
6325 return 0;
6326 }
6327
6328 /*
6329 * Have we just finished recovery? If so, and if we are mounting (or
6330 * remounting) the filesystem readonly, then we will end up with a
6331 * consistent fs on disk. Record that fact.
6332 */
ext4_mark_recovery_complete(struct super_block * sb,struct ext4_super_block * es)6333 static int ext4_mark_recovery_complete(struct super_block *sb,
6334 struct ext4_super_block *es)
6335 {
6336 int err;
6337 journal_t *journal = EXT4_SB(sb)->s_journal;
6338
6339 if (!ext4_has_feature_journal(sb)) {
6340 if (journal != NULL) {
6341 ext4_error(sb, "Journal got removed while the fs was "
6342 "mounted!");
6343 return -EFSCORRUPTED;
6344 }
6345 return 0;
6346 }
6347 jbd2_journal_lock_updates(journal);
6348 err = jbd2_journal_flush(journal, 0);
6349 if (err < 0)
6350 goto out;
6351
6352 if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
6353 ext4_has_feature_orphan_present(sb))) {
6354 if (!ext4_orphan_file_empty(sb)) {
6355 ext4_error(sb, "Orphan file not empty on read-only fs.");
6356 err = -EFSCORRUPTED;
6357 goto out;
6358 }
6359 ext4_clear_feature_journal_needs_recovery(sb);
6360 ext4_clear_feature_orphan_present(sb);
6361 ext4_commit_super(sb);
6362 }
6363 out:
6364 jbd2_journal_unlock_updates(journal);
6365 return err;
6366 }
6367
6368 /*
6369 * If we are mounting (or read-write remounting) a filesystem whose journal
6370 * has recorded an error from a previous lifetime, move that error to the
6371 * main filesystem now.
6372 */
ext4_clear_journal_err(struct super_block * sb,struct ext4_super_block * es)6373 static int ext4_clear_journal_err(struct super_block *sb,
6374 struct ext4_super_block *es)
6375 {
6376 journal_t *journal;
6377 int j_errno;
6378 const char *errstr;
6379
6380 if (!ext4_has_feature_journal(sb)) {
6381 ext4_error(sb, "Journal got removed while the fs was mounted!");
6382 return -EFSCORRUPTED;
6383 }
6384
6385 journal = EXT4_SB(sb)->s_journal;
6386
6387 /*
6388 * Now check for any error status which may have been recorded in the
6389 * journal by a prior ext4_error() or ext4_abort()
6390 */
6391
6392 j_errno = jbd2_journal_errno(journal);
6393 if (j_errno) {
6394 char nbuf[16];
6395
6396 errstr = ext4_decode_error(sb, j_errno, nbuf);
6397 ext4_warning(sb, "Filesystem error recorded "
6398 "from previous mount: %s", errstr);
6399
6400 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
6401 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
6402 j_errno = ext4_commit_super(sb);
6403 if (j_errno)
6404 return j_errno;
6405 ext4_warning(sb, "Marked fs in need of filesystem check.");
6406
6407 jbd2_journal_clear_err(journal);
6408 jbd2_journal_update_sb_errno(journal);
6409 }
6410 return 0;
6411 }
6412
6413 /*
6414 * Force the running and committing transactions to commit,
6415 * and wait on the commit.
6416 */
ext4_force_commit(struct super_block * sb)6417 int ext4_force_commit(struct super_block *sb)
6418 {
6419 return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
6420 }
6421
ext4_sync_fs(struct super_block * sb,int wait)6422 static int ext4_sync_fs(struct super_block *sb, int wait)
6423 {
6424 int ret = 0;
6425 tid_t target;
6426 bool needs_barrier = false;
6427 struct ext4_sb_info *sbi = EXT4_SB(sb);
6428
6429 ret = ext4_emergency_state(sb);
6430 if (unlikely(ret))
6431 return ret;
6432
6433 trace_ext4_sync_fs(sb, wait);
6434 flush_workqueue(sbi->rsv_conversion_wq);
6435 /*
6436 * Writeback quota in non-journalled quota case - journalled quota has
6437 * no dirty dquots
6438 */
6439 dquot_writeback_dquots(sb, -1);
6440 /*
6441 * Data writeback is possible w/o journal transaction, so barrier must
6442 * being sent at the end of the function. But we can skip it if
6443 * transaction_commit will do it for us.
6444 */
6445 if (sbi->s_journal) {
6446 target = jbd2_get_latest_transaction(sbi->s_journal);
6447 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
6448 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
6449 needs_barrier = true;
6450
6451 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
6452 if (wait)
6453 ret = jbd2_log_wait_commit(sbi->s_journal,
6454 target);
6455 }
6456 } else if (wait && test_opt(sb, BARRIER))
6457 needs_barrier = true;
6458 if (needs_barrier) {
6459 int err;
6460 err = blkdev_issue_flush(sb->s_bdev);
6461 if (!ret)
6462 ret = err;
6463 }
6464
6465 return ret;
6466 }
6467
6468 /*
6469 * LVM calls this function before a (read-only) snapshot is created. This
6470 * gives us a chance to flush the journal completely and mark the fs clean.
6471 *
6472 * Note that only this function cannot bring a filesystem to be in a clean
6473 * state independently. It relies on upper layer to stop all data & metadata
6474 * modifications.
6475 */
ext4_freeze(struct super_block * sb)6476 static int ext4_freeze(struct super_block *sb)
6477 {
6478 int error = 0;
6479 journal_t *journal = EXT4_SB(sb)->s_journal;
6480
6481 if (journal) {
6482 /* Now we set up the journal barrier. */
6483 jbd2_journal_lock_updates(journal);
6484
6485 /*
6486 * Don't clear the needs_recovery flag if we failed to
6487 * flush the journal.
6488 */
6489 error = jbd2_journal_flush(journal, 0);
6490 if (error < 0)
6491 goto out;
6492
6493 /* Journal blocked and flushed, clear needs_recovery flag. */
6494 ext4_clear_feature_journal_needs_recovery(sb);
6495 if (ext4_orphan_file_empty(sb))
6496 ext4_clear_feature_orphan_present(sb);
6497 }
6498
6499 error = ext4_commit_super(sb);
6500 out:
6501 if (journal)
6502 /* we rely on upper layer to stop further updates */
6503 jbd2_journal_unlock_updates(journal);
6504 return error;
6505 }
6506
6507 /*
6508 * Called by LVM after the snapshot is done. We need to reset the RECOVER
6509 * flag here, even though the filesystem is not technically dirty yet.
6510 */
ext4_unfreeze(struct super_block * sb)6511 static int ext4_unfreeze(struct super_block *sb)
6512 {
6513 if (ext4_emergency_state(sb))
6514 return 0;
6515
6516 if (EXT4_SB(sb)->s_journal) {
6517 /* Reset the needs_recovery flag before the fs is unlocked. */
6518 ext4_set_feature_journal_needs_recovery(sb);
6519 if (ext4_has_feature_orphan_file(sb))
6520 ext4_set_feature_orphan_present(sb);
6521 }
6522
6523 ext4_commit_super(sb);
6524 return 0;
6525 }
6526
6527 /*
6528 * Structure to save mount options for ext4_remount's benefit
6529 */
6530 struct ext4_mount_options {
6531 unsigned long s_mount_opt;
6532 unsigned long s_mount_opt2;
6533 kuid_t s_resuid;
6534 kgid_t s_resgid;
6535 unsigned long s_commit_interval;
6536 u32 s_min_batch_time, s_max_batch_time;
6537 #ifdef CONFIG_QUOTA
6538 int s_jquota_fmt;
6539 char *s_qf_names[EXT4_MAXQUOTAS];
6540 #endif
6541 };
6542
__ext4_remount(struct fs_context * fc,struct super_block * sb)6543 static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
6544 {
6545 struct ext4_fs_context *ctx = fc->fs_private;
6546 struct ext4_super_block *es;
6547 struct ext4_sb_info *sbi = EXT4_SB(sb);
6548 unsigned long old_sb_flags;
6549 struct ext4_mount_options old_opts;
6550 ext4_group_t g;
6551 int err = 0;
6552 int alloc_ctx;
6553 #ifdef CONFIG_QUOTA
6554 int enable_quota = 0;
6555 int i, j;
6556 char *to_free[EXT4_MAXQUOTAS];
6557 #endif
6558
6559
6560 /* Store the original options */
6561 old_sb_flags = sb->s_flags;
6562 old_opts.s_mount_opt = sbi->s_mount_opt;
6563 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
6564 old_opts.s_resuid = sbi->s_resuid;
6565 old_opts.s_resgid = sbi->s_resgid;
6566 old_opts.s_commit_interval = sbi->s_commit_interval;
6567 old_opts.s_min_batch_time = sbi->s_min_batch_time;
6568 old_opts.s_max_batch_time = sbi->s_max_batch_time;
6569 #ifdef CONFIG_QUOTA
6570 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
6571 for (i = 0; i < EXT4_MAXQUOTAS; i++)
6572 if (sbi->s_qf_names[i]) {
6573 char *qf_name = get_qf_name(sb, sbi, i);
6574
6575 old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
6576 if (!old_opts.s_qf_names[i]) {
6577 for (j = 0; j < i; j++)
6578 kfree(old_opts.s_qf_names[j]);
6579 return -ENOMEM;
6580 }
6581 } else
6582 old_opts.s_qf_names[i] = NULL;
6583 #endif
6584 if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
6585 if (sbi->s_journal && sbi->s_journal->j_task->io_context)
6586 ctx->journal_ioprio =
6587 sbi->s_journal->j_task->io_context->ioprio;
6588 else
6589 ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
6590
6591 }
6592
6593 if ((ctx->spec & EXT4_SPEC_s_stripe) &&
6594 ext4_is_stripe_incompatible(sb, ctx->s_stripe)) {
6595 ext4_msg(sb, KERN_WARNING,
6596 "stripe (%lu) is not aligned with cluster size (%u), "
6597 "stripe is disabled",
6598 ctx->s_stripe, sbi->s_cluster_ratio);
6599 ctx->s_stripe = 0;
6600 }
6601
6602 /*
6603 * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
6604 * two calls to ext4_should_dioread_nolock() to return inconsistent
6605 * values, triggering WARN_ON in ext4_add_complete_io(). we grab
6606 * here s_writepages_rwsem to avoid race between writepages ops and
6607 * remount.
6608 */
6609 alloc_ctx = ext4_writepages_down_write(sb);
6610 ext4_apply_options(fc, sb);
6611 ext4_writepages_up_write(sb, alloc_ctx);
6612
6613 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
6614 test_opt(sb, JOURNAL_CHECKSUM)) {
6615 ext4_msg(sb, KERN_ERR, "changing journal_checksum "
6616 "during remount not supported; ignoring");
6617 sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
6618 }
6619
6620 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
6621 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
6622 ext4_msg(sb, KERN_ERR, "can't mount with "
6623 "both data=journal and delalloc");
6624 err = -EINVAL;
6625 goto restore_opts;
6626 }
6627 if (test_opt(sb, DIOREAD_NOLOCK)) {
6628 ext4_msg(sb, KERN_ERR, "can't mount with "
6629 "both data=journal and dioread_nolock");
6630 err = -EINVAL;
6631 goto restore_opts;
6632 }
6633 } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
6634 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
6635 ext4_msg(sb, KERN_ERR, "can't mount with "
6636 "journal_async_commit in data=ordered mode");
6637 err = -EINVAL;
6638 goto restore_opts;
6639 }
6640 }
6641
6642 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
6643 ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
6644 err = -EINVAL;
6645 goto restore_opts;
6646 }
6647
6648 if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) &&
6649 !test_opt(sb, DELALLOC)) {
6650 ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount");
6651 err = -EINVAL;
6652 goto restore_opts;
6653 }
6654
6655 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
6656 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
6657
6658 es = sbi->s_es;
6659
6660 if (sbi->s_journal) {
6661 ext4_init_journal_params(sb, sbi->s_journal);
6662 set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
6663 }
6664
6665 /* Flush outstanding errors before changing fs state */
6666 flush_work(&sbi->s_sb_upd_work);
6667
6668 if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
6669 if (ext4_emergency_state(sb)) {
6670 err = -EROFS;
6671 goto restore_opts;
6672 }
6673
6674 if (fc->sb_flags & SB_RDONLY) {
6675 err = sync_filesystem(sb);
6676 if (err < 0)
6677 goto restore_opts;
6678 err = dquot_suspend(sb, -1);
6679 if (err < 0)
6680 goto restore_opts;
6681
6682 /*
6683 * First of all, the unconditional stuff we have to do
6684 * to disable replay of the journal when we next remount
6685 */
6686 sb->s_flags |= SB_RDONLY;
6687
6688 /*
6689 * OK, test if we are remounting a valid rw partition
6690 * readonly, and if so set the rdonly flag and then
6691 * mark the partition as valid again.
6692 */
6693 if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
6694 (sbi->s_mount_state & EXT4_VALID_FS))
6695 es->s_state = cpu_to_le16(sbi->s_mount_state);
6696
6697 if (sbi->s_journal) {
6698 /*
6699 * We let remount-ro finish even if marking fs
6700 * as clean failed...
6701 */
6702 ext4_mark_recovery_complete(sb, es);
6703 }
6704 } else {
6705 /* Make sure we can mount this feature set readwrite */
6706 if (ext4_has_feature_readonly(sb) ||
6707 !ext4_feature_set_ok(sb, 0)) {
6708 err = -EROFS;
6709 goto restore_opts;
6710 }
6711 /*
6712 * Make sure the group descriptor checksums
6713 * are sane. If they aren't, refuse to remount r/w.
6714 */
6715 for (g = 0; g < sbi->s_groups_count; g++) {
6716 struct ext4_group_desc *gdp =
6717 ext4_get_group_desc(sb, g, NULL);
6718
6719 if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
6720 ext4_msg(sb, KERN_ERR,
6721 "ext4_remount: Checksum for group %u failed (%u!=%u)",
6722 g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
6723 le16_to_cpu(gdp->bg_checksum));
6724 err = -EFSBADCRC;
6725 goto restore_opts;
6726 }
6727 }
6728
6729 /*
6730 * If we have an unprocessed orphan list hanging
6731 * around from a previously readonly bdev mount,
6732 * require a full umount/remount for now.
6733 */
6734 if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
6735 ext4_msg(sb, KERN_WARNING, "Couldn't "
6736 "remount RDWR because of unprocessed "
6737 "orphan inode list. Please "
6738 "umount/remount instead");
6739 err = -EINVAL;
6740 goto restore_opts;
6741 }
6742
6743 /*
6744 * Mounting a RDONLY partition read-write, so reread
6745 * and store the current valid flag. (It may have
6746 * been changed by e2fsck since we originally mounted
6747 * the partition.)
6748 */
6749 if (sbi->s_journal) {
6750 err = ext4_clear_journal_err(sb, es);
6751 if (err)
6752 goto restore_opts;
6753 }
6754 sbi->s_mount_state = (le16_to_cpu(es->s_state) &
6755 ~EXT4_FC_REPLAY);
6756
6757 err = ext4_setup_super(sb, es, 0);
6758 if (err)
6759 goto restore_opts;
6760
6761 sb->s_flags &= ~SB_RDONLY;
6762 if (ext4_has_feature_mmp(sb)) {
6763 err = ext4_multi_mount_protect(sb,
6764 le64_to_cpu(es->s_mmp_block));
6765 if (err)
6766 goto restore_opts;
6767 }
6768 #ifdef CONFIG_QUOTA
6769 enable_quota = 1;
6770 #endif
6771 }
6772 }
6773
6774 /*
6775 * Handle creation of system zone data early because it can fail.
6776 * Releasing of existing data is done when we are sure remount will
6777 * succeed.
6778 */
6779 if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
6780 err = ext4_setup_system_zone(sb);
6781 if (err)
6782 goto restore_opts;
6783 }
6784
6785 if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
6786 err = ext4_commit_super(sb);
6787 if (err)
6788 goto restore_opts;
6789 }
6790
6791 #ifdef CONFIG_QUOTA
6792 if (enable_quota) {
6793 if (sb_any_quota_suspended(sb))
6794 dquot_resume(sb, -1);
6795 else if (ext4_has_feature_quota(sb)) {
6796 err = ext4_enable_quotas(sb);
6797 if (err)
6798 goto restore_opts;
6799 }
6800 }
6801 /* Release old quota file names */
6802 for (i = 0; i < EXT4_MAXQUOTAS; i++)
6803 kfree(old_opts.s_qf_names[i]);
6804 #endif
6805 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6806 ext4_release_system_zone(sb);
6807
6808 /*
6809 * Reinitialize lazy itable initialization thread based on
6810 * current settings
6811 */
6812 if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
6813 ext4_unregister_li_request(sb);
6814 else {
6815 ext4_group_t first_not_zeroed;
6816 first_not_zeroed = ext4_has_uninit_itable(sb);
6817 ext4_register_li_request(sb, first_not_zeroed);
6818 }
6819
6820 if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6821 ext4_stop_mmpd(sbi);
6822
6823 /*
6824 * Handle aborting the filesystem as the last thing during remount to
6825 * avoid obsure errors during remount when some option changes fail to
6826 * apply due to shutdown filesystem.
6827 */
6828 if (test_opt2(sb, ABORT))
6829 ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
6830
6831 return 0;
6832
6833 restore_opts:
6834 /*
6835 * If there was a failing r/w to ro transition, we may need to
6836 * re-enable quota
6837 */
6838 if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
6839 sb_any_quota_suspended(sb))
6840 dquot_resume(sb, -1);
6841
6842 alloc_ctx = ext4_writepages_down_write(sb);
6843 sb->s_flags = old_sb_flags;
6844 sbi->s_mount_opt = old_opts.s_mount_opt;
6845 sbi->s_mount_opt2 = old_opts.s_mount_opt2;
6846 sbi->s_resuid = old_opts.s_resuid;
6847 sbi->s_resgid = old_opts.s_resgid;
6848 sbi->s_commit_interval = old_opts.s_commit_interval;
6849 sbi->s_min_batch_time = old_opts.s_min_batch_time;
6850 sbi->s_max_batch_time = old_opts.s_max_batch_time;
6851 ext4_writepages_up_write(sb, alloc_ctx);
6852
6853 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
6854 ext4_release_system_zone(sb);
6855 #ifdef CONFIG_QUOTA
6856 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
6857 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
6858 to_free[i] = get_qf_name(sb, sbi, i);
6859 rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
6860 }
6861 synchronize_rcu();
6862 for (i = 0; i < EXT4_MAXQUOTAS; i++)
6863 kfree(to_free[i]);
6864 #endif
6865 if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
6866 ext4_stop_mmpd(sbi);
6867 return err;
6868 }
6869
ext4_reconfigure(struct fs_context * fc)6870 static int ext4_reconfigure(struct fs_context *fc)
6871 {
6872 struct super_block *sb = fc->root->d_sb;
6873 int ret;
6874 bool old_ro = sb_rdonly(sb);
6875
6876 fc->s_fs_info = EXT4_SB(sb);
6877
6878 ret = ext4_check_opt_consistency(fc, sb);
6879 if (ret < 0)
6880 return ret;
6881
6882 ret = __ext4_remount(fc, sb);
6883 if (ret < 0)
6884 return ret;
6885
6886 ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
6887 &sb->s_uuid,
6888 (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
6889
6890 return 0;
6891 }
6892
6893 #ifdef CONFIG_QUOTA
ext4_statfs_project(struct super_block * sb,kprojid_t projid,struct kstatfs * buf)6894 static int ext4_statfs_project(struct super_block *sb,
6895 kprojid_t projid, struct kstatfs *buf)
6896 {
6897 struct kqid qid;
6898 struct dquot *dquot;
6899 u64 limit;
6900 u64 curblock;
6901
6902 qid = make_kqid_projid(projid);
6903 dquot = dqget(sb, qid);
6904 if (IS_ERR(dquot))
6905 return PTR_ERR(dquot);
6906 spin_lock(&dquot->dq_dqb_lock);
6907
6908 limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
6909 dquot->dq_dqb.dqb_bhardlimit);
6910 limit >>= sb->s_blocksize_bits;
6911
6912 if (limit) {
6913 uint64_t remaining = 0;
6914
6915 curblock = (dquot->dq_dqb.dqb_curspace +
6916 dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
6917 if (limit > curblock)
6918 remaining = limit - curblock;
6919
6920 buf->f_blocks = min(buf->f_blocks, limit);
6921 buf->f_bfree = min(buf->f_bfree, remaining);
6922 buf->f_bavail = min(buf->f_bavail, remaining);
6923 }
6924
6925 limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
6926 dquot->dq_dqb.dqb_ihardlimit);
6927 if (limit) {
6928 uint64_t remaining = 0;
6929
6930 if (limit > dquot->dq_dqb.dqb_curinodes)
6931 remaining = limit - dquot->dq_dqb.dqb_curinodes;
6932
6933 buf->f_files = min(buf->f_files, limit);
6934 buf->f_ffree = min(buf->f_ffree, remaining);
6935 }
6936
6937 spin_unlock(&dquot->dq_dqb_lock);
6938 dqput(dquot);
6939 return 0;
6940 }
6941 #endif
6942
ext4_statfs(struct dentry * dentry,struct kstatfs * buf)6943 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
6944 {
6945 struct super_block *sb = dentry->d_sb;
6946 struct ext4_sb_info *sbi = EXT4_SB(sb);
6947 struct ext4_super_block *es = sbi->s_es;
6948 ext4_fsblk_t overhead = 0, resv_blocks;
6949 s64 bfree;
6950 resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
6951
6952 if (!test_opt(sb, MINIX_DF))
6953 overhead = sbi->s_overhead;
6954
6955 buf->f_type = EXT4_SUPER_MAGIC;
6956 buf->f_bsize = sb->s_blocksize;
6957 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
6958 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
6959 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
6960 /* prevent underflow in case that few free space is available */
6961 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
6962 buf->f_bavail = buf->f_bfree -
6963 (ext4_r_blocks_count(es) + resv_blocks);
6964 if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
6965 buf->f_bavail = 0;
6966 buf->f_files = le32_to_cpu(es->s_inodes_count);
6967 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
6968 buf->f_namelen = EXT4_NAME_LEN;
6969 buf->f_fsid = uuid_to_fsid(es->s_uuid);
6970
6971 #ifdef CONFIG_QUOTA
6972 if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
6973 sb_has_quota_limits_enabled(sb, PRJQUOTA))
6974 ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
6975 #endif
6976 return 0;
6977 }
6978
6979
6980 #ifdef CONFIG_QUOTA
6981
6982 /*
6983 * Helper functions so that transaction is started before we acquire dqio_sem
6984 * to keep correct lock ordering of transaction > dqio_sem
6985 */
dquot_to_inode(struct dquot * dquot)6986 static inline struct inode *dquot_to_inode(struct dquot *dquot)
6987 {
6988 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
6989 }
6990
ext4_write_dquot(struct dquot * dquot)6991 static int ext4_write_dquot(struct dquot *dquot)
6992 {
6993 int ret, err;
6994 handle_t *handle;
6995 struct inode *inode;
6996
6997 inode = dquot_to_inode(dquot);
6998 handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
6999 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
7000 if (IS_ERR(handle))
7001 return PTR_ERR(handle);
7002 ret = dquot_commit(dquot);
7003 if (ret < 0)
7004 ext4_error_err(dquot->dq_sb, -ret,
7005 "Failed to commit dquot type %d",
7006 dquot->dq_id.type);
7007 err = ext4_journal_stop(handle);
7008 if (!ret)
7009 ret = err;
7010 return ret;
7011 }
7012
ext4_acquire_dquot(struct dquot * dquot)7013 static int ext4_acquire_dquot(struct dquot *dquot)
7014 {
7015 int ret, err;
7016 handle_t *handle;
7017
7018 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
7019 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
7020 if (IS_ERR(handle))
7021 return PTR_ERR(handle);
7022 ret = dquot_acquire(dquot);
7023 if (ret < 0)
7024 ext4_error_err(dquot->dq_sb, -ret,
7025 "Failed to acquire dquot type %d",
7026 dquot->dq_id.type);
7027 err = ext4_journal_stop(handle);
7028 if (!ret)
7029 ret = err;
7030 return ret;
7031 }
7032
ext4_release_dquot(struct dquot * dquot)7033 static int ext4_release_dquot(struct dquot *dquot)
7034 {
7035 int ret, err;
7036 handle_t *handle;
7037 bool freeze_protected = false;
7038
7039 /*
7040 * Trying to sb_start_intwrite() in a running transaction
7041 * can result in a deadlock. Further, running transactions
7042 * are already protected from freezing.
7043 */
7044 if (!ext4_journal_current_handle()) {
7045 sb_start_intwrite(dquot->dq_sb);
7046 freeze_protected = true;
7047 }
7048
7049 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
7050 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
7051 if (IS_ERR(handle)) {
7052 /* Release dquot anyway to avoid endless cycle in dqput() */
7053 dquot_release(dquot);
7054 if (freeze_protected)
7055 sb_end_intwrite(dquot->dq_sb);
7056 return PTR_ERR(handle);
7057 }
7058 ret = dquot_release(dquot);
7059 if (ret < 0)
7060 ext4_error_err(dquot->dq_sb, -ret,
7061 "Failed to release dquot type %d",
7062 dquot->dq_id.type);
7063 err = ext4_journal_stop(handle);
7064 if (!ret)
7065 ret = err;
7066
7067 if (freeze_protected)
7068 sb_end_intwrite(dquot->dq_sb);
7069
7070 return ret;
7071 }
7072
ext4_mark_dquot_dirty(struct dquot * dquot)7073 static int ext4_mark_dquot_dirty(struct dquot *dquot)
7074 {
7075 struct super_block *sb = dquot->dq_sb;
7076
7077 if (ext4_is_quota_journalled(sb)) {
7078 dquot_mark_dquot_dirty(dquot);
7079 return ext4_write_dquot(dquot);
7080 } else {
7081 return dquot_mark_dquot_dirty(dquot);
7082 }
7083 }
7084
ext4_write_info(struct super_block * sb,int type)7085 static int ext4_write_info(struct super_block *sb, int type)
7086 {
7087 int ret, err;
7088 handle_t *handle;
7089
7090 /* Data block + inode block */
7091 handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
7092 if (IS_ERR(handle))
7093 return PTR_ERR(handle);
7094 ret = dquot_commit_info(sb, type);
7095 err = ext4_journal_stop(handle);
7096 if (!ret)
7097 ret = err;
7098 return ret;
7099 }
7100
lockdep_set_quota_inode(struct inode * inode,int subclass)7101 static void lockdep_set_quota_inode(struct inode *inode, int subclass)
7102 {
7103 struct ext4_inode_info *ei = EXT4_I(inode);
7104
7105 /* The first argument of lockdep_set_subclass has to be
7106 * *exactly* the same as the argument to init_rwsem() --- in
7107 * this case, in init_once() --- or lockdep gets unhappy
7108 * because the name of the lock is set using the
7109 * stringification of the argument to init_rwsem().
7110 */
7111 (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
7112 lockdep_set_subclass(&ei->i_data_sem, subclass);
7113 }
7114
7115 /*
7116 * Standard function to be called on quota_on
7117 */
ext4_quota_on(struct super_block * sb,int type,int format_id,const struct path * path)7118 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
7119 const struct path *path)
7120 {
7121 int err;
7122
7123 if (!test_opt(sb, QUOTA))
7124 return -EINVAL;
7125
7126 /* Quotafile not on the same filesystem? */
7127 if (path->dentry->d_sb != sb)
7128 return -EXDEV;
7129
7130 /* Quota already enabled for this file? */
7131 if (IS_NOQUOTA(d_inode(path->dentry)))
7132 return -EBUSY;
7133
7134 /* Journaling quota? */
7135 if (EXT4_SB(sb)->s_qf_names[type]) {
7136 /* Quotafile not in fs root? */
7137 if (path->dentry->d_parent != sb->s_root)
7138 ext4_msg(sb, KERN_WARNING,
7139 "Quota file not on filesystem root. "
7140 "Journaled quota will not work");
7141 sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
7142 } else {
7143 /*
7144 * Clear the flag just in case mount options changed since
7145 * last time.
7146 */
7147 sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
7148 }
7149
7150 lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
7151 err = dquot_quota_on(sb, type, format_id, path);
7152 if (!err) {
7153 struct inode *inode = d_inode(path->dentry);
7154 handle_t *handle;
7155
7156 /*
7157 * Set inode flags to prevent userspace from messing with quota
7158 * files. If this fails, we return success anyway since quotas
7159 * are already enabled and this is not a hard failure.
7160 */
7161 inode_lock(inode);
7162 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
7163 if (IS_ERR(handle))
7164 goto unlock_inode;
7165 EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
7166 inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
7167 S_NOATIME | S_IMMUTABLE);
7168 err = ext4_mark_inode_dirty(handle, inode);
7169 ext4_journal_stop(handle);
7170 unlock_inode:
7171 inode_unlock(inode);
7172 if (err)
7173 dquot_quota_off(sb, type);
7174 }
7175 if (err)
7176 lockdep_set_quota_inode(path->dentry->d_inode,
7177 I_DATA_SEM_NORMAL);
7178 return err;
7179 }
7180
ext4_check_quota_inum(int type,unsigned long qf_inum)7181 static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
7182 {
7183 switch (type) {
7184 case USRQUOTA:
7185 return qf_inum == EXT4_USR_QUOTA_INO;
7186 case GRPQUOTA:
7187 return qf_inum == EXT4_GRP_QUOTA_INO;
7188 case PRJQUOTA:
7189 return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
7190 default:
7191 BUG();
7192 }
7193 }
7194
ext4_quota_enable(struct super_block * sb,int type,int format_id,unsigned int flags)7195 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
7196 unsigned int flags)
7197 {
7198 int err;
7199 struct inode *qf_inode;
7200 unsigned long qf_inums[EXT4_MAXQUOTAS] = {
7201 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
7202 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
7203 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
7204 };
7205
7206 BUG_ON(!ext4_has_feature_quota(sb));
7207
7208 if (!qf_inums[type])
7209 return -EPERM;
7210
7211 if (!ext4_check_quota_inum(type, qf_inums[type])) {
7212 ext4_error(sb, "Bad quota inum: %lu, type: %d",
7213 qf_inums[type], type);
7214 return -EUCLEAN;
7215 }
7216
7217 qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
7218 if (IS_ERR(qf_inode)) {
7219 ext4_error(sb, "Bad quota inode: %lu, type: %d",
7220 qf_inums[type], type);
7221 return PTR_ERR(qf_inode);
7222 }
7223
7224 /* Don't account quota for quota files to avoid recursion */
7225 qf_inode->i_flags |= S_NOQUOTA;
7226 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
7227 err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
7228 if (err)
7229 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
7230 iput(qf_inode);
7231
7232 return err;
7233 }
7234
7235 /* Enable usage tracking for all quota types. */
ext4_enable_quotas(struct super_block * sb)7236 int ext4_enable_quotas(struct super_block *sb)
7237 {
7238 int type, err = 0;
7239 unsigned long qf_inums[EXT4_MAXQUOTAS] = {
7240 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
7241 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
7242 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
7243 };
7244 bool quota_mopt[EXT4_MAXQUOTAS] = {
7245 test_opt(sb, USRQUOTA),
7246 test_opt(sb, GRPQUOTA),
7247 test_opt(sb, PRJQUOTA),
7248 };
7249
7250 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
7251 for (type = 0; type < EXT4_MAXQUOTAS; type++) {
7252 if (qf_inums[type]) {
7253 err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
7254 DQUOT_USAGE_ENABLED |
7255 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
7256 if (err) {
7257 ext4_warning(sb,
7258 "Failed to enable quota tracking "
7259 "(type=%d, err=%d, ino=%lu). "
7260 "Please run e2fsck to fix.", type,
7261 err, qf_inums[type]);
7262
7263 ext4_quotas_off(sb, type);
7264 return err;
7265 }
7266 }
7267 }
7268 return 0;
7269 }
7270
ext4_quota_off(struct super_block * sb,int type)7271 static int ext4_quota_off(struct super_block *sb, int type)
7272 {
7273 struct inode *inode = sb_dqopt(sb)->files[type];
7274 handle_t *handle;
7275 int err;
7276
7277 /* Force all delayed allocation blocks to be allocated.
7278 * Caller already holds s_umount sem */
7279 if (test_opt(sb, DELALLOC))
7280 sync_filesystem(sb);
7281
7282 if (!inode || !igrab(inode))
7283 goto out;
7284
7285 err = dquot_quota_off(sb, type);
7286 if (err || ext4_has_feature_quota(sb))
7287 goto out_put;
7288 /*
7289 * When the filesystem was remounted read-only first, we cannot cleanup
7290 * inode flags here. Bad luck but people should be using QUOTA feature
7291 * these days anyway.
7292 */
7293 if (sb_rdonly(sb))
7294 goto out_put;
7295
7296 inode_lock(inode);
7297 /*
7298 * Update modification times of quota files when userspace can
7299 * start looking at them. If we fail, we return success anyway since
7300 * this is not a hard failure and quotas are already disabled.
7301 */
7302 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
7303 if (IS_ERR(handle)) {
7304 err = PTR_ERR(handle);
7305 goto out_unlock;
7306 }
7307 EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
7308 inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
7309 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
7310 err = ext4_mark_inode_dirty(handle, inode);
7311 ext4_journal_stop(handle);
7312 out_unlock:
7313 inode_unlock(inode);
7314 out_put:
7315 lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
7316 iput(inode);
7317 return err;
7318 out:
7319 return dquot_quota_off(sb, type);
7320 }
7321
7322 /* Read data from quotafile - avoid pagecache and such because we cannot afford
7323 * acquiring the locks... As quota files are never truncated and quota code
7324 * itself serializes the operations (and no one else should touch the files)
7325 * we don't have to be afraid of races */
ext4_quota_read(struct super_block * sb,int type,char * data,size_t len,loff_t off)7326 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
7327 size_t len, loff_t off)
7328 {
7329 struct inode *inode = sb_dqopt(sb)->files[type];
7330 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
7331 int offset = off & (sb->s_blocksize - 1);
7332 int tocopy;
7333 size_t toread;
7334 struct buffer_head *bh;
7335 loff_t i_size = i_size_read(inode);
7336
7337 if (off > i_size)
7338 return 0;
7339 if (off+len > i_size)
7340 len = i_size-off;
7341 toread = len;
7342 while (toread > 0) {
7343 tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
7344 bh = ext4_bread(NULL, inode, blk, 0);
7345 if (IS_ERR(bh))
7346 return PTR_ERR(bh);
7347 if (!bh) /* A hole? */
7348 memset(data, 0, tocopy);
7349 else
7350 memcpy(data, bh->b_data+offset, tocopy);
7351 brelse(bh);
7352 offset = 0;
7353 toread -= tocopy;
7354 data += tocopy;
7355 blk++;
7356 }
7357 return len;
7358 }
7359
7360 /* Write to quotafile (we know the transaction is already started and has
7361 * enough credits) */
ext4_quota_write(struct super_block * sb,int type,const char * data,size_t len,loff_t off)7362 static ssize_t ext4_quota_write(struct super_block *sb, int type,
7363 const char *data, size_t len, loff_t off)
7364 {
7365 struct inode *inode = sb_dqopt(sb)->files[type];
7366 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
7367 int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
7368 int retries = 0;
7369 struct buffer_head *bh;
7370 handle_t *handle = journal_current_handle();
7371
7372 if (!handle) {
7373 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
7374 " cancelled because transaction is not started",
7375 (unsigned long long)off, (unsigned long long)len);
7376 return -EIO;
7377 }
7378 /*
7379 * Since we account only one data block in transaction credits,
7380 * then it is impossible to cross a block boundary.
7381 */
7382 if (sb->s_blocksize - offset < len) {
7383 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
7384 " cancelled because not block aligned",
7385 (unsigned long long)off, (unsigned long long)len);
7386 return -EIO;
7387 }
7388
7389 do {
7390 bh = ext4_bread(handle, inode, blk,
7391 EXT4_GET_BLOCKS_CREATE |
7392 EXT4_GET_BLOCKS_METADATA_NOFAIL);
7393 } while (PTR_ERR(bh) == -ENOSPC &&
7394 ext4_should_retry_alloc(inode->i_sb, &retries));
7395 if (IS_ERR(bh))
7396 return PTR_ERR(bh);
7397 if (!bh)
7398 goto out;
7399 BUFFER_TRACE(bh, "get write access");
7400 err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
7401 if (err) {
7402 brelse(bh);
7403 return err;
7404 }
7405 lock_buffer(bh);
7406 memcpy(bh->b_data+offset, data, len);
7407 flush_dcache_folio(bh->b_folio);
7408 unlock_buffer(bh);
7409 err = ext4_handle_dirty_metadata(handle, NULL, bh);
7410 brelse(bh);
7411 out:
7412 if (inode->i_size < off + len) {
7413 i_size_write(inode, off + len);
7414 EXT4_I(inode)->i_disksize = inode->i_size;
7415 err2 = ext4_mark_inode_dirty(handle, inode);
7416 if (unlikely(err2 && !err))
7417 err = err2;
7418 }
7419 return err ? err : len;
7420 }
7421 #endif
7422
7423 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
register_as_ext2(void)7424 static inline void register_as_ext2(void)
7425 {
7426 int err = register_filesystem(&ext2_fs_type);
7427 if (err)
7428 printk(KERN_WARNING
7429 "EXT4-fs: Unable to register as ext2 (%d)\n", err);
7430 }
7431
unregister_as_ext2(void)7432 static inline void unregister_as_ext2(void)
7433 {
7434 unregister_filesystem(&ext2_fs_type);
7435 }
7436
ext2_feature_set_ok(struct super_block * sb)7437 static inline int ext2_feature_set_ok(struct super_block *sb)
7438 {
7439 if (ext4_has_unknown_ext2_incompat_features(sb))
7440 return 0;
7441 if (sb_rdonly(sb))
7442 return 1;
7443 if (ext4_has_unknown_ext2_ro_compat_features(sb))
7444 return 0;
7445 return 1;
7446 }
7447 #else
register_as_ext2(void)7448 static inline void register_as_ext2(void) { }
unregister_as_ext2(void)7449 static inline void unregister_as_ext2(void) { }
ext2_feature_set_ok(struct super_block * sb)7450 static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
7451 #endif
7452
register_as_ext3(void)7453 static inline void register_as_ext3(void)
7454 {
7455 int err = register_filesystem(&ext3_fs_type);
7456 if (err)
7457 printk(KERN_WARNING
7458 "EXT4-fs: Unable to register as ext3 (%d)\n", err);
7459 }
7460
unregister_as_ext3(void)7461 static inline void unregister_as_ext3(void)
7462 {
7463 unregister_filesystem(&ext3_fs_type);
7464 }
7465
ext3_feature_set_ok(struct super_block * sb)7466 static inline int ext3_feature_set_ok(struct super_block *sb)
7467 {
7468 if (ext4_has_unknown_ext3_incompat_features(sb))
7469 return 0;
7470 if (!ext4_has_feature_journal(sb))
7471 return 0;
7472 if (sb_rdonly(sb))
7473 return 1;
7474 if (ext4_has_unknown_ext3_ro_compat_features(sb))
7475 return 0;
7476 return 1;
7477 }
7478
ext4_kill_sb(struct super_block * sb)7479 static void ext4_kill_sb(struct super_block *sb)
7480 {
7481 struct ext4_sb_info *sbi = EXT4_SB(sb);
7482 struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
7483
7484 kill_block_super(sb);
7485
7486 if (bdev_file)
7487 bdev_fput(bdev_file);
7488 }
7489
7490 static struct file_system_type ext4_fs_type = {
7491 .owner = THIS_MODULE,
7492 .name = "ext4",
7493 .init_fs_context = ext4_init_fs_context,
7494 .parameters = ext4_param_specs,
7495 .kill_sb = ext4_kill_sb,
7496 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
7497 FS_LBS,
7498 };
7499 MODULE_ALIAS_FS("ext4");
7500
ext4_init_fs(void)7501 static int __init ext4_init_fs(void)
7502 {
7503 int err;
7504
7505 ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
7506 ext4_li_info = NULL;
7507
7508 /* Build-time check for flags consistency */
7509 ext4_check_flag_values();
7510
7511 err = ext4_init_es();
7512 if (err)
7513 return err;
7514
7515 err = ext4_init_pending();
7516 if (err)
7517 goto out7;
7518
7519 err = ext4_init_post_read_processing();
7520 if (err)
7521 goto out6;
7522
7523 err = ext4_init_pageio();
7524 if (err)
7525 goto out5;
7526
7527 err = ext4_init_system_zone();
7528 if (err)
7529 goto out4;
7530
7531 err = ext4_init_sysfs();
7532 if (err)
7533 goto out3;
7534
7535 err = ext4_init_mballoc();
7536 if (err)
7537 goto out2;
7538 err = init_inodecache();
7539 if (err)
7540 goto out1;
7541
7542 err = ext4_fc_init_dentry_cache();
7543 if (err)
7544 goto out05;
7545
7546 register_as_ext3();
7547 register_as_ext2();
7548 err = register_filesystem(&ext4_fs_type);
7549 if (err)
7550 goto out;
7551
7552 return 0;
7553 out:
7554 unregister_as_ext2();
7555 unregister_as_ext3();
7556 ext4_fc_destroy_dentry_cache();
7557 out05:
7558 destroy_inodecache();
7559 out1:
7560 ext4_exit_mballoc();
7561 out2:
7562 ext4_exit_sysfs();
7563 out3:
7564 ext4_exit_system_zone();
7565 out4:
7566 ext4_exit_pageio();
7567 out5:
7568 ext4_exit_post_read_processing();
7569 out6:
7570 ext4_exit_pending();
7571 out7:
7572 ext4_exit_es();
7573
7574 return err;
7575 }
7576
ext4_exit_fs(void)7577 static void __exit ext4_exit_fs(void)
7578 {
7579 ext4_destroy_lazyinit_thread();
7580 unregister_as_ext2();
7581 unregister_as_ext3();
7582 unregister_filesystem(&ext4_fs_type);
7583 ext4_fc_destroy_dentry_cache();
7584 destroy_inodecache();
7585 ext4_exit_mballoc();
7586 ext4_exit_sysfs();
7587 ext4_exit_system_zone();
7588 ext4_exit_pageio();
7589 ext4_exit_post_read_processing();
7590 ext4_exit_es();
7591 ext4_exit_pending();
7592 }
7593
7594 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
7595 MODULE_DESCRIPTION("Fourth Extended Filesystem");
7596 MODULE_LICENSE("GPL");
7597 module_init(ext4_init_fs)
7598 module_exit(ext4_exit_fs)
7599