xref: /linux/fs/ext4/mmp.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fs.h>
3 #include <linux/random.h>
4 #include <linux/buffer_head.h>
5 #include <linux/utsname.h>
6 #include <linux/kthread.h>
7 
8 #include "ext4.h"
9 
10 /* Checksumming functions */
11 static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
12 {
13 	struct ext4_sb_info *sbi = EXT4_SB(sb);
14 	int offset = offsetof(struct mmp_struct, mmp_checksum);
15 	__u32 csum;
16 
17 	csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
18 
19 	return cpu_to_le32(csum);
20 }
21 
22 static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
23 {
24 	if (!ext4_has_feature_metadata_csum(sb))
25 		return 1;
26 
27 	return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
28 }
29 
30 static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
31 {
32 	if (!ext4_has_feature_metadata_csum(sb))
33 		return;
34 
35 	mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
36 }
37 
38 /*
39  * Write the MMP block using REQ_SYNC to try to get the block on-disk
40  * faster.
41  */
42 static int write_mmp_block_thawed(struct super_block *sb,
43 				  struct buffer_head *bh)
44 {
45 	struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
46 
47 	ext4_mmp_csum_set(sb, mmp);
48 	lock_buffer(bh);
49 	bh->b_end_io = end_buffer_write_sync;
50 	get_bh(bh);
51 	submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
52 	wait_on_buffer(bh);
53 	if (unlikely(!buffer_uptodate(bh)))
54 		return -EIO;
55 	return 0;
56 }
57 
58 static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
59 {
60 	/*
61 	 * We protect against freezing so that we don't create dirty buffers
62 	 * on frozen filesystem.
63 	 */
64 	scoped_guard(super_write, sb)
65 		return write_mmp_block_thawed(sb, bh);
66 }
67 
68 /*
69  * Read the MMP block. It _must_ be read from disk and hence we clear the
70  * uptodate flag on the buffer.
71  */
72 static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
73 			  ext4_fsblk_t mmp_block)
74 {
75 	struct mmp_struct *mmp;
76 	int ret;
77 
78 	if (*bh)
79 		clear_buffer_uptodate(*bh);
80 
81 	/* This would be sb_bread(sb, mmp_block), except we need to be sure
82 	 * that the MD RAID device cache has been bypassed, and that the read
83 	 * is not blocked in the elevator. */
84 	if (!*bh) {
85 		*bh = sb_getblk(sb, mmp_block);
86 		if (!*bh) {
87 			ret = -ENOMEM;
88 			goto warn_exit;
89 		}
90 	}
91 
92 	lock_buffer(*bh);
93 	ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
94 	if (ret)
95 		goto warn_exit;
96 
97 	mmp = (struct mmp_struct *)((*bh)->b_data);
98 	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
99 		ret = -EFSCORRUPTED;
100 		goto warn_exit;
101 	}
102 	if (!ext4_mmp_csum_verify(sb, mmp)) {
103 		ret = -EFSBADCRC;
104 		goto warn_exit;
105 	}
106 	return 0;
107 warn_exit:
108 	brelse(*bh);
109 	*bh = NULL;
110 	ext4_warning(sb, "Error %d while reading MMP block %llu",
111 		     ret, mmp_block);
112 	return ret;
113 }
114 
115 /*
116  * Dump as much information as possible to help the admin.
117  */
118 void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
119 		    const char *function, unsigned int line, const char *msg)
120 {
121 	__ext4_warning(sb, function, line, "%s", msg);
122 	__ext4_warning(sb, function, line,
123 		       "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
124 		       (unsigned long long)le64_to_cpu(mmp->mmp_time),
125 		       (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
126 		       (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
127 }
128 
129 /*
130  * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
131  */
132 static int kmmpd(void *data)
133 {
134 	struct super_block *sb = data;
135 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
136 	struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
137 	struct mmp_struct *mmp;
138 	ext4_fsblk_t mmp_block;
139 	u32 seq = 0;
140 	unsigned long failed_writes = 0;
141 	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
142 	unsigned mmp_check_interval;
143 	unsigned long last_update_time;
144 	unsigned long diff;
145 	int retval = 0;
146 
147 	mmp_block = le64_to_cpu(es->s_mmp_block);
148 	mmp = (struct mmp_struct *)(bh->b_data);
149 	mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
150 	/*
151 	 * Start with the higher mmp_check_interval and reduce it if
152 	 * the MMP block is being updated on time.
153 	 */
154 	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
155 				 EXT4_MMP_MIN_CHECK_INTERVAL);
156 	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
157 
158 	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
159 	       sizeof(mmp->mmp_nodename));
160 
161 	while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
162 		if (!ext4_has_feature_mmp(sb)) {
163 			ext4_warning(sb, "kmmpd being stopped since MMP feature"
164 				     " has been disabled.");
165 			goto wait_to_exit;
166 		}
167 		if (++seq > EXT4_MMP_SEQ_MAX)
168 			seq = 1;
169 
170 		mmp->mmp_seq = cpu_to_le32(seq);
171 		mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
172 		last_update_time = jiffies;
173 
174 		retval = write_mmp_block(sb, bh);
175 		/*
176 		 * Don't spew too many error messages. Print one every
177 		 * (s_mmp_update_interval * 60) seconds.
178 		 */
179 		if (retval) {
180 			if ((failed_writes % 60) == 0) {
181 				ext4_error_err(sb, -retval,
182 					       "Error writing to MMP block");
183 			}
184 			failed_writes++;
185 		}
186 
187 		diff = jiffies - last_update_time;
188 		if (diff < mmp_update_interval * HZ)
189 			schedule_timeout_interruptible(mmp_update_interval *
190 						       HZ - diff);
191 
192 		/*
193 		 * We need to make sure that more than mmp_check_interval
194 		 * seconds have not passed since writing. If that has happened
195 		 * we need to check if the MMP block is as we left it.
196 		 */
197 		diff = jiffies - last_update_time;
198 		if (diff > mmp_check_interval * HZ) {
199 			struct buffer_head *bh_check = NULL;
200 			struct mmp_struct *mmp_check;
201 
202 			retval = read_mmp_block(sb, &bh_check, mmp_block);
203 			if (retval) {
204 				ext4_error_err(sb, -retval,
205 					       "error reading MMP data: %d",
206 					       retval);
207 				goto wait_to_exit;
208 			}
209 
210 			mmp_check = (struct mmp_struct *)(bh_check->b_data);
211 			if (mmp->mmp_seq != mmp_check->mmp_seq ||
212 			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
213 				   sizeof(mmp->mmp_nodename))) {
214 				dump_mmp_msg(sb, mmp_check,
215 					     "Error while updating MMP info. "
216 					     "The filesystem seems to have been"
217 					     " multiply mounted.");
218 				ext4_error_err(sb, EBUSY, "abort");
219 				put_bh(bh_check);
220 				retval = -EBUSY;
221 				goto wait_to_exit;
222 			}
223 			put_bh(bh_check);
224 		}
225 
226 		 /*
227 		 * Adjust the mmp_check_interval depending on how much time
228 		 * it took for the MMP block to be written.
229 		 */
230 		mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ,
231 					   EXT4_MMP_MIN_CHECK_INTERVAL,
232 					   EXT4_MMP_MAX_CHECK_INTERVAL);
233 		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
234 	}
235 
236 	/*
237 	 * Unmount seems to be clean.
238 	 */
239 	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
240 	mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
241 
242 	retval = write_mmp_block(sb, bh);
243 
244 wait_to_exit:
245 	while (!kthread_should_stop()) {
246 		set_current_state(TASK_INTERRUPTIBLE);
247 		if (!kthread_should_stop())
248 			schedule();
249 	}
250 	set_current_state(TASK_RUNNING);
251 	return retval;
252 }
253 
254 void ext4_stop_mmpd(struct ext4_sb_info *sbi)
255 {
256 	if (sbi->s_mmp_tsk) {
257 		kthread_stop(sbi->s_mmp_tsk);
258 		brelse(sbi->s_mmp_bh);
259 		sbi->s_mmp_tsk = NULL;
260 	}
261 }
262 
263 /*
264  * Get a random new sequence number but make sure it is not greater than
265  * EXT4_MMP_SEQ_MAX.
266  */
267 static unsigned int mmp_new_seq(void)
268 {
269 	return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
270 }
271 
272 /*
273  * Protect the filesystem from being mounted more than once.
274  */
275 int ext4_multi_mount_protect(struct super_block *sb,
276 				    ext4_fsblk_t mmp_block)
277 {
278 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
279 	struct buffer_head *bh = NULL;
280 	struct mmp_struct *mmp = NULL;
281 	u32 seq;
282 	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
283 	unsigned int wait_time = 0;
284 	int retval;
285 
286 	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
287 	    mmp_block >= ext4_blocks_count(es)) {
288 		ext4_warning(sb, "Invalid MMP block in superblock");
289 		retval = -EINVAL;
290 		goto failed;
291 	}
292 
293 	retval = read_mmp_block(sb, &bh, mmp_block);
294 	if (retval)
295 		goto failed;
296 
297 	mmp = (struct mmp_struct *)(bh->b_data);
298 
299 	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
300 		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
301 
302 	/*
303 	 * If check_interval in MMP block is larger, use that instead of
304 	 * update_interval from the superblock.
305 	 */
306 	if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
307 		mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
308 
309 	seq = le32_to_cpu(mmp->mmp_seq);
310 	if (seq == EXT4_MMP_SEQ_CLEAN)
311 		goto skip;
312 
313 	if (seq == EXT4_MMP_SEQ_FSCK) {
314 		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
315 		retval = -EBUSY;
316 		goto failed;
317 	}
318 
319 	wait_time = min(mmp_check_interval * 2 + 1,
320 			mmp_check_interval + 60);
321 
322 	/* Print MMP interval if more than 20 secs. */
323 	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
324 		ext4_warning(sb, "MMP interval %u higher than expected, please"
325 			     " wait.\n", wait_time * 2);
326 
327 	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
328 		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
329 		retval = -ETIMEDOUT;
330 		goto failed;
331 	}
332 
333 	retval = read_mmp_block(sb, &bh, mmp_block);
334 	if (retval)
335 		goto failed;
336 	mmp = (struct mmp_struct *)(bh->b_data);
337 	if (seq != le32_to_cpu(mmp->mmp_seq)) {
338 		dump_mmp_msg(sb, mmp,
339 			     "Device is already active on another node.");
340 		retval = -EBUSY;
341 		goto failed;
342 	}
343 
344 skip:
345 	/*
346 	 * write a new random sequence number.
347 	 */
348 	seq = mmp_new_seq();
349 	mmp->mmp_seq = cpu_to_le32(seq);
350 
351 	/*
352 	 * On mount / remount we are protected against fs freezing (by s_umount
353 	 * semaphore) and grabbing freeze protection upsets lockdep
354 	 */
355 	retval = write_mmp_block_thawed(sb, bh);
356 	if (retval)
357 		goto failed;
358 
359 	/*
360 	 * wait for MMP interval and check mmp_seq.
361 	 */
362 	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
363 		ext4_warning(sb, "MMP startup interrupted, failing mount");
364 		retval = -ETIMEDOUT;
365 		goto failed;
366 	}
367 
368 	retval = read_mmp_block(sb, &bh, mmp_block);
369 	if (retval)
370 		goto failed;
371 	mmp = (struct mmp_struct *)(bh->b_data);
372 	if (seq != le32_to_cpu(mmp->mmp_seq)) {
373 		dump_mmp_msg(sb, mmp,
374 			     "Device is already active on another node.");
375 		retval = -EBUSY;
376 		goto failed;
377 	}
378 
379 	EXT4_SB(sb)->s_mmp_bh = bh;
380 
381 	BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
382 	snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
383 		 "%pg", bh->b_bdev);
384 
385 	/*
386 	 * Start a kernel thread to update the MMP block periodically.
387 	 */
388 	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
389 					     (int)sizeof(mmp->mmp_bdevname),
390 					     mmp->mmp_bdevname);
391 	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
392 		EXT4_SB(sb)->s_mmp_tsk = NULL;
393 		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
394 			     sb->s_id);
395 		retval = -ENOMEM;
396 		goto failed;
397 	}
398 
399 	return 0;
400 
401 failed:
402 	brelse(bh);
403 	return retval;
404 }
405