xref: /linux/block/badblocks.c (revision 987b741c52c7c6c68d46fbaeb95b8d1087f10b7f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Bad block management
4  *
5  * - Heavily based on MD badblocks code from Neil Brown
6  *
7  * Copyright (c) 2015, Intel Corporation.
8  */
9 
10 #include <linux/badblocks.h>
11 #include <linux/seqlock.h>
12 #include <linux/device.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/stddef.h>
16 #include <linux/types.h>
17 #include <linux/slab.h>
18 
19 /**
20  * badblocks_check() - check a given range for bad sectors
21  * @bb:		the badblocks structure that holds all badblock information
22  * @s:		sector (start) at which to check for badblocks
23  * @sectors:	number of sectors to check for badblocks
24  * @first_bad:	pointer to store location of the first badblock
25  * @bad_sectors: pointer to store number of badblocks after @first_bad
26  *
27  * We can record which blocks on each device are 'bad' and so just
28  * fail those blocks, or that stripe, rather than the whole device.
29  * Entries in the bad-block table are 64bits wide.  This comprises:
30  * Length of bad-range, in sectors: 0-511 for lengths 1-512
31  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
32  *  A 'shift' can be set so that larger blocks are tracked and
33  *  consequently larger devices can be covered.
34  * 'Acknowledged' flag - 1 bit. - the most significant bit.
35  *
36  * Locking of the bad-block table uses a seqlock so badblocks_check
37  * might need to retry if it is very unlucky.
38  * We will sometimes want to check for bad blocks in a bi_end_io function,
39  * so we use the write_seqlock_irq variant.
40  *
41  * When looking for a bad block we specify a range and want to
42  * know if any block in the range is bad.  So we binary-search
43  * to the last range that starts at-or-before the given endpoint,
44  * (or "before the sector after the target range")
45  * then see if it ends after the given start.
46  *
47  * Return:
48  *  0: there are no known bad blocks in the range
49  *  1: there are known bad block which are all acknowledged
50  * -1: there are bad blocks which have not yet been acknowledged in metadata.
51  * plus the start/length of the first bad section we overlap.
52  */
53 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
54 			sector_t *first_bad, int *bad_sectors)
55 {
56 	int hi;
57 	int lo;
58 	u64 *p = bb->page;
59 	int rv;
60 	sector_t target = s + sectors;
61 	unsigned seq;
62 
63 	if (bb->shift > 0) {
64 		/* round the start down, and the end up */
65 		s >>= bb->shift;
66 		target += (1<<bb->shift) - 1;
67 		target >>= bb->shift;
68 		sectors = target - s;
69 	}
70 	/* 'target' is now the first block after the bad range */
71 
72 retry:
73 	seq = read_seqbegin(&bb->lock);
74 	lo = 0;
75 	rv = 0;
76 	hi = bb->count;
77 
78 	/* Binary search between lo and hi for 'target'
79 	 * i.e. for the last range that starts before 'target'
80 	 */
81 	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
82 	 * are known not to be the last range before target.
83 	 * VARIANT: hi-lo is the number of possible
84 	 * ranges, and decreases until it reaches 1
85 	 */
86 	while (hi - lo > 1) {
87 		int mid = (lo + hi) / 2;
88 		sector_t a = BB_OFFSET(p[mid]);
89 
90 		if (a < target)
91 			/* This could still be the one, earlier ranges
92 			 * could not.
93 			 */
94 			lo = mid;
95 		else
96 			/* This and later ranges are definitely out. */
97 			hi = mid;
98 	}
99 	/* 'lo' might be the last that started before target, but 'hi' isn't */
100 	if (hi > lo) {
101 		/* need to check all range that end after 's' to see if
102 		 * any are unacknowledged.
103 		 */
104 		while (lo >= 0 &&
105 		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
106 			if (BB_OFFSET(p[lo]) < target) {
107 				/* starts before the end, and finishes after
108 				 * the start, so they must overlap
109 				 */
110 				if (rv != -1 && BB_ACK(p[lo]))
111 					rv = 1;
112 				else
113 					rv = -1;
114 				*first_bad = BB_OFFSET(p[lo]);
115 				*bad_sectors = BB_LEN(p[lo]);
116 			}
117 			lo--;
118 		}
119 	}
120 
121 	if (read_seqretry(&bb->lock, seq))
122 		goto retry;
123 
124 	return rv;
125 }
126 EXPORT_SYMBOL_GPL(badblocks_check);
127 
128 static void badblocks_update_acked(struct badblocks *bb)
129 {
130 	u64 *p = bb->page;
131 	int i;
132 	bool unacked = false;
133 
134 	if (!bb->unacked_exist)
135 		return;
136 
137 	for (i = 0; i < bb->count ; i++) {
138 		if (!BB_ACK(p[i])) {
139 			unacked = true;
140 			break;
141 		}
142 	}
143 
144 	if (!unacked)
145 		bb->unacked_exist = 0;
146 }
147 
148 /**
149  * badblocks_set() - Add a range of bad blocks to the table.
150  * @bb:		the badblocks structure that holds all badblock information
151  * @s:		first sector to mark as bad
152  * @sectors:	number of sectors to mark as bad
153  * @acknowledged: weather to mark the bad sectors as acknowledged
154  *
155  * This might extend the table, or might contract it if two adjacent ranges
156  * can be merged. We binary-search to find the 'insertion' point, then
157  * decide how best to handle it.
158  *
159  * Return:
160  *  0: success
161  *  1: failed to set badblocks (out of space)
162  */
163 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
164 			int acknowledged)
165 {
166 	u64 *p;
167 	int lo, hi;
168 	int rv = 0;
169 	unsigned long flags;
170 
171 	if (bb->shift < 0)
172 		/* badblocks are disabled */
173 		return 1;
174 
175 	if (bb->shift) {
176 		/* round the start down, and the end up */
177 		sector_t next = s + sectors;
178 
179 		s >>= bb->shift;
180 		next += (1<<bb->shift) - 1;
181 		next >>= bb->shift;
182 		sectors = next - s;
183 	}
184 
185 	write_seqlock_irqsave(&bb->lock, flags);
186 
187 	p = bb->page;
188 	lo = 0;
189 	hi = bb->count;
190 	/* Find the last range that starts at-or-before 's' */
191 	while (hi - lo > 1) {
192 		int mid = (lo + hi) / 2;
193 		sector_t a = BB_OFFSET(p[mid]);
194 
195 		if (a <= s)
196 			lo = mid;
197 		else
198 			hi = mid;
199 	}
200 	if (hi > lo && BB_OFFSET(p[lo]) > s)
201 		hi = lo;
202 
203 	if (hi > lo) {
204 		/* we found a range that might merge with the start
205 		 * of our new range
206 		 */
207 		sector_t a = BB_OFFSET(p[lo]);
208 		sector_t e = a + BB_LEN(p[lo]);
209 		int ack = BB_ACK(p[lo]);
210 
211 		if (e >= s) {
212 			/* Yes, we can merge with a previous range */
213 			if (s == a && s + sectors >= e)
214 				/* new range covers old */
215 				ack = acknowledged;
216 			else
217 				ack = ack && acknowledged;
218 
219 			if (e < s + sectors)
220 				e = s + sectors;
221 			if (e - a <= BB_MAX_LEN) {
222 				p[lo] = BB_MAKE(a, e-a, ack);
223 				s = e;
224 			} else {
225 				/* does not all fit in one range,
226 				 * make p[lo] maximal
227 				 */
228 				if (BB_LEN(p[lo]) != BB_MAX_LEN)
229 					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
230 				s = a + BB_MAX_LEN;
231 			}
232 			sectors = e - s;
233 		}
234 	}
235 	if (sectors && hi < bb->count) {
236 		/* 'hi' points to the first range that starts after 's'.
237 		 * Maybe we can merge with the start of that range
238 		 */
239 		sector_t a = BB_OFFSET(p[hi]);
240 		sector_t e = a + BB_LEN(p[hi]);
241 		int ack = BB_ACK(p[hi]);
242 
243 		if (a <= s + sectors) {
244 			/* merging is possible */
245 			if (e <= s + sectors) {
246 				/* full overlap */
247 				e = s + sectors;
248 				ack = acknowledged;
249 			} else
250 				ack = ack && acknowledged;
251 
252 			a = s;
253 			if (e - a <= BB_MAX_LEN) {
254 				p[hi] = BB_MAKE(a, e-a, ack);
255 				s = e;
256 			} else {
257 				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
258 				s = a + BB_MAX_LEN;
259 			}
260 			sectors = e - s;
261 			lo = hi;
262 			hi++;
263 		}
264 	}
265 	if (sectors == 0 && hi < bb->count) {
266 		/* we might be able to combine lo and hi */
267 		/* Note: 's' is at the end of 'lo' */
268 		sector_t a = BB_OFFSET(p[hi]);
269 		int lolen = BB_LEN(p[lo]);
270 		int hilen = BB_LEN(p[hi]);
271 		int newlen = lolen + hilen - (s - a);
272 
273 		if (s >= a && newlen < BB_MAX_LEN) {
274 			/* yes, we can combine them */
275 			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
276 
277 			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
278 			memmove(p + hi, p + hi + 1,
279 				(bb->count - hi - 1) * 8);
280 			bb->count--;
281 		}
282 	}
283 	while (sectors) {
284 		/* didn't merge (it all).
285 		 * Need to add a range just before 'hi'
286 		 */
287 		if (bb->count >= MAX_BADBLOCKS) {
288 			/* No room for more */
289 			rv = 1;
290 			break;
291 		} else {
292 			int this_sectors = sectors;
293 
294 			memmove(p + hi + 1, p + hi,
295 				(bb->count - hi) * 8);
296 			bb->count++;
297 
298 			if (this_sectors > BB_MAX_LEN)
299 				this_sectors = BB_MAX_LEN;
300 			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
301 			sectors -= this_sectors;
302 			s += this_sectors;
303 		}
304 	}
305 
306 	bb->changed = 1;
307 	if (!acknowledged)
308 		bb->unacked_exist = 1;
309 	else
310 		badblocks_update_acked(bb);
311 	write_sequnlock_irqrestore(&bb->lock, flags);
312 
313 	return rv;
314 }
315 EXPORT_SYMBOL_GPL(badblocks_set);
316 
317 /**
318  * badblocks_clear() - Remove a range of bad blocks to the table.
319  * @bb:		the badblocks structure that holds all badblock information
320  * @s:		first sector to mark as bad
321  * @sectors:	number of sectors to mark as bad
322  *
323  * This may involve extending the table if we spilt a region,
324  * but it must not fail.  So if the table becomes full, we just
325  * drop the remove request.
326  *
327  * Return:
328  *  0: success
329  *  1: failed to clear badblocks
330  */
331 int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
332 {
333 	u64 *p;
334 	int lo, hi;
335 	sector_t target = s + sectors;
336 	int rv = 0;
337 
338 	if (bb->shift > 0) {
339 		/* When clearing we round the start up and the end down.
340 		 * This should not matter as the shift should align with
341 		 * the block size and no rounding should ever be needed.
342 		 * However it is better the think a block is bad when it
343 		 * isn't than to think a block is not bad when it is.
344 		 */
345 		s += (1<<bb->shift) - 1;
346 		s >>= bb->shift;
347 		target >>= bb->shift;
348 		sectors = target - s;
349 	}
350 
351 	write_seqlock_irq(&bb->lock);
352 
353 	p = bb->page;
354 	lo = 0;
355 	hi = bb->count;
356 	/* Find the last range that starts before 'target' */
357 	while (hi - lo > 1) {
358 		int mid = (lo + hi) / 2;
359 		sector_t a = BB_OFFSET(p[mid]);
360 
361 		if (a < target)
362 			lo = mid;
363 		else
364 			hi = mid;
365 	}
366 	if (hi > lo) {
367 		/* p[lo] is the last range that could overlap the
368 		 * current range.  Earlier ranges could also overlap,
369 		 * but only this one can overlap the end of the range.
370 		 */
371 		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
372 		    (BB_OFFSET(p[lo]) < target)) {
373 			/* Partial overlap, leave the tail of this range */
374 			int ack = BB_ACK(p[lo]);
375 			sector_t a = BB_OFFSET(p[lo]);
376 			sector_t end = a + BB_LEN(p[lo]);
377 
378 			if (a < s) {
379 				/* we need to split this range */
380 				if (bb->count >= MAX_BADBLOCKS) {
381 					rv = -ENOSPC;
382 					goto out;
383 				}
384 				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
385 				bb->count++;
386 				p[lo] = BB_MAKE(a, s-a, ack);
387 				lo++;
388 			}
389 			p[lo] = BB_MAKE(target, end - target, ack);
390 			/* there is no longer an overlap */
391 			hi = lo;
392 			lo--;
393 		}
394 		while (lo >= 0 &&
395 		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
396 		       (BB_OFFSET(p[lo]) < target)) {
397 			/* This range does overlap */
398 			if (BB_OFFSET(p[lo]) < s) {
399 				/* Keep the early parts of this range. */
400 				int ack = BB_ACK(p[lo]);
401 				sector_t start = BB_OFFSET(p[lo]);
402 
403 				p[lo] = BB_MAKE(start, s - start, ack);
404 				/* now low doesn't overlap, so.. */
405 				break;
406 			}
407 			lo--;
408 		}
409 		/* 'lo' is strictly before, 'hi' is strictly after,
410 		 * anything between needs to be discarded
411 		 */
412 		if (hi - lo > 1) {
413 			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
414 			bb->count -= (hi - lo - 1);
415 		}
416 	}
417 
418 	badblocks_update_acked(bb);
419 	bb->changed = 1;
420 out:
421 	write_sequnlock_irq(&bb->lock);
422 	return rv;
423 }
424 EXPORT_SYMBOL_GPL(badblocks_clear);
425 
426 /**
427  * ack_all_badblocks() - Acknowledge all bad blocks in a list.
428  * @bb:		the badblocks structure that holds all badblock information
429  *
430  * This only succeeds if ->changed is clear.  It is used by
431  * in-kernel metadata updates
432  */
433 void ack_all_badblocks(struct badblocks *bb)
434 {
435 	if (bb->page == NULL || bb->changed)
436 		/* no point even trying */
437 		return;
438 	write_seqlock_irq(&bb->lock);
439 
440 	if (bb->changed == 0 && bb->unacked_exist) {
441 		u64 *p = bb->page;
442 		int i;
443 
444 		for (i = 0; i < bb->count ; i++) {
445 			if (!BB_ACK(p[i])) {
446 				sector_t start = BB_OFFSET(p[i]);
447 				int len = BB_LEN(p[i]);
448 
449 				p[i] = BB_MAKE(start, len, 1);
450 			}
451 		}
452 		bb->unacked_exist = 0;
453 	}
454 	write_sequnlock_irq(&bb->lock);
455 }
456 EXPORT_SYMBOL_GPL(ack_all_badblocks);
457 
458 /**
459  * badblocks_show() - sysfs access to bad-blocks list
460  * @bb:		the badblocks structure that holds all badblock information
461  * @page:	buffer received from sysfs
462  * @unack:	weather to show unacknowledged badblocks
463  *
464  * Return:
465  *  Length of returned data
466  */
467 ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
468 {
469 	size_t len;
470 	int i;
471 	u64 *p = bb->page;
472 	unsigned seq;
473 
474 	if (bb->shift < 0)
475 		return 0;
476 
477 retry:
478 	seq = read_seqbegin(&bb->lock);
479 
480 	len = 0;
481 	i = 0;
482 
483 	while (len < PAGE_SIZE && i < bb->count) {
484 		sector_t s = BB_OFFSET(p[i]);
485 		unsigned int length = BB_LEN(p[i]);
486 		int ack = BB_ACK(p[i]);
487 
488 		i++;
489 
490 		if (unack && ack)
491 			continue;
492 
493 		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
494 				(unsigned long long)s << bb->shift,
495 				length << bb->shift);
496 	}
497 	if (unack && len == 0)
498 		bb->unacked_exist = 0;
499 
500 	if (read_seqretry(&bb->lock, seq))
501 		goto retry;
502 
503 	return len;
504 }
505 EXPORT_SYMBOL_GPL(badblocks_show);
506 
507 /**
508  * badblocks_store() - sysfs access to bad-blocks list
509  * @bb:		the badblocks structure that holds all badblock information
510  * @page:	buffer received from sysfs
511  * @len:	length of data received from sysfs
512  * @unack:	weather to show unacknowledged badblocks
513  *
514  * Return:
515  *  Length of the buffer processed or -ve error.
516  */
517 ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
518 			int unack)
519 {
520 	unsigned long long sector;
521 	int length;
522 	char newline;
523 
524 	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
525 	case 3:
526 		if (newline != '\n')
527 			return -EINVAL;
528 		fallthrough;
529 	case 2:
530 		if (length <= 0)
531 			return -EINVAL;
532 		break;
533 	default:
534 		return -EINVAL;
535 	}
536 
537 	if (badblocks_set(bb, sector, length, !unack))
538 		return -ENOSPC;
539 	else
540 		return len;
541 }
542 EXPORT_SYMBOL_GPL(badblocks_store);
543 
544 static int __badblocks_init(struct device *dev, struct badblocks *bb,
545 		int enable)
546 {
547 	bb->dev = dev;
548 	bb->count = 0;
549 	if (enable)
550 		bb->shift = 0;
551 	else
552 		bb->shift = -1;
553 	if (dev)
554 		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
555 	else
556 		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
557 	if (!bb->page) {
558 		bb->shift = -1;
559 		return -ENOMEM;
560 	}
561 	seqlock_init(&bb->lock);
562 
563 	return 0;
564 }
565 
566 /**
567  * badblocks_init() - initialize the badblocks structure
568  * @bb:		the badblocks structure that holds all badblock information
569  * @enable:	weather to enable badblocks accounting
570  *
571  * Return:
572  *  0: success
573  *  -ve errno: on error
574  */
575 int badblocks_init(struct badblocks *bb, int enable)
576 {
577 	return __badblocks_init(NULL, bb, enable);
578 }
579 EXPORT_SYMBOL_GPL(badblocks_init);
580 
581 int devm_init_badblocks(struct device *dev, struct badblocks *bb)
582 {
583 	if (!bb)
584 		return -EINVAL;
585 	return __badblocks_init(dev, bb, 1);
586 }
587 EXPORT_SYMBOL_GPL(devm_init_badblocks);
588 
589 /**
590  * badblocks_exit() - free the badblocks structure
591  * @bb:		the badblocks structure that holds all badblock information
592  */
593 void badblocks_exit(struct badblocks *bb)
594 {
595 	if (!bb)
596 		return;
597 	if (bb->dev)
598 		devm_kfree(bb->dev, bb->page);
599 	else
600 		kfree(bb->page);
601 	bb->page = NULL;
602 }
603 EXPORT_SYMBOL_GPL(badblocks_exit);
604