xref: /linux/block/badblocks.c (revision 98838d95075a5295f3478ceba18bcccf472e30f4)
1 /*
2  * Bad block management
3  *
4  * - Heavily based on MD badblocks code from Neil Brown
5  *
6  * Copyright (c) 2015, Intel Corporation.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  */
17 
18 #include <linux/badblocks.h>
19 #include <linux/seqlock.h>
20 #include <linux/device.h>
21 #include <linux/kernel.h>
22 #include <linux/module.h>
23 #include <linux/stddef.h>
24 #include <linux/types.h>
25 #include <linux/slab.h>
26 
27 /**
28  * badblocks_check() - check a given range for bad sectors
29  * @bb:		the badblocks structure that holds all badblock information
30  * @s:		sector (start) at which to check for badblocks
31  * @sectors:	number of sectors to check for badblocks
32  * @first_bad:	pointer to store location of the first badblock
33  * @bad_sectors: pointer to store number of badblocks after @first_bad
34  *
35  * We can record which blocks on each device are 'bad' and so just
36  * fail those blocks, or that stripe, rather than the whole device.
37  * Entries in the bad-block table are 64bits wide.  This comprises:
38  * Length of bad-range, in sectors: 0-511 for lengths 1-512
39  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
40  *  A 'shift' can be set so that larger blocks are tracked and
41  *  consequently larger devices can be covered.
42  * 'Acknowledged' flag - 1 bit. - the most significant bit.
43  *
44  * Locking of the bad-block table uses a seqlock so badblocks_check
45  * might need to retry if it is very unlucky.
46  * We will sometimes want to check for bad blocks in a bi_end_io function,
47  * so we use the write_seqlock_irq variant.
48  *
49  * When looking for a bad block we specify a range and want to
50  * know if any block in the range is bad.  So we binary-search
51  * to the last range that starts at-or-before the given endpoint,
52  * (or "before the sector after the target range")
53  * then see if it ends after the given start.
54  *
55  * Return:
56  *  0: there are no known bad blocks in the range
57  *  1: there are known bad block which are all acknowledged
58  * -1: there are bad blocks which have not yet been acknowledged in metadata.
59  * plus the start/length of the first bad section we overlap.
60  */
61 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
62 			sector_t *first_bad, int *bad_sectors)
63 {
64 	int hi;
65 	int lo;
66 	u64 *p = bb->page;
67 	int rv;
68 	sector_t target = s + sectors;
69 	unsigned seq;
70 
71 	if (bb->shift > 0) {
72 		/* round the start down, and the end up */
73 		s >>= bb->shift;
74 		target += (1<<bb->shift) - 1;
75 		target >>= bb->shift;
76 		sectors = target - s;
77 	}
78 	/* 'target' is now the first block after the bad range */
79 
80 retry:
81 	seq = read_seqbegin(&bb->lock);
82 	lo = 0;
83 	rv = 0;
84 	hi = bb->count;
85 
86 	/* Binary search between lo and hi for 'target'
87 	 * i.e. for the last range that starts before 'target'
88 	 */
89 	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
90 	 * are known not to be the last range before target.
91 	 * VARIANT: hi-lo is the number of possible
92 	 * ranges, and decreases until it reaches 1
93 	 */
94 	while (hi - lo > 1) {
95 		int mid = (lo + hi) / 2;
96 		sector_t a = BB_OFFSET(p[mid]);
97 
98 		if (a < target)
99 			/* This could still be the one, earlier ranges
100 			 * could not.
101 			 */
102 			lo = mid;
103 		else
104 			/* This and later ranges are definitely out. */
105 			hi = mid;
106 	}
107 	/* 'lo' might be the last that started before target, but 'hi' isn't */
108 	if (hi > lo) {
109 		/* need to check all range that end after 's' to see if
110 		 * any are unacknowledged.
111 		 */
112 		while (lo >= 0 &&
113 		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
114 			if (BB_OFFSET(p[lo]) < target) {
115 				/* starts before the end, and finishes after
116 				 * the start, so they must overlap
117 				 */
118 				if (rv != -1 && BB_ACK(p[lo]))
119 					rv = 1;
120 				else
121 					rv = -1;
122 				*first_bad = BB_OFFSET(p[lo]);
123 				*bad_sectors = BB_LEN(p[lo]);
124 			}
125 			lo--;
126 		}
127 	}
128 
129 	if (read_seqretry(&bb->lock, seq))
130 		goto retry;
131 
132 	return rv;
133 }
134 EXPORT_SYMBOL_GPL(badblocks_check);
135 
136 static void badblocks_update_acked(struct badblocks *bb)
137 {
138 	u64 *p = bb->page;
139 	int i;
140 	bool unacked = false;
141 
142 	if (!bb->unacked_exist)
143 		return;
144 
145 	for (i = 0; i < bb->count ; i++) {
146 		if (!BB_ACK(p[i])) {
147 			unacked = true;
148 			break;
149 		}
150 	}
151 
152 	if (!unacked)
153 		bb->unacked_exist = 0;
154 }
155 
156 /**
157  * badblocks_set() - Add a range of bad blocks to the table.
158  * @bb:		the badblocks structure that holds all badblock information
159  * @s:		first sector to mark as bad
160  * @sectors:	number of sectors to mark as bad
161  * @acknowledged: weather to mark the bad sectors as acknowledged
162  *
163  * This might extend the table, or might contract it if two adjacent ranges
164  * can be merged. We binary-search to find the 'insertion' point, then
165  * decide how best to handle it.
166  *
167  * Return:
168  *  0: success
169  *  1: failed to set badblocks (out of space)
170  */
171 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
172 			int acknowledged)
173 {
174 	u64 *p;
175 	int lo, hi;
176 	int rv = 0;
177 	unsigned long flags;
178 
179 	if (bb->shift < 0)
180 		/* badblocks are disabled */
181 		return 0;
182 
183 	if (bb->shift) {
184 		/* round the start down, and the end up */
185 		sector_t next = s + sectors;
186 
187 		s >>= bb->shift;
188 		next += (1<<bb->shift) - 1;
189 		next >>= bb->shift;
190 		sectors = next - s;
191 	}
192 
193 	write_seqlock_irqsave(&bb->lock, flags);
194 
195 	p = bb->page;
196 	lo = 0;
197 	hi = bb->count;
198 	/* Find the last range that starts at-or-before 's' */
199 	while (hi - lo > 1) {
200 		int mid = (lo + hi) / 2;
201 		sector_t a = BB_OFFSET(p[mid]);
202 
203 		if (a <= s)
204 			lo = mid;
205 		else
206 			hi = mid;
207 	}
208 	if (hi > lo && BB_OFFSET(p[lo]) > s)
209 		hi = lo;
210 
211 	if (hi > lo) {
212 		/* we found a range that might merge with the start
213 		 * of our new range
214 		 */
215 		sector_t a = BB_OFFSET(p[lo]);
216 		sector_t e = a + BB_LEN(p[lo]);
217 		int ack = BB_ACK(p[lo]);
218 
219 		if (e >= s) {
220 			/* Yes, we can merge with a previous range */
221 			if (s == a && s + sectors >= e)
222 				/* new range covers old */
223 				ack = acknowledged;
224 			else
225 				ack = ack && acknowledged;
226 
227 			if (e < s + sectors)
228 				e = s + sectors;
229 			if (e - a <= BB_MAX_LEN) {
230 				p[lo] = BB_MAKE(a, e-a, ack);
231 				s = e;
232 			} else {
233 				/* does not all fit in one range,
234 				 * make p[lo] maximal
235 				 */
236 				if (BB_LEN(p[lo]) != BB_MAX_LEN)
237 					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
238 				s = a + BB_MAX_LEN;
239 			}
240 			sectors = e - s;
241 		}
242 	}
243 	if (sectors && hi < bb->count) {
244 		/* 'hi' points to the first range that starts after 's'.
245 		 * Maybe we can merge with the start of that range
246 		 */
247 		sector_t a = BB_OFFSET(p[hi]);
248 		sector_t e = a + BB_LEN(p[hi]);
249 		int ack = BB_ACK(p[hi]);
250 
251 		if (a <= s + sectors) {
252 			/* merging is possible */
253 			if (e <= s + sectors) {
254 				/* full overlap */
255 				e = s + sectors;
256 				ack = acknowledged;
257 			} else
258 				ack = ack && acknowledged;
259 
260 			a = s;
261 			if (e - a <= BB_MAX_LEN) {
262 				p[hi] = BB_MAKE(a, e-a, ack);
263 				s = e;
264 			} else {
265 				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
266 				s = a + BB_MAX_LEN;
267 			}
268 			sectors = e - s;
269 			lo = hi;
270 			hi++;
271 		}
272 	}
273 	if (sectors == 0 && hi < bb->count) {
274 		/* we might be able to combine lo and hi */
275 		/* Note: 's' is at the end of 'lo' */
276 		sector_t a = BB_OFFSET(p[hi]);
277 		int lolen = BB_LEN(p[lo]);
278 		int hilen = BB_LEN(p[hi]);
279 		int newlen = lolen + hilen - (s - a);
280 
281 		if (s >= a && newlen < BB_MAX_LEN) {
282 			/* yes, we can combine them */
283 			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
284 
285 			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
286 			memmove(p + hi, p + hi + 1,
287 				(bb->count - hi - 1) * 8);
288 			bb->count--;
289 		}
290 	}
291 	while (sectors) {
292 		/* didn't merge (it all).
293 		 * Need to add a range just before 'hi'
294 		 */
295 		if (bb->count >= MAX_BADBLOCKS) {
296 			/* No room for more */
297 			rv = 1;
298 			break;
299 		} else {
300 			int this_sectors = sectors;
301 
302 			memmove(p + hi + 1, p + hi,
303 				(bb->count - hi) * 8);
304 			bb->count++;
305 
306 			if (this_sectors > BB_MAX_LEN)
307 				this_sectors = BB_MAX_LEN;
308 			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
309 			sectors -= this_sectors;
310 			s += this_sectors;
311 		}
312 	}
313 
314 	bb->changed = 1;
315 	if (!acknowledged)
316 		bb->unacked_exist = 1;
317 	else
318 		badblocks_update_acked(bb);
319 	write_sequnlock_irqrestore(&bb->lock, flags);
320 
321 	return rv;
322 }
323 EXPORT_SYMBOL_GPL(badblocks_set);
324 
325 /**
326  * badblocks_clear() - Remove a range of bad blocks to the table.
327  * @bb:		the badblocks structure that holds all badblock information
328  * @s:		first sector to mark as bad
329  * @sectors:	number of sectors to mark as bad
330  *
331  * This may involve extending the table if we spilt a region,
332  * but it must not fail.  So if the table becomes full, we just
333  * drop the remove request.
334  *
335  * Return:
336  *  0: success
337  *  1: failed to clear badblocks
338  */
339 int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
340 {
341 	u64 *p;
342 	int lo, hi;
343 	sector_t target = s + sectors;
344 	int rv = 0;
345 
346 	if (bb->shift > 0) {
347 		/* When clearing we round the start up and the end down.
348 		 * This should not matter as the shift should align with
349 		 * the block size and no rounding should ever be needed.
350 		 * However it is better the think a block is bad when it
351 		 * isn't than to think a block is not bad when it is.
352 		 */
353 		s += (1<<bb->shift) - 1;
354 		s >>= bb->shift;
355 		target >>= bb->shift;
356 		sectors = target - s;
357 	}
358 
359 	write_seqlock_irq(&bb->lock);
360 
361 	p = bb->page;
362 	lo = 0;
363 	hi = bb->count;
364 	/* Find the last range that starts before 'target' */
365 	while (hi - lo > 1) {
366 		int mid = (lo + hi) / 2;
367 		sector_t a = BB_OFFSET(p[mid]);
368 
369 		if (a < target)
370 			lo = mid;
371 		else
372 			hi = mid;
373 	}
374 	if (hi > lo) {
375 		/* p[lo] is the last range that could overlap the
376 		 * current range.  Earlier ranges could also overlap,
377 		 * but only this one can overlap the end of the range.
378 		 */
379 		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
380 		    (BB_OFFSET(p[lo]) < target)) {
381 			/* Partial overlap, leave the tail of this range */
382 			int ack = BB_ACK(p[lo]);
383 			sector_t a = BB_OFFSET(p[lo]);
384 			sector_t end = a + BB_LEN(p[lo]);
385 
386 			if (a < s) {
387 				/* we need to split this range */
388 				if (bb->count >= MAX_BADBLOCKS) {
389 					rv = -ENOSPC;
390 					goto out;
391 				}
392 				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
393 				bb->count++;
394 				p[lo] = BB_MAKE(a, s-a, ack);
395 				lo++;
396 			}
397 			p[lo] = BB_MAKE(target, end - target, ack);
398 			/* there is no longer an overlap */
399 			hi = lo;
400 			lo--;
401 		}
402 		while (lo >= 0 &&
403 		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
404 		       (BB_OFFSET(p[lo]) < target)) {
405 			/* This range does overlap */
406 			if (BB_OFFSET(p[lo]) < s) {
407 				/* Keep the early parts of this range. */
408 				int ack = BB_ACK(p[lo]);
409 				sector_t start = BB_OFFSET(p[lo]);
410 
411 				p[lo] = BB_MAKE(start, s - start, ack);
412 				/* now low doesn't overlap, so.. */
413 				break;
414 			}
415 			lo--;
416 		}
417 		/* 'lo' is strictly before, 'hi' is strictly after,
418 		 * anything between needs to be discarded
419 		 */
420 		if (hi - lo > 1) {
421 			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
422 			bb->count -= (hi - lo - 1);
423 		}
424 	}
425 
426 	badblocks_update_acked(bb);
427 	bb->changed = 1;
428 out:
429 	write_sequnlock_irq(&bb->lock);
430 	return rv;
431 }
432 EXPORT_SYMBOL_GPL(badblocks_clear);
433 
434 /**
435  * ack_all_badblocks() - Acknowledge all bad blocks in a list.
436  * @bb:		the badblocks structure that holds all badblock information
437  *
438  * This only succeeds if ->changed is clear.  It is used by
439  * in-kernel metadata updates
440  */
441 void ack_all_badblocks(struct badblocks *bb)
442 {
443 	if (bb->page == NULL || bb->changed)
444 		/* no point even trying */
445 		return;
446 	write_seqlock_irq(&bb->lock);
447 
448 	if (bb->changed == 0 && bb->unacked_exist) {
449 		u64 *p = bb->page;
450 		int i;
451 
452 		for (i = 0; i < bb->count ; i++) {
453 			if (!BB_ACK(p[i])) {
454 				sector_t start = BB_OFFSET(p[i]);
455 				int len = BB_LEN(p[i]);
456 
457 				p[i] = BB_MAKE(start, len, 1);
458 			}
459 		}
460 		bb->unacked_exist = 0;
461 	}
462 	write_sequnlock_irq(&bb->lock);
463 }
464 EXPORT_SYMBOL_GPL(ack_all_badblocks);
465 
466 /**
467  * badblocks_show() - sysfs access to bad-blocks list
468  * @bb:		the badblocks structure that holds all badblock information
469  * @page:	buffer received from sysfs
470  * @unack:	weather to show unacknowledged badblocks
471  *
472  * Return:
473  *  Length of returned data
474  */
475 ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
476 {
477 	size_t len;
478 	int i;
479 	u64 *p = bb->page;
480 	unsigned seq;
481 
482 	if (bb->shift < 0)
483 		return 0;
484 
485 retry:
486 	seq = read_seqbegin(&bb->lock);
487 
488 	len = 0;
489 	i = 0;
490 
491 	while (len < PAGE_SIZE && i < bb->count) {
492 		sector_t s = BB_OFFSET(p[i]);
493 		unsigned int length = BB_LEN(p[i]);
494 		int ack = BB_ACK(p[i]);
495 
496 		i++;
497 
498 		if (unack && ack)
499 			continue;
500 
501 		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
502 				(unsigned long long)s << bb->shift,
503 				length << bb->shift);
504 	}
505 	if (unack && len == 0)
506 		bb->unacked_exist = 0;
507 
508 	if (read_seqretry(&bb->lock, seq))
509 		goto retry;
510 
511 	return len;
512 }
513 EXPORT_SYMBOL_GPL(badblocks_show);
514 
515 /**
516  * badblocks_store() - sysfs access to bad-blocks list
517  * @bb:		the badblocks structure that holds all badblock information
518  * @page:	buffer received from sysfs
519  * @len:	length of data received from sysfs
520  * @unack:	weather to show unacknowledged badblocks
521  *
522  * Return:
523  *  Length of the buffer processed or -ve error.
524  */
525 ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
526 			int unack)
527 {
528 	unsigned long long sector;
529 	int length;
530 	char newline;
531 
532 	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
533 	case 3:
534 		if (newline != '\n')
535 			return -EINVAL;
536 	case 2:
537 		if (length <= 0)
538 			return -EINVAL;
539 		break;
540 	default:
541 		return -EINVAL;
542 	}
543 
544 	if (badblocks_set(bb, sector, length, !unack))
545 		return -ENOSPC;
546 	else
547 		return len;
548 }
549 EXPORT_SYMBOL_GPL(badblocks_store);
550 
551 static int __badblocks_init(struct device *dev, struct badblocks *bb,
552 		int enable)
553 {
554 	bb->dev = dev;
555 	bb->count = 0;
556 	if (enable)
557 		bb->shift = 0;
558 	else
559 		bb->shift = -1;
560 	if (dev)
561 		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
562 	else
563 		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
564 	if (!bb->page) {
565 		bb->shift = -1;
566 		return -ENOMEM;
567 	}
568 	seqlock_init(&bb->lock);
569 
570 	return 0;
571 }
572 
573 /**
574  * badblocks_init() - initialize the badblocks structure
575  * @bb:		the badblocks structure that holds all badblock information
576  * @enable:	weather to enable badblocks accounting
577  *
578  * Return:
579  *  0: success
580  *  -ve errno: on error
581  */
582 int badblocks_init(struct badblocks *bb, int enable)
583 {
584 	return __badblocks_init(NULL, bb, enable);
585 }
586 EXPORT_SYMBOL_GPL(badblocks_init);
587 
588 int devm_init_badblocks(struct device *dev, struct badblocks *bb)
589 {
590 	if (!bb)
591 		return -EINVAL;
592 	return __badblocks_init(dev, bb, 1);
593 }
594 EXPORT_SYMBOL_GPL(devm_init_badblocks);
595 
596 /**
597  * badblocks_exit() - free the badblocks structure
598  * @bb:		the badblocks structure that holds all badblock information
599  */
600 void badblocks_exit(struct badblocks *bb)
601 {
602 	if (!bb)
603 		return;
604 	if (bb->dev)
605 		devm_kfree(bb->dev, bb->page);
606 	else
607 		kfree(bb->page);
608 	bb->page = NULL;
609 }
610 EXPORT_SYMBOL_GPL(badblocks_exit);
611