xref: /linux/block/badblocks.c (revision c48a7c44a1d02516309015b6134c9bb982e17008)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Bad block management
4  *
5  * - Heavily based on MD badblocks code from Neil Brown
6  *
7  * Copyright (c) 2015, Intel Corporation.
8  */
9 
10 #include <linux/badblocks.h>
11 #include <linux/seqlock.h>
12 #include <linux/device.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/stddef.h>
16 #include <linux/types.h>
17 #include <linux/slab.h>
18 
19 /*
20  * The purpose of badblocks set/clear is to manage bad blocks ranges which are
21  * identified by LBA addresses.
22  *
23  * When the caller of badblocks_set() wants to set a range of bad blocks, the
24  * setting range can be acked or unacked. And the setting range may merge,
25  * overwrite, skip the overlapped already set range, depends on who they are
26  * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
27  * more complicated when the setting range covers multiple already set bad block
28  * ranges, with restrictions of maximum length of each bad range and the bad
29  * table space limitation.
30  *
31  * It is difficult and unnecessary to take care of all the possible situations,
32  * for setting a large range of bad blocks, we can handle it by dividing the
33  * large range into smaller ones when encounter overlap, max range length or
34  * bad table full conditions. Every time only a smaller piece of the bad range
35  * is handled with a limited number of conditions how it is interacted with
36  * possible overlapped or adjacent already set bad block ranges. Then the hard
37  * complicated problem can be much simpler to handle in proper way.
38  *
39  * When setting a range of bad blocks to the bad table, the simplified situations
40  * to be considered are, (The already set bad blocks ranges are naming with
41  *  prefix E, and the setting bad blocks range is naming with prefix S)
42  *
43  * 1) A setting range is not overlapped or adjacent to any other already set bad
44  *    block range.
45  *                         +--------+
46  *                         |    S   |
47  *                         +--------+
48  *        +-------------+               +-------------+
49  *        |      E1     |               |      E2     |
50  *        +-------------+               +-------------+
51  *    For this situation if the bad blocks table is not full, just allocate a
52  *    free slot from the bad blocks table to mark the setting range S. The
53  *    result is,
54  *        +-------------+  +--------+   +-------------+
55  *        |      E1     |  |    S   |   |      E2     |
56  *        +-------------+  +--------+   +-------------+
57  * 2) A setting range starts exactly at a start LBA of an already set bad blocks
58  *    range.
59  * 2.1) The setting range size < already set range size
60  *        +--------+
61  *        |    S   |
62  *        +--------+
63  *        +-------------+
64  *        |      E      |
65  *        +-------------+
66  * 2.1.1) If S and E are both acked or unacked range, the setting range S can
67  *    be merged into existing bad range E. The result is,
68  *        +-------------+
69  *        |      S      |
70  *        +-------------+
71  * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
72  *    the result is,
73  *        +-------------+
74  *        |      E      |
75  *        +-------------+
76  * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
77  *    An extra slot from the bad blocks table will be allocated for S, and head
78  *    of E will move to end of the inserted range S. The result is,
79  *        +--------+----+
80  *        |    S   | E  |
81  *        +--------+----+
82  * 2.2) The setting range size == already set range size
83  * 2.2.1) If S and E are both acked or unacked range, the setting range S can
84  *    be merged into existing bad range E. The result is,
85  *        +-------------+
86  *        |      S      |
87  *        +-------------+
88  * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
89  *    the result is,
90  *        +-------------+
91  *        |      E      |
92  *        +-------------+
93  * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
94       bad blocks range E. The result is,
95  *        +-------------+
96  *        |      S      |
97  *        +-------------+
98  * 2.3) The setting range size > already set range size
99  *        +-------------------+
100  *        |          S        |
101  *        +-------------------+
102  *        +-------------+
103  *        |      E      |
104  *        +-------------+
105  *    For such situation, the setting range S can be treated as two parts, the
106  *    first part (S1) is as same size as the already set range E, the second
107  *    part (S2) is the rest of setting range.
108  *        +-------------+-----+        +-------------+       +-----+
109  *        |    S1       | S2  |        |     S1      |       | S2  |
110  *        +-------------+-----+  ===>  +-------------+       +-----+
111  *        +-------------+              +-------------+
112  *        |      E      |              |      E      |
113  *        +-------------+              +-------------+
114  *    Now we only focus on how to handle the setting range S1 and already set
115  *    range E, which are already explained in 2.2), for the rest S2 it will be
116  *    handled later in next loop.
117  * 3) A setting range starts before the start LBA of an already set bad blocks
118  *    range.
119  *        +-------------+
120  *        |      S      |
121  *        +-------------+
122  *             +-------------+
123  *             |      E      |
124  *             +-------------+
125  *    For this situation, the setting range S can be divided into two parts, the
126  *    first (S1) ends at the start LBA of already set range E, the second part
127  *    (S2) starts exactly at a start LBA of the already set range E.
128  *        +----+---------+             +----+      +---------+
129  *        | S1 |    S2   |             | S1 |      |    S2   |
130  *        +----+---------+      ===>   +----+      +---------+
131  *             +-------------+                     +-------------+
132  *             |      E      |                     |      E      |
133  *             +-------------+                     +-------------+
134  *    Now only the first part S1 should be handled in this loop, which is in
135  *    similar condition as 1). The rest part S2 has exact same start LBA address
136  *    of the already set range E, they will be handled in next loop in one of
137  *    situations in 2).
138  * 4) A setting range starts after the start LBA of an already set bad blocks
139  *    range.
140  * 4.1) If the setting range S exactly matches the tail part of already set bad
141  *    blocks range E, like the following chart shows,
142  *            +---------+
143  *            |   S     |
144  *            +---------+
145  *        +-------------+
146  *        |      E      |
147  *        +-------------+
148  * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
149  *    they will be merged into one, the result is,
150  *        +-------------+
151  *        |      S      |
152  *        +-------------+
153  * 4.1.2) If range E is acked and the setting range S is unacked, the setting
154  *    request of S will be rejected, the result is,
155  *        +-------------+
156  *        |      E      |
157  *        +-------------+
158  * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
159  *    overwrite the overlapped range of E, the result is,
160  *        +---+---------+
161  *        | E |    S    |
162  *        +---+---------+
163  * 4.2) If the setting range S stays in middle of an already set range E, like
164  *    the following chart shows,
165  *             +----+
166  *             | S  |
167  *             +----+
168  *        +--------------+
169  *        |       E      |
170  *        +--------------+
171  * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
172  *    they will be merged into one, the result is,
173  *        +--------------+
174  *        |       S      |
175  *        +--------------+
176  * 4.2.2) If range E is acked and the setting range S is unacked, the setting
177  *    request of S will be rejected, the result is also,
178  *        +--------------+
179  *        |       E      |
180  *        +--------------+
181  * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
182  *    inserted into middle of E and split previous range E into two parts (E1
183  *    and E2), the result is,
184  *        +----+----+----+
185  *        | E1 |  S | E2 |
186  *        +----+----+----+
187  * 4.3) If the setting bad blocks range S is overlapped with an already set bad
188  *    blocks range E. The range S starts after the start LBA of range E, and
189  *    ends after the end LBA of range E, as the following chart shows,
190  *            +-------------------+
191  *            |          S        |
192  *            +-------------------+
193  *        +-------------+
194  *        |      E      |
195  *        +-------------+
196  *    For this situation the range S can be divided into two parts, the first
197  *    part (S1) ends at end range E, and the second part (S2) has rest range of
198  *    origin S.
199  *            +---------+---------+            +---------+      +---------+
200  *            |    S1   |    S2   |            |    S1   |      |    S2   |
201  *            +---------+---------+  ===>      +---------+      +---------+
202  *        +-------------+                  +-------------+
203  *        |      E      |                  |      E      |
204  *        +-------------+                  +-------------+
205  *     Now in this loop the setting range S1 and already set range E can be
206  *     handled as the situations 4.1), the rest range S2 will be handled in next
207  *     loop and ignored in this loop.
208  * 5) A setting bad blocks range S is adjacent to one or more already set bad
209  *    blocks range(s), and they are all acked or unacked range.
210  * 5.1) Front merge: If the already set bad blocks range E is before setting
211  *    range S and they are adjacent,
212  *                +------+
213  *                |  S   |
214  *                +------+
215  *        +-------+
216  *        |   E   |
217  *        +-------+
218  * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
219  *    values are same, the setting range S can front merges into range E. The
220  *    result is,
221  *        +--------------+
222  *        |       S      |
223  *        +--------------+
224  * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
225  *    range S right after already set range E into the bad blocks table. The
226  *    result is,
227  *        +--------+------+
228  *        |   E    |   S  |
229  *        +--------+------+
230  * 6) Special cases which above conditions cannot handle
231  * 6.1) Multiple already set ranges may merge into less ones in a full bad table
232  *        +-------------------------------------------------------+
233  *        |                           S                           |
234  *        +-------------------------------------------------------+
235  *        |<----- BB_MAX_LEN ----->|
236  *                                 +-----+     +-----+   +-----+
237  *                                 | E1  |     | E2  |   | E3  |
238  *                                 +-----+     +-----+   +-----+
239  *     In the above example, when the bad blocks table is full, inserting the
240  *     first part of setting range S will fail because no more available slot
241  *     can be allocated from bad blocks table. In this situation a proper
242  *     setting method should be go though all the setting bad blocks range and
243  *     look for chance to merge already set ranges into less ones. When there
244  *     is available slot from bad blocks table, re-try again to handle more
245  *     setting bad blocks ranges as many as possible.
246  *        +------------------------+
247  *        |          S3            |
248  *        +------------------------+
249  *        |<----- BB_MAX_LEN ----->|
250  *                                 +-----+-----+-----+---+-----+--+
251  *                                 |       S1        |     S2     |
252  *                                 +-----+-----+-----+---+-----+--+
253  *     The above chart shows although the first part (S3) cannot be inserted due
254  *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
255  *     can be merged with rest part of S into less range S1 and S2. Now there is
256  *     1 free slot in bad blocks table.
257  *        +------------------------+-----+-----+-----+---+-----+--+
258  *        |           S3           |       S1        |     S2     |
259  *        +------------------------+-----+-----+-----+---+-----+--+
260  *     Since the bad blocks table is not full anymore, re-try again for the
261  *     origin setting range S. Now the setting range S3 can be inserted into the
262  *     bad blocks table with previous freed slot from multiple ranges merge.
263  * 6.2) Front merge after overwrite
264  *    In the following example, in bad blocks table, E1 is an acked bad blocks
265  *    range and E2 is an unacked bad blocks range, therefore they are not able
266  *    to merge into a larger range. The setting bad blocks range S is acked,
267  *    therefore part of E2 can be overwritten by S.
268  *                      +--------+
269  *                      |    S   |                             acknowledged
270  *                      +--------+                         S:       1
271  *              +-------+-------------+                   E1:       1
272  *              |   E1  |    E2       |                   E2:       0
273  *              +-------+-------------+
274  *     With previous simplified routines, after overwriting part of E2 with S,
275  *     the bad blocks table should be (E3 is remaining part of E2 which is not
276  *     overwritten by S),
277  *                                                             acknowledged
278  *              +-------+--------+----+                    S:       1
279  *              |   E1  |    S   | E3 |                   E1:       1
280  *              +-------+--------+----+                   E3:       0
281  *     The above result is correct but not perfect. Range E1 and S in the bad
282  *     blocks table are all acked, merging them into a larger one range may
283  *     occupy less bad blocks table space and make badblocks_check() faster.
284  *     Therefore in such situation, after overwriting range S, the previous range
285  *     E1 should be checked for possible front combination. Then the ideal
286  *     result can be,
287  *              +----------------+----+                        acknowledged
288  *              |       E1       | E3 |                   E1:       1
289  *              +----------------+----+                   E3:       0
290  * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
291  *    range S and they are adjacent. Normally we don't need to care about this
292  *    because front merge handles this while going though range S from head to
293  *    tail, except for the tail part of range S. When the setting range S are
294  *    fully handled, all the above simplified routine doesn't check whether the
295  *    tail LBA of range S is adjacent to the next already set range and not
296  *    merge them even it is possible.
297  *        +------+
298  *        |  S   |
299  *        +------+
300  *               +-------+
301  *               |   E   |
302  *               +-------+
303  *    For the above special situation, when the setting range S are all handled
304  *    and the loop ends, an extra check is necessary for whether next already
305  *    set range E is right after S and mergeable.
306  * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
307  *    values are same, the setting range S can behind merges into range E. The
308  *    result is,
309  *        +--------------+
310  *        |       S      |
311  *        +--------------+
312  * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
313  *     S in front of the already set range E in the bad blocks table. The result
314  *     is,
315  *        +------+-------+
316  *        |  S   |   E   |
317  *        +------+-------+
318  *
319  * All the above 5 simplified situations and 3 special cases may cover 99%+ of
320  * the bad block range setting conditions. Maybe there is some rare corner case
321  * is not considered and optimized, it won't hurt if badblocks_set() fails due
322  * to no space, or some ranges are not merged to save bad blocks table space.
323  *
324  * Inside badblocks_set() each loop starts by jumping to re_insert label, every
325  * time for the new loop prev_badblocks() is called to find an already set range
326  * which starts before or at current setting range. Since the setting bad blocks
327  * range is handled from head to tail, most of the cases it is unnecessary to do
328  * the binary search inside prev_badblocks(), it is possible to provide a hint
329  * to prev_badblocks() for a fast path, then the expensive binary search can be
330  * avoided. In my test with the hint to prev_badblocks(), except for the first
331  * loop, all rested calls to prev_badblocks() can go into the fast path and
332  * return correct bad blocks table index immediately.
333  *
334  *
335  * Clearing a bad blocks range from the bad block table has similar idea as
336  * setting does, but much more simpler. The only thing needs to be noticed is
337  * when the clearing range hits middle of a bad block range, the existing bad
338  * block range will split into two, and one more item should be added into the
339  * bad block table. The simplified situations to be considered are, (The already
340  * set bad blocks ranges in bad block table are naming with prefix E, and the
341  * clearing bad blocks range is naming with prefix C)
342  *
343  * 1) A clearing range is not overlapped to any already set ranges in bad block
344  *    table.
345  *    +-----+         |          +-----+         |          +-----+
346  *    |  C  |         |          |  C  |         |          |  C  |
347  *    +-----+         or         +-----+         or         +-----+
348  *            +---+   |   +----+         +----+  |  +---+
349  *            | E |   |   | E1 |         | E2 |  |  | E |
350  *            +---+   |   +----+         +----+  |  +---+
351  *    For the above situations, no bad block to be cleared and no failure
352  *    happens, simply returns 0.
353  * 2) The clearing range hits middle of an already setting bad blocks range in
354  *    the bad block table.
355  *            +---+
356  *            | C |
357  *            +---+
358  *     +-----------------+
359  *     |         E       |
360  *     +-----------------+
361  *    In this situation if the bad block table is not full, the range E will be
362  *    split into two ranges E1 and E2. The result is,
363  *     +------+   +------+
364  *     |  E1  |   |  E2  |
365  *     +------+   +------+
366  * 3) The clearing range starts exactly at same LBA as an already set bad block range
367  *    from the bad block table.
368  * 3.1) Partially covered at head part
369  *         +------------+
370  *         |     C      |
371  *         +------------+
372  *         +-----------------+
373  *         |         E       |
374  *         +-----------------+
375  *    For this situation, the overlapped already set range will update the
376  *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
377  *    item deleted from bad block table. The result is,
378  *                      +----+
379  *                      | E1 |
380  *                      +----+
381  * 3.2) Exact fully covered
382  *         +-----------------+
383  *         |         C       |
384  *         +-----------------+
385  *         +-----------------+
386  *         |         E       |
387  *         +-----------------+
388  *    For this situation the whole bad blocks range E will be cleared and its
389  *    corresponded item is deleted from the bad block table.
390  * 4) The clearing range exactly ends at same LBA as an already set bad block
391  *    range.
392  *                   +-------+
393  *                   |   C   |
394  *                   +-------+
395  *         +-----------------+
396  *         |         E       |
397  *         +-----------------+
398  *    For the above situation, the already set range E is updated to shrink its
399  *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
400  *    The result is,
401  *         +---------+
402  *         |    E    |
403  *         +---------+
404  * 5) The clearing range is partially overlapped with an already set bad block
405  *    range from the bad block table.
406  * 5.1) The already set bad block range is front overlapped with the clearing
407  *    range.
408  *         +----------+
409  *         |     C    |
410  *         +----------+
411  *              +------------+
412  *              |      E     |
413  *              +------------+
414  *   For such situation, the clearing range C can be treated as two parts. The
415  *   first part ends at the start LBA of range E, and the second part starts at
416  *   same LBA of range E.
417  *         +----+-----+               +----+   +-----+
418  *         | C1 | C2  |               | C1 |   | C2  |
419  *         +----+-----+         ===>  +----+   +-----+
420  *              +------------+                 +------------+
421  *              |      E     |                 |      E     |
422  *              +------------+                 +------------+
423  *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
424  *   handled as condition 3.1) in next loop.
425  * 5.2) The already set bad block range is behind overlaopped with the clearing
426  *   range.
427  *                 +----------+
428  *                 |     C    |
429  *                 +----------+
430  *         +------------+
431  *         |      E     |
432  *         +------------+
433  *   For such situation, the clearing range C can be treated as two parts. The
434  *   first part C1 ends at same end LBA of range E, and the second part starts
435  *   at end LBA of range E.
436  *                 +----+-----+                 +----+    +-----+
437  *                 | C1 | C2  |                 | C1 |    | C2  |
438  *                 +----+-----+  ===>           +----+    +-----+
439  *         +------------+               +------------+
440  *         |      E     |               |      E     |
441  *         +------------+               +------------+
442  *   Now the first part clearing range C1 can be handled as condition 4), and
443  *   the second part clearing range C2 can be handled as condition 1) in next
444  *   loop.
445  *
446  *   All bad blocks range clearing can be simplified into the above 5 situations
447  *   by only handling the head part of the clearing range in each run of the
448  *   while-loop. The idea is similar to bad blocks range setting but much
449  *   simpler.
450  */
451 
452 /*
453  * Find the range starts at-or-before 's' from bad table. The search
454  * starts from index 'hint' and stops at index 'hint_end' from the bad
455  * table.
456  */
457 static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
458 {
459 	int hint_end = hint + 2;
460 	u64 *p = bb->page;
461 	int ret = -1;
462 
463 	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
464 	       (BB_OFFSET(p[hint]) <= s)) {
465 		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
466 			ret = hint;
467 			break;
468 		}
469 		hint++;
470 	}
471 
472 	return ret;
473 }
474 
475 /*
476  * Find the range starts at-or-before bad->start. If 'hint' is provided
477  * (hint >= 0) then search in the bad table from hint firstly. It is
478  * very probably the wanted bad range can be found from the hint index,
479  * then the unnecessary while-loop iteration can be avoided.
480  */
481 static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
482 			  int hint)
483 {
484 	sector_t s = bad->start;
485 	int ret = -1;
486 	int lo, hi;
487 	u64 *p;
488 
489 	if (!bb->count)
490 		goto out;
491 
492 	if (hint >= 0) {
493 		ret = prev_by_hint(bb, s, hint);
494 		if (ret >= 0)
495 			goto out;
496 	}
497 
498 	lo = 0;
499 	hi = bb->count;
500 	p = bb->page;
501 
502 	/* The following bisect search might be unnecessary */
503 	if (BB_OFFSET(p[lo]) > s)
504 		return -1;
505 	if (BB_OFFSET(p[hi - 1]) <= s)
506 		return hi - 1;
507 
508 	/* Do bisect search in bad table */
509 	while (hi - lo > 1) {
510 		int mid = (lo + hi)/2;
511 		sector_t a = BB_OFFSET(p[mid]);
512 
513 		if (a == s) {
514 			ret = mid;
515 			goto out;
516 		}
517 
518 		if (a < s)
519 			lo = mid;
520 		else
521 			hi = mid;
522 	}
523 
524 	if (BB_OFFSET(p[lo]) <= s)
525 		ret = lo;
526 out:
527 	return ret;
528 }
529 
530 /*
531  * Return 'true' if the range indicated by 'bad' can be backward merged
532  * with the bad range (from the bad table) index by 'behind'.
533  */
534 static bool can_merge_behind(struct badblocks *bb,
535 			     struct badblocks_context *bad, int behind)
536 {
537 	sector_t sectors = bad->len;
538 	sector_t s = bad->start;
539 	u64 *p = bb->page;
540 
541 	if ((s < BB_OFFSET(p[behind])) &&
542 	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
543 	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
544 	    BB_ACK(p[behind]) == bad->ack)
545 		return true;
546 	return false;
547 }
548 
549 /*
550  * Do backward merge for range indicated by 'bad' and the bad range
551  * (from the bad table) indexed by 'behind'. The return value is merged
552  * sectors from bad->len.
553  */
554 static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
555 			int behind)
556 {
557 	sector_t sectors = bad->len;
558 	sector_t s = bad->start;
559 	u64 *p = bb->page;
560 	int merged = 0;
561 
562 	WARN_ON(s >= BB_OFFSET(p[behind]));
563 	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
564 
565 	if (s < BB_OFFSET(p[behind])) {
566 		merged = BB_OFFSET(p[behind]) - s;
567 		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
568 
569 		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
570 	}
571 
572 	return merged;
573 }
574 
575 /*
576  * Return 'true' if the range indicated by 'bad' can be forward
577  * merged with the bad range (from the bad table) indexed by 'prev'.
578  */
579 static bool can_merge_front(struct badblocks *bb, int prev,
580 			    struct badblocks_context *bad)
581 {
582 	sector_t s = bad->start;
583 	u64 *p = bb->page;
584 
585 	if (BB_ACK(p[prev]) == bad->ack &&
586 	    (s < BB_END(p[prev]) ||
587 	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
588 		return true;
589 	return false;
590 }
591 
592 /*
593  * Do forward merge for range indicated by 'bad' and the bad range
594  * (from bad table) indexed by 'prev'. The return value is sectors
595  * merged from bad->len.
596  */
597 static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
598 {
599 	sector_t sectors = bad->len;
600 	sector_t s = bad->start;
601 	u64 *p = bb->page;
602 	int merged = 0;
603 
604 	WARN_ON(s > BB_END(p[prev]));
605 
606 	if (s < BB_END(p[prev])) {
607 		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
608 	} else {
609 		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
610 		if ((prev + 1) < bb->count &&
611 		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
612 			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
613 		}
614 
615 		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
616 				  BB_LEN(p[prev]) + merged, bad->ack);
617 	}
618 
619 	return merged;
620 }
621 
622 /*
623  * 'Combine' is a special case which can_merge_front() is not able to
624  * handle: If a bad range (indexed by 'prev' from bad table) exactly
625  * starts as bad->start, and the bad range ahead of 'prev' (indexed by
626  * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
627  * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
628  * these two bad range (from bad table) can be combined.
629  *
630  * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
631  * table can be combined.
632  */
633 static bool can_combine_front(struct badblocks *bb, int prev,
634 			      struct badblocks_context *bad)
635 {
636 	u64 *p = bb->page;
637 
638 	if ((prev > 0) &&
639 	    (BB_OFFSET(p[prev]) == bad->start) &&
640 	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
641 	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
642 	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
643 		return true;
644 	return false;
645 }
646 
647 /*
648  * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
649  * table) into one larger bad range, and the new range is indexed by
650  * 'prev - 1'.
651  * The caller of front_combine() will decrease bb->count, therefore
652  * it is unnecessary to clear p[perv] after front merge.
653  */
654 static void front_combine(struct badblocks *bb, int prev)
655 {
656 	u64 *p = bb->page;
657 
658 	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
659 			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
660 			      BB_ACK(p[prev]));
661 	if ((prev + 1) < bb->count)
662 		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
663 }
664 
665 /*
666  * Return 'true' if the range indicated by 'bad' is exactly forward
667  * overlapped with the bad range (from bad table) indexed by 'front'.
668  * Exactly forward overlap means the bad range (from bad table) indexed
669  * by 'prev' does not cover the whole range indicated by 'bad'.
670  */
671 static bool overlap_front(struct badblocks *bb, int front,
672 			  struct badblocks_context *bad)
673 {
674 	u64 *p = bb->page;
675 
676 	if (bad->start >= BB_OFFSET(p[front]) &&
677 	    bad->start < BB_END(p[front]))
678 		return true;
679 	return false;
680 }
681 
682 /*
683  * Return 'true' if the range indicated by 'bad' is exactly backward
684  * overlapped with the bad range (from bad table) indexed by 'behind'.
685  */
686 static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
687 			   int behind)
688 {
689 	u64 *p = bb->page;
690 
691 	if (bad->start < BB_OFFSET(p[behind]) &&
692 	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
693 		return true;
694 	return false;
695 }
696 
697 /*
698  * Return 'true' if the range indicated by 'bad' can overwrite the bad
699  * range (from bad table) indexed by 'prev'.
700  *
701  * The range indicated by 'bad' can overwrite the bad range indexed by
702  * 'prev' when,
703  * 1) The whole range indicated by 'bad' can cover partial or whole bad
704  *    range (from bad table) indexed by 'prev'.
705  * 2) The ack value of 'bad' is larger or equal to the ack value of bad
706  *    range 'prev'.
707  *
708  * If the overwriting doesn't cover the whole bad range (from bad table)
709  * indexed by 'prev', new range might be split from existing bad range,
710  * 1) The overwrite covers head or tail part of existing bad range, 1
711  *    extra bad range will be split and added into the bad table.
712  * 2) The overwrite covers middle of existing bad range, 2 extra bad
713  *    ranges will be split (ahead and after the overwritten range) and
714  *    added into the bad table.
715  * The number of extra split ranges of the overwriting is stored in
716  * 'extra' and returned for the caller.
717  */
718 static bool can_front_overwrite(struct badblocks *bb, int prev,
719 				struct badblocks_context *bad, int *extra)
720 {
721 	u64 *p = bb->page;
722 	int len;
723 
724 	WARN_ON(!overlap_front(bb, prev, bad));
725 
726 	if (BB_ACK(p[prev]) >= bad->ack)
727 		return false;
728 
729 	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
730 		len = BB_END(p[prev]) - bad->start;
731 		if (BB_OFFSET(p[prev]) == bad->start)
732 			*extra = 0;
733 		else
734 			*extra = 1;
735 
736 		bad->len = len;
737 	} else {
738 		if (BB_OFFSET(p[prev]) == bad->start)
739 			*extra = 1;
740 		else
741 		/*
742 		 * prev range will be split into two, beside the overwritten
743 		 * one, an extra slot needed from bad table.
744 		 */
745 			*extra = 2;
746 	}
747 
748 	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
749 		return false;
750 
751 	return true;
752 }
753 
754 /*
755  * Do the overwrite from the range indicated by 'bad' to the bad range
756  * (from bad table) indexed by 'prev'.
757  * The previously called can_front_overwrite() will provide how many
758  * extra bad range(s) might be split and added into the bad table. All
759  * the splitting cases in the bad table will be handled here.
760  */
761 static int front_overwrite(struct badblocks *bb, int prev,
762 			   struct badblocks_context *bad, int extra)
763 {
764 	u64 *p = bb->page;
765 	sector_t orig_end = BB_END(p[prev]);
766 	int orig_ack = BB_ACK(p[prev]);
767 
768 	switch (extra) {
769 	case 0:
770 		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
771 				  bad->ack);
772 		break;
773 	case 1:
774 		if (BB_OFFSET(p[prev]) == bad->start) {
775 			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
776 					  bad->len, bad->ack);
777 			memmove(p + prev + 2, p + prev + 1,
778 				(bb->count - prev - 1) * 8);
779 			p[prev + 1] = BB_MAKE(bad->start + bad->len,
780 					      orig_end - BB_END(p[prev]),
781 					      orig_ack);
782 		} else {
783 			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
784 					  bad->start - BB_OFFSET(p[prev]),
785 					  orig_ack);
786 			/*
787 			 * prev +2 -> prev + 1 + 1, which is for,
788 			 * 1) prev + 1: the slot index of the previous one
789 			 * 2) + 1: one more slot for extra being 1.
790 			 */
791 			memmove(p + prev + 2, p + prev + 1,
792 				(bb->count - prev - 1) * 8);
793 			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
794 		}
795 		break;
796 	case 2:
797 		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
798 				  bad->start - BB_OFFSET(p[prev]),
799 				  orig_ack);
800 		/*
801 		 * prev + 3 -> prev + 1 + 2, which is for,
802 		 * 1) prev + 1: the slot index of the previous one
803 		 * 2) + 2: two more slots for extra being 2.
804 		 */
805 		memmove(p + prev + 3, p + prev + 1,
806 			(bb->count - prev - 1) * 8);
807 		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
808 		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
809 				      orig_end - BB_END(p[prev + 1]),
810 				      orig_ack);
811 		break;
812 	default:
813 		break;
814 	}
815 
816 	return bad->len;
817 }
818 
819 /*
820  * Explicitly insert a range indicated by 'bad' to the bad table, where
821  * the location is indexed by 'at'.
822  */
823 static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
824 {
825 	u64 *p = bb->page;
826 	int len;
827 
828 	WARN_ON(badblocks_full(bb));
829 
830 	len = min_t(sector_t, bad->len, BB_MAX_LEN);
831 	if (at < bb->count)
832 		memmove(p + at + 1, p + at, (bb->count - at) * 8);
833 	p[at] = BB_MAKE(bad->start, len, bad->ack);
834 
835 	return len;
836 }
837 
838 static void badblocks_update_acked(struct badblocks *bb)
839 {
840 	bool unacked = false;
841 	u64 *p = bb->page;
842 	int i;
843 
844 	if (!bb->unacked_exist)
845 		return;
846 
847 	for (i = 0; i < bb->count ; i++) {
848 		if (!BB_ACK(p[i])) {
849 			unacked = true;
850 			break;
851 		}
852 	}
853 
854 	if (!unacked)
855 		bb->unacked_exist = 0;
856 }
857 
858 /* Do exact work to set bad block range into the bad block table */
859 static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
860 			  int acknowledged)
861 {
862 	int retried = 0, space_desired = 0;
863 	int orig_len, len = 0, added = 0;
864 	struct badblocks_context bad;
865 	int prev = -1, hint = -1;
866 	sector_t orig_start;
867 	unsigned long flags;
868 	int rv = 0;
869 	u64 *p;
870 
871 	if (bb->shift < 0)
872 		/* badblocks are disabled */
873 		return 1;
874 
875 	if (sectors == 0)
876 		/* Invalid sectors number */
877 		return 1;
878 
879 	if (bb->shift) {
880 		/* round the start down, and the end up */
881 		sector_t next = s + sectors;
882 
883 		rounddown(s, bb->shift);
884 		roundup(next, bb->shift);
885 		sectors = next - s;
886 	}
887 
888 	write_seqlock_irqsave(&bb->lock, flags);
889 
890 	orig_start = s;
891 	orig_len = sectors;
892 	bad.ack = acknowledged;
893 	p = bb->page;
894 
895 re_insert:
896 	bad.start = s;
897 	bad.len = sectors;
898 	len = 0;
899 
900 	if (badblocks_empty(bb)) {
901 		len = insert_at(bb, 0, &bad);
902 		bb->count++;
903 		added++;
904 		goto update_sectors;
905 	}
906 
907 	prev = prev_badblocks(bb, &bad, hint);
908 
909 	/* start before all badblocks */
910 	if (prev < 0) {
911 		if (!badblocks_full(bb)) {
912 			/* insert on the first */
913 			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
914 				bad.len = BB_OFFSET(p[0]) - bad.start;
915 			len = insert_at(bb, 0, &bad);
916 			bb->count++;
917 			added++;
918 			hint = 0;
919 			goto update_sectors;
920 		}
921 
922 		/* No sapce, try to merge */
923 		if (overlap_behind(bb, &bad, 0)) {
924 			if (can_merge_behind(bb, &bad, 0)) {
925 				len = behind_merge(bb, &bad, 0);
926 				added++;
927 			} else {
928 				len = BB_OFFSET(p[0]) - s;
929 				space_desired = 1;
930 			}
931 			hint = 0;
932 			goto update_sectors;
933 		}
934 
935 		/* no table space and give up */
936 		goto out;
937 	}
938 
939 	/* in case p[prev-1] can be merged with p[prev] */
940 	if (can_combine_front(bb, prev, &bad)) {
941 		front_combine(bb, prev);
942 		bb->count--;
943 		added++;
944 		hint = prev;
945 		goto update_sectors;
946 	}
947 
948 	if (overlap_front(bb, prev, &bad)) {
949 		if (can_merge_front(bb, prev, &bad)) {
950 			len = front_merge(bb, prev, &bad);
951 			added++;
952 		} else {
953 			int extra = 0;
954 
955 			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
956 				len = min_t(sector_t,
957 					    BB_END(p[prev]) - s, sectors);
958 				hint = prev;
959 				goto update_sectors;
960 			}
961 
962 			len = front_overwrite(bb, prev, &bad, extra);
963 			added++;
964 			bb->count += extra;
965 
966 			if (can_combine_front(bb, prev, &bad)) {
967 				front_combine(bb, prev);
968 				bb->count--;
969 			}
970 		}
971 		hint = prev;
972 		goto update_sectors;
973 	}
974 
975 	if (can_merge_front(bb, prev, &bad)) {
976 		len = front_merge(bb, prev, &bad);
977 		added++;
978 		hint = prev;
979 		goto update_sectors;
980 	}
981 
982 	/* if no space in table, still try to merge in the covered range */
983 	if (badblocks_full(bb)) {
984 		/* skip the cannot-merge range */
985 		if (((prev + 1) < bb->count) &&
986 		    overlap_behind(bb, &bad, prev + 1) &&
987 		    ((s + sectors) >= BB_END(p[prev + 1]))) {
988 			len = BB_END(p[prev + 1]) - s;
989 			hint = prev + 1;
990 			goto update_sectors;
991 		}
992 
993 		/* no retry any more */
994 		len = sectors;
995 		space_desired = 1;
996 		hint = -1;
997 		goto update_sectors;
998 	}
999 
1000 	/* cannot merge and there is space in bad table */
1001 	if ((prev + 1) < bb->count &&
1002 	    overlap_behind(bb, &bad, prev + 1))
1003 		bad.len = min_t(sector_t,
1004 				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
1005 
1006 	len = insert_at(bb, prev + 1, &bad);
1007 	bb->count++;
1008 	added++;
1009 	hint = prev + 1;
1010 
1011 update_sectors:
1012 	s += len;
1013 	sectors -= len;
1014 
1015 	if (sectors > 0)
1016 		goto re_insert;
1017 
1018 	WARN_ON(sectors < 0);
1019 
1020 	/*
1021 	 * Check whether the following already set range can be
1022 	 * merged. (prev < 0) condition is not handled here,
1023 	 * because it's already complicated enough.
1024 	 */
1025 	if (prev >= 0 &&
1026 	    (prev + 1) < bb->count &&
1027 	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
1028 	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
1029 	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
1030 		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
1031 				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
1032 				  BB_ACK(p[prev]));
1033 
1034 		if ((prev + 2) < bb->count)
1035 			memmove(p + prev + 1, p + prev + 2,
1036 				(bb->count -  (prev + 2)) * 8);
1037 		bb->count--;
1038 	}
1039 
1040 	if (space_desired && !badblocks_full(bb)) {
1041 		s = orig_start;
1042 		sectors = orig_len;
1043 		space_desired = 0;
1044 		if (retried++ < 3)
1045 			goto re_insert;
1046 	}
1047 
1048 out:
1049 	if (added) {
1050 		set_changed(bb);
1051 
1052 		if (!acknowledged)
1053 			bb->unacked_exist = 1;
1054 		else
1055 			badblocks_update_acked(bb);
1056 	}
1057 
1058 	write_sequnlock_irqrestore(&bb->lock, flags);
1059 
1060 	if (!added)
1061 		rv = 1;
1062 
1063 	return rv;
1064 }
1065 
1066 /*
1067  * Clear the bad block range from bad block table which is front overlapped
1068  * with the clearing range. The return value is how many sectors from an
1069  * already set bad block range are cleared. If the whole bad block range is
1070  * covered by the clearing range and fully cleared, 'delete' is set as 1 for
1071  * the caller to reduce bb->count.
1072  */
1073 static int front_clear(struct badblocks *bb, int prev,
1074 		       struct badblocks_context *bad, int *deleted)
1075 {
1076 	sector_t sectors = bad->len;
1077 	sector_t s = bad->start;
1078 	u64 *p = bb->page;
1079 	int cleared = 0;
1080 
1081 	*deleted = 0;
1082 	if (s == BB_OFFSET(p[prev])) {
1083 		if (BB_LEN(p[prev]) > sectors) {
1084 			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
1085 					  BB_LEN(p[prev]) - sectors,
1086 					  BB_ACK(p[prev]));
1087 			cleared = sectors;
1088 		} else {
1089 			/* BB_LEN(p[prev]) <= sectors */
1090 			cleared = BB_LEN(p[prev]);
1091 			if ((prev + 1) < bb->count)
1092 				memmove(p + prev, p + prev + 1,
1093 				       (bb->count - prev - 1) * 8);
1094 			*deleted = 1;
1095 		}
1096 	} else if (s > BB_OFFSET(p[prev])) {
1097 		if (BB_END(p[prev]) <= (s + sectors)) {
1098 			cleared = BB_END(p[prev]) - s;
1099 			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
1100 					  s - BB_OFFSET(p[prev]),
1101 					  BB_ACK(p[prev]));
1102 		} else {
1103 			/* Splitting is handled in front_splitting_clear() */
1104 			BUG();
1105 		}
1106 	}
1107 
1108 	return cleared;
1109 }
1110 
1111 /*
1112  * Handle the condition that the clearing range hits middle of an already set
1113  * bad block range from bad block table. In this condition the existing bad
1114  * block range is split into two after the middle part is cleared.
1115  */
1116 static int front_splitting_clear(struct badblocks *bb, int prev,
1117 				  struct badblocks_context *bad)
1118 {
1119 	u64 *p = bb->page;
1120 	u64 end = BB_END(p[prev]);
1121 	int ack = BB_ACK(p[prev]);
1122 	sector_t sectors = bad->len;
1123 	sector_t s = bad->start;
1124 
1125 	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
1126 			  s - BB_OFFSET(p[prev]),
1127 			  ack);
1128 	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
1129 	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
1130 	return sectors;
1131 }
1132 
1133 /* Do the exact work to clear bad block range from the bad block table */
1134 static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
1135 {
1136 	struct badblocks_context bad;
1137 	int prev = -1, hint = -1;
1138 	int len = 0, cleared = 0;
1139 	int rv = 0;
1140 	u64 *p;
1141 
1142 	if (bb->shift < 0)
1143 		/* badblocks are disabled */
1144 		return 1;
1145 
1146 	if (sectors == 0)
1147 		/* Invalid sectors number */
1148 		return 1;
1149 
1150 	if (bb->shift) {
1151 		sector_t target;
1152 
1153 		/* When clearing we round the start up and the end down.
1154 		 * This should not matter as the shift should align with
1155 		 * the block size and no rounding should ever be needed.
1156 		 * However it is better the think a block is bad when it
1157 		 * isn't than to think a block is not bad when it is.
1158 		 */
1159 		target = s + sectors;
1160 		roundup(s, bb->shift);
1161 		rounddown(target, bb->shift);
1162 		sectors = target - s;
1163 	}
1164 
1165 	write_seqlock_irq(&bb->lock);
1166 
1167 	bad.ack = true;
1168 	p = bb->page;
1169 
1170 re_clear:
1171 	bad.start = s;
1172 	bad.len = sectors;
1173 
1174 	if (badblocks_empty(bb)) {
1175 		len = sectors;
1176 		cleared++;
1177 		goto update_sectors;
1178 	}
1179 
1180 
1181 	prev = prev_badblocks(bb, &bad, hint);
1182 
1183 	/* Start before all badblocks */
1184 	if (prev < 0) {
1185 		if (overlap_behind(bb, &bad, 0)) {
1186 			len = BB_OFFSET(p[0]) - s;
1187 			hint = 0;
1188 		} else {
1189 			len = sectors;
1190 		}
1191 		/*
1192 		 * Both situations are to clear non-bad range,
1193 		 * should be treated as successful
1194 		 */
1195 		cleared++;
1196 		goto update_sectors;
1197 	}
1198 
1199 	/* Start after all badblocks */
1200 	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
1201 		len = sectors;
1202 		cleared++;
1203 		goto update_sectors;
1204 	}
1205 
1206 	/* Clear will split a bad record but the table is full */
1207 	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
1208 	    (BB_END(p[prev]) > (bad.start + sectors))) {
1209 		len = sectors;
1210 		goto update_sectors;
1211 	}
1212 
1213 	if (overlap_front(bb, prev, &bad)) {
1214 		if ((BB_OFFSET(p[prev]) < bad.start) &&
1215 		    (BB_END(p[prev]) > (bad.start + bad.len))) {
1216 			/* Splitting */
1217 			if ((bb->count + 1) < MAX_BADBLOCKS) {
1218 				len = front_splitting_clear(bb, prev, &bad);
1219 				bb->count += 1;
1220 				cleared++;
1221 			} else {
1222 				/* No space to split, give up */
1223 				len = sectors;
1224 			}
1225 		} else {
1226 			int deleted = 0;
1227 
1228 			len = front_clear(bb, prev, &bad, &deleted);
1229 			bb->count -= deleted;
1230 			cleared++;
1231 			hint = prev;
1232 		}
1233 
1234 		goto update_sectors;
1235 	}
1236 
1237 	/* Not front overlap, but behind overlap */
1238 	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
1239 		len = BB_OFFSET(p[prev + 1]) - bad.start;
1240 		hint = prev + 1;
1241 		/* Clear non-bad range should be treated as successful */
1242 		cleared++;
1243 		goto update_sectors;
1244 	}
1245 
1246 	/* Not cover any badblocks range in the table */
1247 	len = sectors;
1248 	/* Clear non-bad range should be treated as successful */
1249 	cleared++;
1250 
1251 update_sectors:
1252 	s += len;
1253 	sectors -= len;
1254 
1255 	if (sectors > 0)
1256 		goto re_clear;
1257 
1258 	WARN_ON(sectors < 0);
1259 
1260 	if (cleared) {
1261 		badblocks_update_acked(bb);
1262 		set_changed(bb);
1263 	}
1264 
1265 	write_sequnlock_irq(&bb->lock);
1266 
1267 	if (!cleared)
1268 		rv = 1;
1269 
1270 	return rv;
1271 }
1272 
1273 /* Do the exact work to check bad blocks range from the bad block table */
1274 static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
1275 			    sector_t *first_bad, int *bad_sectors)
1276 {
1277 	int unacked_badblocks, acked_badblocks;
1278 	int prev = -1, hint = -1, set = 0;
1279 	struct badblocks_context bad;
1280 	unsigned int seq;
1281 	int len, rv;
1282 	u64 *p;
1283 
1284 	WARN_ON(bb->shift < 0 || sectors == 0);
1285 
1286 	if (bb->shift > 0) {
1287 		sector_t target;
1288 
1289 		/* round the start down, and the end up */
1290 		target = s + sectors;
1291 		rounddown(s, bb->shift);
1292 		roundup(target, bb->shift);
1293 		sectors = target - s;
1294 	}
1295 
1296 retry:
1297 	seq = read_seqbegin(&bb->lock);
1298 
1299 	p = bb->page;
1300 	unacked_badblocks = 0;
1301 	acked_badblocks = 0;
1302 
1303 re_check:
1304 	bad.start = s;
1305 	bad.len = sectors;
1306 
1307 	if (badblocks_empty(bb)) {
1308 		len = sectors;
1309 		goto update_sectors;
1310 	}
1311 
1312 	prev = prev_badblocks(bb, &bad, hint);
1313 
1314 	/* start after all badblocks */
1315 	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
1316 		len = sectors;
1317 		goto update_sectors;
1318 	}
1319 
1320 	if (overlap_front(bb, prev, &bad)) {
1321 		if (BB_ACK(p[prev]))
1322 			acked_badblocks++;
1323 		else
1324 			unacked_badblocks++;
1325 
1326 		if (BB_END(p[prev]) >= (s + sectors))
1327 			len = sectors;
1328 		else
1329 			len = BB_END(p[prev]) - s;
1330 
1331 		if (set == 0) {
1332 			*first_bad = BB_OFFSET(p[prev]);
1333 			*bad_sectors = BB_LEN(p[prev]);
1334 			set = 1;
1335 		}
1336 		goto update_sectors;
1337 	}
1338 
1339 	/* Not front overlap, but behind overlap */
1340 	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
1341 		len = BB_OFFSET(p[prev + 1]) - bad.start;
1342 		hint = prev + 1;
1343 		goto update_sectors;
1344 	}
1345 
1346 	/* not cover any badblocks range in the table */
1347 	len = sectors;
1348 
1349 update_sectors:
1350 	s += len;
1351 	sectors -= len;
1352 
1353 	if (sectors > 0)
1354 		goto re_check;
1355 
1356 	WARN_ON(sectors < 0);
1357 
1358 	if (unacked_badblocks > 0)
1359 		rv = -1;
1360 	else if (acked_badblocks > 0)
1361 		rv = 1;
1362 	else
1363 		rv = 0;
1364 
1365 	if (read_seqretry(&bb->lock, seq))
1366 		goto retry;
1367 
1368 	return rv;
1369 }
1370 
1371 /**
1372  * badblocks_check() - check a given range for bad sectors
1373  * @bb:		the badblocks structure that holds all badblock information
1374  * @s:		sector (start) at which to check for badblocks
1375  * @sectors:	number of sectors to check for badblocks
1376  * @first_bad:	pointer to store location of the first badblock
1377  * @bad_sectors: pointer to store number of badblocks after @first_bad
1378  *
1379  * We can record which blocks on each device are 'bad' and so just
1380  * fail those blocks, or that stripe, rather than the whole device.
1381  * Entries in the bad-block table are 64bits wide.  This comprises:
1382  * Length of bad-range, in sectors: 0-511 for lengths 1-512
1383  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
1384  *  A 'shift' can be set so that larger blocks are tracked and
1385  *  consequently larger devices can be covered.
1386  * 'Acknowledged' flag - 1 bit. - the most significant bit.
1387  *
1388  * Locking of the bad-block table uses a seqlock so badblocks_check
1389  * might need to retry if it is very unlucky.
1390  * We will sometimes want to check for bad blocks in a bi_end_io function,
1391  * so we use the write_seqlock_irq variant.
1392  *
1393  * When looking for a bad block we specify a range and want to
1394  * know if any block in the range is bad.  So we binary-search
1395  * to the last range that starts at-or-before the given endpoint,
1396  * (or "before the sector after the target range")
1397  * then see if it ends after the given start.
1398  *
1399  * Return:
1400  *  0: there are no known bad blocks in the range
1401  *  1: there are known bad block which are all acknowledged
1402  * -1: there are bad blocks which have not yet been acknowledged in metadata.
1403  * plus the start/length of the first bad section we overlap.
1404  */
1405 int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
1406 			sector_t *first_bad, int *bad_sectors)
1407 {
1408 	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
1409 }
1410 EXPORT_SYMBOL_GPL(badblocks_check);
1411 
1412 /**
1413  * badblocks_set() - Add a range of bad blocks to the table.
1414  * @bb:		the badblocks structure that holds all badblock information
1415  * @s:		first sector to mark as bad
1416  * @sectors:	number of sectors to mark as bad
1417  * @acknowledged: weather to mark the bad sectors as acknowledged
1418  *
1419  * This might extend the table, or might contract it if two adjacent ranges
1420  * can be merged. We binary-search to find the 'insertion' point, then
1421  * decide how best to handle it.
1422  *
1423  * Return:
1424  *  0: success
1425  *  1: failed to set badblocks (out of space)
1426  */
1427 int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
1428 			int acknowledged)
1429 {
1430 	return _badblocks_set(bb, s, sectors, acknowledged);
1431 }
1432 EXPORT_SYMBOL_GPL(badblocks_set);
1433 
1434 /**
1435  * badblocks_clear() - Remove a range of bad blocks to the table.
1436  * @bb:		the badblocks structure that holds all badblock information
1437  * @s:		first sector to mark as bad
1438  * @sectors:	number of sectors to mark as bad
1439  *
1440  * This may involve extending the table if we spilt a region,
1441  * but it must not fail.  So if the table becomes full, we just
1442  * drop the remove request.
1443  *
1444  * Return:
1445  *  0: success
1446  *  1: failed to clear badblocks
1447  */
1448 int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
1449 {
1450 	return _badblocks_clear(bb, s, sectors);
1451 }
1452 EXPORT_SYMBOL_GPL(badblocks_clear);
1453 
1454 /**
1455  * ack_all_badblocks() - Acknowledge all bad blocks in a list.
1456  * @bb:		the badblocks structure that holds all badblock information
1457  *
1458  * This only succeeds if ->changed is clear.  It is used by
1459  * in-kernel metadata updates
1460  */
1461 void ack_all_badblocks(struct badblocks *bb)
1462 {
1463 	if (bb->page == NULL || bb->changed)
1464 		/* no point even trying */
1465 		return;
1466 	write_seqlock_irq(&bb->lock);
1467 
1468 	if (bb->changed == 0 && bb->unacked_exist) {
1469 		u64 *p = bb->page;
1470 		int i;
1471 
1472 		for (i = 0; i < bb->count ; i++) {
1473 			if (!BB_ACK(p[i])) {
1474 				sector_t start = BB_OFFSET(p[i]);
1475 				int len = BB_LEN(p[i]);
1476 
1477 				p[i] = BB_MAKE(start, len, 1);
1478 			}
1479 		}
1480 		bb->unacked_exist = 0;
1481 	}
1482 	write_sequnlock_irq(&bb->lock);
1483 }
1484 EXPORT_SYMBOL_GPL(ack_all_badblocks);
1485 
1486 /**
1487  * badblocks_show() - sysfs access to bad-blocks list
1488  * @bb:		the badblocks structure that holds all badblock information
1489  * @page:	buffer received from sysfs
1490  * @unack:	weather to show unacknowledged badblocks
1491  *
1492  * Return:
1493  *  Length of returned data
1494  */
1495 ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
1496 {
1497 	size_t len;
1498 	int i;
1499 	u64 *p = bb->page;
1500 	unsigned seq;
1501 
1502 	if (bb->shift < 0)
1503 		return 0;
1504 
1505 retry:
1506 	seq = read_seqbegin(&bb->lock);
1507 
1508 	len = 0;
1509 	i = 0;
1510 
1511 	while (len < PAGE_SIZE && i < bb->count) {
1512 		sector_t s = BB_OFFSET(p[i]);
1513 		unsigned int length = BB_LEN(p[i]);
1514 		int ack = BB_ACK(p[i]);
1515 
1516 		i++;
1517 
1518 		if (unack && ack)
1519 			continue;
1520 
1521 		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
1522 				(unsigned long long)s << bb->shift,
1523 				length << bb->shift);
1524 	}
1525 	if (unack && len == 0)
1526 		bb->unacked_exist = 0;
1527 
1528 	if (read_seqretry(&bb->lock, seq))
1529 		goto retry;
1530 
1531 	return len;
1532 }
1533 EXPORT_SYMBOL_GPL(badblocks_show);
1534 
1535 /**
1536  * badblocks_store() - sysfs access to bad-blocks list
1537  * @bb:		the badblocks structure that holds all badblock information
1538  * @page:	buffer received from sysfs
1539  * @len:	length of data received from sysfs
1540  * @unack:	weather to show unacknowledged badblocks
1541  *
1542  * Return:
1543  *  Length of the buffer processed or -ve error.
1544  */
1545 ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
1546 			int unack)
1547 {
1548 	unsigned long long sector;
1549 	int length;
1550 	char newline;
1551 
1552 	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
1553 	case 3:
1554 		if (newline != '\n')
1555 			return -EINVAL;
1556 		fallthrough;
1557 	case 2:
1558 		if (length <= 0)
1559 			return -EINVAL;
1560 		break;
1561 	default:
1562 		return -EINVAL;
1563 	}
1564 
1565 	if (badblocks_set(bb, sector, length, !unack))
1566 		return -ENOSPC;
1567 	else
1568 		return len;
1569 }
1570 EXPORT_SYMBOL_GPL(badblocks_store);
1571 
1572 static int __badblocks_init(struct device *dev, struct badblocks *bb,
1573 		int enable)
1574 {
1575 	bb->dev = dev;
1576 	bb->count = 0;
1577 	if (enable)
1578 		bb->shift = 0;
1579 	else
1580 		bb->shift = -1;
1581 	if (dev)
1582 		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
1583 	else
1584 		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
1585 	if (!bb->page) {
1586 		bb->shift = -1;
1587 		return -ENOMEM;
1588 	}
1589 	seqlock_init(&bb->lock);
1590 
1591 	return 0;
1592 }
1593 
1594 /**
1595  * badblocks_init() - initialize the badblocks structure
1596  * @bb:		the badblocks structure that holds all badblock information
1597  * @enable:	weather to enable badblocks accounting
1598  *
1599  * Return:
1600  *  0: success
1601  *  -ve errno: on error
1602  */
1603 int badblocks_init(struct badblocks *bb, int enable)
1604 {
1605 	return __badblocks_init(NULL, bb, enable);
1606 }
1607 EXPORT_SYMBOL_GPL(badblocks_init);
1608 
1609 int devm_init_badblocks(struct device *dev, struct badblocks *bb)
1610 {
1611 	if (!bb)
1612 		return -EINVAL;
1613 	return __badblocks_init(dev, bb, 1);
1614 }
1615 EXPORT_SYMBOL_GPL(devm_init_badblocks);
1616 
1617 /**
1618  * badblocks_exit() - free the badblocks structure
1619  * @bb:		the badblocks structure that holds all badblock information
1620  */
1621 void badblocks_exit(struct badblocks *bb)
1622 {
1623 	if (!bb)
1624 		return;
1625 	if (bb->dev)
1626 		devm_kfree(bb->dev, bb->page);
1627 	else
1628 		kfree(bb->page);
1629 	bb->page = NULL;
1630 }
1631 EXPORT_SYMBOL_GPL(badblocks_exit);
1632