xref: /linux/fs/btrfs/raid56.c (revision f3827213abae9291b7525b05e6fd29b1f0536ce6)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "messages.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
24 
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT	1
27 
28 /*
29  * set when this rbio is sitting in the hash, but it is just a cache
30  * of past RMW
31  */
32 #define RBIO_CACHE_BIT		2
33 
34 /*
35  * set when it is safe to trust the stripe_pages for caching
36  */
37 #define RBIO_CACHE_READY_BIT	3
38 
39 #define RBIO_CACHE_SIZE 1024
40 
41 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
42 
dump_bioc(const struct btrfs_fs_info * fs_info,const struct btrfs_io_context * bioc)43 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44 {
45 	if (unlikely(!bioc)) {
46 		btrfs_crit(fs_info, "bioc=NULL");
47 		return;
48 	}
49 	btrfs_crit(fs_info,
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 		bioc->logical, bioc->full_stripe_logical, bioc->size,
52 		bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53 		bioc->replace_stripe_src, bioc->num_stripes);
54 	for (int i = 0; i < bioc->num_stripes; i++) {
55 		btrfs_crit(fs_info, "    nr=%d devid=%llu physical=%llu",
56 			   i, bioc->stripes[i].dev->devid,
57 			   bioc->stripes[i].physical);
58 	}
59 }
60 
btrfs_dump_rbio(const struct btrfs_fs_info * fs_info,const struct btrfs_raid_bio * rbio)61 static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62 			    const struct btrfs_raid_bio *rbio)
63 {
64 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65 		return;
66 
67 	dump_bioc(fs_info, rbio->bioc);
68 	btrfs_crit(fs_info,
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70 		rbio->flags, rbio->nr_sectors, rbio->nr_data,
71 		rbio->real_stripes, rbio->stripe_nsectors,
72 		rbio->scrubp, rbio->dbitmap);
73 }
74 
75 #define ASSERT_RBIO(expr, rbio)						\
76 ({									\
77 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
78 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
79 					(rbio)->bioc->fs_info : NULL;	\
80 									\
81 		btrfs_dump_rbio(__fs_info, (rbio));			\
82 	}								\
83 	ASSERT((expr));							\
84 })
85 
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr)			\
87 ({									\
88 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
89 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
90 					(rbio)->bioc->fs_info : NULL;	\
91 									\
92 		btrfs_dump_rbio(__fs_info, (rbio));			\
93 		btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr));	\
94 	}								\
95 	ASSERT((expr));							\
96 })
97 
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr)			\
99 ({									\
100 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
101 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
102 					(rbio)->bioc->fs_info : NULL;	\
103 									\
104 		btrfs_dump_rbio(__fs_info, (rbio));			\
105 		btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr));	\
106 	}								\
107 	ASSERT((expr));							\
108 })
109 
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical)			\
111 ({									\
112 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
113 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
114 					(rbio)->bioc->fs_info : NULL;	\
115 									\
116 		btrfs_dump_rbio(__fs_info, (rbio));			\
117 		btrfs_crit(__fs_info, "logical=%llu", (logical));		\
118 	}								\
119 	ASSERT((expr));							\
120 })
121 
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash {
124 	struct list_head hash_list;
125 	spinlock_t lock;
126 };
127 
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table {
130 	struct list_head stripe_cache;
131 	spinlock_t cache_lock;
132 	int cache_size;
133 	struct btrfs_stripe_hash table[];
134 };
135 
136 /*
137  * A structure to present a sector inside a page, the length is fixed to
138  * sectorsize;
139  */
140 struct sector_ptr {
141 	/*
142 	 * Blocks from the bio list can still be highmem.
143 	 * So here we use physical address to present a page and the offset inside it.
144 	 */
145 	phys_addr_t paddr;
146 	bool has_paddr;
147 	bool uptodate;
148 };
149 
150 static void rmw_rbio_work(struct work_struct *work);
151 static void rmw_rbio_work_locked(struct work_struct *work);
152 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
153 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
154 
155 static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
156 static void scrub_rbio_work_locked(struct work_struct *work);
157 
free_raid_bio_pointers(struct btrfs_raid_bio * rbio)158 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
159 {
160 	bitmap_free(rbio->error_bitmap);
161 	kfree(rbio->stripe_pages);
162 	kfree(rbio->bio_sectors);
163 	kfree(rbio->stripe_sectors);
164 	kfree(rbio->finish_pointers);
165 }
166 
free_raid_bio(struct btrfs_raid_bio * rbio)167 static void free_raid_bio(struct btrfs_raid_bio *rbio)
168 {
169 	int i;
170 
171 	if (!refcount_dec_and_test(&rbio->refs))
172 		return;
173 
174 	WARN_ON(!list_empty(&rbio->stripe_cache));
175 	WARN_ON(!list_empty(&rbio->hash_list));
176 	WARN_ON(!bio_list_empty(&rbio->bio_list));
177 
178 	for (i = 0; i < rbio->nr_pages; i++) {
179 		if (rbio->stripe_pages[i]) {
180 			__free_page(rbio->stripe_pages[i]);
181 			rbio->stripe_pages[i] = NULL;
182 		}
183 	}
184 
185 	btrfs_put_bioc(rbio->bioc);
186 	free_raid_bio_pointers(rbio);
187 	kfree(rbio);
188 }
189 
start_async_work(struct btrfs_raid_bio * rbio,work_func_t work_func)190 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
191 {
192 	INIT_WORK(&rbio->work, work_func);
193 	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
194 }
195 
196 /*
197  * the stripe hash table is used for locking, and to collect
198  * bios in hopes of making a full stripe
199  */
btrfs_alloc_stripe_hash_table(struct btrfs_fs_info * info)200 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
201 {
202 	struct btrfs_stripe_hash_table *table;
203 	struct btrfs_stripe_hash_table *x;
204 	struct btrfs_stripe_hash *cur;
205 	struct btrfs_stripe_hash *h;
206 	unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
207 
208 	if (info->stripe_hash_table)
209 		return 0;
210 
211 	/*
212 	 * The table is large, starting with order 4 and can go as high as
213 	 * order 7 in case lock debugging is turned on.
214 	 *
215 	 * Try harder to allocate and fallback to vmalloc to lower the chance
216 	 * of a failing mount.
217 	 */
218 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
219 	if (!table)
220 		return -ENOMEM;
221 
222 	spin_lock_init(&table->cache_lock);
223 	INIT_LIST_HEAD(&table->stripe_cache);
224 
225 	h = table->table;
226 
227 	for (unsigned int i = 0; i < num_entries; i++) {
228 		cur = h + i;
229 		INIT_LIST_HEAD(&cur->hash_list);
230 		spin_lock_init(&cur->lock);
231 	}
232 
233 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
234 	kvfree(x);
235 	return 0;
236 }
237 
memcpy_sectors(const struct sector_ptr * dst,const struct sector_ptr * src,u32 blocksize)238 static void memcpy_sectors(const struct sector_ptr *dst,
239 			   const struct sector_ptr *src, u32 blocksize)
240 {
241 	memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr),
242 		    phys_to_page(src->paddr), offset_in_page(src->paddr),
243 		    blocksize);
244 }
245 
246 /*
247  * caching an rbio means to copy anything from the
248  * bio_sectors array into the stripe_pages array.  We
249  * use the page uptodate bit in the stripe cache array
250  * to indicate if it has valid data
251  *
252  * once the caching is done, we set the cache ready
253  * bit.
254  */
cache_rbio_pages(struct btrfs_raid_bio * rbio)255 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
256 {
257 	int i;
258 	int ret;
259 
260 	ret = alloc_rbio_pages(rbio);
261 	if (ret)
262 		return;
263 
264 	for (i = 0; i < rbio->nr_sectors; i++) {
265 		/* Some range not covered by bio (partial write), skip it */
266 		if (!rbio->bio_sectors[i].has_paddr) {
267 			/*
268 			 * Even if the sector is not covered by bio, if it is
269 			 * a data sector it should still be uptodate as it is
270 			 * read from disk.
271 			 */
272 			if (i < rbio->nr_data * rbio->stripe_nsectors)
273 				ASSERT(rbio->stripe_sectors[i].uptodate);
274 			continue;
275 		}
276 
277 		memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i],
278 				rbio->bioc->fs_info->sectorsize);
279 		rbio->stripe_sectors[i].uptodate = 1;
280 	}
281 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
282 }
283 
284 /*
285  * we hash on the first logical address of the stripe
286  */
rbio_bucket(struct btrfs_raid_bio * rbio)287 static int rbio_bucket(struct btrfs_raid_bio *rbio)
288 {
289 	u64 num = rbio->bioc->full_stripe_logical;
290 
291 	/*
292 	 * we shift down quite a bit.  We're using byte
293 	 * addressing, and most of the lower bits are zeros.
294 	 * This tends to upset hash_64, and it consistently
295 	 * returns just one or two different values.
296 	 *
297 	 * shifting off the lower bits fixes things.
298 	 */
299 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
300 }
301 
full_page_sectors_uptodate(struct btrfs_raid_bio * rbio,unsigned int page_nr)302 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
303 				       unsigned int page_nr)
304 {
305 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
306 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
307 	int i;
308 
309 	ASSERT(page_nr < rbio->nr_pages);
310 
311 	for (i = sectors_per_page * page_nr;
312 	     i < sectors_per_page * page_nr + sectors_per_page;
313 	     i++) {
314 		if (!rbio->stripe_sectors[i].uptodate)
315 			return false;
316 	}
317 	return true;
318 }
319 
320 /*
321  * Update the stripe_sectors[] array to use correct page and pgoff
322  *
323  * Should be called every time any page pointer in stripes_pages[] got modified.
324  */
index_stripe_sectors(struct btrfs_raid_bio * rbio)325 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
326 {
327 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
328 	u32 offset;
329 	int i;
330 
331 	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
332 		int page_index = offset >> PAGE_SHIFT;
333 
334 		ASSERT(page_index < rbio->nr_pages);
335 		if (!rbio->stripe_pages[page_index])
336 			continue;
337 
338 		rbio->stripe_sectors[i].has_paddr = true;
339 		rbio->stripe_sectors[i].paddr =
340 			page_to_phys(rbio->stripe_pages[page_index]) +
341 			offset_in_page(offset);
342 	}
343 }
344 
steal_rbio_page(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest,int page_nr)345 static void steal_rbio_page(struct btrfs_raid_bio *src,
346 			    struct btrfs_raid_bio *dest, int page_nr)
347 {
348 	const u32 sectorsize = src->bioc->fs_info->sectorsize;
349 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
350 	int i;
351 
352 	if (dest->stripe_pages[page_nr])
353 		__free_page(dest->stripe_pages[page_nr]);
354 	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
355 	src->stripe_pages[page_nr] = NULL;
356 
357 	/* Also update the sector->uptodate bits. */
358 	for (i = sectors_per_page * page_nr;
359 	     i < sectors_per_page * page_nr + sectors_per_page; i++)
360 		dest->stripe_sectors[i].uptodate = true;
361 }
362 
is_data_stripe_page(struct btrfs_raid_bio * rbio,int page_nr)363 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
364 {
365 	const int sector_nr = (page_nr << PAGE_SHIFT) >>
366 			      rbio->bioc->fs_info->sectorsize_bits;
367 
368 	/*
369 	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
370 	 * we won't have a page which is half data half parity.
371 	 *
372 	 * Thus if the first sector of the page belongs to data stripes, then
373 	 * the full page belongs to data stripes.
374 	 */
375 	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
376 }
377 
378 /*
379  * Stealing an rbio means taking all the uptodate pages from the stripe array
380  * in the source rbio and putting them into the destination rbio.
381  *
382  * This will also update the involved stripe_sectors[] which are referring to
383  * the old pages.
384  */
steal_rbio(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest)385 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
386 {
387 	int i;
388 
389 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
390 		return;
391 
392 	for (i = 0; i < dest->nr_pages; i++) {
393 		struct page *p = src->stripe_pages[i];
394 
395 		/*
396 		 * We don't need to steal P/Q pages as they will always be
397 		 * regenerated for RMW or full write anyway.
398 		 */
399 		if (!is_data_stripe_page(src, i))
400 			continue;
401 
402 		/*
403 		 * If @src already has RBIO_CACHE_READY_BIT, it should have
404 		 * all data stripe pages present and uptodate.
405 		 */
406 		ASSERT(p);
407 		ASSERT(full_page_sectors_uptodate(src, i));
408 		steal_rbio_page(src, dest, i);
409 	}
410 	index_stripe_sectors(dest);
411 	index_stripe_sectors(src);
412 }
413 
414 /*
415  * merging means we take the bio_list from the victim and
416  * splice it into the destination.  The victim should
417  * be discarded afterwards.
418  *
419  * must be called with dest->rbio_list_lock held
420  */
merge_rbio(struct btrfs_raid_bio * dest,struct btrfs_raid_bio * victim)421 static void merge_rbio(struct btrfs_raid_bio *dest,
422 		       struct btrfs_raid_bio *victim)
423 {
424 	bio_list_merge_init(&dest->bio_list, &victim->bio_list);
425 	dest->bio_list_bytes += victim->bio_list_bytes;
426 	/* Also inherit the bitmaps from @victim. */
427 	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
428 		  dest->stripe_nsectors);
429 }
430 
431 /*
432  * used to prune items that are in the cache.  The caller
433  * must hold the hash table lock.
434  */
__remove_rbio_from_cache(struct btrfs_raid_bio * rbio)435 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
436 {
437 	int bucket = rbio_bucket(rbio);
438 	struct btrfs_stripe_hash_table *table;
439 	struct btrfs_stripe_hash *h;
440 	int freeit = 0;
441 
442 	/*
443 	 * check the bit again under the hash table lock.
444 	 */
445 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
446 		return;
447 
448 	table = rbio->bioc->fs_info->stripe_hash_table;
449 	h = table->table + bucket;
450 
451 	/* hold the lock for the bucket because we may be
452 	 * removing it from the hash table
453 	 */
454 	spin_lock(&h->lock);
455 
456 	/*
457 	 * hold the lock for the bio list because we need
458 	 * to make sure the bio list is empty
459 	 */
460 	spin_lock(&rbio->bio_list_lock);
461 
462 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
463 		list_del_init(&rbio->stripe_cache);
464 		table->cache_size -= 1;
465 		freeit = 1;
466 
467 		/* if the bio list isn't empty, this rbio is
468 		 * still involved in an IO.  We take it out
469 		 * of the cache list, and drop the ref that
470 		 * was held for the list.
471 		 *
472 		 * If the bio_list was empty, we also remove
473 		 * the rbio from the hash_table, and drop
474 		 * the corresponding ref
475 		 */
476 		if (bio_list_empty(&rbio->bio_list)) {
477 			if (!list_empty(&rbio->hash_list)) {
478 				list_del_init(&rbio->hash_list);
479 				refcount_dec(&rbio->refs);
480 				BUG_ON(!list_empty(&rbio->plug_list));
481 			}
482 		}
483 	}
484 
485 	spin_unlock(&rbio->bio_list_lock);
486 	spin_unlock(&h->lock);
487 
488 	if (freeit)
489 		free_raid_bio(rbio);
490 }
491 
492 /*
493  * prune a given rbio from the cache
494  */
remove_rbio_from_cache(struct btrfs_raid_bio * rbio)495 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
496 {
497 	struct btrfs_stripe_hash_table *table;
498 
499 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
500 		return;
501 
502 	table = rbio->bioc->fs_info->stripe_hash_table;
503 
504 	spin_lock(&table->cache_lock);
505 	__remove_rbio_from_cache(rbio);
506 	spin_unlock(&table->cache_lock);
507 }
508 
509 /*
510  * remove everything in the cache
511  */
btrfs_clear_rbio_cache(struct btrfs_fs_info * info)512 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
513 {
514 	struct btrfs_stripe_hash_table *table;
515 	struct btrfs_raid_bio *rbio;
516 
517 	table = info->stripe_hash_table;
518 
519 	spin_lock(&table->cache_lock);
520 	while (!list_empty(&table->stripe_cache)) {
521 		rbio = list_first_entry(&table->stripe_cache,
522 					struct btrfs_raid_bio, stripe_cache);
523 		__remove_rbio_from_cache(rbio);
524 	}
525 	spin_unlock(&table->cache_lock);
526 }
527 
528 /*
529  * remove all cached entries and free the hash table
530  * used by unmount
531  */
btrfs_free_stripe_hash_table(struct btrfs_fs_info * info)532 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
533 {
534 	if (!info->stripe_hash_table)
535 		return;
536 	btrfs_clear_rbio_cache(info);
537 	kvfree(info->stripe_hash_table);
538 	info->stripe_hash_table = NULL;
539 }
540 
541 /*
542  * insert an rbio into the stripe cache.  It
543  * must have already been prepared by calling
544  * cache_rbio_pages
545  *
546  * If this rbio was already cached, it gets
547  * moved to the front of the lru.
548  *
549  * If the size of the rbio cache is too big, we
550  * prune an item.
551  */
cache_rbio(struct btrfs_raid_bio * rbio)552 static void cache_rbio(struct btrfs_raid_bio *rbio)
553 {
554 	struct btrfs_stripe_hash_table *table;
555 
556 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
557 		return;
558 
559 	table = rbio->bioc->fs_info->stripe_hash_table;
560 
561 	spin_lock(&table->cache_lock);
562 	spin_lock(&rbio->bio_list_lock);
563 
564 	/* bump our ref if we were not in the list before */
565 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
566 		refcount_inc(&rbio->refs);
567 
568 	if (!list_empty(&rbio->stripe_cache)){
569 		list_move(&rbio->stripe_cache, &table->stripe_cache);
570 	} else {
571 		list_add(&rbio->stripe_cache, &table->stripe_cache);
572 		table->cache_size += 1;
573 	}
574 
575 	spin_unlock(&rbio->bio_list_lock);
576 
577 	if (table->cache_size > RBIO_CACHE_SIZE) {
578 		struct btrfs_raid_bio *found;
579 
580 		found = list_last_entry(&table->stripe_cache,
581 					struct btrfs_raid_bio,
582 					stripe_cache);
583 
584 		if (found != rbio)
585 			__remove_rbio_from_cache(found);
586 	}
587 
588 	spin_unlock(&table->cache_lock);
589 }
590 
591 /*
592  * helper function to run the xor_blocks api.  It is only
593  * able to do MAX_XOR_BLOCKS at a time, so we need to
594  * loop through.
595  */
run_xor(void ** pages,int src_cnt,ssize_t len)596 static void run_xor(void **pages, int src_cnt, ssize_t len)
597 {
598 	int src_off = 0;
599 	int xor_src_cnt = 0;
600 	void *dest = pages[src_cnt];
601 
602 	while(src_cnt > 0) {
603 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
604 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
605 
606 		src_cnt -= xor_src_cnt;
607 		src_off += xor_src_cnt;
608 	}
609 }
610 
611 /*
612  * Returns true if the bio list inside this rbio covers an entire stripe (no
613  * rmw required).
614  */
rbio_is_full(struct btrfs_raid_bio * rbio)615 static int rbio_is_full(struct btrfs_raid_bio *rbio)
616 {
617 	unsigned long size = rbio->bio_list_bytes;
618 	int ret = 1;
619 
620 	spin_lock(&rbio->bio_list_lock);
621 	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
622 		ret = 0;
623 	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
624 	spin_unlock(&rbio->bio_list_lock);
625 
626 	return ret;
627 }
628 
629 /*
630  * returns 1 if it is safe to merge two rbios together.
631  * The merging is safe if the two rbios correspond to
632  * the same stripe and if they are both going in the same
633  * direction (read vs write), and if neither one is
634  * locked for final IO
635  *
636  * The caller is responsible for locking such that
637  * rmw_locked is safe to test
638  */
rbio_can_merge(struct btrfs_raid_bio * last,struct btrfs_raid_bio * cur)639 static int rbio_can_merge(struct btrfs_raid_bio *last,
640 			  struct btrfs_raid_bio *cur)
641 {
642 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
643 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
644 		return 0;
645 
646 	/*
647 	 * we can't merge with cached rbios, since the
648 	 * idea is that when we merge the destination
649 	 * rbio is going to run our IO for us.  We can
650 	 * steal from cached rbios though, other functions
651 	 * handle that.
652 	 */
653 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
654 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
655 		return 0;
656 
657 	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
658 		return 0;
659 
660 	/* we can't merge with different operations */
661 	if (last->operation != cur->operation)
662 		return 0;
663 	/*
664 	 * We've need read the full stripe from the drive.
665 	 * check and repair the parity and write the new results.
666 	 *
667 	 * We're not allowed to add any new bios to the
668 	 * bio list here, anyone else that wants to
669 	 * change this stripe needs to do their own rmw.
670 	 */
671 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
672 		return 0;
673 
674 	if (last->operation == BTRFS_RBIO_READ_REBUILD)
675 		return 0;
676 
677 	return 1;
678 }
679 
rbio_stripe_sector_index(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)680 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
681 					     unsigned int stripe_nr,
682 					     unsigned int sector_nr)
683 {
684 	ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
685 	ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
686 
687 	return stripe_nr * rbio->stripe_nsectors + sector_nr;
688 }
689 
690 /* Return a sector from rbio->stripe_sectors, not from the bio list */
rbio_stripe_sector(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)691 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
692 					     unsigned int stripe_nr,
693 					     unsigned int sector_nr)
694 {
695 	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
696 							      sector_nr)];
697 }
698 
699 /* Grab a sector inside P stripe */
rbio_pstripe_sector(const struct btrfs_raid_bio * rbio,unsigned int sector_nr)700 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
701 					      unsigned int sector_nr)
702 {
703 	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
704 }
705 
706 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
rbio_qstripe_sector(const struct btrfs_raid_bio * rbio,unsigned int sector_nr)707 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
708 					      unsigned int sector_nr)
709 {
710 	if (rbio->nr_data + 1 == rbio->real_stripes)
711 		return NULL;
712 	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
713 }
714 
715 /*
716  * The first stripe in the table for a logical address
717  * has the lock.  rbios are added in one of three ways:
718  *
719  * 1) Nobody has the stripe locked yet.  The rbio is given
720  * the lock and 0 is returned.  The caller must start the IO
721  * themselves.
722  *
723  * 2) Someone has the stripe locked, but we're able to merge
724  * with the lock owner.  The rbio is freed and the IO will
725  * start automatically along with the existing rbio.  1 is returned.
726  *
727  * 3) Someone has the stripe locked, but we're not able to merge.
728  * The rbio is added to the lock owner's plug list, or merged into
729  * an rbio already on the plug list.  When the lock owner unlocks,
730  * the next rbio on the list is run and the IO is started automatically.
731  * 1 is returned
732  *
733  * If we return 0, the caller still owns the rbio and must continue with
734  * IO submission.  If we return 1, the caller must assume the rbio has
735  * already been freed.
736  */
lock_stripe_add(struct btrfs_raid_bio * rbio)737 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
738 {
739 	struct btrfs_stripe_hash *h;
740 	struct btrfs_raid_bio *cur;
741 	struct btrfs_raid_bio *pending;
742 	struct btrfs_raid_bio *freeit = NULL;
743 	struct btrfs_raid_bio *cache_drop = NULL;
744 	int ret = 0;
745 
746 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
747 
748 	spin_lock(&h->lock);
749 	list_for_each_entry(cur, &h->hash_list, hash_list) {
750 		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
751 			continue;
752 
753 		spin_lock(&cur->bio_list_lock);
754 
755 		/* Can we steal this cached rbio's pages? */
756 		if (bio_list_empty(&cur->bio_list) &&
757 		    list_empty(&cur->plug_list) &&
758 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
759 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
760 			list_del_init(&cur->hash_list);
761 			refcount_dec(&cur->refs);
762 
763 			steal_rbio(cur, rbio);
764 			cache_drop = cur;
765 			spin_unlock(&cur->bio_list_lock);
766 
767 			goto lockit;
768 		}
769 
770 		/* Can we merge into the lock owner? */
771 		if (rbio_can_merge(cur, rbio)) {
772 			merge_rbio(cur, rbio);
773 			spin_unlock(&cur->bio_list_lock);
774 			freeit = rbio;
775 			ret = 1;
776 			goto out;
777 		}
778 
779 
780 		/*
781 		 * We couldn't merge with the running rbio, see if we can merge
782 		 * with the pending ones.  We don't have to check for rmw_locked
783 		 * because there is no way they are inside finish_rmw right now
784 		 */
785 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
786 			if (rbio_can_merge(pending, rbio)) {
787 				merge_rbio(pending, rbio);
788 				spin_unlock(&cur->bio_list_lock);
789 				freeit = rbio;
790 				ret = 1;
791 				goto out;
792 			}
793 		}
794 
795 		/*
796 		 * No merging, put us on the tail of the plug list, our rbio
797 		 * will be started with the currently running rbio unlocks
798 		 */
799 		list_add_tail(&rbio->plug_list, &cur->plug_list);
800 		spin_unlock(&cur->bio_list_lock);
801 		ret = 1;
802 		goto out;
803 	}
804 lockit:
805 	refcount_inc(&rbio->refs);
806 	list_add(&rbio->hash_list, &h->hash_list);
807 out:
808 	spin_unlock(&h->lock);
809 	if (cache_drop)
810 		remove_rbio_from_cache(cache_drop);
811 	if (freeit)
812 		free_raid_bio(freeit);
813 	return ret;
814 }
815 
816 static void recover_rbio_work_locked(struct work_struct *work);
817 
818 /*
819  * called as rmw or parity rebuild is completed.  If the plug list has more
820  * rbios waiting for this stripe, the next one on the list will be started
821  */
unlock_stripe(struct btrfs_raid_bio * rbio)822 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
823 {
824 	int bucket;
825 	struct btrfs_stripe_hash *h;
826 	int keep_cache = 0;
827 
828 	bucket = rbio_bucket(rbio);
829 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
830 
831 	if (list_empty(&rbio->plug_list))
832 		cache_rbio(rbio);
833 
834 	spin_lock(&h->lock);
835 	spin_lock(&rbio->bio_list_lock);
836 
837 	if (!list_empty(&rbio->hash_list)) {
838 		/*
839 		 * if we're still cached and there is no other IO
840 		 * to perform, just leave this rbio here for others
841 		 * to steal from later
842 		 */
843 		if (list_empty(&rbio->plug_list) &&
844 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
845 			keep_cache = 1;
846 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
847 			BUG_ON(!bio_list_empty(&rbio->bio_list));
848 			goto done;
849 		}
850 
851 		list_del_init(&rbio->hash_list);
852 		refcount_dec(&rbio->refs);
853 
854 		/*
855 		 * we use the plug list to hold all the rbios
856 		 * waiting for the chance to lock this stripe.
857 		 * hand the lock over to one of them.
858 		 */
859 		if (!list_empty(&rbio->plug_list)) {
860 			struct btrfs_raid_bio *next;
861 			struct list_head *head = rbio->plug_list.next;
862 
863 			next = list_entry(head, struct btrfs_raid_bio,
864 					  plug_list);
865 
866 			list_del_init(&rbio->plug_list);
867 
868 			list_add(&next->hash_list, &h->hash_list);
869 			refcount_inc(&next->refs);
870 			spin_unlock(&rbio->bio_list_lock);
871 			spin_unlock(&h->lock);
872 
873 			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
874 				start_async_work(next, recover_rbio_work_locked);
875 			} else if (next->operation == BTRFS_RBIO_WRITE) {
876 				steal_rbio(rbio, next);
877 				start_async_work(next, rmw_rbio_work_locked);
878 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
879 				steal_rbio(rbio, next);
880 				start_async_work(next, scrub_rbio_work_locked);
881 			}
882 
883 			goto done_nolock;
884 		}
885 	}
886 done:
887 	spin_unlock(&rbio->bio_list_lock);
888 	spin_unlock(&h->lock);
889 
890 done_nolock:
891 	if (!keep_cache)
892 		remove_rbio_from_cache(rbio);
893 }
894 
rbio_endio_bio_list(struct bio * cur,blk_status_t status)895 static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
896 {
897 	struct bio *next;
898 
899 	while (cur) {
900 		next = cur->bi_next;
901 		cur->bi_next = NULL;
902 		cur->bi_status = status;
903 		bio_endio(cur);
904 		cur = next;
905 	}
906 }
907 
908 /*
909  * this frees the rbio and runs through all the bios in the
910  * bio_list and calls end_io on them
911  */
rbio_orig_end_io(struct btrfs_raid_bio * rbio,blk_status_t status)912 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
913 {
914 	struct bio *cur = bio_list_get(&rbio->bio_list);
915 	struct bio *extra;
916 
917 	kfree(rbio->csum_buf);
918 	bitmap_free(rbio->csum_bitmap);
919 	rbio->csum_buf = NULL;
920 	rbio->csum_bitmap = NULL;
921 
922 	/*
923 	 * Clear the data bitmap, as the rbio may be cached for later usage.
924 	 * do this before before unlock_stripe() so there will be no new bio
925 	 * for this bio.
926 	 */
927 	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
928 
929 	/*
930 	 * At this moment, rbio->bio_list is empty, however since rbio does not
931 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
932 	 * hash list, rbio may be merged with others so that rbio->bio_list
933 	 * becomes non-empty.
934 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
935 	 * more and we can call bio_endio() on all queued bios.
936 	 */
937 	unlock_stripe(rbio);
938 	extra = bio_list_get(&rbio->bio_list);
939 	free_raid_bio(rbio);
940 
941 	rbio_endio_bio_list(cur, status);
942 	if (extra)
943 		rbio_endio_bio_list(extra, status);
944 }
945 
946 /*
947  * Get a sector pointer specified by its @stripe_nr and @sector_nr.
948  *
949  * @rbio:               The raid bio
950  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
951  * @sector_nr:		Sector number inside the stripe,
952  *			valid range [0, stripe_nsectors)
953  * @bio_list_only:      Whether to use sectors inside the bio list only.
954  *
955  * The read/modify/write code wants to reuse the original bio page as much
956  * as possible, and only use stripe_sectors as fallback.
957  */
sector_in_rbio(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr,bool bio_list_only)958 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
959 					 int stripe_nr, int sector_nr,
960 					 bool bio_list_only)
961 {
962 	struct sector_ptr *sector;
963 	int index;
964 
965 	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
966 			   rbio, stripe_nr);
967 	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
968 			   rbio, sector_nr);
969 
970 	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
971 	ASSERT(index >= 0 && index < rbio->nr_sectors);
972 
973 	spin_lock(&rbio->bio_list_lock);
974 	sector = &rbio->bio_sectors[index];
975 	if (sector->has_paddr || bio_list_only) {
976 		/* Don't return sector without a valid page pointer */
977 		if (!sector->has_paddr)
978 			sector = NULL;
979 		spin_unlock(&rbio->bio_list_lock);
980 		return sector;
981 	}
982 	spin_unlock(&rbio->bio_list_lock);
983 
984 	return &rbio->stripe_sectors[index];
985 }
986 
987 /*
988  * allocation and initial setup for the btrfs_raid_bio.  Not
989  * this does not allocate any pages for rbio->pages.
990  */
alloc_rbio(struct btrfs_fs_info * fs_info,struct btrfs_io_context * bioc)991 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
992 					 struct btrfs_io_context *bioc)
993 {
994 	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
995 	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
996 	const unsigned int num_pages = stripe_npages * real_stripes;
997 	const unsigned int stripe_nsectors =
998 		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
999 	const unsigned int num_sectors = stripe_nsectors * real_stripes;
1000 	struct btrfs_raid_bio *rbio;
1001 
1002 	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1003 	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
1004 	/*
1005 	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1006 	 * (at most 16) should be no larger than BITS_PER_LONG.
1007 	 */
1008 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
1009 
1010 	/*
1011 	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1012 	 * (limited by u8).
1013 	 */
1014 	ASSERT(real_stripes >= 2);
1015 	ASSERT(real_stripes <= U8_MAX);
1016 
1017 	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
1018 	if (!rbio)
1019 		return ERR_PTR(-ENOMEM);
1020 	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1021 				     GFP_NOFS);
1022 	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1023 				    GFP_NOFS);
1024 	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1025 				       GFP_NOFS);
1026 	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1027 	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1028 
1029 	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
1030 	    !rbio->finish_pointers || !rbio->error_bitmap) {
1031 		free_raid_bio_pointers(rbio);
1032 		kfree(rbio);
1033 		return ERR_PTR(-ENOMEM);
1034 	}
1035 
1036 	bio_list_init(&rbio->bio_list);
1037 	init_waitqueue_head(&rbio->io_wait);
1038 	INIT_LIST_HEAD(&rbio->plug_list);
1039 	spin_lock_init(&rbio->bio_list_lock);
1040 	INIT_LIST_HEAD(&rbio->stripe_cache);
1041 	INIT_LIST_HEAD(&rbio->hash_list);
1042 	btrfs_get_bioc(bioc);
1043 	rbio->bioc = bioc;
1044 	rbio->nr_pages = num_pages;
1045 	rbio->nr_sectors = num_sectors;
1046 	rbio->real_stripes = real_stripes;
1047 	rbio->stripe_npages = stripe_npages;
1048 	rbio->stripe_nsectors = stripe_nsectors;
1049 	refcount_set(&rbio->refs, 1);
1050 	atomic_set(&rbio->stripes_pending, 0);
1051 
1052 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1053 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1054 	ASSERT(rbio->nr_data > 0);
1055 
1056 	return rbio;
1057 }
1058 
1059 /* allocate pages for all the stripes in the bio, including parity */
alloc_rbio_pages(struct btrfs_raid_bio * rbio)1060 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1061 {
1062 	int ret;
1063 
1064 	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1065 	if (ret < 0)
1066 		return ret;
1067 	/* Mapping all sectors */
1068 	index_stripe_sectors(rbio);
1069 	return 0;
1070 }
1071 
1072 /* only allocate pages for p/q stripes */
alloc_rbio_parity_pages(struct btrfs_raid_bio * rbio)1073 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1074 {
1075 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1076 	int ret;
1077 
1078 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1079 				     rbio->stripe_pages + data_pages, false);
1080 	if (ret < 0)
1081 		return ret;
1082 
1083 	index_stripe_sectors(rbio);
1084 	return 0;
1085 }
1086 
1087 /*
1088  * Return the total number of errors found in the vertical stripe of @sector_nr.
1089  *
1090  * @faila and @failb will also be updated to the first and second stripe
1091  * number of the errors.
1092  */
get_rbio_veritical_errors(struct btrfs_raid_bio * rbio,int sector_nr,int * faila,int * failb)1093 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1094 				     int *faila, int *failb)
1095 {
1096 	int stripe_nr;
1097 	int found_errors = 0;
1098 
1099 	if (faila || failb) {
1100 		/*
1101 		 * Both @faila and @failb should be valid pointers if any of
1102 		 * them is specified.
1103 		 */
1104 		ASSERT(faila && failb);
1105 		*faila = -1;
1106 		*failb = -1;
1107 	}
1108 
1109 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1110 		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1111 
1112 		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1113 			found_errors++;
1114 			if (faila) {
1115 				/* Update faila and failb. */
1116 				if (*faila < 0)
1117 					*faila = stripe_nr;
1118 				else if (*failb < 0)
1119 					*failb = stripe_nr;
1120 			}
1121 		}
1122 	}
1123 	return found_errors;
1124 }
1125 
1126 /*
1127  * Add a single sector @sector into our list of bios for IO.
1128  *
1129  * Return 0 if everything went well.
1130  * Return <0 for error.
1131  */
rbio_add_io_sector(struct btrfs_raid_bio * rbio,struct bio_list * bio_list,struct sector_ptr * sector,unsigned int stripe_nr,unsigned int sector_nr,enum req_op op)1132 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1133 			      struct bio_list *bio_list,
1134 			      struct sector_ptr *sector,
1135 			      unsigned int stripe_nr,
1136 			      unsigned int sector_nr,
1137 			      enum req_op op)
1138 {
1139 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1140 	struct bio *last = bio_list->tail;
1141 	int ret;
1142 	struct bio *bio;
1143 	struct btrfs_io_stripe *stripe;
1144 	u64 disk_start;
1145 
1146 	/*
1147 	 * Note: here stripe_nr has taken device replace into consideration,
1148 	 * thus it can be larger than rbio->real_stripe.
1149 	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1150 	 */
1151 	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1152 			   rbio, stripe_nr);
1153 	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1154 			   rbio, sector_nr);
1155 	ASSERT(sector->has_paddr);
1156 
1157 	stripe = &rbio->bioc->stripes[stripe_nr];
1158 	disk_start = stripe->physical + sector_nr * sectorsize;
1159 
1160 	/* if the device is missing, just fail this stripe */
1161 	if (!stripe->dev->bdev) {
1162 		int found_errors;
1163 
1164 		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1165 			rbio->error_bitmap);
1166 
1167 		/* Check if we have reached tolerance early. */
1168 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1169 							 NULL, NULL);
1170 		if (unlikely(found_errors > rbio->bioc->max_errors))
1171 			return -EIO;
1172 		return 0;
1173 	}
1174 
1175 	/* see if we can add this page onto our existing bio */
1176 	if (last) {
1177 		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1178 		last_end += last->bi_iter.bi_size;
1179 
1180 		/*
1181 		 * we can't merge these if they are from different
1182 		 * devices or if they are not contiguous
1183 		 */
1184 		if (last_end == disk_start && !last->bi_status &&
1185 		    last->bi_bdev == stripe->dev->bdev) {
1186 			ret = bio_add_page(last, phys_to_page(sector->paddr),
1187 					   sectorsize, offset_in_page(sector->paddr));
1188 			if (ret == sectorsize)
1189 				return 0;
1190 		}
1191 	}
1192 
1193 	/* put a new bio on the list */
1194 	bio = bio_alloc(stripe->dev->bdev,
1195 			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1196 			op, GFP_NOFS);
1197 	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1198 	bio->bi_private = rbio;
1199 
1200 	__bio_add_page(bio, phys_to_page(sector->paddr), sectorsize,
1201 		       offset_in_page(sector->paddr));
1202 	bio_list_add(bio_list, bio);
1203 	return 0;
1204 }
1205 
index_one_bio(struct btrfs_raid_bio * rbio,struct bio * bio)1206 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1207 {
1208 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1209 	const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
1210 	struct bvec_iter iter = bio->bi_iter;
1211 	phys_addr_t paddr;
1212 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1213 		     rbio->bioc->full_stripe_logical;
1214 
1215 	btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) {
1216 		unsigned int index = (offset >> sectorsize_bits);
1217 		struct sector_ptr *sector = &rbio->bio_sectors[index];
1218 
1219 		sector->has_paddr = true;
1220 		sector->paddr = paddr;
1221 		offset += sectorsize;
1222 	}
1223 }
1224 
1225 /*
1226  * helper function to walk our bio list and populate the bio_pages array with
1227  * the result.  This seems expensive, but it is faster than constantly
1228  * searching through the bio list as we setup the IO in finish_rmw or stripe
1229  * reconstruction.
1230  *
1231  * This must be called before you trust the answers from page_in_rbio
1232  */
index_rbio_pages(struct btrfs_raid_bio * rbio)1233 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1234 {
1235 	struct bio *bio;
1236 
1237 	spin_lock(&rbio->bio_list_lock);
1238 	bio_list_for_each(bio, &rbio->bio_list)
1239 		index_one_bio(rbio, bio);
1240 
1241 	spin_unlock(&rbio->bio_list_lock);
1242 }
1243 
bio_get_trace_info(struct btrfs_raid_bio * rbio,struct bio * bio,struct raid56_bio_trace_info * trace_info)1244 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1245 			       struct raid56_bio_trace_info *trace_info)
1246 {
1247 	const struct btrfs_io_context *bioc = rbio->bioc;
1248 	int i;
1249 
1250 	ASSERT(bioc);
1251 
1252 	/* We rely on bio->bi_bdev to find the stripe number. */
1253 	if (!bio->bi_bdev)
1254 		goto not_found;
1255 
1256 	for (i = 0; i < bioc->num_stripes; i++) {
1257 		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1258 			continue;
1259 		trace_info->stripe_nr = i;
1260 		trace_info->devid = bioc->stripes[i].dev->devid;
1261 		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1262 				     bioc->stripes[i].physical;
1263 		return;
1264 	}
1265 
1266 not_found:
1267 	trace_info->devid = -1;
1268 	trace_info->offset = -1;
1269 	trace_info->stripe_nr = -1;
1270 }
1271 
bio_list_put(struct bio_list * bio_list)1272 static inline void bio_list_put(struct bio_list *bio_list)
1273 {
1274 	struct bio *bio;
1275 
1276 	while ((bio = bio_list_pop(bio_list)))
1277 		bio_put(bio);
1278 }
1279 
assert_rbio(struct btrfs_raid_bio * rbio)1280 static void assert_rbio(struct btrfs_raid_bio *rbio)
1281 {
1282 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1283 		return;
1284 
1285 	/*
1286 	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1287 	 * we won't go beyond 256 disks anyway.
1288 	 */
1289 	ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1290 	ASSERT_RBIO(rbio->nr_data > 0, rbio);
1291 
1292 	/*
1293 	 * This is another check to make sure nr data stripes is smaller
1294 	 * than total stripes.
1295 	 */
1296 	ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1297 }
1298 
kmap_local_sector(const struct sector_ptr * sector)1299 static inline void *kmap_local_sector(const struct sector_ptr *sector)
1300 {
1301 	/* The sector pointer must have a page mapped to it. */
1302 	ASSERT(sector->has_paddr);
1303 
1304 	return kmap_local_page(phys_to_page(sector->paddr)) +
1305 	       offset_in_page(sector->paddr);
1306 }
1307 
1308 /* Generate PQ for one vertical stripe. */
generate_pq_vertical(struct btrfs_raid_bio * rbio,int sectornr)1309 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1310 {
1311 	void **pointers = rbio->finish_pointers;
1312 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1313 	struct sector_ptr *sector;
1314 	int stripe;
1315 	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1316 
1317 	/* First collect one sector from each data stripe */
1318 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1319 		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1320 		pointers[stripe] = kmap_local_sector(sector);
1321 	}
1322 
1323 	/* Then add the parity stripe */
1324 	sector = rbio_pstripe_sector(rbio, sectornr);
1325 	sector->uptodate = 1;
1326 	pointers[stripe++] = kmap_local_sector(sector);
1327 
1328 	if (has_qstripe) {
1329 		/*
1330 		 * RAID6, add the qstripe and call the library function
1331 		 * to fill in our p/q
1332 		 */
1333 		sector = rbio_qstripe_sector(rbio, sectornr);
1334 		sector->uptodate = 1;
1335 		pointers[stripe++] = kmap_local_sector(sector);
1336 
1337 		assert_rbio(rbio);
1338 		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1339 					pointers);
1340 	} else {
1341 		/* raid5 */
1342 		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1343 		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1344 	}
1345 	for (stripe = stripe - 1; stripe >= 0; stripe--)
1346 		kunmap_local(pointers[stripe]);
1347 }
1348 
rmw_assemble_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1349 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1350 				   struct bio_list *bio_list)
1351 {
1352 	/* The total sector number inside the full stripe. */
1353 	int total_sector_nr;
1354 	int sectornr;
1355 	int stripe;
1356 	int ret;
1357 
1358 	ASSERT(bio_list_size(bio_list) == 0);
1359 
1360 	/* We should have at least one data sector. */
1361 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1362 
1363 	/*
1364 	 * Reset errors, as we may have errors inherited from from degraded
1365 	 * write.
1366 	 */
1367 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1368 
1369 	/*
1370 	 * Start assembly.  Make bios for everything from the higher layers (the
1371 	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1372 	 */
1373 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1374 	     total_sector_nr++) {
1375 		struct sector_ptr *sector;
1376 
1377 		stripe = total_sector_nr / rbio->stripe_nsectors;
1378 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1379 
1380 		/* This vertical stripe has no data, skip it. */
1381 		if (!test_bit(sectornr, &rbio->dbitmap))
1382 			continue;
1383 
1384 		if (stripe < rbio->nr_data) {
1385 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1386 			if (!sector)
1387 				continue;
1388 		} else {
1389 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1390 		}
1391 
1392 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1393 					 sectornr, REQ_OP_WRITE);
1394 		if (ret)
1395 			goto error;
1396 	}
1397 
1398 	if (likely(!rbio->bioc->replace_nr_stripes))
1399 		return 0;
1400 
1401 	/*
1402 	 * Make a copy for the replace target device.
1403 	 *
1404 	 * Thus the source stripe number (in replace_stripe_src) should be valid.
1405 	 */
1406 	ASSERT(rbio->bioc->replace_stripe_src >= 0);
1407 
1408 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1409 	     total_sector_nr++) {
1410 		struct sector_ptr *sector;
1411 
1412 		stripe = total_sector_nr / rbio->stripe_nsectors;
1413 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1414 
1415 		/*
1416 		 * For RAID56, there is only one device that can be replaced,
1417 		 * and replace_stripe_src[0] indicates the stripe number we
1418 		 * need to copy from.
1419 		 */
1420 		if (stripe != rbio->bioc->replace_stripe_src) {
1421 			/*
1422 			 * We can skip the whole stripe completely, note
1423 			 * total_sector_nr will be increased by one anyway.
1424 			 */
1425 			ASSERT(sectornr == 0);
1426 			total_sector_nr += rbio->stripe_nsectors - 1;
1427 			continue;
1428 		}
1429 
1430 		/* This vertical stripe has no data, skip it. */
1431 		if (!test_bit(sectornr, &rbio->dbitmap))
1432 			continue;
1433 
1434 		if (stripe < rbio->nr_data) {
1435 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1436 			if (!sector)
1437 				continue;
1438 		} else {
1439 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1440 		}
1441 
1442 		ret = rbio_add_io_sector(rbio, bio_list, sector,
1443 					 rbio->real_stripes,
1444 					 sectornr, REQ_OP_WRITE);
1445 		if (ret)
1446 			goto error;
1447 	}
1448 
1449 	return 0;
1450 error:
1451 	bio_list_put(bio_list);
1452 	return -EIO;
1453 }
1454 
set_rbio_range_error(struct btrfs_raid_bio * rbio,struct bio * bio)1455 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1456 {
1457 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1458 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1459 		     rbio->bioc->full_stripe_logical;
1460 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1461 
1462 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1463 
1464 	bitmap_set(rbio->error_bitmap, total_nr_sector,
1465 		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1466 
1467 	/*
1468 	 * Special handling for raid56_alloc_missing_rbio() used by
1469 	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1470 	 * pass an empty bio here.  Thus we have to find out the missing device
1471 	 * and mark the stripe error instead.
1472 	 */
1473 	if (bio->bi_iter.bi_size == 0) {
1474 		bool found_missing = false;
1475 		int stripe_nr;
1476 
1477 		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1478 			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1479 				found_missing = true;
1480 				bitmap_set(rbio->error_bitmap,
1481 					   stripe_nr * rbio->stripe_nsectors,
1482 					   rbio->stripe_nsectors);
1483 			}
1484 		}
1485 		ASSERT(found_missing);
1486 	}
1487 }
1488 
1489 /*
1490  * For subpage case, we can no longer set page Up-to-date directly for
1491  * stripe_pages[], thus we need to locate the sector.
1492  */
find_stripe_sector(struct btrfs_raid_bio * rbio,phys_addr_t paddr)1493 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1494 					     phys_addr_t paddr)
1495 {
1496 	int i;
1497 
1498 	for (i = 0; i < rbio->nr_sectors; i++) {
1499 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
1500 
1501 		if (sector->has_paddr && sector->paddr == paddr)
1502 			return sector;
1503 	}
1504 	return NULL;
1505 }
1506 
1507 /*
1508  * this sets each page in the bio uptodate.  It should only be used on private
1509  * rbio pages, nothing that comes in from the higher layers
1510  */
set_bio_pages_uptodate(struct btrfs_raid_bio * rbio,struct bio * bio)1511 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1512 {
1513 	const u32 blocksize = rbio->bioc->fs_info->sectorsize;
1514 	phys_addr_t paddr;
1515 
1516 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1517 
1518 	btrfs_bio_for_each_block_all(paddr, bio, blocksize) {
1519 		struct sector_ptr *sector = find_stripe_sector(rbio, paddr);
1520 
1521 		ASSERT(sector);
1522 		if (sector)
1523 			sector->uptodate = 1;
1524 	}
1525 }
1526 
get_bio_sector_nr(struct btrfs_raid_bio * rbio,struct bio * bio)1527 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1528 {
1529 	phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
1530 	int i;
1531 
1532 	for (i = 0; i < rbio->nr_sectors; i++) {
1533 		if (rbio->stripe_sectors[i].paddr == bvec_paddr)
1534 			break;
1535 		if (rbio->bio_sectors[i].has_paddr &&
1536 		    rbio->bio_sectors[i].paddr == bvec_paddr)
1537 			break;
1538 	}
1539 	ASSERT(i < rbio->nr_sectors);
1540 	return i;
1541 }
1542 
rbio_update_error_bitmap(struct btrfs_raid_bio * rbio,struct bio * bio)1543 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1544 {
1545 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1546 	u32 bio_size = 0;
1547 	struct bio_vec *bvec;
1548 	int i;
1549 
1550 	bio_for_each_bvec_all(bvec, bio, i)
1551 		bio_size += bvec->bv_len;
1552 
1553 	/*
1554 	 * Since we can have multiple bios touching the error_bitmap, we cannot
1555 	 * call bitmap_set() without protection.
1556 	 *
1557 	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1558 	 */
1559 	for (i = total_sector_nr; i < total_sector_nr +
1560 	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1561 		set_bit(i, rbio->error_bitmap);
1562 }
1563 
1564 /* Verify the data sectors at read time. */
verify_bio_data_sectors(struct btrfs_raid_bio * rbio,struct bio * bio)1565 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1566 				    struct bio *bio)
1567 {
1568 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1569 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1570 	phys_addr_t paddr;
1571 
1572 	/* No data csum for the whole stripe, no need to verify. */
1573 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1574 		return;
1575 
1576 	/* P/Q stripes, they have no data csum to verify against. */
1577 	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1578 		return;
1579 
1580 	btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) {
1581 		u8 csum_buf[BTRFS_CSUM_SIZE];
1582 		u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
1583 		int ret;
1584 
1585 		/* No csum for this sector, skip to the next sector. */
1586 		if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1587 			continue;
1588 
1589 		ret = btrfs_check_block_csum(fs_info, paddr,
1590 					     csum_buf, expected_csum);
1591 		if (ret < 0)
1592 			set_bit(total_sector_nr, rbio->error_bitmap);
1593 		total_sector_nr++;
1594 	}
1595 }
1596 
raid_wait_read_end_io(struct bio * bio)1597 static void raid_wait_read_end_io(struct bio *bio)
1598 {
1599 	struct btrfs_raid_bio *rbio = bio->bi_private;
1600 
1601 	if (bio->bi_status) {
1602 		rbio_update_error_bitmap(rbio, bio);
1603 	} else {
1604 		set_bio_pages_uptodate(rbio, bio);
1605 		verify_bio_data_sectors(rbio, bio);
1606 	}
1607 
1608 	bio_put(bio);
1609 	if (atomic_dec_and_test(&rbio->stripes_pending))
1610 		wake_up(&rbio->io_wait);
1611 }
1612 
submit_read_wait_bio_list(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1613 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1614 			     struct bio_list *bio_list)
1615 {
1616 	struct bio *bio;
1617 
1618 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1619 	while ((bio = bio_list_pop(bio_list))) {
1620 		bio->bi_end_io = raid_wait_read_end_io;
1621 
1622 		if (trace_raid56_read_enabled()) {
1623 			struct raid56_bio_trace_info trace_info = { 0 };
1624 
1625 			bio_get_trace_info(rbio, bio, &trace_info);
1626 			trace_raid56_read(rbio, bio, &trace_info);
1627 		}
1628 		submit_bio(bio);
1629 	}
1630 
1631 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1632 }
1633 
alloc_rbio_data_pages(struct btrfs_raid_bio * rbio)1634 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1635 {
1636 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1637 	int ret;
1638 
1639 	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1640 	if (ret < 0)
1641 		return ret;
1642 
1643 	index_stripe_sectors(rbio);
1644 	return 0;
1645 }
1646 
1647 /*
1648  * We use plugging call backs to collect full stripes.
1649  * Any time we get a partial stripe write while plugged
1650  * we collect it into a list.  When the unplug comes down,
1651  * we sort the list by logical block number and merge
1652  * everything we can into the same rbios
1653  */
1654 struct btrfs_plug_cb {
1655 	struct blk_plug_cb cb;
1656 	struct btrfs_fs_info *info;
1657 	struct list_head rbio_list;
1658 };
1659 
1660 /*
1661  * rbios on the plug list are sorted for easier merging.
1662  */
plug_cmp(void * priv,const struct list_head * a,const struct list_head * b)1663 static int plug_cmp(void *priv, const struct list_head *a,
1664 		    const struct list_head *b)
1665 {
1666 	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1667 						       plug_list);
1668 	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1669 						       plug_list);
1670 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1671 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1672 
1673 	if (a_sector < b_sector)
1674 		return -1;
1675 	if (a_sector > b_sector)
1676 		return 1;
1677 	return 0;
1678 }
1679 
raid_unplug(struct blk_plug_cb * cb,bool from_schedule)1680 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1681 {
1682 	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1683 	struct btrfs_raid_bio *cur;
1684 	struct btrfs_raid_bio *last = NULL;
1685 
1686 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1687 
1688 	while (!list_empty(&plug->rbio_list)) {
1689 		cur = list_first_entry(&plug->rbio_list,
1690 				       struct btrfs_raid_bio, plug_list);
1691 		list_del_init(&cur->plug_list);
1692 
1693 		if (rbio_is_full(cur)) {
1694 			/* We have a full stripe, queue it down. */
1695 			start_async_work(cur, rmw_rbio_work);
1696 			continue;
1697 		}
1698 		if (last) {
1699 			if (rbio_can_merge(last, cur)) {
1700 				merge_rbio(last, cur);
1701 				free_raid_bio(cur);
1702 				continue;
1703 			}
1704 			start_async_work(last, rmw_rbio_work);
1705 		}
1706 		last = cur;
1707 	}
1708 	if (last)
1709 		start_async_work(last, rmw_rbio_work);
1710 	kfree(plug);
1711 }
1712 
1713 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
rbio_add_bio(struct btrfs_raid_bio * rbio,struct bio * orig_bio)1714 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1715 {
1716 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1717 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1718 	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1719 	const u32 orig_len = orig_bio->bi_iter.bi_size;
1720 	const u32 sectorsize = fs_info->sectorsize;
1721 	u64 cur_logical;
1722 
1723 	ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1724 			    orig_logical + orig_len <= full_stripe_start +
1725 			    rbio->nr_data * BTRFS_STRIPE_LEN,
1726 			    rbio, orig_logical);
1727 
1728 	bio_list_add(&rbio->bio_list, orig_bio);
1729 	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1730 
1731 	/* Update the dbitmap. */
1732 	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1733 	     cur_logical += sectorsize) {
1734 		int bit = ((u32)(cur_logical - full_stripe_start) >>
1735 			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1736 
1737 		set_bit(bit, &rbio->dbitmap);
1738 	}
1739 }
1740 
1741 /*
1742  * our main entry point for writes from the rest of the FS.
1743  */
raid56_parity_write(struct bio * bio,struct btrfs_io_context * bioc)1744 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1745 {
1746 	struct btrfs_fs_info *fs_info = bioc->fs_info;
1747 	struct btrfs_raid_bio *rbio;
1748 	struct btrfs_plug_cb *plug = NULL;
1749 	struct blk_plug_cb *cb;
1750 
1751 	rbio = alloc_rbio(fs_info, bioc);
1752 	if (IS_ERR(rbio)) {
1753 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1754 		bio_endio(bio);
1755 		return;
1756 	}
1757 	rbio->operation = BTRFS_RBIO_WRITE;
1758 	rbio_add_bio(rbio, bio);
1759 
1760 	/*
1761 	 * Don't plug on full rbios, just get them out the door
1762 	 * as quickly as we can
1763 	 */
1764 	if (!rbio_is_full(rbio)) {
1765 		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1766 		if (cb) {
1767 			plug = container_of(cb, struct btrfs_plug_cb, cb);
1768 			if (!plug->info) {
1769 				plug->info = fs_info;
1770 				INIT_LIST_HEAD(&plug->rbio_list);
1771 			}
1772 			list_add_tail(&rbio->plug_list, &plug->rbio_list);
1773 			return;
1774 		}
1775 	}
1776 
1777 	/*
1778 	 * Either we don't have any existing plug, or we're doing a full stripe,
1779 	 * queue the rmw work now.
1780 	 */
1781 	start_async_work(rbio, rmw_rbio_work);
1782 }
1783 
verify_one_sector(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr)1784 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1785 			     int stripe_nr, int sector_nr)
1786 {
1787 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1788 	struct sector_ptr *sector;
1789 	u8 csum_buf[BTRFS_CSUM_SIZE];
1790 	u8 *csum_expected;
1791 	int ret;
1792 
1793 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1794 		return 0;
1795 
1796 	/* No way to verify P/Q as they are not covered by data csum. */
1797 	if (stripe_nr >= rbio->nr_data)
1798 		return 0;
1799 	/*
1800 	 * If we're rebuilding a read, we have to use pages from the
1801 	 * bio list if possible.
1802 	 */
1803 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1804 		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1805 	} else {
1806 		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1807 	}
1808 
1809 	csum_expected = rbio->csum_buf +
1810 			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1811 			fs_info->csum_size;
1812 	ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected);
1813 	return ret;
1814 }
1815 
1816 /*
1817  * Recover a vertical stripe specified by @sector_nr.
1818  * @*pointers are the pre-allocated pointers by the caller, so we don't
1819  * need to allocate/free the pointers again and again.
1820  */
recover_vertical(struct btrfs_raid_bio * rbio,int sector_nr,void ** pointers,void ** unmap_array)1821 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1822 			    void **pointers, void **unmap_array)
1823 {
1824 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1825 	struct sector_ptr *sector;
1826 	const u32 sectorsize = fs_info->sectorsize;
1827 	int found_errors;
1828 	int faila;
1829 	int failb;
1830 	int stripe_nr;
1831 	int ret = 0;
1832 
1833 	/*
1834 	 * Now we just use bitmap to mark the horizontal stripes in
1835 	 * which we have data when doing parity scrub.
1836 	 */
1837 	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1838 	    !test_bit(sector_nr, &rbio->dbitmap))
1839 		return 0;
1840 
1841 	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1842 						 &failb);
1843 	/*
1844 	 * No errors in the vertical stripe, skip it.  Can happen for recovery
1845 	 * which only part of a stripe failed csum check.
1846 	 */
1847 	if (!found_errors)
1848 		return 0;
1849 
1850 	if (unlikely(found_errors > rbio->bioc->max_errors))
1851 		return -EIO;
1852 
1853 	/*
1854 	 * Setup our array of pointers with sectors from each stripe
1855 	 *
1856 	 * NOTE: store a duplicate array of pointers to preserve the
1857 	 * pointer order.
1858 	 */
1859 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1860 		/*
1861 		 * If we're rebuilding a read, we have to use pages from the
1862 		 * bio list if possible.
1863 		 */
1864 		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1865 			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1866 		} else {
1867 			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1868 		}
1869 		pointers[stripe_nr] = kmap_local_sector(sector);
1870 		unmap_array[stripe_nr] = pointers[stripe_nr];
1871 	}
1872 
1873 	/* All raid6 handling here */
1874 	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1875 		/* Single failure, rebuild from parity raid5 style */
1876 		if (failb < 0) {
1877 			if (faila == rbio->nr_data)
1878 				/*
1879 				 * Just the P stripe has failed, without
1880 				 * a bad data or Q stripe.
1881 				 * We have nothing to do, just skip the
1882 				 * recovery for this stripe.
1883 				 */
1884 				goto cleanup;
1885 			/*
1886 			 * a single failure in raid6 is rebuilt
1887 			 * in the pstripe code below
1888 			 */
1889 			goto pstripe;
1890 		}
1891 
1892 		/*
1893 		 * If the q stripe is failed, do a pstripe reconstruction from
1894 		 * the xors.
1895 		 * If both the q stripe and the P stripe are failed, we're
1896 		 * here due to a crc mismatch and we can't give them the
1897 		 * data they want.
1898 		 */
1899 		if (failb == rbio->real_stripes - 1) {
1900 			if (faila == rbio->real_stripes - 2)
1901 				/*
1902 				 * Only P and Q are corrupted.
1903 				 * We only care about data stripes recovery,
1904 				 * can skip this vertical stripe.
1905 				 */
1906 				goto cleanup;
1907 			/*
1908 			 * Otherwise we have one bad data stripe and
1909 			 * a good P stripe.  raid5!
1910 			 */
1911 			goto pstripe;
1912 		}
1913 
1914 		if (failb == rbio->real_stripes - 2) {
1915 			raid6_datap_recov(rbio->real_stripes, sectorsize,
1916 					  faila, pointers);
1917 		} else {
1918 			raid6_2data_recov(rbio->real_stripes, sectorsize,
1919 					  faila, failb, pointers);
1920 		}
1921 	} else {
1922 		void *p;
1923 
1924 		/* Rebuild from P stripe here (raid5 or raid6). */
1925 		ASSERT(failb == -1);
1926 pstripe:
1927 		/* Copy parity block into failed block to start with */
1928 		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1929 
1930 		/* Rearrange the pointer array */
1931 		p = pointers[faila];
1932 		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1933 		     stripe_nr++)
1934 			pointers[stripe_nr] = pointers[stripe_nr + 1];
1935 		pointers[rbio->nr_data - 1] = p;
1936 
1937 		/* Xor in the rest */
1938 		run_xor(pointers, rbio->nr_data - 1, sectorsize);
1939 
1940 	}
1941 
1942 	/*
1943 	 * No matter if this is a RMW or recovery, we should have all
1944 	 * failed sectors repaired in the vertical stripe, thus they are now
1945 	 * uptodate.
1946 	 * Especially if we determine to cache the rbio, we need to
1947 	 * have at least all data sectors uptodate.
1948 	 *
1949 	 * If possible, also check if the repaired sector matches its data
1950 	 * checksum.
1951 	 */
1952 	if (faila >= 0) {
1953 		ret = verify_one_sector(rbio, faila, sector_nr);
1954 		if (ret < 0)
1955 			goto cleanup;
1956 
1957 		sector = rbio_stripe_sector(rbio, faila, sector_nr);
1958 		sector->uptodate = 1;
1959 	}
1960 	if (failb >= 0) {
1961 		ret = verify_one_sector(rbio, failb, sector_nr);
1962 		if (ret < 0)
1963 			goto cleanup;
1964 
1965 		sector = rbio_stripe_sector(rbio, failb, sector_nr);
1966 		sector->uptodate = 1;
1967 	}
1968 
1969 cleanup:
1970 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1971 		kunmap_local(unmap_array[stripe_nr]);
1972 	return ret;
1973 }
1974 
recover_sectors(struct btrfs_raid_bio * rbio)1975 static int recover_sectors(struct btrfs_raid_bio *rbio)
1976 {
1977 	void **pointers = NULL;
1978 	void **unmap_array = NULL;
1979 	int sectornr;
1980 	int ret = 0;
1981 
1982 	/*
1983 	 * @pointers array stores the pointer for each sector.
1984 	 *
1985 	 * @unmap_array stores copy of pointers that does not get reordered
1986 	 * during reconstruction so that kunmap_local works.
1987 	 */
1988 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1989 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1990 	if (!pointers || !unmap_array) {
1991 		ret = -ENOMEM;
1992 		goto out;
1993 	}
1994 
1995 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1996 		spin_lock(&rbio->bio_list_lock);
1997 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1998 		spin_unlock(&rbio->bio_list_lock);
1999 	}
2000 
2001 	index_rbio_pages(rbio);
2002 
2003 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2004 		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2005 		if (ret < 0)
2006 			break;
2007 	}
2008 
2009 out:
2010 	kfree(pointers);
2011 	kfree(unmap_array);
2012 	return ret;
2013 }
2014 
recover_rbio(struct btrfs_raid_bio * rbio)2015 static void recover_rbio(struct btrfs_raid_bio *rbio)
2016 {
2017 	struct bio_list bio_list = BIO_EMPTY_LIST;
2018 	int total_sector_nr;
2019 	int ret = 0;
2020 
2021 	/*
2022 	 * Either we're doing recover for a read failure or degraded write,
2023 	 * caller should have set error bitmap correctly.
2024 	 */
2025 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2026 
2027 	/* For recovery, we need to read all sectors including P/Q. */
2028 	ret = alloc_rbio_pages(rbio);
2029 	if (ret < 0)
2030 		goto out;
2031 
2032 	index_rbio_pages(rbio);
2033 
2034 	/*
2035 	 * Read everything that hasn't failed. However this time we will
2036 	 * not trust any cached sector.
2037 	 * As we may read out some stale data but higher layer is not reading
2038 	 * that stale part.
2039 	 *
2040 	 * So here we always re-read everything in recovery path.
2041 	 */
2042 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2043 	     total_sector_nr++) {
2044 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2045 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2046 		struct sector_ptr *sector;
2047 
2048 		/*
2049 		 * Skip the range which has error.  It can be a range which is
2050 		 * marked error (for csum mismatch), or it can be a missing
2051 		 * device.
2052 		 */
2053 		if (!rbio->bioc->stripes[stripe].dev->bdev ||
2054 		    test_bit(total_sector_nr, rbio->error_bitmap)) {
2055 			/*
2056 			 * Also set the error bit for missing device, which
2057 			 * may not yet have its error bit set.
2058 			 */
2059 			set_bit(total_sector_nr, rbio->error_bitmap);
2060 			continue;
2061 		}
2062 
2063 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2064 		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2065 					 sectornr, REQ_OP_READ);
2066 		if (ret < 0) {
2067 			bio_list_put(&bio_list);
2068 			goto out;
2069 		}
2070 	}
2071 
2072 	submit_read_wait_bio_list(rbio, &bio_list);
2073 	ret = recover_sectors(rbio);
2074 out:
2075 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2076 }
2077 
recover_rbio_work(struct work_struct * work)2078 static void recover_rbio_work(struct work_struct *work)
2079 {
2080 	struct btrfs_raid_bio *rbio;
2081 
2082 	rbio = container_of(work, struct btrfs_raid_bio, work);
2083 	if (!lock_stripe_add(rbio))
2084 		recover_rbio(rbio);
2085 }
2086 
recover_rbio_work_locked(struct work_struct * work)2087 static void recover_rbio_work_locked(struct work_struct *work)
2088 {
2089 	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2090 }
2091 
set_rbio_raid6_extra_error(struct btrfs_raid_bio * rbio,int mirror_num)2092 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2093 {
2094 	bool found = false;
2095 	int sector_nr;
2096 
2097 	/*
2098 	 * This is for RAID6 extra recovery tries, thus mirror number should
2099 	 * be large than 2.
2100 	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2101 	 * RAID5 methods.
2102 	 */
2103 	ASSERT(mirror_num > 2);
2104 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2105 		int found_errors;
2106 		int faila;
2107 		int failb;
2108 
2109 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2110 							 &faila, &failb);
2111 		/* This vertical stripe doesn't have errors. */
2112 		if (!found_errors)
2113 			continue;
2114 
2115 		/*
2116 		 * If we found errors, there should be only one error marked
2117 		 * by previous set_rbio_range_error().
2118 		 */
2119 		ASSERT(found_errors == 1);
2120 		found = true;
2121 
2122 		/* Now select another stripe to mark as error. */
2123 		failb = rbio->real_stripes - (mirror_num - 1);
2124 		if (failb <= faila)
2125 			failb--;
2126 
2127 		/* Set the extra bit in error bitmap. */
2128 		if (failb >= 0)
2129 			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2130 				rbio->error_bitmap);
2131 	}
2132 
2133 	/* We should found at least one vertical stripe with error.*/
2134 	ASSERT(found);
2135 }
2136 
2137 /*
2138  * the main entry point for reads from the higher layers.  This
2139  * is really only called when the normal read path had a failure,
2140  * so we assume the bio they send down corresponds to a failed part
2141  * of the drive.
2142  */
raid56_parity_recover(struct bio * bio,struct btrfs_io_context * bioc,int mirror_num)2143 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2144 			   int mirror_num)
2145 {
2146 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2147 	struct btrfs_raid_bio *rbio;
2148 
2149 	rbio = alloc_rbio(fs_info, bioc);
2150 	if (IS_ERR(rbio)) {
2151 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2152 		bio_endio(bio);
2153 		return;
2154 	}
2155 
2156 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2157 	rbio_add_bio(rbio, bio);
2158 
2159 	set_rbio_range_error(rbio, bio);
2160 
2161 	/*
2162 	 * Loop retry:
2163 	 * for 'mirror == 2', reconstruct from all other stripes.
2164 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2165 	 */
2166 	if (mirror_num > 2)
2167 		set_rbio_raid6_extra_error(rbio, mirror_num);
2168 
2169 	start_async_work(rbio, recover_rbio_work);
2170 }
2171 
fill_data_csums(struct btrfs_raid_bio * rbio)2172 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2173 {
2174 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2175 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2176 						       rbio->bioc->full_stripe_logical);
2177 	const u64 start = rbio->bioc->full_stripe_logical;
2178 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2179 			fs_info->sectorsize_bits;
2180 	int ret;
2181 
2182 	/* The rbio should not have its csum buffer initialized. */
2183 	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2184 
2185 	/*
2186 	 * Skip the csum search if:
2187 	 *
2188 	 * - The rbio doesn't belong to data block groups
2189 	 *   Then we are doing IO for tree blocks, no need to search csums.
2190 	 *
2191 	 * - The rbio belongs to mixed block groups
2192 	 *   This is to avoid deadlock, as we're already holding the full
2193 	 *   stripe lock, if we trigger a metadata read, and it needs to do
2194 	 *   raid56 recovery, we will deadlock.
2195 	 */
2196 	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2197 	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2198 		return;
2199 
2200 	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2201 				 fs_info->csum_size, GFP_NOFS);
2202 	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2203 					  GFP_NOFS);
2204 	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2205 		ret = -ENOMEM;
2206 		goto error;
2207 	}
2208 
2209 	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2210 					rbio->csum_buf, rbio->csum_bitmap);
2211 	if (ret < 0)
2212 		goto error;
2213 	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2214 		goto no_csum;
2215 	return;
2216 
2217 error:
2218 	/*
2219 	 * We failed to allocate memory or grab the csum, but it's not fatal,
2220 	 * we can still continue.  But better to warn users that RMW is no
2221 	 * longer safe for this particular sub-stripe write.
2222 	 */
2223 	btrfs_warn_rl(fs_info,
2224 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2225 			rbio->bioc->full_stripe_logical, ret);
2226 no_csum:
2227 	kfree(rbio->csum_buf);
2228 	bitmap_free(rbio->csum_bitmap);
2229 	rbio->csum_buf = NULL;
2230 	rbio->csum_bitmap = NULL;
2231 }
2232 
rmw_read_wait_recover(struct btrfs_raid_bio * rbio)2233 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2234 {
2235 	struct bio_list bio_list = BIO_EMPTY_LIST;
2236 	int total_sector_nr;
2237 	int ret = 0;
2238 
2239 	/*
2240 	 * Fill the data csums we need for data verification.  We need to fill
2241 	 * the csum_bitmap/csum_buf first, as our endio function will try to
2242 	 * verify the data sectors.
2243 	 */
2244 	fill_data_csums(rbio);
2245 
2246 	/*
2247 	 * Build a list of bios to read all sectors (including data and P/Q).
2248 	 *
2249 	 * This behavior is to compensate the later csum verification and recovery.
2250 	 */
2251 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2252 	     total_sector_nr++) {
2253 		struct sector_ptr *sector;
2254 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2255 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2256 
2257 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2258 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2259 			       stripe, sectornr, REQ_OP_READ);
2260 		if (ret) {
2261 			bio_list_put(&bio_list);
2262 			return ret;
2263 		}
2264 	}
2265 
2266 	/*
2267 	 * We may or may not have any corrupted sectors (including missing dev
2268 	 * and csum mismatch), just let recover_sectors() to handle them all.
2269 	 */
2270 	submit_read_wait_bio_list(rbio, &bio_list);
2271 	return recover_sectors(rbio);
2272 }
2273 
raid_wait_write_end_io(struct bio * bio)2274 static void raid_wait_write_end_io(struct bio *bio)
2275 {
2276 	struct btrfs_raid_bio *rbio = bio->bi_private;
2277 
2278 	if (bio->bi_status)
2279 		rbio_update_error_bitmap(rbio, bio);
2280 	bio_put(bio);
2281 	if (atomic_dec_and_test(&rbio->stripes_pending))
2282 		wake_up(&rbio->io_wait);
2283 }
2284 
submit_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)2285 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2286 			      struct bio_list *bio_list)
2287 {
2288 	struct bio *bio;
2289 
2290 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2291 	while ((bio = bio_list_pop(bio_list))) {
2292 		bio->bi_end_io = raid_wait_write_end_io;
2293 
2294 		if (trace_raid56_write_enabled()) {
2295 			struct raid56_bio_trace_info trace_info = { 0 };
2296 
2297 			bio_get_trace_info(rbio, bio, &trace_info);
2298 			trace_raid56_write(rbio, bio, &trace_info);
2299 		}
2300 		submit_bio(bio);
2301 	}
2302 }
2303 
2304 /*
2305  * To determine if we need to read any sector from the disk.
2306  * Should only be utilized in RMW path, to skip cached rbio.
2307  */
need_read_stripe_sectors(struct btrfs_raid_bio * rbio)2308 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2309 {
2310 	int i;
2311 
2312 	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2313 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
2314 
2315 		/*
2316 		 * We have a sector which doesn't have page nor uptodate,
2317 		 * thus this rbio can not be cached one, as cached one must
2318 		 * have all its data sectors present and uptodate.
2319 		 */
2320 		if (!sector->has_paddr || !sector->uptodate)
2321 			return true;
2322 	}
2323 	return false;
2324 }
2325 
rmw_rbio(struct btrfs_raid_bio * rbio)2326 static void rmw_rbio(struct btrfs_raid_bio *rbio)
2327 {
2328 	struct bio_list bio_list;
2329 	int sectornr;
2330 	int ret = 0;
2331 
2332 	/*
2333 	 * Allocate the pages for parity first, as P/Q pages will always be
2334 	 * needed for both full-stripe and sub-stripe writes.
2335 	 */
2336 	ret = alloc_rbio_parity_pages(rbio);
2337 	if (ret < 0)
2338 		goto out;
2339 
2340 	/*
2341 	 * Either full stripe write, or we have every data sector already
2342 	 * cached, can go to write path immediately.
2343 	 */
2344 	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2345 		/*
2346 		 * Now we're doing sub-stripe write, also need all data stripes
2347 		 * to do the full RMW.
2348 		 */
2349 		ret = alloc_rbio_data_pages(rbio);
2350 		if (ret < 0)
2351 			goto out;
2352 
2353 		index_rbio_pages(rbio);
2354 
2355 		ret = rmw_read_wait_recover(rbio);
2356 		if (ret < 0)
2357 			goto out;
2358 	}
2359 
2360 	/*
2361 	 * At this stage we're not allowed to add any new bios to the
2362 	 * bio list any more, anyone else that wants to change this stripe
2363 	 * needs to do their own rmw.
2364 	 */
2365 	spin_lock(&rbio->bio_list_lock);
2366 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2367 	spin_unlock(&rbio->bio_list_lock);
2368 
2369 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2370 
2371 	index_rbio_pages(rbio);
2372 
2373 	/*
2374 	 * We don't cache full rbios because we're assuming
2375 	 * the higher layers are unlikely to use this area of
2376 	 * the disk again soon.  If they do use it again,
2377 	 * hopefully they will send another full bio.
2378 	 */
2379 	if (!rbio_is_full(rbio))
2380 		cache_rbio_pages(rbio);
2381 	else
2382 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2383 
2384 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2385 		generate_pq_vertical(rbio, sectornr);
2386 
2387 	bio_list_init(&bio_list);
2388 	ret = rmw_assemble_write_bios(rbio, &bio_list);
2389 	if (ret < 0)
2390 		goto out;
2391 
2392 	/* We should have at least one bio assembled. */
2393 	ASSERT(bio_list_size(&bio_list));
2394 	submit_write_bios(rbio, &bio_list);
2395 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2396 
2397 	/* We may have more errors than our tolerance during the read. */
2398 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2399 		int found_errors;
2400 
2401 		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2402 		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2403 			ret = -EIO;
2404 			break;
2405 		}
2406 	}
2407 out:
2408 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2409 }
2410 
rmw_rbio_work(struct work_struct * work)2411 static void rmw_rbio_work(struct work_struct *work)
2412 {
2413 	struct btrfs_raid_bio *rbio;
2414 
2415 	rbio = container_of(work, struct btrfs_raid_bio, work);
2416 	if (lock_stripe_add(rbio) == 0)
2417 		rmw_rbio(rbio);
2418 }
2419 
rmw_rbio_work_locked(struct work_struct * work)2420 static void rmw_rbio_work_locked(struct work_struct *work)
2421 {
2422 	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2423 }
2424 
2425 /*
2426  * The following code is used to scrub/replace the parity stripe
2427  *
2428  * Caller must have already increased bio_counter for getting @bioc.
2429  *
2430  * Note: We need make sure all the pages that add into the scrub/replace
2431  * raid bio are correct and not be changed during the scrub/replace. That
2432  * is those pages just hold metadata or file data with checksum.
2433  */
2434 
raid56_parity_alloc_scrub_rbio(struct bio * bio,struct btrfs_io_context * bioc,struct btrfs_device * scrub_dev,unsigned long * dbitmap,int stripe_nsectors)2435 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2436 				struct btrfs_io_context *bioc,
2437 				struct btrfs_device *scrub_dev,
2438 				unsigned long *dbitmap, int stripe_nsectors)
2439 {
2440 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2441 	struct btrfs_raid_bio *rbio;
2442 	int i;
2443 
2444 	rbio = alloc_rbio(fs_info, bioc);
2445 	if (IS_ERR(rbio))
2446 		return NULL;
2447 	bio_list_add(&rbio->bio_list, bio);
2448 	/*
2449 	 * This is a special bio which is used to hold the completion handler
2450 	 * and make the scrub rbio is similar to the other types
2451 	 */
2452 	ASSERT(!bio->bi_iter.bi_size);
2453 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2454 
2455 	/*
2456 	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2457 	 * to the end position, so this search can start from the first parity
2458 	 * stripe.
2459 	 */
2460 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2461 		if (bioc->stripes[i].dev == scrub_dev) {
2462 			rbio->scrubp = i;
2463 			break;
2464 		}
2465 	}
2466 	ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2467 
2468 	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2469 	return rbio;
2470 }
2471 
2472 /*
2473  * We just scrub the parity that we have correct data on the same horizontal,
2474  * so we needn't allocate all pages for all the stripes.
2475  */
alloc_rbio_essential_pages(struct btrfs_raid_bio * rbio)2476 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2477 {
2478 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2479 	int total_sector_nr;
2480 
2481 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2482 	     total_sector_nr++) {
2483 		struct page *page;
2484 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2485 		int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2486 
2487 		if (!test_bit(sectornr, &rbio->dbitmap))
2488 			continue;
2489 		if (rbio->stripe_pages[index])
2490 			continue;
2491 		page = alloc_page(GFP_NOFS);
2492 		if (!page)
2493 			return -ENOMEM;
2494 		rbio->stripe_pages[index] = page;
2495 	}
2496 	index_stripe_sectors(rbio);
2497 	return 0;
2498 }
2499 
finish_parity_scrub(struct btrfs_raid_bio * rbio)2500 static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2501 {
2502 	struct btrfs_io_context *bioc = rbio->bioc;
2503 	const u32 sectorsize = bioc->fs_info->sectorsize;
2504 	void **pointers = rbio->finish_pointers;
2505 	unsigned long *pbitmap = &rbio->finish_pbitmap;
2506 	int nr_data = rbio->nr_data;
2507 	int stripe;
2508 	int sectornr;
2509 	bool has_qstripe;
2510 	struct page *page;
2511 	struct sector_ptr p_sector = { 0 };
2512 	struct sector_ptr q_sector = { 0 };
2513 	struct bio_list bio_list;
2514 	int is_replace = 0;
2515 	int ret;
2516 
2517 	bio_list_init(&bio_list);
2518 
2519 	if (rbio->real_stripes - rbio->nr_data == 1)
2520 		has_qstripe = false;
2521 	else if (rbio->real_stripes - rbio->nr_data == 2)
2522 		has_qstripe = true;
2523 	else
2524 		BUG();
2525 
2526 	/*
2527 	 * Replace is running and our P/Q stripe is being replaced, then we
2528 	 * need to duplicate the final write to replace target.
2529 	 */
2530 	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2531 		is_replace = 1;
2532 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2533 	}
2534 
2535 	/*
2536 	 * Because the higher layers(scrubber) are unlikely to
2537 	 * use this area of the disk again soon, so don't cache
2538 	 * it.
2539 	 */
2540 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2541 
2542 	page = alloc_page(GFP_NOFS);
2543 	if (!page)
2544 		return -ENOMEM;
2545 	p_sector.has_paddr = true;
2546 	p_sector.paddr = page_to_phys(page);
2547 	p_sector.uptodate = 1;
2548 	page = NULL;
2549 
2550 	if (has_qstripe) {
2551 		/* RAID6, allocate and map temp space for the Q stripe */
2552 		page = alloc_page(GFP_NOFS);
2553 		if (!page) {
2554 			__free_page(phys_to_page(p_sector.paddr));
2555 			p_sector.has_paddr = false;
2556 			return -ENOMEM;
2557 		}
2558 		q_sector.has_paddr = true;
2559 		q_sector.paddr = page_to_phys(page);
2560 		q_sector.uptodate = 1;
2561 		page = NULL;
2562 		pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector);
2563 	}
2564 
2565 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2566 
2567 	/* Map the parity stripe just once */
2568 	pointers[nr_data] = kmap_local_sector(&p_sector);
2569 
2570 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2571 		struct sector_ptr *sector;
2572 		void *parity;
2573 
2574 		/* first collect one page from each data stripe */
2575 		for (stripe = 0; stripe < nr_data; stripe++) {
2576 			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2577 			pointers[stripe] = kmap_local_sector(sector);
2578 		}
2579 
2580 		if (has_qstripe) {
2581 			assert_rbio(rbio);
2582 			/* RAID6, call the library function to fill in our P/Q */
2583 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2584 						pointers);
2585 		} else {
2586 			/* raid5 */
2587 			memcpy(pointers[nr_data], pointers[0], sectorsize);
2588 			run_xor(pointers + 1, nr_data - 1, sectorsize);
2589 		}
2590 
2591 		/* Check scrubbing parity and repair it */
2592 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2593 		parity = kmap_local_sector(sector);
2594 		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2595 			memcpy(parity, pointers[rbio->scrubp], sectorsize);
2596 		else
2597 			/* Parity is right, needn't writeback */
2598 			bitmap_clear(&rbio->dbitmap, sectornr, 1);
2599 		kunmap_local(parity);
2600 
2601 		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2602 			kunmap_local(pointers[stripe]);
2603 	}
2604 
2605 	kunmap_local(pointers[nr_data]);
2606 	__free_page(phys_to_page(p_sector.paddr));
2607 	p_sector.has_paddr = false;
2608 	if (q_sector.has_paddr) {
2609 		__free_page(phys_to_page(q_sector.paddr));
2610 		q_sector.has_paddr = false;
2611 	}
2612 
2613 	/*
2614 	 * time to start writing.  Make bios for everything from the
2615 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2616 	 * everything else.
2617 	 */
2618 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2619 		struct sector_ptr *sector;
2620 
2621 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2622 		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2623 					 sectornr, REQ_OP_WRITE);
2624 		if (ret)
2625 			goto cleanup;
2626 	}
2627 
2628 	if (!is_replace)
2629 		goto submit_write;
2630 
2631 	/*
2632 	 * Replace is running and our parity stripe needs to be duplicated to
2633 	 * the target device.  Check we have a valid source stripe number.
2634 	 */
2635 	ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2636 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2637 		struct sector_ptr *sector;
2638 
2639 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2640 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2641 					 rbio->real_stripes,
2642 					 sectornr, REQ_OP_WRITE);
2643 		if (ret)
2644 			goto cleanup;
2645 	}
2646 
2647 submit_write:
2648 	submit_write_bios(rbio, &bio_list);
2649 	return 0;
2650 
2651 cleanup:
2652 	bio_list_put(&bio_list);
2653 	return ret;
2654 }
2655 
is_data_stripe(struct btrfs_raid_bio * rbio,int stripe)2656 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2657 {
2658 	if (stripe >= 0 && stripe < rbio->nr_data)
2659 		return 1;
2660 	return 0;
2661 }
2662 
recover_scrub_rbio(struct btrfs_raid_bio * rbio)2663 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2664 {
2665 	void **pointers = NULL;
2666 	void **unmap_array = NULL;
2667 	int sector_nr;
2668 	int ret = 0;
2669 
2670 	/*
2671 	 * @pointers array stores the pointer for each sector.
2672 	 *
2673 	 * @unmap_array stores copy of pointers that does not get reordered
2674 	 * during reconstruction so that kunmap_local works.
2675 	 */
2676 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2677 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2678 	if (!pointers || !unmap_array) {
2679 		ret = -ENOMEM;
2680 		goto out;
2681 	}
2682 
2683 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2684 		int dfail = 0, failp = -1;
2685 		int faila;
2686 		int failb;
2687 		int found_errors;
2688 
2689 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2690 							 &faila, &failb);
2691 		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2692 			ret = -EIO;
2693 			goto out;
2694 		}
2695 		if (found_errors == 0)
2696 			continue;
2697 
2698 		/* We should have at least one error here. */
2699 		ASSERT(faila >= 0 || failb >= 0);
2700 
2701 		if (is_data_stripe(rbio, faila))
2702 			dfail++;
2703 		else if (is_parity_stripe(faila))
2704 			failp = faila;
2705 
2706 		if (is_data_stripe(rbio, failb))
2707 			dfail++;
2708 		else if (is_parity_stripe(failb))
2709 			failp = failb;
2710 		/*
2711 		 * Because we can not use a scrubbing parity to repair the
2712 		 * data, so the capability of the repair is declined.  (In the
2713 		 * case of RAID5, we can not repair anything.)
2714 		 */
2715 		if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
2716 			ret = -EIO;
2717 			goto out;
2718 		}
2719 		/*
2720 		 * If all data is good, only parity is correctly, just repair
2721 		 * the parity, no need to recover data stripes.
2722 		 */
2723 		if (dfail == 0)
2724 			continue;
2725 
2726 		/*
2727 		 * Here means we got one corrupted data stripe and one
2728 		 * corrupted parity on RAID6, if the corrupted parity is
2729 		 * scrubbing parity, luckily, use the other one to repair the
2730 		 * data, or we can not repair the data stripe.
2731 		 */
2732 		if (unlikely(failp != rbio->scrubp)) {
2733 			ret = -EIO;
2734 			goto out;
2735 		}
2736 
2737 		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2738 		if (ret < 0)
2739 			goto out;
2740 	}
2741 out:
2742 	kfree(pointers);
2743 	kfree(unmap_array);
2744 	return ret;
2745 }
2746 
scrub_assemble_read_bios(struct btrfs_raid_bio * rbio)2747 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2748 {
2749 	struct bio_list bio_list = BIO_EMPTY_LIST;
2750 	int total_sector_nr;
2751 	int ret = 0;
2752 
2753 	/* Build a list of bios to read all the missing parts. */
2754 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2755 	     total_sector_nr++) {
2756 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2757 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2758 		struct sector_ptr *sector;
2759 
2760 		/* No data in the vertical stripe, no need to read. */
2761 		if (!test_bit(sectornr, &rbio->dbitmap))
2762 			continue;
2763 
2764 		/*
2765 		 * We want to find all the sectors missing from the rbio and
2766 		 * read them from the disk. If sector_in_rbio() finds a sector
2767 		 * in the bio list we don't need to read it off the stripe.
2768 		 */
2769 		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2770 		if (sector)
2771 			continue;
2772 
2773 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2774 		/*
2775 		 * The bio cache may have handed us an uptodate sector.  If so,
2776 		 * use it.
2777 		 */
2778 		if (sector->uptodate)
2779 			continue;
2780 
2781 		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2782 					 sectornr, REQ_OP_READ);
2783 		if (ret) {
2784 			bio_list_put(&bio_list);
2785 			return ret;
2786 		}
2787 	}
2788 
2789 	submit_read_wait_bio_list(rbio, &bio_list);
2790 	return 0;
2791 }
2792 
scrub_rbio(struct btrfs_raid_bio * rbio)2793 static void scrub_rbio(struct btrfs_raid_bio *rbio)
2794 {
2795 	int sector_nr;
2796 	int ret;
2797 
2798 	ret = alloc_rbio_essential_pages(rbio);
2799 	if (ret)
2800 		goto out;
2801 
2802 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2803 
2804 	ret = scrub_assemble_read_bios(rbio);
2805 	if (ret < 0)
2806 		goto out;
2807 
2808 	/* We may have some failures, recover the failed sectors first. */
2809 	ret = recover_scrub_rbio(rbio);
2810 	if (ret < 0)
2811 		goto out;
2812 
2813 	/*
2814 	 * We have every sector properly prepared. Can finish the scrub
2815 	 * and writeback the good content.
2816 	 */
2817 	ret = finish_parity_scrub(rbio);
2818 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2819 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2820 		int found_errors;
2821 
2822 		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2823 		if (unlikely(found_errors > rbio->bioc->max_errors)) {
2824 			ret = -EIO;
2825 			break;
2826 		}
2827 	}
2828 out:
2829 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2830 }
2831 
scrub_rbio_work_locked(struct work_struct * work)2832 static void scrub_rbio_work_locked(struct work_struct *work)
2833 {
2834 	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2835 }
2836 
raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio * rbio)2837 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2838 {
2839 	if (!lock_stripe_add(rbio))
2840 		start_async_work(rbio, scrub_rbio_work_locked);
2841 }
2842 
2843 /*
2844  * This is for scrub call sites where we already have correct data contents.
2845  * This allows us to avoid reading data stripes again.
2846  *
2847  * Unfortunately here we have to do folio copy, other than reusing the pages.
2848  * This is due to the fact rbio has its own page management for its cache.
2849  */
raid56_parity_cache_data_folios(struct btrfs_raid_bio * rbio,struct folio ** data_folios,u64 data_logical)2850 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
2851 				     struct folio **data_folios, u64 data_logical)
2852 {
2853 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2854 	const u64 offset_in_full_stripe = data_logical -
2855 					  rbio->bioc->full_stripe_logical;
2856 	unsigned int findex = 0;
2857 	unsigned int foffset = 0;
2858 	int ret;
2859 
2860 	/* We shouldn't hit RAID56 for bs > ps cases for now. */
2861 	ASSERT(fs_info->sectorsize <= PAGE_SIZE);
2862 
2863 	/*
2864 	 * If we hit ENOMEM temporarily, but later at
2865 	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2866 	 * the extra read, not a big deal.
2867 	 *
2868 	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2869 	 * the bio would got proper error number set.
2870 	 */
2871 	ret = alloc_rbio_data_pages(rbio);
2872 	if (ret < 0)
2873 		return;
2874 
2875 	/* data_logical must be at stripe boundary and inside the full stripe. */
2876 	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2877 	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2878 
2879 	for (unsigned int cur_off = offset_in_full_stripe;
2880 	     cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
2881 	     cur_off += PAGE_SIZE) {
2882 		const unsigned int pindex = cur_off >> PAGE_SHIFT;
2883 		void *kaddr;
2884 
2885 		kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
2886 		memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
2887 		kunmap_local(kaddr);
2888 
2889 		foffset += PAGE_SIZE;
2890 		ASSERT(foffset <= folio_size(data_folios[findex]));
2891 		if (foffset == folio_size(data_folios[findex])) {
2892 			findex++;
2893 			foffset = 0;
2894 		}
2895 	}
2896 	for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits;
2897 	     sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits;
2898 	     sector_nr++)
2899 		rbio->stripe_sectors[sector_nr].uptodate = true;
2900 }
2901