xref: /linux/fs/btrfs/volumes.c (revision 93d90ad708b8da6efc0e487b66111aa9db7f70c7)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h>
28 #include <linux/raid/pq.h>
29 #include <linux/semaphore.h>
30 #include <asm/div64.h>
31 #include "ctree.h"
32 #include "extent_map.h"
33 #include "disk-io.h"
34 #include "transaction.h"
35 #include "print-tree.h"
36 #include "volumes.h"
37 #include "raid56.h"
38 #include "async-thread.h"
39 #include "check-integrity.h"
40 #include "rcu-string.h"
41 #include "math.h"
42 #include "dev-replace.h"
43 #include "sysfs.h"
44 
45 static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 				struct btrfs_root *root,
47 				struct btrfs_device *device);
48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
52 
53 DEFINE_MUTEX(uuid_mutex);
54 static LIST_HEAD(fs_uuids);
55 
56 static struct btrfs_fs_devices *__alloc_fs_devices(void)
57 {
58 	struct btrfs_fs_devices *fs_devs;
59 
60 	fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
61 	if (!fs_devs)
62 		return ERR_PTR(-ENOMEM);
63 
64 	mutex_init(&fs_devs->device_list_mutex);
65 
66 	INIT_LIST_HEAD(&fs_devs->devices);
67 	INIT_LIST_HEAD(&fs_devs->resized_devices);
68 	INIT_LIST_HEAD(&fs_devs->alloc_list);
69 	INIT_LIST_HEAD(&fs_devs->list);
70 
71 	return fs_devs;
72 }
73 
74 /**
75  * alloc_fs_devices - allocate struct btrfs_fs_devices
76  * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
77  *		generated.
78  *
79  * Return: a pointer to a new &struct btrfs_fs_devices on success;
80  * ERR_PTR() on error.  Returned struct is not linked onto any lists and
81  * can be destroyed with kfree() right away.
82  */
83 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
84 {
85 	struct btrfs_fs_devices *fs_devs;
86 
87 	fs_devs = __alloc_fs_devices();
88 	if (IS_ERR(fs_devs))
89 		return fs_devs;
90 
91 	if (fsid)
92 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
93 	else
94 		generate_random_uuid(fs_devs->fsid);
95 
96 	return fs_devs;
97 }
98 
99 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
100 {
101 	struct btrfs_device *device;
102 	WARN_ON(fs_devices->opened);
103 	while (!list_empty(&fs_devices->devices)) {
104 		device = list_entry(fs_devices->devices.next,
105 				    struct btrfs_device, dev_list);
106 		list_del(&device->dev_list);
107 		rcu_string_free(device->name);
108 		kfree(device);
109 	}
110 	kfree(fs_devices);
111 }
112 
113 static void btrfs_kobject_uevent(struct block_device *bdev,
114 				 enum kobject_action action)
115 {
116 	int ret;
117 
118 	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
119 	if (ret)
120 		pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
121 			action,
122 			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
123 			&disk_to_dev(bdev->bd_disk)->kobj);
124 }
125 
126 void btrfs_cleanup_fs_uuids(void)
127 {
128 	struct btrfs_fs_devices *fs_devices;
129 
130 	while (!list_empty(&fs_uuids)) {
131 		fs_devices = list_entry(fs_uuids.next,
132 					struct btrfs_fs_devices, list);
133 		list_del(&fs_devices->list);
134 		free_fs_devices(fs_devices);
135 	}
136 }
137 
138 static struct btrfs_device *__alloc_device(void)
139 {
140 	struct btrfs_device *dev;
141 
142 	dev = kzalloc(sizeof(*dev), GFP_NOFS);
143 	if (!dev)
144 		return ERR_PTR(-ENOMEM);
145 
146 	INIT_LIST_HEAD(&dev->dev_list);
147 	INIT_LIST_HEAD(&dev->dev_alloc_list);
148 	INIT_LIST_HEAD(&dev->resized_list);
149 
150 	spin_lock_init(&dev->io_lock);
151 
152 	spin_lock_init(&dev->reada_lock);
153 	atomic_set(&dev->reada_in_flight, 0);
154 	atomic_set(&dev->dev_stats_ccnt, 0);
155 	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
156 	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
157 
158 	return dev;
159 }
160 
161 static noinline struct btrfs_device *__find_device(struct list_head *head,
162 						   u64 devid, u8 *uuid)
163 {
164 	struct btrfs_device *dev;
165 
166 	list_for_each_entry(dev, head, dev_list) {
167 		if (dev->devid == devid &&
168 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
169 			return dev;
170 		}
171 	}
172 	return NULL;
173 }
174 
175 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
176 {
177 	struct btrfs_fs_devices *fs_devices;
178 
179 	list_for_each_entry(fs_devices, &fs_uuids, list) {
180 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
181 			return fs_devices;
182 	}
183 	return NULL;
184 }
185 
186 static int
187 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
188 		      int flush, struct block_device **bdev,
189 		      struct buffer_head **bh)
190 {
191 	int ret;
192 
193 	*bdev = blkdev_get_by_path(device_path, flags, holder);
194 
195 	if (IS_ERR(*bdev)) {
196 		ret = PTR_ERR(*bdev);
197 		printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
198 		goto error;
199 	}
200 
201 	if (flush)
202 		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
203 	ret = set_blocksize(*bdev, 4096);
204 	if (ret) {
205 		blkdev_put(*bdev, flags);
206 		goto error;
207 	}
208 	invalidate_bdev(*bdev);
209 	*bh = btrfs_read_dev_super(*bdev);
210 	if (!*bh) {
211 		ret = -EINVAL;
212 		blkdev_put(*bdev, flags);
213 		goto error;
214 	}
215 
216 	return 0;
217 
218 error:
219 	*bdev = NULL;
220 	*bh = NULL;
221 	return ret;
222 }
223 
224 static void requeue_list(struct btrfs_pending_bios *pending_bios,
225 			struct bio *head, struct bio *tail)
226 {
227 
228 	struct bio *old_head;
229 
230 	old_head = pending_bios->head;
231 	pending_bios->head = head;
232 	if (pending_bios->tail)
233 		tail->bi_next = old_head;
234 	else
235 		pending_bios->tail = tail;
236 }
237 
238 /*
239  * we try to collect pending bios for a device so we don't get a large
240  * number of procs sending bios down to the same device.  This greatly
241  * improves the schedulers ability to collect and merge the bios.
242  *
243  * But, it also turns into a long list of bios to process and that is sure
244  * to eventually make the worker thread block.  The solution here is to
245  * make some progress and then put this work struct back at the end of
246  * the list if the block device is congested.  This way, multiple devices
247  * can make progress from a single worker thread.
248  */
249 static noinline void run_scheduled_bios(struct btrfs_device *device)
250 {
251 	struct bio *pending;
252 	struct backing_dev_info *bdi;
253 	struct btrfs_fs_info *fs_info;
254 	struct btrfs_pending_bios *pending_bios;
255 	struct bio *tail;
256 	struct bio *cur;
257 	int again = 0;
258 	unsigned long num_run;
259 	unsigned long batch_run = 0;
260 	unsigned long limit;
261 	unsigned long last_waited = 0;
262 	int force_reg = 0;
263 	int sync_pending = 0;
264 	struct blk_plug plug;
265 
266 	/*
267 	 * this function runs all the bios we've collected for
268 	 * a particular device.  We don't want to wander off to
269 	 * another device without first sending all of these down.
270 	 * So, setup a plug here and finish it off before we return
271 	 */
272 	blk_start_plug(&plug);
273 
274 	bdi = blk_get_backing_dev_info(device->bdev);
275 	fs_info = device->dev_root->fs_info;
276 	limit = btrfs_async_submit_limit(fs_info);
277 	limit = limit * 2 / 3;
278 
279 loop:
280 	spin_lock(&device->io_lock);
281 
282 loop_lock:
283 	num_run = 0;
284 
285 	/* take all the bios off the list at once and process them
286 	 * later on (without the lock held).  But, remember the
287 	 * tail and other pointers so the bios can be properly reinserted
288 	 * into the list if we hit congestion
289 	 */
290 	if (!force_reg && device->pending_sync_bios.head) {
291 		pending_bios = &device->pending_sync_bios;
292 		force_reg = 1;
293 	} else {
294 		pending_bios = &device->pending_bios;
295 		force_reg = 0;
296 	}
297 
298 	pending = pending_bios->head;
299 	tail = pending_bios->tail;
300 	WARN_ON(pending && !tail);
301 
302 	/*
303 	 * if pending was null this time around, no bios need processing
304 	 * at all and we can stop.  Otherwise it'll loop back up again
305 	 * and do an additional check so no bios are missed.
306 	 *
307 	 * device->running_pending is used to synchronize with the
308 	 * schedule_bio code.
309 	 */
310 	if (device->pending_sync_bios.head == NULL &&
311 	    device->pending_bios.head == NULL) {
312 		again = 0;
313 		device->running_pending = 0;
314 	} else {
315 		again = 1;
316 		device->running_pending = 1;
317 	}
318 
319 	pending_bios->head = NULL;
320 	pending_bios->tail = NULL;
321 
322 	spin_unlock(&device->io_lock);
323 
324 	while (pending) {
325 
326 		rmb();
327 		/* we want to work on both lists, but do more bios on the
328 		 * sync list than the regular list
329 		 */
330 		if ((num_run > 32 &&
331 		    pending_bios != &device->pending_sync_bios &&
332 		    device->pending_sync_bios.head) ||
333 		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
334 		    device->pending_bios.head)) {
335 			spin_lock(&device->io_lock);
336 			requeue_list(pending_bios, pending, tail);
337 			goto loop_lock;
338 		}
339 
340 		cur = pending;
341 		pending = pending->bi_next;
342 		cur->bi_next = NULL;
343 
344 		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
345 		    waitqueue_active(&fs_info->async_submit_wait))
346 			wake_up(&fs_info->async_submit_wait);
347 
348 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
349 
350 		/*
351 		 * if we're doing the sync list, record that our
352 		 * plug has some sync requests on it
353 		 *
354 		 * If we're doing the regular list and there are
355 		 * sync requests sitting around, unplug before
356 		 * we add more
357 		 */
358 		if (pending_bios == &device->pending_sync_bios) {
359 			sync_pending = 1;
360 		} else if (sync_pending) {
361 			blk_finish_plug(&plug);
362 			blk_start_plug(&plug);
363 			sync_pending = 0;
364 		}
365 
366 		btrfsic_submit_bio(cur->bi_rw, cur);
367 		num_run++;
368 		batch_run++;
369 		if (need_resched())
370 			cond_resched();
371 
372 		/*
373 		 * we made progress, there is more work to do and the bdi
374 		 * is now congested.  Back off and let other work structs
375 		 * run instead
376 		 */
377 		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
378 		    fs_info->fs_devices->open_devices > 1) {
379 			struct io_context *ioc;
380 
381 			ioc = current->io_context;
382 
383 			/*
384 			 * the main goal here is that we don't want to
385 			 * block if we're going to be able to submit
386 			 * more requests without blocking.
387 			 *
388 			 * This code does two great things, it pokes into
389 			 * the elevator code from a filesystem _and_
390 			 * it makes assumptions about how batching works.
391 			 */
392 			if (ioc && ioc->nr_batch_requests > 0 &&
393 			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
394 			    (last_waited == 0 ||
395 			     ioc->last_waited == last_waited)) {
396 				/*
397 				 * we want to go through our batch of
398 				 * requests and stop.  So, we copy out
399 				 * the ioc->last_waited time and test
400 				 * against it before looping
401 				 */
402 				last_waited = ioc->last_waited;
403 				if (need_resched())
404 					cond_resched();
405 				continue;
406 			}
407 			spin_lock(&device->io_lock);
408 			requeue_list(pending_bios, pending, tail);
409 			device->running_pending = 1;
410 
411 			spin_unlock(&device->io_lock);
412 			btrfs_queue_work(fs_info->submit_workers,
413 					 &device->work);
414 			goto done;
415 		}
416 		/* unplug every 64 requests just for good measure */
417 		if (batch_run % 64 == 0) {
418 			blk_finish_plug(&plug);
419 			blk_start_plug(&plug);
420 			sync_pending = 0;
421 		}
422 	}
423 
424 	cond_resched();
425 	if (again)
426 		goto loop;
427 
428 	spin_lock(&device->io_lock);
429 	if (device->pending_bios.head || device->pending_sync_bios.head)
430 		goto loop_lock;
431 	spin_unlock(&device->io_lock);
432 
433 done:
434 	blk_finish_plug(&plug);
435 }
436 
437 static void pending_bios_fn(struct btrfs_work *work)
438 {
439 	struct btrfs_device *device;
440 
441 	device = container_of(work, struct btrfs_device, work);
442 	run_scheduled_bios(device);
443 }
444 
445 /*
446  * Add new device to list of registered devices
447  *
448  * Returns:
449  * 1   - first time device is seen
450  * 0   - device already known
451  * < 0 - error
452  */
453 static noinline int device_list_add(const char *path,
454 			   struct btrfs_super_block *disk_super,
455 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
456 {
457 	struct btrfs_device *device;
458 	struct btrfs_fs_devices *fs_devices;
459 	struct rcu_string *name;
460 	int ret = 0;
461 	u64 found_transid = btrfs_super_generation(disk_super);
462 
463 	fs_devices = find_fsid(disk_super->fsid);
464 	if (!fs_devices) {
465 		fs_devices = alloc_fs_devices(disk_super->fsid);
466 		if (IS_ERR(fs_devices))
467 			return PTR_ERR(fs_devices);
468 
469 		list_add(&fs_devices->list, &fs_uuids);
470 
471 		device = NULL;
472 	} else {
473 		device = __find_device(&fs_devices->devices, devid,
474 				       disk_super->dev_item.uuid);
475 	}
476 
477 	if (!device) {
478 		if (fs_devices->opened)
479 			return -EBUSY;
480 
481 		device = btrfs_alloc_device(NULL, &devid,
482 					    disk_super->dev_item.uuid);
483 		if (IS_ERR(device)) {
484 			/* we can safely leave the fs_devices entry around */
485 			return PTR_ERR(device);
486 		}
487 
488 		name = rcu_string_strdup(path, GFP_NOFS);
489 		if (!name) {
490 			kfree(device);
491 			return -ENOMEM;
492 		}
493 		rcu_assign_pointer(device->name, name);
494 
495 		mutex_lock(&fs_devices->device_list_mutex);
496 		list_add_rcu(&device->dev_list, &fs_devices->devices);
497 		fs_devices->num_devices++;
498 		mutex_unlock(&fs_devices->device_list_mutex);
499 
500 		ret = 1;
501 		device->fs_devices = fs_devices;
502 	} else if (!device->name || strcmp(device->name->str, path)) {
503 		/*
504 		 * When FS is already mounted.
505 		 * 1. If you are here and if the device->name is NULL that
506 		 *    means this device was missing at time of FS mount.
507 		 * 2. If you are here and if the device->name is different
508 		 *    from 'path' that means either
509 		 *      a. The same device disappeared and reappeared with
510 		 *         different name. or
511 		 *      b. The missing-disk-which-was-replaced, has
512 		 *         reappeared now.
513 		 *
514 		 * We must allow 1 and 2a above. But 2b would be a spurious
515 		 * and unintentional.
516 		 *
517 		 * Further in case of 1 and 2a above, the disk at 'path'
518 		 * would have missed some transaction when it was away and
519 		 * in case of 2a the stale bdev has to be updated as well.
520 		 * 2b must not be allowed at all time.
521 		 */
522 
523 		/*
524 		 * For now, we do allow update to btrfs_fs_device through the
525 		 * btrfs dev scan cli after FS has been mounted.  We're still
526 		 * tracking a problem where systems fail mount by subvolume id
527 		 * when we reject replacement on a mounted FS.
528 		 */
529 		if (!fs_devices->opened && found_transid < device->generation) {
530 			/*
531 			 * That is if the FS is _not_ mounted and if you
532 			 * are here, that means there is more than one
533 			 * disk with same uuid and devid.We keep the one
534 			 * with larger generation number or the last-in if
535 			 * generation are equal.
536 			 */
537 			return -EEXIST;
538 		}
539 
540 		name = rcu_string_strdup(path, GFP_NOFS);
541 		if (!name)
542 			return -ENOMEM;
543 		rcu_string_free(device->name);
544 		rcu_assign_pointer(device->name, name);
545 		if (device->missing) {
546 			fs_devices->missing_devices--;
547 			device->missing = 0;
548 		}
549 	}
550 
551 	/*
552 	 * Unmount does not free the btrfs_device struct but would zero
553 	 * generation along with most of the other members. So just update
554 	 * it back. We need it to pick the disk with largest generation
555 	 * (as above).
556 	 */
557 	if (!fs_devices->opened)
558 		device->generation = found_transid;
559 
560 	*fs_devices_ret = fs_devices;
561 
562 	return ret;
563 }
564 
565 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
566 {
567 	struct btrfs_fs_devices *fs_devices;
568 	struct btrfs_device *device;
569 	struct btrfs_device *orig_dev;
570 
571 	fs_devices = alloc_fs_devices(orig->fsid);
572 	if (IS_ERR(fs_devices))
573 		return fs_devices;
574 
575 	mutex_lock(&orig->device_list_mutex);
576 	fs_devices->total_devices = orig->total_devices;
577 
578 	/* We have held the volume lock, it is safe to get the devices. */
579 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
580 		struct rcu_string *name;
581 
582 		device = btrfs_alloc_device(NULL, &orig_dev->devid,
583 					    orig_dev->uuid);
584 		if (IS_ERR(device))
585 			goto error;
586 
587 		/*
588 		 * This is ok to do without rcu read locked because we hold the
589 		 * uuid mutex so nothing we touch in here is going to disappear.
590 		 */
591 		if (orig_dev->name) {
592 			name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
593 			if (!name) {
594 				kfree(device);
595 				goto error;
596 			}
597 			rcu_assign_pointer(device->name, name);
598 		}
599 
600 		list_add(&device->dev_list, &fs_devices->devices);
601 		device->fs_devices = fs_devices;
602 		fs_devices->num_devices++;
603 	}
604 	mutex_unlock(&orig->device_list_mutex);
605 	return fs_devices;
606 error:
607 	mutex_unlock(&orig->device_list_mutex);
608 	free_fs_devices(fs_devices);
609 	return ERR_PTR(-ENOMEM);
610 }
611 
612 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
613 			       struct btrfs_fs_devices *fs_devices, int step)
614 {
615 	struct btrfs_device *device, *next;
616 	struct btrfs_device *latest_dev = NULL;
617 
618 	mutex_lock(&uuid_mutex);
619 again:
620 	/* This is the initialized path, it is safe to release the devices. */
621 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
622 		if (device->in_fs_metadata) {
623 			if (!device->is_tgtdev_for_dev_replace &&
624 			    (!latest_dev ||
625 			     device->generation > latest_dev->generation)) {
626 				latest_dev = device;
627 			}
628 			continue;
629 		}
630 
631 		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
632 			/*
633 			 * In the first step, keep the device which has
634 			 * the correct fsid and the devid that is used
635 			 * for the dev_replace procedure.
636 			 * In the second step, the dev_replace state is
637 			 * read from the device tree and it is known
638 			 * whether the procedure is really active or
639 			 * not, which means whether this device is
640 			 * used or whether it should be removed.
641 			 */
642 			if (step == 0 || device->is_tgtdev_for_dev_replace) {
643 				continue;
644 			}
645 		}
646 		if (device->bdev) {
647 			blkdev_put(device->bdev, device->mode);
648 			device->bdev = NULL;
649 			fs_devices->open_devices--;
650 		}
651 		if (device->writeable) {
652 			list_del_init(&device->dev_alloc_list);
653 			device->writeable = 0;
654 			if (!device->is_tgtdev_for_dev_replace)
655 				fs_devices->rw_devices--;
656 		}
657 		list_del_init(&device->dev_list);
658 		fs_devices->num_devices--;
659 		rcu_string_free(device->name);
660 		kfree(device);
661 	}
662 
663 	if (fs_devices->seed) {
664 		fs_devices = fs_devices->seed;
665 		goto again;
666 	}
667 
668 	fs_devices->latest_bdev = latest_dev->bdev;
669 
670 	mutex_unlock(&uuid_mutex);
671 }
672 
673 static void __free_device(struct work_struct *work)
674 {
675 	struct btrfs_device *device;
676 
677 	device = container_of(work, struct btrfs_device, rcu_work);
678 
679 	if (device->bdev)
680 		blkdev_put(device->bdev, device->mode);
681 
682 	rcu_string_free(device->name);
683 	kfree(device);
684 }
685 
686 static void free_device(struct rcu_head *head)
687 {
688 	struct btrfs_device *device;
689 
690 	device = container_of(head, struct btrfs_device, rcu);
691 
692 	INIT_WORK(&device->rcu_work, __free_device);
693 	schedule_work(&device->rcu_work);
694 }
695 
696 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
697 {
698 	struct btrfs_device *device;
699 
700 	if (--fs_devices->opened > 0)
701 		return 0;
702 
703 	mutex_lock(&fs_devices->device_list_mutex);
704 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
705 		struct btrfs_device *new_device;
706 		struct rcu_string *name;
707 
708 		if (device->bdev)
709 			fs_devices->open_devices--;
710 
711 		if (device->writeable &&
712 		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
713 			list_del_init(&device->dev_alloc_list);
714 			fs_devices->rw_devices--;
715 		}
716 
717 		if (device->missing)
718 			fs_devices->missing_devices--;
719 
720 		new_device = btrfs_alloc_device(NULL, &device->devid,
721 						device->uuid);
722 		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
723 
724 		/* Safe because we are under uuid_mutex */
725 		if (device->name) {
726 			name = rcu_string_strdup(device->name->str, GFP_NOFS);
727 			BUG_ON(!name); /* -ENOMEM */
728 			rcu_assign_pointer(new_device->name, name);
729 		}
730 
731 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
732 		new_device->fs_devices = device->fs_devices;
733 
734 		call_rcu(&device->rcu, free_device);
735 	}
736 	mutex_unlock(&fs_devices->device_list_mutex);
737 
738 	WARN_ON(fs_devices->open_devices);
739 	WARN_ON(fs_devices->rw_devices);
740 	fs_devices->opened = 0;
741 	fs_devices->seeding = 0;
742 
743 	return 0;
744 }
745 
746 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
747 {
748 	struct btrfs_fs_devices *seed_devices = NULL;
749 	int ret;
750 
751 	mutex_lock(&uuid_mutex);
752 	ret = __btrfs_close_devices(fs_devices);
753 	if (!fs_devices->opened) {
754 		seed_devices = fs_devices->seed;
755 		fs_devices->seed = NULL;
756 	}
757 	mutex_unlock(&uuid_mutex);
758 
759 	while (seed_devices) {
760 		fs_devices = seed_devices;
761 		seed_devices = fs_devices->seed;
762 		__btrfs_close_devices(fs_devices);
763 		free_fs_devices(fs_devices);
764 	}
765 	/*
766 	 * Wait for rcu kworkers under __btrfs_close_devices
767 	 * to finish all blkdev_puts so device is really
768 	 * free when umount is done.
769 	 */
770 	rcu_barrier();
771 	return ret;
772 }
773 
774 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
775 				fmode_t flags, void *holder)
776 {
777 	struct request_queue *q;
778 	struct block_device *bdev;
779 	struct list_head *head = &fs_devices->devices;
780 	struct btrfs_device *device;
781 	struct btrfs_device *latest_dev = NULL;
782 	struct buffer_head *bh;
783 	struct btrfs_super_block *disk_super;
784 	u64 devid;
785 	int seeding = 1;
786 	int ret = 0;
787 
788 	flags |= FMODE_EXCL;
789 
790 	list_for_each_entry(device, head, dev_list) {
791 		if (device->bdev)
792 			continue;
793 		if (!device->name)
794 			continue;
795 
796 		/* Just open everything we can; ignore failures here */
797 		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
798 					    &bdev, &bh))
799 			continue;
800 
801 		disk_super = (struct btrfs_super_block *)bh->b_data;
802 		devid = btrfs_stack_device_id(&disk_super->dev_item);
803 		if (devid != device->devid)
804 			goto error_brelse;
805 
806 		if (memcmp(device->uuid, disk_super->dev_item.uuid,
807 			   BTRFS_UUID_SIZE))
808 			goto error_brelse;
809 
810 		device->generation = btrfs_super_generation(disk_super);
811 		if (!latest_dev ||
812 		    device->generation > latest_dev->generation)
813 			latest_dev = device;
814 
815 		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
816 			device->writeable = 0;
817 		} else {
818 			device->writeable = !bdev_read_only(bdev);
819 			seeding = 0;
820 		}
821 
822 		q = bdev_get_queue(bdev);
823 		if (blk_queue_discard(q))
824 			device->can_discard = 1;
825 
826 		device->bdev = bdev;
827 		device->in_fs_metadata = 0;
828 		device->mode = flags;
829 
830 		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
831 			fs_devices->rotating = 1;
832 
833 		fs_devices->open_devices++;
834 		if (device->writeable &&
835 		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
836 			fs_devices->rw_devices++;
837 			list_add(&device->dev_alloc_list,
838 				 &fs_devices->alloc_list);
839 		}
840 		brelse(bh);
841 		continue;
842 
843 error_brelse:
844 		brelse(bh);
845 		blkdev_put(bdev, flags);
846 		continue;
847 	}
848 	if (fs_devices->open_devices == 0) {
849 		ret = -EINVAL;
850 		goto out;
851 	}
852 	fs_devices->seeding = seeding;
853 	fs_devices->opened = 1;
854 	fs_devices->latest_bdev = latest_dev->bdev;
855 	fs_devices->total_rw_bytes = 0;
856 out:
857 	return ret;
858 }
859 
860 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
861 		       fmode_t flags, void *holder)
862 {
863 	int ret;
864 
865 	mutex_lock(&uuid_mutex);
866 	if (fs_devices->opened) {
867 		fs_devices->opened++;
868 		ret = 0;
869 	} else {
870 		ret = __btrfs_open_devices(fs_devices, flags, holder);
871 	}
872 	mutex_unlock(&uuid_mutex);
873 	return ret;
874 }
875 
876 /*
877  * Look for a btrfs signature on a device. This may be called out of the mount path
878  * and we are not allowed to call set_blocksize during the scan. The superblock
879  * is read via pagecache
880  */
881 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
882 			  struct btrfs_fs_devices **fs_devices_ret)
883 {
884 	struct btrfs_super_block *disk_super;
885 	struct block_device *bdev;
886 	struct page *page;
887 	void *p;
888 	int ret = -EINVAL;
889 	u64 devid;
890 	u64 transid;
891 	u64 total_devices;
892 	u64 bytenr;
893 	pgoff_t index;
894 
895 	/*
896 	 * we would like to check all the supers, but that would make
897 	 * a btrfs mount succeed after a mkfs from a different FS.
898 	 * So, we need to add a special mount option to scan for
899 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
900 	 */
901 	bytenr = btrfs_sb_offset(0);
902 	flags |= FMODE_EXCL;
903 	mutex_lock(&uuid_mutex);
904 
905 	bdev = blkdev_get_by_path(path, flags, holder);
906 
907 	if (IS_ERR(bdev)) {
908 		ret = PTR_ERR(bdev);
909 		goto error;
910 	}
911 
912 	/* make sure our super fits in the device */
913 	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
914 		goto error_bdev_put;
915 
916 	/* make sure our super fits in the page */
917 	if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
918 		goto error_bdev_put;
919 
920 	/* make sure our super doesn't straddle pages on disk */
921 	index = bytenr >> PAGE_CACHE_SHIFT;
922 	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
923 		goto error_bdev_put;
924 
925 	/* pull in the page with our super */
926 	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
927 				   index, GFP_NOFS);
928 
929 	if (IS_ERR_OR_NULL(page))
930 		goto error_bdev_put;
931 
932 	p = kmap(page);
933 
934 	/* align our pointer to the offset of the super block */
935 	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
936 
937 	if (btrfs_super_bytenr(disk_super) != bytenr ||
938 	    btrfs_super_magic(disk_super) != BTRFS_MAGIC)
939 		goto error_unmap;
940 
941 	devid = btrfs_stack_device_id(&disk_super->dev_item);
942 	transid = btrfs_super_generation(disk_super);
943 	total_devices = btrfs_super_num_devices(disk_super);
944 
945 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
946 	if (ret > 0) {
947 		if (disk_super->label[0]) {
948 			if (disk_super->label[BTRFS_LABEL_SIZE - 1])
949 				disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
950 			printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
951 		} else {
952 			printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
953 		}
954 
955 		printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
956 		ret = 0;
957 	}
958 	if (!ret && fs_devices_ret)
959 		(*fs_devices_ret)->total_devices = total_devices;
960 
961 error_unmap:
962 	kunmap(page);
963 	page_cache_release(page);
964 
965 error_bdev_put:
966 	blkdev_put(bdev, flags);
967 error:
968 	mutex_unlock(&uuid_mutex);
969 	return ret;
970 }
971 
972 /* helper to account the used device space in the range */
973 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
974 				   u64 end, u64 *length)
975 {
976 	struct btrfs_key key;
977 	struct btrfs_root *root = device->dev_root;
978 	struct btrfs_dev_extent *dev_extent;
979 	struct btrfs_path *path;
980 	u64 extent_end;
981 	int ret;
982 	int slot;
983 	struct extent_buffer *l;
984 
985 	*length = 0;
986 
987 	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
988 		return 0;
989 
990 	path = btrfs_alloc_path();
991 	if (!path)
992 		return -ENOMEM;
993 	path->reada = 2;
994 
995 	key.objectid = device->devid;
996 	key.offset = start;
997 	key.type = BTRFS_DEV_EXTENT_KEY;
998 
999 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1000 	if (ret < 0)
1001 		goto out;
1002 	if (ret > 0) {
1003 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
1004 		if (ret < 0)
1005 			goto out;
1006 	}
1007 
1008 	while (1) {
1009 		l = path->nodes[0];
1010 		slot = path->slots[0];
1011 		if (slot >= btrfs_header_nritems(l)) {
1012 			ret = btrfs_next_leaf(root, path);
1013 			if (ret == 0)
1014 				continue;
1015 			if (ret < 0)
1016 				goto out;
1017 
1018 			break;
1019 		}
1020 		btrfs_item_key_to_cpu(l, &key, slot);
1021 
1022 		if (key.objectid < device->devid)
1023 			goto next;
1024 
1025 		if (key.objectid > device->devid)
1026 			break;
1027 
1028 		if (key.type != BTRFS_DEV_EXTENT_KEY)
1029 			goto next;
1030 
1031 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1032 		extent_end = key.offset + btrfs_dev_extent_length(l,
1033 								  dev_extent);
1034 		if (key.offset <= start && extent_end > end) {
1035 			*length = end - start + 1;
1036 			break;
1037 		} else if (key.offset <= start && extent_end > start)
1038 			*length += extent_end - start;
1039 		else if (key.offset > start && extent_end <= end)
1040 			*length += extent_end - key.offset;
1041 		else if (key.offset > start && key.offset <= end) {
1042 			*length += end - key.offset + 1;
1043 			break;
1044 		} else if (key.offset > end)
1045 			break;
1046 
1047 next:
1048 		path->slots[0]++;
1049 	}
1050 	ret = 0;
1051 out:
1052 	btrfs_free_path(path);
1053 	return ret;
1054 }
1055 
1056 static int contains_pending_extent(struct btrfs_trans_handle *trans,
1057 				   struct btrfs_device *device,
1058 				   u64 *start, u64 len)
1059 {
1060 	struct extent_map *em;
1061 	struct list_head *search_list = &trans->transaction->pending_chunks;
1062 	int ret = 0;
1063 
1064 again:
1065 	list_for_each_entry(em, search_list, list) {
1066 		struct map_lookup *map;
1067 		int i;
1068 
1069 		map = (struct map_lookup *)em->bdev;
1070 		for (i = 0; i < map->num_stripes; i++) {
1071 			if (map->stripes[i].dev != device)
1072 				continue;
1073 			if (map->stripes[i].physical >= *start + len ||
1074 			    map->stripes[i].physical + em->orig_block_len <=
1075 			    *start)
1076 				continue;
1077 			*start = map->stripes[i].physical +
1078 				em->orig_block_len;
1079 			ret = 1;
1080 		}
1081 	}
1082 	if (search_list == &trans->transaction->pending_chunks) {
1083 		search_list = &trans->root->fs_info->pinned_chunks;
1084 		goto again;
1085 	}
1086 
1087 	return ret;
1088 }
1089 
1090 
1091 /*
1092  * find_free_dev_extent - find free space in the specified device
1093  * @device:	the device which we search the free space in
1094  * @num_bytes:	the size of the free space that we need
1095  * @start:	store the start of the free space.
1096  * @len:	the size of the free space. that we find, or the size of the max
1097  * 		free space if we don't find suitable free space
1098  *
1099  * this uses a pretty simple search, the expectation is that it is
1100  * called very infrequently and that a given device has a small number
1101  * of extents
1102  *
1103  * @start is used to store the start of the free space if we find. But if we
1104  * don't find suitable free space, it will be used to store the start position
1105  * of the max free space.
1106  *
1107  * @len is used to store the size of the free space that we find.
1108  * But if we don't find suitable free space, it is used to store the size of
1109  * the max free space.
1110  */
1111 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1112 			 struct btrfs_device *device, u64 num_bytes,
1113 			 u64 *start, u64 *len)
1114 {
1115 	struct btrfs_key key;
1116 	struct btrfs_root *root = device->dev_root;
1117 	struct btrfs_dev_extent *dev_extent;
1118 	struct btrfs_path *path;
1119 	u64 hole_size;
1120 	u64 max_hole_start;
1121 	u64 max_hole_size;
1122 	u64 extent_end;
1123 	u64 search_start;
1124 	u64 search_end = device->total_bytes;
1125 	int ret;
1126 	int slot;
1127 	struct extent_buffer *l;
1128 
1129 	/* FIXME use last free of some kind */
1130 
1131 	/* we don't want to overwrite the superblock on the drive,
1132 	 * so we make sure to start at an offset of at least 1MB
1133 	 */
1134 	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1135 
1136 	path = btrfs_alloc_path();
1137 	if (!path)
1138 		return -ENOMEM;
1139 again:
1140 	max_hole_start = search_start;
1141 	max_hole_size = 0;
1142 	hole_size = 0;
1143 
1144 	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1145 		ret = -ENOSPC;
1146 		goto out;
1147 	}
1148 
1149 	path->reada = 2;
1150 	path->search_commit_root = 1;
1151 	path->skip_locking = 1;
1152 
1153 	key.objectid = device->devid;
1154 	key.offset = search_start;
1155 	key.type = BTRFS_DEV_EXTENT_KEY;
1156 
1157 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1158 	if (ret < 0)
1159 		goto out;
1160 	if (ret > 0) {
1161 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
1162 		if (ret < 0)
1163 			goto out;
1164 	}
1165 
1166 	while (1) {
1167 		l = path->nodes[0];
1168 		slot = path->slots[0];
1169 		if (slot >= btrfs_header_nritems(l)) {
1170 			ret = btrfs_next_leaf(root, path);
1171 			if (ret == 0)
1172 				continue;
1173 			if (ret < 0)
1174 				goto out;
1175 
1176 			break;
1177 		}
1178 		btrfs_item_key_to_cpu(l, &key, slot);
1179 
1180 		if (key.objectid < device->devid)
1181 			goto next;
1182 
1183 		if (key.objectid > device->devid)
1184 			break;
1185 
1186 		if (key.type != BTRFS_DEV_EXTENT_KEY)
1187 			goto next;
1188 
1189 		if (key.offset > search_start) {
1190 			hole_size = key.offset - search_start;
1191 
1192 			/*
1193 			 * Have to check before we set max_hole_start, otherwise
1194 			 * we could end up sending back this offset anyway.
1195 			 */
1196 			if (contains_pending_extent(trans, device,
1197 						    &search_start,
1198 						    hole_size))
1199 				hole_size = 0;
1200 
1201 			if (hole_size > max_hole_size) {
1202 				max_hole_start = search_start;
1203 				max_hole_size = hole_size;
1204 			}
1205 
1206 			/*
1207 			 * If this free space is greater than which we need,
1208 			 * it must be the max free space that we have found
1209 			 * until now, so max_hole_start must point to the start
1210 			 * of this free space and the length of this free space
1211 			 * is stored in max_hole_size. Thus, we return
1212 			 * max_hole_start and max_hole_size and go back to the
1213 			 * caller.
1214 			 */
1215 			if (hole_size >= num_bytes) {
1216 				ret = 0;
1217 				goto out;
1218 			}
1219 		}
1220 
1221 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1222 		extent_end = key.offset + btrfs_dev_extent_length(l,
1223 								  dev_extent);
1224 		if (extent_end > search_start)
1225 			search_start = extent_end;
1226 next:
1227 		path->slots[0]++;
1228 		cond_resched();
1229 	}
1230 
1231 	/*
1232 	 * At this point, search_start should be the end of
1233 	 * allocated dev extents, and when shrinking the device,
1234 	 * search_end may be smaller than search_start.
1235 	 */
1236 	if (search_end > search_start)
1237 		hole_size = search_end - search_start;
1238 
1239 	if (hole_size > max_hole_size) {
1240 		max_hole_start = search_start;
1241 		max_hole_size = hole_size;
1242 	}
1243 
1244 	if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1245 		btrfs_release_path(path);
1246 		goto again;
1247 	}
1248 
1249 	/* See above. */
1250 	if (hole_size < num_bytes)
1251 		ret = -ENOSPC;
1252 	else
1253 		ret = 0;
1254 
1255 out:
1256 	btrfs_free_path(path);
1257 	*start = max_hole_start;
1258 	if (len)
1259 		*len = max_hole_size;
1260 	return ret;
1261 }
1262 
1263 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1264 			  struct btrfs_device *device,
1265 			  u64 start, u64 *dev_extent_len)
1266 {
1267 	int ret;
1268 	struct btrfs_path *path;
1269 	struct btrfs_root *root = device->dev_root;
1270 	struct btrfs_key key;
1271 	struct btrfs_key found_key;
1272 	struct extent_buffer *leaf = NULL;
1273 	struct btrfs_dev_extent *extent = NULL;
1274 
1275 	path = btrfs_alloc_path();
1276 	if (!path)
1277 		return -ENOMEM;
1278 
1279 	key.objectid = device->devid;
1280 	key.offset = start;
1281 	key.type = BTRFS_DEV_EXTENT_KEY;
1282 again:
1283 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1284 	if (ret > 0) {
1285 		ret = btrfs_previous_item(root, path, key.objectid,
1286 					  BTRFS_DEV_EXTENT_KEY);
1287 		if (ret)
1288 			goto out;
1289 		leaf = path->nodes[0];
1290 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1291 		extent = btrfs_item_ptr(leaf, path->slots[0],
1292 					struct btrfs_dev_extent);
1293 		BUG_ON(found_key.offset > start || found_key.offset +
1294 		       btrfs_dev_extent_length(leaf, extent) < start);
1295 		key = found_key;
1296 		btrfs_release_path(path);
1297 		goto again;
1298 	} else if (ret == 0) {
1299 		leaf = path->nodes[0];
1300 		extent = btrfs_item_ptr(leaf, path->slots[0],
1301 					struct btrfs_dev_extent);
1302 	} else {
1303 		btrfs_error(root->fs_info, ret, "Slot search failed");
1304 		goto out;
1305 	}
1306 
1307 	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1308 
1309 	ret = btrfs_del_item(trans, root, path);
1310 	if (ret) {
1311 		btrfs_error(root->fs_info, ret,
1312 			    "Failed to remove dev extent item");
1313 	}
1314 out:
1315 	btrfs_free_path(path);
1316 	return ret;
1317 }
1318 
1319 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1320 				  struct btrfs_device *device,
1321 				  u64 chunk_tree, u64 chunk_objectid,
1322 				  u64 chunk_offset, u64 start, u64 num_bytes)
1323 {
1324 	int ret;
1325 	struct btrfs_path *path;
1326 	struct btrfs_root *root = device->dev_root;
1327 	struct btrfs_dev_extent *extent;
1328 	struct extent_buffer *leaf;
1329 	struct btrfs_key key;
1330 
1331 	WARN_ON(!device->in_fs_metadata);
1332 	WARN_ON(device->is_tgtdev_for_dev_replace);
1333 	path = btrfs_alloc_path();
1334 	if (!path)
1335 		return -ENOMEM;
1336 
1337 	key.objectid = device->devid;
1338 	key.offset = start;
1339 	key.type = BTRFS_DEV_EXTENT_KEY;
1340 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1341 				      sizeof(*extent));
1342 	if (ret)
1343 		goto out;
1344 
1345 	leaf = path->nodes[0];
1346 	extent = btrfs_item_ptr(leaf, path->slots[0],
1347 				struct btrfs_dev_extent);
1348 	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1349 	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1350 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1351 
1352 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1353 		    btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
1354 
1355 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1356 	btrfs_mark_buffer_dirty(leaf);
1357 out:
1358 	btrfs_free_path(path);
1359 	return ret;
1360 }
1361 
1362 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1363 {
1364 	struct extent_map_tree *em_tree;
1365 	struct extent_map *em;
1366 	struct rb_node *n;
1367 	u64 ret = 0;
1368 
1369 	em_tree = &fs_info->mapping_tree.map_tree;
1370 	read_lock(&em_tree->lock);
1371 	n = rb_last(&em_tree->map);
1372 	if (n) {
1373 		em = rb_entry(n, struct extent_map, rb_node);
1374 		ret = em->start + em->len;
1375 	}
1376 	read_unlock(&em_tree->lock);
1377 
1378 	return ret;
1379 }
1380 
1381 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1382 				    u64 *devid_ret)
1383 {
1384 	int ret;
1385 	struct btrfs_key key;
1386 	struct btrfs_key found_key;
1387 	struct btrfs_path *path;
1388 
1389 	path = btrfs_alloc_path();
1390 	if (!path)
1391 		return -ENOMEM;
1392 
1393 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1394 	key.type = BTRFS_DEV_ITEM_KEY;
1395 	key.offset = (u64)-1;
1396 
1397 	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1398 	if (ret < 0)
1399 		goto error;
1400 
1401 	BUG_ON(ret == 0); /* Corruption */
1402 
1403 	ret = btrfs_previous_item(fs_info->chunk_root, path,
1404 				  BTRFS_DEV_ITEMS_OBJECTID,
1405 				  BTRFS_DEV_ITEM_KEY);
1406 	if (ret) {
1407 		*devid_ret = 1;
1408 	} else {
1409 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1410 				      path->slots[0]);
1411 		*devid_ret = found_key.offset + 1;
1412 	}
1413 	ret = 0;
1414 error:
1415 	btrfs_free_path(path);
1416 	return ret;
1417 }
1418 
1419 /*
1420  * the device information is stored in the chunk root
1421  * the btrfs_device struct should be fully filled in
1422  */
1423 static int btrfs_add_device(struct btrfs_trans_handle *trans,
1424 			    struct btrfs_root *root,
1425 			    struct btrfs_device *device)
1426 {
1427 	int ret;
1428 	struct btrfs_path *path;
1429 	struct btrfs_dev_item *dev_item;
1430 	struct extent_buffer *leaf;
1431 	struct btrfs_key key;
1432 	unsigned long ptr;
1433 
1434 	root = root->fs_info->chunk_root;
1435 
1436 	path = btrfs_alloc_path();
1437 	if (!path)
1438 		return -ENOMEM;
1439 
1440 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1441 	key.type = BTRFS_DEV_ITEM_KEY;
1442 	key.offset = device->devid;
1443 
1444 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1445 				      sizeof(*dev_item));
1446 	if (ret)
1447 		goto out;
1448 
1449 	leaf = path->nodes[0];
1450 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1451 
1452 	btrfs_set_device_id(leaf, dev_item, device->devid);
1453 	btrfs_set_device_generation(leaf, dev_item, 0);
1454 	btrfs_set_device_type(leaf, dev_item, device->type);
1455 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1456 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1457 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1458 	btrfs_set_device_total_bytes(leaf, dev_item,
1459 				     btrfs_device_get_disk_total_bytes(device));
1460 	btrfs_set_device_bytes_used(leaf, dev_item,
1461 				    btrfs_device_get_bytes_used(device));
1462 	btrfs_set_device_group(leaf, dev_item, 0);
1463 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1464 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1465 	btrfs_set_device_start_offset(leaf, dev_item, 0);
1466 
1467 	ptr = btrfs_device_uuid(dev_item);
1468 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1469 	ptr = btrfs_device_fsid(dev_item);
1470 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1471 	btrfs_mark_buffer_dirty(leaf);
1472 
1473 	ret = 0;
1474 out:
1475 	btrfs_free_path(path);
1476 	return ret;
1477 }
1478 
1479 /*
1480  * Function to update ctime/mtime for a given device path.
1481  * Mainly used for ctime/mtime based probe like libblkid.
1482  */
1483 static void update_dev_time(char *path_name)
1484 {
1485 	struct file *filp;
1486 
1487 	filp = filp_open(path_name, O_RDWR, 0);
1488 	if (IS_ERR(filp))
1489 		return;
1490 	file_update_time(filp);
1491 	filp_close(filp, NULL);
1492 	return;
1493 }
1494 
1495 static int btrfs_rm_dev_item(struct btrfs_root *root,
1496 			     struct btrfs_device *device)
1497 {
1498 	int ret;
1499 	struct btrfs_path *path;
1500 	struct btrfs_key key;
1501 	struct btrfs_trans_handle *trans;
1502 
1503 	root = root->fs_info->chunk_root;
1504 
1505 	path = btrfs_alloc_path();
1506 	if (!path)
1507 		return -ENOMEM;
1508 
1509 	trans = btrfs_start_transaction(root, 0);
1510 	if (IS_ERR(trans)) {
1511 		btrfs_free_path(path);
1512 		return PTR_ERR(trans);
1513 	}
1514 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1515 	key.type = BTRFS_DEV_ITEM_KEY;
1516 	key.offset = device->devid;
1517 
1518 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1519 	if (ret < 0)
1520 		goto out;
1521 
1522 	if (ret > 0) {
1523 		ret = -ENOENT;
1524 		goto out;
1525 	}
1526 
1527 	ret = btrfs_del_item(trans, root, path);
1528 	if (ret)
1529 		goto out;
1530 out:
1531 	btrfs_free_path(path);
1532 	btrfs_commit_transaction(trans, root);
1533 	return ret;
1534 }
1535 
1536 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1537 {
1538 	struct btrfs_device *device;
1539 	struct btrfs_device *next_device;
1540 	struct block_device *bdev;
1541 	struct buffer_head *bh = NULL;
1542 	struct btrfs_super_block *disk_super;
1543 	struct btrfs_fs_devices *cur_devices;
1544 	u64 all_avail;
1545 	u64 devid;
1546 	u64 num_devices;
1547 	u8 *dev_uuid;
1548 	unsigned seq;
1549 	int ret = 0;
1550 	bool clear_super = false;
1551 
1552 	mutex_lock(&uuid_mutex);
1553 
1554 	do {
1555 		seq = read_seqbegin(&root->fs_info->profiles_lock);
1556 
1557 		all_avail = root->fs_info->avail_data_alloc_bits |
1558 			    root->fs_info->avail_system_alloc_bits |
1559 			    root->fs_info->avail_metadata_alloc_bits;
1560 	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
1561 
1562 	num_devices = root->fs_info->fs_devices->num_devices;
1563 	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1564 	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1565 		WARN_ON(num_devices < 1);
1566 		num_devices--;
1567 	}
1568 	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1569 
1570 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1571 		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1572 		goto out;
1573 	}
1574 
1575 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1576 		ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1577 		goto out;
1578 	}
1579 
1580 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1581 	    root->fs_info->fs_devices->rw_devices <= 2) {
1582 		ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1583 		goto out;
1584 	}
1585 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1586 	    root->fs_info->fs_devices->rw_devices <= 3) {
1587 		ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1588 		goto out;
1589 	}
1590 
1591 	if (strcmp(device_path, "missing") == 0) {
1592 		struct list_head *devices;
1593 		struct btrfs_device *tmp;
1594 
1595 		device = NULL;
1596 		devices = &root->fs_info->fs_devices->devices;
1597 		/*
1598 		 * It is safe to read the devices since the volume_mutex
1599 		 * is held.
1600 		 */
1601 		list_for_each_entry(tmp, devices, dev_list) {
1602 			if (tmp->in_fs_metadata &&
1603 			    !tmp->is_tgtdev_for_dev_replace &&
1604 			    !tmp->bdev) {
1605 				device = tmp;
1606 				break;
1607 			}
1608 		}
1609 		bdev = NULL;
1610 		bh = NULL;
1611 		disk_super = NULL;
1612 		if (!device) {
1613 			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1614 			goto out;
1615 		}
1616 	} else {
1617 		ret = btrfs_get_bdev_and_sb(device_path,
1618 					    FMODE_WRITE | FMODE_EXCL,
1619 					    root->fs_info->bdev_holder, 0,
1620 					    &bdev, &bh);
1621 		if (ret)
1622 			goto out;
1623 		disk_super = (struct btrfs_super_block *)bh->b_data;
1624 		devid = btrfs_stack_device_id(&disk_super->dev_item);
1625 		dev_uuid = disk_super->dev_item.uuid;
1626 		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1627 					   disk_super->fsid);
1628 		if (!device) {
1629 			ret = -ENOENT;
1630 			goto error_brelse;
1631 		}
1632 	}
1633 
1634 	if (device->is_tgtdev_for_dev_replace) {
1635 		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1636 		goto error_brelse;
1637 	}
1638 
1639 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1640 		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1641 		goto error_brelse;
1642 	}
1643 
1644 	if (device->writeable) {
1645 		lock_chunks(root);
1646 		list_del_init(&device->dev_alloc_list);
1647 		device->fs_devices->rw_devices--;
1648 		unlock_chunks(root);
1649 		clear_super = true;
1650 	}
1651 
1652 	mutex_unlock(&uuid_mutex);
1653 	ret = btrfs_shrink_device(device, 0);
1654 	mutex_lock(&uuid_mutex);
1655 	if (ret)
1656 		goto error_undo;
1657 
1658 	/*
1659 	 * TODO: the superblock still includes this device in its num_devices
1660 	 * counter although write_all_supers() is not locked out. This
1661 	 * could give a filesystem state which requires a degraded mount.
1662 	 */
1663 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1664 	if (ret)
1665 		goto error_undo;
1666 
1667 	device->in_fs_metadata = 0;
1668 	btrfs_scrub_cancel_dev(root->fs_info, device);
1669 
1670 	/*
1671 	 * the device list mutex makes sure that we don't change
1672 	 * the device list while someone else is writing out all
1673 	 * the device supers. Whoever is writing all supers, should
1674 	 * lock the device list mutex before getting the number of
1675 	 * devices in the super block (super_copy). Conversely,
1676 	 * whoever updates the number of devices in the super block
1677 	 * (super_copy) should hold the device list mutex.
1678 	 */
1679 
1680 	cur_devices = device->fs_devices;
1681 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1682 	list_del_rcu(&device->dev_list);
1683 
1684 	device->fs_devices->num_devices--;
1685 	device->fs_devices->total_devices--;
1686 
1687 	if (device->missing)
1688 		device->fs_devices->missing_devices--;
1689 
1690 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
1691 				 struct btrfs_device, dev_list);
1692 	if (device->bdev == root->fs_info->sb->s_bdev)
1693 		root->fs_info->sb->s_bdev = next_device->bdev;
1694 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1695 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1696 
1697 	if (device->bdev) {
1698 		device->fs_devices->open_devices--;
1699 		/* remove sysfs entry */
1700 		btrfs_kobj_rm_device(root->fs_info, device);
1701 	}
1702 
1703 	call_rcu(&device->rcu, free_device);
1704 
1705 	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1706 	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1707 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1708 
1709 	if (cur_devices->open_devices == 0) {
1710 		struct btrfs_fs_devices *fs_devices;
1711 		fs_devices = root->fs_info->fs_devices;
1712 		while (fs_devices) {
1713 			if (fs_devices->seed == cur_devices) {
1714 				fs_devices->seed = cur_devices->seed;
1715 				break;
1716 			}
1717 			fs_devices = fs_devices->seed;
1718 		}
1719 		cur_devices->seed = NULL;
1720 		__btrfs_close_devices(cur_devices);
1721 		free_fs_devices(cur_devices);
1722 	}
1723 
1724 	root->fs_info->num_tolerated_disk_barrier_failures =
1725 		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1726 
1727 	/*
1728 	 * at this point, the device is zero sized.  We want to
1729 	 * remove it from the devices list and zero out the old super
1730 	 */
1731 	if (clear_super && disk_super) {
1732 		u64 bytenr;
1733 		int i;
1734 
1735 		/* make sure this device isn't detected as part of
1736 		 * the FS anymore
1737 		 */
1738 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1739 		set_buffer_dirty(bh);
1740 		sync_dirty_buffer(bh);
1741 
1742 		/* clear the mirror copies of super block on the disk
1743 		 * being removed, 0th copy is been taken care above and
1744 		 * the below would take of the rest
1745 		 */
1746 		for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1747 			bytenr = btrfs_sb_offset(i);
1748 			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1749 					i_size_read(bdev->bd_inode))
1750 				break;
1751 
1752 			brelse(bh);
1753 			bh = __bread(bdev, bytenr / 4096,
1754 					BTRFS_SUPER_INFO_SIZE);
1755 			if (!bh)
1756 				continue;
1757 
1758 			disk_super = (struct btrfs_super_block *)bh->b_data;
1759 
1760 			if (btrfs_super_bytenr(disk_super) != bytenr ||
1761 				btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1762 				continue;
1763 			}
1764 			memset(&disk_super->magic, 0,
1765 						sizeof(disk_super->magic));
1766 			set_buffer_dirty(bh);
1767 			sync_dirty_buffer(bh);
1768 		}
1769 	}
1770 
1771 	ret = 0;
1772 
1773 	if (bdev) {
1774 		/* Notify udev that device has changed */
1775 		btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1776 
1777 		/* Update ctime/mtime for device path for libblkid */
1778 		update_dev_time(device_path);
1779 	}
1780 
1781 error_brelse:
1782 	brelse(bh);
1783 	if (bdev)
1784 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1785 out:
1786 	mutex_unlock(&uuid_mutex);
1787 	return ret;
1788 error_undo:
1789 	if (device->writeable) {
1790 		lock_chunks(root);
1791 		list_add(&device->dev_alloc_list,
1792 			 &root->fs_info->fs_devices->alloc_list);
1793 		device->fs_devices->rw_devices++;
1794 		unlock_chunks(root);
1795 	}
1796 	goto error_brelse;
1797 }
1798 
1799 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1800 					struct btrfs_device *srcdev)
1801 {
1802 	struct btrfs_fs_devices *fs_devices;
1803 
1804 	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1805 
1806 	/*
1807 	 * in case of fs with no seed, srcdev->fs_devices will point
1808 	 * to fs_devices of fs_info. However when the dev being replaced is
1809 	 * a seed dev it will point to the seed's local fs_devices. In short
1810 	 * srcdev will have its correct fs_devices in both the cases.
1811 	 */
1812 	fs_devices = srcdev->fs_devices;
1813 
1814 	list_del_rcu(&srcdev->dev_list);
1815 	list_del_rcu(&srcdev->dev_alloc_list);
1816 	fs_devices->num_devices--;
1817 	if (srcdev->missing)
1818 		fs_devices->missing_devices--;
1819 
1820 	if (srcdev->writeable) {
1821 		fs_devices->rw_devices--;
1822 		/* zero out the old super if it is writable */
1823 		btrfs_scratch_superblock(srcdev);
1824 	}
1825 
1826 	if (srcdev->bdev)
1827 		fs_devices->open_devices--;
1828 }
1829 
1830 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1831 				      struct btrfs_device *srcdev)
1832 {
1833 	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1834 
1835 	call_rcu(&srcdev->rcu, free_device);
1836 
1837 	/*
1838 	 * unless fs_devices is seed fs, num_devices shouldn't go
1839 	 * zero
1840 	 */
1841 	BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
1842 
1843 	/* if this is no devs we rather delete the fs_devices */
1844 	if (!fs_devices->num_devices) {
1845 		struct btrfs_fs_devices *tmp_fs_devices;
1846 
1847 		tmp_fs_devices = fs_info->fs_devices;
1848 		while (tmp_fs_devices) {
1849 			if (tmp_fs_devices->seed == fs_devices) {
1850 				tmp_fs_devices->seed = fs_devices->seed;
1851 				break;
1852 			}
1853 			tmp_fs_devices = tmp_fs_devices->seed;
1854 		}
1855 		fs_devices->seed = NULL;
1856 		__btrfs_close_devices(fs_devices);
1857 		free_fs_devices(fs_devices);
1858 	}
1859 }
1860 
1861 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1862 				      struct btrfs_device *tgtdev)
1863 {
1864 	struct btrfs_device *next_device;
1865 
1866 	mutex_lock(&uuid_mutex);
1867 	WARN_ON(!tgtdev);
1868 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
1869 	if (tgtdev->bdev) {
1870 		btrfs_scratch_superblock(tgtdev);
1871 		fs_info->fs_devices->open_devices--;
1872 	}
1873 	fs_info->fs_devices->num_devices--;
1874 
1875 	next_device = list_entry(fs_info->fs_devices->devices.next,
1876 				 struct btrfs_device, dev_list);
1877 	if (tgtdev->bdev == fs_info->sb->s_bdev)
1878 		fs_info->sb->s_bdev = next_device->bdev;
1879 	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1880 		fs_info->fs_devices->latest_bdev = next_device->bdev;
1881 	list_del_rcu(&tgtdev->dev_list);
1882 
1883 	call_rcu(&tgtdev->rcu, free_device);
1884 
1885 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1886 	mutex_unlock(&uuid_mutex);
1887 }
1888 
1889 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1890 				     struct btrfs_device **device)
1891 {
1892 	int ret = 0;
1893 	struct btrfs_super_block *disk_super;
1894 	u64 devid;
1895 	u8 *dev_uuid;
1896 	struct block_device *bdev;
1897 	struct buffer_head *bh;
1898 
1899 	*device = NULL;
1900 	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1901 				    root->fs_info->bdev_holder, 0, &bdev, &bh);
1902 	if (ret)
1903 		return ret;
1904 	disk_super = (struct btrfs_super_block *)bh->b_data;
1905 	devid = btrfs_stack_device_id(&disk_super->dev_item);
1906 	dev_uuid = disk_super->dev_item.uuid;
1907 	*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1908 				    disk_super->fsid);
1909 	brelse(bh);
1910 	if (!*device)
1911 		ret = -ENOENT;
1912 	blkdev_put(bdev, FMODE_READ);
1913 	return ret;
1914 }
1915 
1916 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1917 					 char *device_path,
1918 					 struct btrfs_device **device)
1919 {
1920 	*device = NULL;
1921 	if (strcmp(device_path, "missing") == 0) {
1922 		struct list_head *devices;
1923 		struct btrfs_device *tmp;
1924 
1925 		devices = &root->fs_info->fs_devices->devices;
1926 		/*
1927 		 * It is safe to read the devices since the volume_mutex
1928 		 * is held by the caller.
1929 		 */
1930 		list_for_each_entry(tmp, devices, dev_list) {
1931 			if (tmp->in_fs_metadata && !tmp->bdev) {
1932 				*device = tmp;
1933 				break;
1934 			}
1935 		}
1936 
1937 		if (!*device) {
1938 			btrfs_err(root->fs_info, "no missing device found");
1939 			return -ENOENT;
1940 		}
1941 
1942 		return 0;
1943 	} else {
1944 		return btrfs_find_device_by_path(root, device_path, device);
1945 	}
1946 }
1947 
1948 /*
1949  * does all the dirty work required for changing file system's UUID.
1950  */
1951 static int btrfs_prepare_sprout(struct btrfs_root *root)
1952 {
1953 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1954 	struct btrfs_fs_devices *old_devices;
1955 	struct btrfs_fs_devices *seed_devices;
1956 	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1957 	struct btrfs_device *device;
1958 	u64 super_flags;
1959 
1960 	BUG_ON(!mutex_is_locked(&uuid_mutex));
1961 	if (!fs_devices->seeding)
1962 		return -EINVAL;
1963 
1964 	seed_devices = __alloc_fs_devices();
1965 	if (IS_ERR(seed_devices))
1966 		return PTR_ERR(seed_devices);
1967 
1968 	old_devices = clone_fs_devices(fs_devices);
1969 	if (IS_ERR(old_devices)) {
1970 		kfree(seed_devices);
1971 		return PTR_ERR(old_devices);
1972 	}
1973 
1974 	list_add(&old_devices->list, &fs_uuids);
1975 
1976 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1977 	seed_devices->opened = 1;
1978 	INIT_LIST_HEAD(&seed_devices->devices);
1979 	INIT_LIST_HEAD(&seed_devices->alloc_list);
1980 	mutex_init(&seed_devices->device_list_mutex);
1981 
1982 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1983 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1984 			      synchronize_rcu);
1985 	list_for_each_entry(device, &seed_devices->devices, dev_list)
1986 		device->fs_devices = seed_devices;
1987 
1988 	lock_chunks(root);
1989 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1990 	unlock_chunks(root);
1991 
1992 	fs_devices->seeding = 0;
1993 	fs_devices->num_devices = 0;
1994 	fs_devices->open_devices = 0;
1995 	fs_devices->missing_devices = 0;
1996 	fs_devices->rotating = 0;
1997 	fs_devices->seed = seed_devices;
1998 
1999 	generate_random_uuid(fs_devices->fsid);
2000 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2001 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2002 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2003 
2004 	super_flags = btrfs_super_flags(disk_super) &
2005 		      ~BTRFS_SUPER_FLAG_SEEDING;
2006 	btrfs_set_super_flags(disk_super, super_flags);
2007 
2008 	return 0;
2009 }
2010 
2011 /*
2012  * strore the expected generation for seed devices in device items.
2013  */
2014 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2015 			       struct btrfs_root *root)
2016 {
2017 	struct btrfs_path *path;
2018 	struct extent_buffer *leaf;
2019 	struct btrfs_dev_item *dev_item;
2020 	struct btrfs_device *device;
2021 	struct btrfs_key key;
2022 	u8 fs_uuid[BTRFS_UUID_SIZE];
2023 	u8 dev_uuid[BTRFS_UUID_SIZE];
2024 	u64 devid;
2025 	int ret;
2026 
2027 	path = btrfs_alloc_path();
2028 	if (!path)
2029 		return -ENOMEM;
2030 
2031 	root = root->fs_info->chunk_root;
2032 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2033 	key.offset = 0;
2034 	key.type = BTRFS_DEV_ITEM_KEY;
2035 
2036 	while (1) {
2037 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2038 		if (ret < 0)
2039 			goto error;
2040 
2041 		leaf = path->nodes[0];
2042 next_slot:
2043 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2044 			ret = btrfs_next_leaf(root, path);
2045 			if (ret > 0)
2046 				break;
2047 			if (ret < 0)
2048 				goto error;
2049 			leaf = path->nodes[0];
2050 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2051 			btrfs_release_path(path);
2052 			continue;
2053 		}
2054 
2055 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2056 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2057 		    key.type != BTRFS_DEV_ITEM_KEY)
2058 			break;
2059 
2060 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
2061 					  struct btrfs_dev_item);
2062 		devid = btrfs_device_id(leaf, dev_item);
2063 		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2064 				   BTRFS_UUID_SIZE);
2065 		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2066 				   BTRFS_UUID_SIZE);
2067 		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
2068 					   fs_uuid);
2069 		BUG_ON(!device); /* Logic error */
2070 
2071 		if (device->fs_devices->seeding) {
2072 			btrfs_set_device_generation(leaf, dev_item,
2073 						    device->generation);
2074 			btrfs_mark_buffer_dirty(leaf);
2075 		}
2076 
2077 		path->slots[0]++;
2078 		goto next_slot;
2079 	}
2080 	ret = 0;
2081 error:
2082 	btrfs_free_path(path);
2083 	return ret;
2084 }
2085 
2086 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2087 {
2088 	struct request_queue *q;
2089 	struct btrfs_trans_handle *trans;
2090 	struct btrfs_device *device;
2091 	struct block_device *bdev;
2092 	struct list_head *devices;
2093 	struct super_block *sb = root->fs_info->sb;
2094 	struct rcu_string *name;
2095 	u64 tmp;
2096 	int seeding_dev = 0;
2097 	int ret = 0;
2098 
2099 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
2100 		return -EROFS;
2101 
2102 	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2103 				  root->fs_info->bdev_holder);
2104 	if (IS_ERR(bdev))
2105 		return PTR_ERR(bdev);
2106 
2107 	if (root->fs_info->fs_devices->seeding) {
2108 		seeding_dev = 1;
2109 		down_write(&sb->s_umount);
2110 		mutex_lock(&uuid_mutex);
2111 	}
2112 
2113 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2114 
2115 	devices = &root->fs_info->fs_devices->devices;
2116 
2117 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2118 	list_for_each_entry(device, devices, dev_list) {
2119 		if (device->bdev == bdev) {
2120 			ret = -EEXIST;
2121 			mutex_unlock(
2122 				&root->fs_info->fs_devices->device_list_mutex);
2123 			goto error;
2124 		}
2125 	}
2126 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2127 
2128 	device = btrfs_alloc_device(root->fs_info, NULL, NULL);
2129 	if (IS_ERR(device)) {
2130 		/* we can safely leave the fs_devices entry around */
2131 		ret = PTR_ERR(device);
2132 		goto error;
2133 	}
2134 
2135 	name = rcu_string_strdup(device_path, GFP_NOFS);
2136 	if (!name) {
2137 		kfree(device);
2138 		ret = -ENOMEM;
2139 		goto error;
2140 	}
2141 	rcu_assign_pointer(device->name, name);
2142 
2143 	trans = btrfs_start_transaction(root, 0);
2144 	if (IS_ERR(trans)) {
2145 		rcu_string_free(device->name);
2146 		kfree(device);
2147 		ret = PTR_ERR(trans);
2148 		goto error;
2149 	}
2150 
2151 	q = bdev_get_queue(bdev);
2152 	if (blk_queue_discard(q))
2153 		device->can_discard = 1;
2154 	device->writeable = 1;
2155 	device->generation = trans->transid;
2156 	device->io_width = root->sectorsize;
2157 	device->io_align = root->sectorsize;
2158 	device->sector_size = root->sectorsize;
2159 	device->total_bytes = i_size_read(bdev->bd_inode);
2160 	device->disk_total_bytes = device->total_bytes;
2161 	device->commit_total_bytes = device->total_bytes;
2162 	device->dev_root = root->fs_info->dev_root;
2163 	device->bdev = bdev;
2164 	device->in_fs_metadata = 1;
2165 	device->is_tgtdev_for_dev_replace = 0;
2166 	device->mode = FMODE_EXCL;
2167 	device->dev_stats_valid = 1;
2168 	set_blocksize(device->bdev, 4096);
2169 
2170 	if (seeding_dev) {
2171 		sb->s_flags &= ~MS_RDONLY;
2172 		ret = btrfs_prepare_sprout(root);
2173 		BUG_ON(ret); /* -ENOMEM */
2174 	}
2175 
2176 	device->fs_devices = root->fs_info->fs_devices;
2177 
2178 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2179 	lock_chunks(root);
2180 	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2181 	list_add(&device->dev_alloc_list,
2182 		 &root->fs_info->fs_devices->alloc_list);
2183 	root->fs_info->fs_devices->num_devices++;
2184 	root->fs_info->fs_devices->open_devices++;
2185 	root->fs_info->fs_devices->rw_devices++;
2186 	root->fs_info->fs_devices->total_devices++;
2187 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2188 
2189 	spin_lock(&root->fs_info->free_chunk_lock);
2190 	root->fs_info->free_chunk_space += device->total_bytes;
2191 	spin_unlock(&root->fs_info->free_chunk_lock);
2192 
2193 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2194 		root->fs_info->fs_devices->rotating = 1;
2195 
2196 	tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
2197 	btrfs_set_super_total_bytes(root->fs_info->super_copy,
2198 				    tmp + device->total_bytes);
2199 
2200 	tmp = btrfs_super_num_devices(root->fs_info->super_copy);
2201 	btrfs_set_super_num_devices(root->fs_info->super_copy,
2202 				    tmp + 1);
2203 
2204 	/* add sysfs device entry */
2205 	btrfs_kobj_add_device(root->fs_info, device);
2206 
2207 	/*
2208 	 * we've got more storage, clear any full flags on the space
2209 	 * infos
2210 	 */
2211 	btrfs_clear_space_info_full(root->fs_info);
2212 
2213 	unlock_chunks(root);
2214 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2215 
2216 	if (seeding_dev) {
2217 		lock_chunks(root);
2218 		ret = init_first_rw_device(trans, root, device);
2219 		unlock_chunks(root);
2220 		if (ret) {
2221 			btrfs_abort_transaction(trans, root, ret);
2222 			goto error_trans;
2223 		}
2224 	}
2225 
2226 	ret = btrfs_add_device(trans, root, device);
2227 	if (ret) {
2228 		btrfs_abort_transaction(trans, root, ret);
2229 		goto error_trans;
2230 	}
2231 
2232 	if (seeding_dev) {
2233 		char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2234 
2235 		ret = btrfs_finish_sprout(trans, root);
2236 		if (ret) {
2237 			btrfs_abort_transaction(trans, root, ret);
2238 			goto error_trans;
2239 		}
2240 
2241 		/* Sprouting would change fsid of the mounted root,
2242 		 * so rename the fsid on the sysfs
2243 		 */
2244 		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2245 						root->fs_info->fsid);
2246 		if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2247 			goto error_trans;
2248 	}
2249 
2250 	root->fs_info->num_tolerated_disk_barrier_failures =
2251 		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2252 	ret = btrfs_commit_transaction(trans, root);
2253 
2254 	if (seeding_dev) {
2255 		mutex_unlock(&uuid_mutex);
2256 		up_write(&sb->s_umount);
2257 
2258 		if (ret) /* transaction commit */
2259 			return ret;
2260 
2261 		ret = btrfs_relocate_sys_chunks(root);
2262 		if (ret < 0)
2263 			btrfs_error(root->fs_info, ret,
2264 				    "Failed to relocate sys chunks after "
2265 				    "device initialization. This can be fixed "
2266 				    "using the \"btrfs balance\" command.");
2267 		trans = btrfs_attach_transaction(root);
2268 		if (IS_ERR(trans)) {
2269 			if (PTR_ERR(trans) == -ENOENT)
2270 				return 0;
2271 			return PTR_ERR(trans);
2272 		}
2273 		ret = btrfs_commit_transaction(trans, root);
2274 	}
2275 
2276 	/* Update ctime/mtime for libblkid */
2277 	update_dev_time(device_path);
2278 	return ret;
2279 
2280 error_trans:
2281 	btrfs_end_transaction(trans, root);
2282 	rcu_string_free(device->name);
2283 	btrfs_kobj_rm_device(root->fs_info, device);
2284 	kfree(device);
2285 error:
2286 	blkdev_put(bdev, FMODE_EXCL);
2287 	if (seeding_dev) {
2288 		mutex_unlock(&uuid_mutex);
2289 		up_write(&sb->s_umount);
2290 	}
2291 	return ret;
2292 }
2293 
2294 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2295 				  struct btrfs_device *srcdev,
2296 				  struct btrfs_device **device_out)
2297 {
2298 	struct request_queue *q;
2299 	struct btrfs_device *device;
2300 	struct block_device *bdev;
2301 	struct btrfs_fs_info *fs_info = root->fs_info;
2302 	struct list_head *devices;
2303 	struct rcu_string *name;
2304 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
2305 	int ret = 0;
2306 
2307 	*device_out = NULL;
2308 	if (fs_info->fs_devices->seeding) {
2309 		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2310 		return -EINVAL;
2311 	}
2312 
2313 	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2314 				  fs_info->bdev_holder);
2315 	if (IS_ERR(bdev)) {
2316 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
2317 		return PTR_ERR(bdev);
2318 	}
2319 
2320 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
2321 
2322 	devices = &fs_info->fs_devices->devices;
2323 	list_for_each_entry(device, devices, dev_list) {
2324 		if (device->bdev == bdev) {
2325 			btrfs_err(fs_info, "target device is in the filesystem!");
2326 			ret = -EEXIST;
2327 			goto error;
2328 		}
2329 	}
2330 
2331 
2332 	if (i_size_read(bdev->bd_inode) <
2333 	    btrfs_device_get_total_bytes(srcdev)) {
2334 		btrfs_err(fs_info, "target device is smaller than source device!");
2335 		ret = -EINVAL;
2336 		goto error;
2337 	}
2338 
2339 
2340 	device = btrfs_alloc_device(NULL, &devid, NULL);
2341 	if (IS_ERR(device)) {
2342 		ret = PTR_ERR(device);
2343 		goto error;
2344 	}
2345 
2346 	name = rcu_string_strdup(device_path, GFP_NOFS);
2347 	if (!name) {
2348 		kfree(device);
2349 		ret = -ENOMEM;
2350 		goto error;
2351 	}
2352 	rcu_assign_pointer(device->name, name);
2353 
2354 	q = bdev_get_queue(bdev);
2355 	if (blk_queue_discard(q))
2356 		device->can_discard = 1;
2357 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2358 	device->writeable = 1;
2359 	device->generation = 0;
2360 	device->io_width = root->sectorsize;
2361 	device->io_align = root->sectorsize;
2362 	device->sector_size = root->sectorsize;
2363 	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2364 	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2365 	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2366 	ASSERT(list_empty(&srcdev->resized_list));
2367 	device->commit_total_bytes = srcdev->commit_total_bytes;
2368 	device->commit_bytes_used = device->bytes_used;
2369 	device->dev_root = fs_info->dev_root;
2370 	device->bdev = bdev;
2371 	device->in_fs_metadata = 1;
2372 	device->is_tgtdev_for_dev_replace = 1;
2373 	device->mode = FMODE_EXCL;
2374 	device->dev_stats_valid = 1;
2375 	set_blocksize(device->bdev, 4096);
2376 	device->fs_devices = fs_info->fs_devices;
2377 	list_add(&device->dev_list, &fs_info->fs_devices->devices);
2378 	fs_info->fs_devices->num_devices++;
2379 	fs_info->fs_devices->open_devices++;
2380 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2381 
2382 	*device_out = device;
2383 	return ret;
2384 
2385 error:
2386 	blkdev_put(bdev, FMODE_EXCL);
2387 	return ret;
2388 }
2389 
2390 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2391 					      struct btrfs_device *tgtdev)
2392 {
2393 	WARN_ON(fs_info->fs_devices->rw_devices == 0);
2394 	tgtdev->io_width = fs_info->dev_root->sectorsize;
2395 	tgtdev->io_align = fs_info->dev_root->sectorsize;
2396 	tgtdev->sector_size = fs_info->dev_root->sectorsize;
2397 	tgtdev->dev_root = fs_info->dev_root;
2398 	tgtdev->in_fs_metadata = 1;
2399 }
2400 
2401 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2402 					struct btrfs_device *device)
2403 {
2404 	int ret;
2405 	struct btrfs_path *path;
2406 	struct btrfs_root *root;
2407 	struct btrfs_dev_item *dev_item;
2408 	struct extent_buffer *leaf;
2409 	struct btrfs_key key;
2410 
2411 	root = device->dev_root->fs_info->chunk_root;
2412 
2413 	path = btrfs_alloc_path();
2414 	if (!path)
2415 		return -ENOMEM;
2416 
2417 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2418 	key.type = BTRFS_DEV_ITEM_KEY;
2419 	key.offset = device->devid;
2420 
2421 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2422 	if (ret < 0)
2423 		goto out;
2424 
2425 	if (ret > 0) {
2426 		ret = -ENOENT;
2427 		goto out;
2428 	}
2429 
2430 	leaf = path->nodes[0];
2431 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2432 
2433 	btrfs_set_device_id(leaf, dev_item, device->devid);
2434 	btrfs_set_device_type(leaf, dev_item, device->type);
2435 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2436 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2437 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2438 	btrfs_set_device_total_bytes(leaf, dev_item,
2439 				     btrfs_device_get_disk_total_bytes(device));
2440 	btrfs_set_device_bytes_used(leaf, dev_item,
2441 				    btrfs_device_get_bytes_used(device));
2442 	btrfs_mark_buffer_dirty(leaf);
2443 
2444 out:
2445 	btrfs_free_path(path);
2446 	return ret;
2447 }
2448 
2449 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2450 		      struct btrfs_device *device, u64 new_size)
2451 {
2452 	struct btrfs_super_block *super_copy =
2453 		device->dev_root->fs_info->super_copy;
2454 	struct btrfs_fs_devices *fs_devices;
2455 	u64 old_total;
2456 	u64 diff;
2457 
2458 	if (!device->writeable)
2459 		return -EACCES;
2460 
2461 	lock_chunks(device->dev_root);
2462 	old_total = btrfs_super_total_bytes(super_copy);
2463 	diff = new_size - device->total_bytes;
2464 
2465 	if (new_size <= device->total_bytes ||
2466 	    device->is_tgtdev_for_dev_replace) {
2467 		unlock_chunks(device->dev_root);
2468 		return -EINVAL;
2469 	}
2470 
2471 	fs_devices = device->dev_root->fs_info->fs_devices;
2472 
2473 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
2474 	device->fs_devices->total_rw_bytes += diff;
2475 
2476 	btrfs_device_set_total_bytes(device, new_size);
2477 	btrfs_device_set_disk_total_bytes(device, new_size);
2478 	btrfs_clear_space_info_full(device->dev_root->fs_info);
2479 	if (list_empty(&device->resized_list))
2480 		list_add_tail(&device->resized_list,
2481 			      &fs_devices->resized_devices);
2482 	unlock_chunks(device->dev_root);
2483 
2484 	return btrfs_update_device(trans, device);
2485 }
2486 
2487 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2488 			    struct btrfs_root *root,
2489 			    u64 chunk_tree, u64 chunk_objectid,
2490 			    u64 chunk_offset)
2491 {
2492 	int ret;
2493 	struct btrfs_path *path;
2494 	struct btrfs_key key;
2495 
2496 	root = root->fs_info->chunk_root;
2497 	path = btrfs_alloc_path();
2498 	if (!path)
2499 		return -ENOMEM;
2500 
2501 	key.objectid = chunk_objectid;
2502 	key.offset = chunk_offset;
2503 	key.type = BTRFS_CHUNK_ITEM_KEY;
2504 
2505 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2506 	if (ret < 0)
2507 		goto out;
2508 	else if (ret > 0) { /* Logic error or corruption */
2509 		btrfs_error(root->fs_info, -ENOENT,
2510 			    "Failed lookup while freeing chunk.");
2511 		ret = -ENOENT;
2512 		goto out;
2513 	}
2514 
2515 	ret = btrfs_del_item(trans, root, path);
2516 	if (ret < 0)
2517 		btrfs_error(root->fs_info, ret,
2518 			    "Failed to delete chunk item.");
2519 out:
2520 	btrfs_free_path(path);
2521 	return ret;
2522 }
2523 
2524 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2525 			chunk_offset)
2526 {
2527 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2528 	struct btrfs_disk_key *disk_key;
2529 	struct btrfs_chunk *chunk;
2530 	u8 *ptr;
2531 	int ret = 0;
2532 	u32 num_stripes;
2533 	u32 array_size;
2534 	u32 len = 0;
2535 	u32 cur;
2536 	struct btrfs_key key;
2537 
2538 	lock_chunks(root);
2539 	array_size = btrfs_super_sys_array_size(super_copy);
2540 
2541 	ptr = super_copy->sys_chunk_array;
2542 	cur = 0;
2543 
2544 	while (cur < array_size) {
2545 		disk_key = (struct btrfs_disk_key *)ptr;
2546 		btrfs_disk_key_to_cpu(&key, disk_key);
2547 
2548 		len = sizeof(*disk_key);
2549 
2550 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2551 			chunk = (struct btrfs_chunk *)(ptr + len);
2552 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2553 			len += btrfs_chunk_item_size(num_stripes);
2554 		} else {
2555 			ret = -EIO;
2556 			break;
2557 		}
2558 		if (key.objectid == chunk_objectid &&
2559 		    key.offset == chunk_offset) {
2560 			memmove(ptr, ptr + len, array_size - (cur + len));
2561 			array_size -= len;
2562 			btrfs_set_super_sys_array_size(super_copy, array_size);
2563 		} else {
2564 			ptr += len;
2565 			cur += len;
2566 		}
2567 	}
2568 	unlock_chunks(root);
2569 	return ret;
2570 }
2571 
2572 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2573 		       struct btrfs_root *root, u64 chunk_offset)
2574 {
2575 	struct extent_map_tree *em_tree;
2576 	struct extent_map *em;
2577 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2578 	struct map_lookup *map;
2579 	u64 dev_extent_len = 0;
2580 	u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2581 	u64 chunk_tree = root->fs_info->chunk_root->objectid;
2582 	int i, ret = 0;
2583 
2584 	/* Just in case */
2585 	root = root->fs_info->chunk_root;
2586 	em_tree = &root->fs_info->mapping_tree.map_tree;
2587 
2588 	read_lock(&em_tree->lock);
2589 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2590 	read_unlock(&em_tree->lock);
2591 
2592 	if (!em || em->start > chunk_offset ||
2593 	    em->start + em->len < chunk_offset) {
2594 		/*
2595 		 * This is a logic error, but we don't want to just rely on the
2596 		 * user having built with ASSERT enabled, so if ASSERT doens't
2597 		 * do anything we still error out.
2598 		 */
2599 		ASSERT(0);
2600 		if (em)
2601 			free_extent_map(em);
2602 		return -EINVAL;
2603 	}
2604 	map = (struct map_lookup *)em->bdev;
2605 
2606 	for (i = 0; i < map->num_stripes; i++) {
2607 		struct btrfs_device *device = map->stripes[i].dev;
2608 		ret = btrfs_free_dev_extent(trans, device,
2609 					    map->stripes[i].physical,
2610 					    &dev_extent_len);
2611 		if (ret) {
2612 			btrfs_abort_transaction(trans, root, ret);
2613 			goto out;
2614 		}
2615 
2616 		if (device->bytes_used > 0) {
2617 			lock_chunks(root);
2618 			btrfs_device_set_bytes_used(device,
2619 					device->bytes_used - dev_extent_len);
2620 			spin_lock(&root->fs_info->free_chunk_lock);
2621 			root->fs_info->free_chunk_space += dev_extent_len;
2622 			spin_unlock(&root->fs_info->free_chunk_lock);
2623 			btrfs_clear_space_info_full(root->fs_info);
2624 			unlock_chunks(root);
2625 		}
2626 
2627 		if (map->stripes[i].dev) {
2628 			ret = btrfs_update_device(trans, map->stripes[i].dev);
2629 			if (ret) {
2630 				btrfs_abort_transaction(trans, root, ret);
2631 				goto out;
2632 			}
2633 		}
2634 	}
2635 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2636 			       chunk_offset);
2637 	if (ret) {
2638 		btrfs_abort_transaction(trans, root, ret);
2639 		goto out;
2640 	}
2641 
2642 	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2643 
2644 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2645 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2646 		if (ret) {
2647 			btrfs_abort_transaction(trans, root, ret);
2648 			goto out;
2649 		}
2650 	}
2651 
2652 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2653 	if (ret) {
2654 		btrfs_abort_transaction(trans, extent_root, ret);
2655 		goto out;
2656 	}
2657 
2658 out:
2659 	/* once for us */
2660 	free_extent_map(em);
2661 	return ret;
2662 }
2663 
2664 static int btrfs_relocate_chunk(struct btrfs_root *root,
2665 			 u64 chunk_tree, u64 chunk_objectid,
2666 			 u64 chunk_offset)
2667 {
2668 	struct btrfs_root *extent_root;
2669 	struct btrfs_trans_handle *trans;
2670 	int ret;
2671 
2672 	root = root->fs_info->chunk_root;
2673 	extent_root = root->fs_info->extent_root;
2674 
2675 	ret = btrfs_can_relocate(extent_root, chunk_offset);
2676 	if (ret)
2677 		return -ENOSPC;
2678 
2679 	/* step one, relocate all the extents inside this chunk */
2680 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2681 	if (ret)
2682 		return ret;
2683 
2684 	trans = btrfs_start_transaction(root, 0);
2685 	if (IS_ERR(trans)) {
2686 		ret = PTR_ERR(trans);
2687 		btrfs_std_error(root->fs_info, ret);
2688 		return ret;
2689 	}
2690 
2691 	/*
2692 	 * step two, delete the device extents and the
2693 	 * chunk tree entries
2694 	 */
2695 	ret = btrfs_remove_chunk(trans, root, chunk_offset);
2696 	btrfs_end_transaction(trans, root);
2697 	return ret;
2698 }
2699 
2700 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2701 {
2702 	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2703 	struct btrfs_path *path;
2704 	struct extent_buffer *leaf;
2705 	struct btrfs_chunk *chunk;
2706 	struct btrfs_key key;
2707 	struct btrfs_key found_key;
2708 	u64 chunk_tree = chunk_root->root_key.objectid;
2709 	u64 chunk_type;
2710 	bool retried = false;
2711 	int failed = 0;
2712 	int ret;
2713 
2714 	path = btrfs_alloc_path();
2715 	if (!path)
2716 		return -ENOMEM;
2717 
2718 again:
2719 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2720 	key.offset = (u64)-1;
2721 	key.type = BTRFS_CHUNK_ITEM_KEY;
2722 
2723 	while (1) {
2724 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2725 		if (ret < 0)
2726 			goto error;
2727 		BUG_ON(ret == 0); /* Corruption */
2728 
2729 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
2730 					  key.type);
2731 		if (ret < 0)
2732 			goto error;
2733 		if (ret > 0)
2734 			break;
2735 
2736 		leaf = path->nodes[0];
2737 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2738 
2739 		chunk = btrfs_item_ptr(leaf, path->slots[0],
2740 				       struct btrfs_chunk);
2741 		chunk_type = btrfs_chunk_type(leaf, chunk);
2742 		btrfs_release_path(path);
2743 
2744 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2745 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2746 						   found_key.objectid,
2747 						   found_key.offset);
2748 			if (ret == -ENOSPC)
2749 				failed++;
2750 			else
2751 				BUG_ON(ret);
2752 		}
2753 
2754 		if (found_key.offset == 0)
2755 			break;
2756 		key.offset = found_key.offset - 1;
2757 	}
2758 	ret = 0;
2759 	if (failed && !retried) {
2760 		failed = 0;
2761 		retried = true;
2762 		goto again;
2763 	} else if (WARN_ON(failed && retried)) {
2764 		ret = -ENOSPC;
2765 	}
2766 error:
2767 	btrfs_free_path(path);
2768 	return ret;
2769 }
2770 
2771 static int insert_balance_item(struct btrfs_root *root,
2772 			       struct btrfs_balance_control *bctl)
2773 {
2774 	struct btrfs_trans_handle *trans;
2775 	struct btrfs_balance_item *item;
2776 	struct btrfs_disk_balance_args disk_bargs;
2777 	struct btrfs_path *path;
2778 	struct extent_buffer *leaf;
2779 	struct btrfs_key key;
2780 	int ret, err;
2781 
2782 	path = btrfs_alloc_path();
2783 	if (!path)
2784 		return -ENOMEM;
2785 
2786 	trans = btrfs_start_transaction(root, 0);
2787 	if (IS_ERR(trans)) {
2788 		btrfs_free_path(path);
2789 		return PTR_ERR(trans);
2790 	}
2791 
2792 	key.objectid = BTRFS_BALANCE_OBJECTID;
2793 	key.type = BTRFS_BALANCE_ITEM_KEY;
2794 	key.offset = 0;
2795 
2796 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2797 				      sizeof(*item));
2798 	if (ret)
2799 		goto out;
2800 
2801 	leaf = path->nodes[0];
2802 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2803 
2804 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2805 
2806 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2807 	btrfs_set_balance_data(leaf, item, &disk_bargs);
2808 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2809 	btrfs_set_balance_meta(leaf, item, &disk_bargs);
2810 	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2811 	btrfs_set_balance_sys(leaf, item, &disk_bargs);
2812 
2813 	btrfs_set_balance_flags(leaf, item, bctl->flags);
2814 
2815 	btrfs_mark_buffer_dirty(leaf);
2816 out:
2817 	btrfs_free_path(path);
2818 	err = btrfs_commit_transaction(trans, root);
2819 	if (err && !ret)
2820 		ret = err;
2821 	return ret;
2822 }
2823 
2824 static int del_balance_item(struct btrfs_root *root)
2825 {
2826 	struct btrfs_trans_handle *trans;
2827 	struct btrfs_path *path;
2828 	struct btrfs_key key;
2829 	int ret, err;
2830 
2831 	path = btrfs_alloc_path();
2832 	if (!path)
2833 		return -ENOMEM;
2834 
2835 	trans = btrfs_start_transaction(root, 0);
2836 	if (IS_ERR(trans)) {
2837 		btrfs_free_path(path);
2838 		return PTR_ERR(trans);
2839 	}
2840 
2841 	key.objectid = BTRFS_BALANCE_OBJECTID;
2842 	key.type = BTRFS_BALANCE_ITEM_KEY;
2843 	key.offset = 0;
2844 
2845 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2846 	if (ret < 0)
2847 		goto out;
2848 	if (ret > 0) {
2849 		ret = -ENOENT;
2850 		goto out;
2851 	}
2852 
2853 	ret = btrfs_del_item(trans, root, path);
2854 out:
2855 	btrfs_free_path(path);
2856 	err = btrfs_commit_transaction(trans, root);
2857 	if (err && !ret)
2858 		ret = err;
2859 	return ret;
2860 }
2861 
2862 /*
2863  * This is a heuristic used to reduce the number of chunks balanced on
2864  * resume after balance was interrupted.
2865  */
2866 static void update_balance_args(struct btrfs_balance_control *bctl)
2867 {
2868 	/*
2869 	 * Turn on soft mode for chunk types that were being converted.
2870 	 */
2871 	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2872 		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2873 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2874 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2875 	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2876 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2877 
2878 	/*
2879 	 * Turn on usage filter if is not already used.  The idea is
2880 	 * that chunks that we have already balanced should be
2881 	 * reasonably full.  Don't do it for chunks that are being
2882 	 * converted - that will keep us from relocating unconverted
2883 	 * (albeit full) chunks.
2884 	 */
2885 	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2886 	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2887 		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2888 		bctl->data.usage = 90;
2889 	}
2890 	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2891 	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2892 		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2893 		bctl->sys.usage = 90;
2894 	}
2895 	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2896 	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2897 		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2898 		bctl->meta.usage = 90;
2899 	}
2900 }
2901 
2902 /*
2903  * Should be called with both balance and volume mutexes held to
2904  * serialize other volume operations (add_dev/rm_dev/resize) with
2905  * restriper.  Same goes for unset_balance_control.
2906  */
2907 static void set_balance_control(struct btrfs_balance_control *bctl)
2908 {
2909 	struct btrfs_fs_info *fs_info = bctl->fs_info;
2910 
2911 	BUG_ON(fs_info->balance_ctl);
2912 
2913 	spin_lock(&fs_info->balance_lock);
2914 	fs_info->balance_ctl = bctl;
2915 	spin_unlock(&fs_info->balance_lock);
2916 }
2917 
2918 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2919 {
2920 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2921 
2922 	BUG_ON(!fs_info->balance_ctl);
2923 
2924 	spin_lock(&fs_info->balance_lock);
2925 	fs_info->balance_ctl = NULL;
2926 	spin_unlock(&fs_info->balance_lock);
2927 
2928 	kfree(bctl);
2929 }
2930 
2931 /*
2932  * Balance filters.  Return 1 if chunk should be filtered out
2933  * (should not be balanced).
2934  */
2935 static int chunk_profiles_filter(u64 chunk_type,
2936 				 struct btrfs_balance_args *bargs)
2937 {
2938 	chunk_type = chunk_to_extended(chunk_type) &
2939 				BTRFS_EXTENDED_PROFILE_MASK;
2940 
2941 	if (bargs->profiles & chunk_type)
2942 		return 0;
2943 
2944 	return 1;
2945 }
2946 
2947 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2948 			      struct btrfs_balance_args *bargs)
2949 {
2950 	struct btrfs_block_group_cache *cache;
2951 	u64 chunk_used, user_thresh;
2952 	int ret = 1;
2953 
2954 	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2955 	chunk_used = btrfs_block_group_used(&cache->item);
2956 
2957 	if (bargs->usage == 0)
2958 		user_thresh = 1;
2959 	else if (bargs->usage > 100)
2960 		user_thresh = cache->key.offset;
2961 	else
2962 		user_thresh = div_factor_fine(cache->key.offset,
2963 					      bargs->usage);
2964 
2965 	if (chunk_used < user_thresh)
2966 		ret = 0;
2967 
2968 	btrfs_put_block_group(cache);
2969 	return ret;
2970 }
2971 
2972 static int chunk_devid_filter(struct extent_buffer *leaf,
2973 			      struct btrfs_chunk *chunk,
2974 			      struct btrfs_balance_args *bargs)
2975 {
2976 	struct btrfs_stripe *stripe;
2977 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2978 	int i;
2979 
2980 	for (i = 0; i < num_stripes; i++) {
2981 		stripe = btrfs_stripe_nr(chunk, i);
2982 		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2983 			return 0;
2984 	}
2985 
2986 	return 1;
2987 }
2988 
2989 /* [pstart, pend) */
2990 static int chunk_drange_filter(struct extent_buffer *leaf,
2991 			       struct btrfs_chunk *chunk,
2992 			       u64 chunk_offset,
2993 			       struct btrfs_balance_args *bargs)
2994 {
2995 	struct btrfs_stripe *stripe;
2996 	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2997 	u64 stripe_offset;
2998 	u64 stripe_length;
2999 	int factor;
3000 	int i;
3001 
3002 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3003 		return 0;
3004 
3005 	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3006 	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3007 		factor = num_stripes / 2;
3008 	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3009 		factor = num_stripes - 1;
3010 	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3011 		factor = num_stripes - 2;
3012 	} else {
3013 		factor = num_stripes;
3014 	}
3015 
3016 	for (i = 0; i < num_stripes; i++) {
3017 		stripe = btrfs_stripe_nr(chunk, i);
3018 		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3019 			continue;
3020 
3021 		stripe_offset = btrfs_stripe_offset(leaf, stripe);
3022 		stripe_length = btrfs_chunk_length(leaf, chunk);
3023 		do_div(stripe_length, factor);
3024 
3025 		if (stripe_offset < bargs->pend &&
3026 		    stripe_offset + stripe_length > bargs->pstart)
3027 			return 0;
3028 	}
3029 
3030 	return 1;
3031 }
3032 
3033 /* [vstart, vend) */
3034 static int chunk_vrange_filter(struct extent_buffer *leaf,
3035 			       struct btrfs_chunk *chunk,
3036 			       u64 chunk_offset,
3037 			       struct btrfs_balance_args *bargs)
3038 {
3039 	if (chunk_offset < bargs->vend &&
3040 	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3041 		/* at least part of the chunk is inside this vrange */
3042 		return 0;
3043 
3044 	return 1;
3045 }
3046 
3047 static int chunk_soft_convert_filter(u64 chunk_type,
3048 				     struct btrfs_balance_args *bargs)
3049 {
3050 	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3051 		return 0;
3052 
3053 	chunk_type = chunk_to_extended(chunk_type) &
3054 				BTRFS_EXTENDED_PROFILE_MASK;
3055 
3056 	if (bargs->target == chunk_type)
3057 		return 1;
3058 
3059 	return 0;
3060 }
3061 
3062 static int should_balance_chunk(struct btrfs_root *root,
3063 				struct extent_buffer *leaf,
3064 				struct btrfs_chunk *chunk, u64 chunk_offset)
3065 {
3066 	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3067 	struct btrfs_balance_args *bargs = NULL;
3068 	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3069 
3070 	/* type filter */
3071 	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3072 	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3073 		return 0;
3074 	}
3075 
3076 	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3077 		bargs = &bctl->data;
3078 	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3079 		bargs = &bctl->sys;
3080 	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3081 		bargs = &bctl->meta;
3082 
3083 	/* profiles filter */
3084 	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3085 	    chunk_profiles_filter(chunk_type, bargs)) {
3086 		return 0;
3087 	}
3088 
3089 	/* usage filter */
3090 	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3091 	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
3092 		return 0;
3093 	}
3094 
3095 	/* devid filter */
3096 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3097 	    chunk_devid_filter(leaf, chunk, bargs)) {
3098 		return 0;
3099 	}
3100 
3101 	/* drange filter, makes sense only with devid filter */
3102 	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3103 	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
3104 		return 0;
3105 	}
3106 
3107 	/* vrange filter */
3108 	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3109 	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3110 		return 0;
3111 	}
3112 
3113 	/* soft profile changing mode */
3114 	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3115 	    chunk_soft_convert_filter(chunk_type, bargs)) {
3116 		return 0;
3117 	}
3118 
3119 	/*
3120 	 * limited by count, must be the last filter
3121 	 */
3122 	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3123 		if (bargs->limit == 0)
3124 			return 0;
3125 		else
3126 			bargs->limit--;
3127 	}
3128 
3129 	return 1;
3130 }
3131 
3132 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3133 {
3134 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3135 	struct btrfs_root *chunk_root = fs_info->chunk_root;
3136 	struct btrfs_root *dev_root = fs_info->dev_root;
3137 	struct list_head *devices;
3138 	struct btrfs_device *device;
3139 	u64 old_size;
3140 	u64 size_to_free;
3141 	struct btrfs_chunk *chunk;
3142 	struct btrfs_path *path;
3143 	struct btrfs_key key;
3144 	struct btrfs_key found_key;
3145 	struct btrfs_trans_handle *trans;
3146 	struct extent_buffer *leaf;
3147 	int slot;
3148 	int ret;
3149 	int enospc_errors = 0;
3150 	bool counting = true;
3151 	u64 limit_data = bctl->data.limit;
3152 	u64 limit_meta = bctl->meta.limit;
3153 	u64 limit_sys = bctl->sys.limit;
3154 
3155 	/* step one make some room on all the devices */
3156 	devices = &fs_info->fs_devices->devices;
3157 	list_for_each_entry(device, devices, dev_list) {
3158 		old_size = btrfs_device_get_total_bytes(device);
3159 		size_to_free = div_factor(old_size, 1);
3160 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3161 		if (!device->writeable ||
3162 		    btrfs_device_get_total_bytes(device) -
3163 		    btrfs_device_get_bytes_used(device) > size_to_free ||
3164 		    device->is_tgtdev_for_dev_replace)
3165 			continue;
3166 
3167 		ret = btrfs_shrink_device(device, old_size - size_to_free);
3168 		if (ret == -ENOSPC)
3169 			break;
3170 		BUG_ON(ret);
3171 
3172 		trans = btrfs_start_transaction(dev_root, 0);
3173 		BUG_ON(IS_ERR(trans));
3174 
3175 		ret = btrfs_grow_device(trans, device, old_size);
3176 		BUG_ON(ret);
3177 
3178 		btrfs_end_transaction(trans, dev_root);
3179 	}
3180 
3181 	/* step two, relocate all the chunks */
3182 	path = btrfs_alloc_path();
3183 	if (!path) {
3184 		ret = -ENOMEM;
3185 		goto error;
3186 	}
3187 
3188 	/* zero out stat counters */
3189 	spin_lock(&fs_info->balance_lock);
3190 	memset(&bctl->stat, 0, sizeof(bctl->stat));
3191 	spin_unlock(&fs_info->balance_lock);
3192 again:
3193 	if (!counting) {
3194 		bctl->data.limit = limit_data;
3195 		bctl->meta.limit = limit_meta;
3196 		bctl->sys.limit = limit_sys;
3197 	}
3198 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3199 	key.offset = (u64)-1;
3200 	key.type = BTRFS_CHUNK_ITEM_KEY;
3201 
3202 	while (1) {
3203 		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3204 		    atomic_read(&fs_info->balance_cancel_req)) {
3205 			ret = -ECANCELED;
3206 			goto error;
3207 		}
3208 
3209 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3210 		if (ret < 0)
3211 			goto error;
3212 
3213 		/*
3214 		 * this shouldn't happen, it means the last relocate
3215 		 * failed
3216 		 */
3217 		if (ret == 0)
3218 			BUG(); /* FIXME break ? */
3219 
3220 		ret = btrfs_previous_item(chunk_root, path, 0,
3221 					  BTRFS_CHUNK_ITEM_KEY);
3222 		if (ret) {
3223 			ret = 0;
3224 			break;
3225 		}
3226 
3227 		leaf = path->nodes[0];
3228 		slot = path->slots[0];
3229 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3230 
3231 		if (found_key.objectid != key.objectid)
3232 			break;
3233 
3234 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3235 
3236 		if (!counting) {
3237 			spin_lock(&fs_info->balance_lock);
3238 			bctl->stat.considered++;
3239 			spin_unlock(&fs_info->balance_lock);
3240 		}
3241 
3242 		ret = should_balance_chunk(chunk_root, leaf, chunk,
3243 					   found_key.offset);
3244 		btrfs_release_path(path);
3245 		if (!ret)
3246 			goto loop;
3247 
3248 		if (counting) {
3249 			spin_lock(&fs_info->balance_lock);
3250 			bctl->stat.expected++;
3251 			spin_unlock(&fs_info->balance_lock);
3252 			goto loop;
3253 		}
3254 
3255 		ret = btrfs_relocate_chunk(chunk_root,
3256 					   chunk_root->root_key.objectid,
3257 					   found_key.objectid,
3258 					   found_key.offset);
3259 		if (ret && ret != -ENOSPC)
3260 			goto error;
3261 		if (ret == -ENOSPC) {
3262 			enospc_errors++;
3263 		} else {
3264 			spin_lock(&fs_info->balance_lock);
3265 			bctl->stat.completed++;
3266 			spin_unlock(&fs_info->balance_lock);
3267 		}
3268 loop:
3269 		if (found_key.offset == 0)
3270 			break;
3271 		key.offset = found_key.offset - 1;
3272 	}
3273 
3274 	if (counting) {
3275 		btrfs_release_path(path);
3276 		counting = false;
3277 		goto again;
3278 	}
3279 error:
3280 	btrfs_free_path(path);
3281 	if (enospc_errors) {
3282 		btrfs_info(fs_info, "%d enospc errors during balance",
3283 		       enospc_errors);
3284 		if (!ret)
3285 			ret = -ENOSPC;
3286 	}
3287 
3288 	return ret;
3289 }
3290 
3291 /**
3292  * alloc_profile_is_valid - see if a given profile is valid and reduced
3293  * @flags: profile to validate
3294  * @extended: if true @flags is treated as an extended profile
3295  */
3296 static int alloc_profile_is_valid(u64 flags, int extended)
3297 {
3298 	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3299 			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
3300 
3301 	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3302 
3303 	/* 1) check that all other bits are zeroed */
3304 	if (flags & ~mask)
3305 		return 0;
3306 
3307 	/* 2) see if profile is reduced */
3308 	if (flags == 0)
3309 		return !extended; /* "0" is valid for usual profiles */
3310 
3311 	/* true if exactly one bit set */
3312 	return (flags & (flags - 1)) == 0;
3313 }
3314 
3315 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3316 {
3317 	/* cancel requested || normal exit path */
3318 	return atomic_read(&fs_info->balance_cancel_req) ||
3319 		(atomic_read(&fs_info->balance_pause_req) == 0 &&
3320 		 atomic_read(&fs_info->balance_cancel_req) == 0);
3321 }
3322 
3323 static void __cancel_balance(struct btrfs_fs_info *fs_info)
3324 {
3325 	int ret;
3326 
3327 	unset_balance_control(fs_info);
3328 	ret = del_balance_item(fs_info->tree_root);
3329 	if (ret)
3330 		btrfs_std_error(fs_info, ret);
3331 
3332 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3333 }
3334 
3335 /*
3336  * Should be called with both balance and volume mutexes held
3337  */
3338 int btrfs_balance(struct btrfs_balance_control *bctl,
3339 		  struct btrfs_ioctl_balance_args *bargs)
3340 {
3341 	struct btrfs_fs_info *fs_info = bctl->fs_info;
3342 	u64 allowed;
3343 	int mixed = 0;
3344 	int ret;
3345 	u64 num_devices;
3346 	unsigned seq;
3347 
3348 	if (btrfs_fs_closing(fs_info) ||
3349 	    atomic_read(&fs_info->balance_pause_req) ||
3350 	    atomic_read(&fs_info->balance_cancel_req)) {
3351 		ret = -EINVAL;
3352 		goto out;
3353 	}
3354 
3355 	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3356 	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3357 		mixed = 1;
3358 
3359 	/*
3360 	 * In case of mixed groups both data and meta should be picked,
3361 	 * and identical options should be given for both of them.
3362 	 */
3363 	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3364 	if (mixed && (bctl->flags & allowed)) {
3365 		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3366 		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3367 		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3368 			btrfs_err(fs_info, "with mixed groups data and "
3369 				   "metadata balance options must be the same");
3370 			ret = -EINVAL;
3371 			goto out;
3372 		}
3373 	}
3374 
3375 	num_devices = fs_info->fs_devices->num_devices;
3376 	btrfs_dev_replace_lock(&fs_info->dev_replace);
3377 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3378 		BUG_ON(num_devices < 1);
3379 		num_devices--;
3380 	}
3381 	btrfs_dev_replace_unlock(&fs_info->dev_replace);
3382 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3383 	if (num_devices == 1)
3384 		allowed |= BTRFS_BLOCK_GROUP_DUP;
3385 	else if (num_devices > 1)
3386 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3387 	if (num_devices > 2)
3388 		allowed |= BTRFS_BLOCK_GROUP_RAID5;
3389 	if (num_devices > 3)
3390 		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3391 			    BTRFS_BLOCK_GROUP_RAID6);
3392 	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3393 	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
3394 	     (bctl->data.target & ~allowed))) {
3395 		btrfs_err(fs_info, "unable to start balance with target "
3396 			   "data profile %llu",
3397 		       bctl->data.target);
3398 		ret = -EINVAL;
3399 		goto out;
3400 	}
3401 	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3402 	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3403 	     (bctl->meta.target & ~allowed))) {
3404 		btrfs_err(fs_info,
3405 			   "unable to start balance with target metadata profile %llu",
3406 		       bctl->meta.target);
3407 		ret = -EINVAL;
3408 		goto out;
3409 	}
3410 	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3411 	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3412 	     (bctl->sys.target & ~allowed))) {
3413 		btrfs_err(fs_info,
3414 			   "unable to start balance with target system profile %llu",
3415 		       bctl->sys.target);
3416 		ret = -EINVAL;
3417 		goto out;
3418 	}
3419 
3420 	/* allow dup'ed data chunks only in mixed mode */
3421 	if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3422 	    (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3423 		btrfs_err(fs_info, "dup for data is not allowed");
3424 		ret = -EINVAL;
3425 		goto out;
3426 	}
3427 
3428 	/* allow to reduce meta or sys integrity only if force set */
3429 	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3430 			BTRFS_BLOCK_GROUP_RAID10 |
3431 			BTRFS_BLOCK_GROUP_RAID5 |
3432 			BTRFS_BLOCK_GROUP_RAID6;
3433 	do {
3434 		seq = read_seqbegin(&fs_info->profiles_lock);
3435 
3436 		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3437 		     (fs_info->avail_system_alloc_bits & allowed) &&
3438 		     !(bctl->sys.target & allowed)) ||
3439 		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3440 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
3441 		     !(bctl->meta.target & allowed))) {
3442 			if (bctl->flags & BTRFS_BALANCE_FORCE) {
3443 				btrfs_info(fs_info, "force reducing metadata integrity");
3444 			} else {
3445 				btrfs_err(fs_info, "balance will reduce metadata "
3446 					   "integrity, use force if you want this");
3447 				ret = -EINVAL;
3448 				goto out;
3449 			}
3450 		}
3451 	} while (read_seqretry(&fs_info->profiles_lock, seq));
3452 
3453 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3454 		int num_tolerated_disk_barrier_failures;
3455 		u64 target = bctl->sys.target;
3456 
3457 		num_tolerated_disk_barrier_failures =
3458 			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3459 		if (num_tolerated_disk_barrier_failures > 0 &&
3460 		    (target &
3461 		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3462 		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3463 			num_tolerated_disk_barrier_failures = 0;
3464 		else if (num_tolerated_disk_barrier_failures > 1 &&
3465 			 (target &
3466 			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3467 			num_tolerated_disk_barrier_failures = 1;
3468 
3469 		fs_info->num_tolerated_disk_barrier_failures =
3470 			num_tolerated_disk_barrier_failures;
3471 	}
3472 
3473 	ret = insert_balance_item(fs_info->tree_root, bctl);
3474 	if (ret && ret != -EEXIST)
3475 		goto out;
3476 
3477 	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3478 		BUG_ON(ret == -EEXIST);
3479 		set_balance_control(bctl);
3480 	} else {
3481 		BUG_ON(ret != -EEXIST);
3482 		spin_lock(&fs_info->balance_lock);
3483 		update_balance_args(bctl);
3484 		spin_unlock(&fs_info->balance_lock);
3485 	}
3486 
3487 	atomic_inc(&fs_info->balance_running);
3488 	mutex_unlock(&fs_info->balance_mutex);
3489 
3490 	ret = __btrfs_balance(fs_info);
3491 
3492 	mutex_lock(&fs_info->balance_mutex);
3493 	atomic_dec(&fs_info->balance_running);
3494 
3495 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3496 		fs_info->num_tolerated_disk_barrier_failures =
3497 			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3498 	}
3499 
3500 	if (bargs) {
3501 		memset(bargs, 0, sizeof(*bargs));
3502 		update_ioctl_balance_args(fs_info, 0, bargs);
3503 	}
3504 
3505 	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3506 	    balance_need_close(fs_info)) {
3507 		__cancel_balance(fs_info);
3508 	}
3509 
3510 	wake_up(&fs_info->balance_wait_q);
3511 
3512 	return ret;
3513 out:
3514 	if (bctl->flags & BTRFS_BALANCE_RESUME)
3515 		__cancel_balance(fs_info);
3516 	else {
3517 		kfree(bctl);
3518 		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3519 	}
3520 	return ret;
3521 }
3522 
3523 static int balance_kthread(void *data)
3524 {
3525 	struct btrfs_fs_info *fs_info = data;
3526 	int ret = 0;
3527 
3528 	mutex_lock(&fs_info->volume_mutex);
3529 	mutex_lock(&fs_info->balance_mutex);
3530 
3531 	if (fs_info->balance_ctl) {
3532 		btrfs_info(fs_info, "continuing balance");
3533 		ret = btrfs_balance(fs_info->balance_ctl, NULL);
3534 	}
3535 
3536 	mutex_unlock(&fs_info->balance_mutex);
3537 	mutex_unlock(&fs_info->volume_mutex);
3538 
3539 	return ret;
3540 }
3541 
3542 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3543 {
3544 	struct task_struct *tsk;
3545 
3546 	spin_lock(&fs_info->balance_lock);
3547 	if (!fs_info->balance_ctl) {
3548 		spin_unlock(&fs_info->balance_lock);
3549 		return 0;
3550 	}
3551 	spin_unlock(&fs_info->balance_lock);
3552 
3553 	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3554 		btrfs_info(fs_info, "force skipping balance");
3555 		return 0;
3556 	}
3557 
3558 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3559 	return PTR_ERR_OR_ZERO(tsk);
3560 }
3561 
3562 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3563 {
3564 	struct btrfs_balance_control *bctl;
3565 	struct btrfs_balance_item *item;
3566 	struct btrfs_disk_balance_args disk_bargs;
3567 	struct btrfs_path *path;
3568 	struct extent_buffer *leaf;
3569 	struct btrfs_key key;
3570 	int ret;
3571 
3572 	path = btrfs_alloc_path();
3573 	if (!path)
3574 		return -ENOMEM;
3575 
3576 	key.objectid = BTRFS_BALANCE_OBJECTID;
3577 	key.type = BTRFS_BALANCE_ITEM_KEY;
3578 	key.offset = 0;
3579 
3580 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3581 	if (ret < 0)
3582 		goto out;
3583 	if (ret > 0) { /* ret = -ENOENT; */
3584 		ret = 0;
3585 		goto out;
3586 	}
3587 
3588 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3589 	if (!bctl) {
3590 		ret = -ENOMEM;
3591 		goto out;
3592 	}
3593 
3594 	leaf = path->nodes[0];
3595 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3596 
3597 	bctl->fs_info = fs_info;
3598 	bctl->flags = btrfs_balance_flags(leaf, item);
3599 	bctl->flags |= BTRFS_BALANCE_RESUME;
3600 
3601 	btrfs_balance_data(leaf, item, &disk_bargs);
3602 	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3603 	btrfs_balance_meta(leaf, item, &disk_bargs);
3604 	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3605 	btrfs_balance_sys(leaf, item, &disk_bargs);
3606 	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3607 
3608 	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3609 
3610 	mutex_lock(&fs_info->volume_mutex);
3611 	mutex_lock(&fs_info->balance_mutex);
3612 
3613 	set_balance_control(bctl);
3614 
3615 	mutex_unlock(&fs_info->balance_mutex);
3616 	mutex_unlock(&fs_info->volume_mutex);
3617 out:
3618 	btrfs_free_path(path);
3619 	return ret;
3620 }
3621 
3622 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3623 {
3624 	int ret = 0;
3625 
3626 	mutex_lock(&fs_info->balance_mutex);
3627 	if (!fs_info->balance_ctl) {
3628 		mutex_unlock(&fs_info->balance_mutex);
3629 		return -ENOTCONN;
3630 	}
3631 
3632 	if (atomic_read(&fs_info->balance_running)) {
3633 		atomic_inc(&fs_info->balance_pause_req);
3634 		mutex_unlock(&fs_info->balance_mutex);
3635 
3636 		wait_event(fs_info->balance_wait_q,
3637 			   atomic_read(&fs_info->balance_running) == 0);
3638 
3639 		mutex_lock(&fs_info->balance_mutex);
3640 		/* we are good with balance_ctl ripped off from under us */
3641 		BUG_ON(atomic_read(&fs_info->balance_running));
3642 		atomic_dec(&fs_info->balance_pause_req);
3643 	} else {
3644 		ret = -ENOTCONN;
3645 	}
3646 
3647 	mutex_unlock(&fs_info->balance_mutex);
3648 	return ret;
3649 }
3650 
3651 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3652 {
3653 	if (fs_info->sb->s_flags & MS_RDONLY)
3654 		return -EROFS;
3655 
3656 	mutex_lock(&fs_info->balance_mutex);
3657 	if (!fs_info->balance_ctl) {
3658 		mutex_unlock(&fs_info->balance_mutex);
3659 		return -ENOTCONN;
3660 	}
3661 
3662 	atomic_inc(&fs_info->balance_cancel_req);
3663 	/*
3664 	 * if we are running just wait and return, balance item is
3665 	 * deleted in btrfs_balance in this case
3666 	 */
3667 	if (atomic_read(&fs_info->balance_running)) {
3668 		mutex_unlock(&fs_info->balance_mutex);
3669 		wait_event(fs_info->balance_wait_q,
3670 			   atomic_read(&fs_info->balance_running) == 0);
3671 		mutex_lock(&fs_info->balance_mutex);
3672 	} else {
3673 		/* __cancel_balance needs volume_mutex */
3674 		mutex_unlock(&fs_info->balance_mutex);
3675 		mutex_lock(&fs_info->volume_mutex);
3676 		mutex_lock(&fs_info->balance_mutex);
3677 
3678 		if (fs_info->balance_ctl)
3679 			__cancel_balance(fs_info);
3680 
3681 		mutex_unlock(&fs_info->volume_mutex);
3682 	}
3683 
3684 	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3685 	atomic_dec(&fs_info->balance_cancel_req);
3686 	mutex_unlock(&fs_info->balance_mutex);
3687 	return 0;
3688 }
3689 
3690 static int btrfs_uuid_scan_kthread(void *data)
3691 {
3692 	struct btrfs_fs_info *fs_info = data;
3693 	struct btrfs_root *root = fs_info->tree_root;
3694 	struct btrfs_key key;
3695 	struct btrfs_key max_key;
3696 	struct btrfs_path *path = NULL;
3697 	int ret = 0;
3698 	struct extent_buffer *eb;
3699 	int slot;
3700 	struct btrfs_root_item root_item;
3701 	u32 item_size;
3702 	struct btrfs_trans_handle *trans = NULL;
3703 
3704 	path = btrfs_alloc_path();
3705 	if (!path) {
3706 		ret = -ENOMEM;
3707 		goto out;
3708 	}
3709 
3710 	key.objectid = 0;
3711 	key.type = BTRFS_ROOT_ITEM_KEY;
3712 	key.offset = 0;
3713 
3714 	max_key.objectid = (u64)-1;
3715 	max_key.type = BTRFS_ROOT_ITEM_KEY;
3716 	max_key.offset = (u64)-1;
3717 
3718 	while (1) {
3719 		ret = btrfs_search_forward(root, &key, path, 0);
3720 		if (ret) {
3721 			if (ret > 0)
3722 				ret = 0;
3723 			break;
3724 		}
3725 
3726 		if (key.type != BTRFS_ROOT_ITEM_KEY ||
3727 		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
3728 		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
3729 		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
3730 			goto skip;
3731 
3732 		eb = path->nodes[0];
3733 		slot = path->slots[0];
3734 		item_size = btrfs_item_size_nr(eb, slot);
3735 		if (item_size < sizeof(root_item))
3736 			goto skip;
3737 
3738 		read_extent_buffer(eb, &root_item,
3739 				   btrfs_item_ptr_offset(eb, slot),
3740 				   (int)sizeof(root_item));
3741 		if (btrfs_root_refs(&root_item) == 0)
3742 			goto skip;
3743 
3744 		if (!btrfs_is_empty_uuid(root_item.uuid) ||
3745 		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
3746 			if (trans)
3747 				goto update_tree;
3748 
3749 			btrfs_release_path(path);
3750 			/*
3751 			 * 1 - subvol uuid item
3752 			 * 1 - received_subvol uuid item
3753 			 */
3754 			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
3755 			if (IS_ERR(trans)) {
3756 				ret = PTR_ERR(trans);
3757 				break;
3758 			}
3759 			continue;
3760 		} else {
3761 			goto skip;
3762 		}
3763 update_tree:
3764 		if (!btrfs_is_empty_uuid(root_item.uuid)) {
3765 			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3766 						  root_item.uuid,
3767 						  BTRFS_UUID_KEY_SUBVOL,
3768 						  key.objectid);
3769 			if (ret < 0) {
3770 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
3771 					ret);
3772 				break;
3773 			}
3774 		}
3775 
3776 		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
3777 			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3778 						  root_item.received_uuid,
3779 						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
3780 						  key.objectid);
3781 			if (ret < 0) {
3782 				btrfs_warn(fs_info, "uuid_tree_add failed %d",
3783 					ret);
3784 				break;
3785 			}
3786 		}
3787 
3788 skip:
3789 		if (trans) {
3790 			ret = btrfs_end_transaction(trans, fs_info->uuid_root);
3791 			trans = NULL;
3792 			if (ret)
3793 				break;
3794 		}
3795 
3796 		btrfs_release_path(path);
3797 		if (key.offset < (u64)-1) {
3798 			key.offset++;
3799 		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
3800 			key.offset = 0;
3801 			key.type = BTRFS_ROOT_ITEM_KEY;
3802 		} else if (key.objectid < (u64)-1) {
3803 			key.offset = 0;
3804 			key.type = BTRFS_ROOT_ITEM_KEY;
3805 			key.objectid++;
3806 		} else {
3807 			break;
3808 		}
3809 		cond_resched();
3810 	}
3811 
3812 out:
3813 	btrfs_free_path(path);
3814 	if (trans && !IS_ERR(trans))
3815 		btrfs_end_transaction(trans, fs_info->uuid_root);
3816 	if (ret)
3817 		btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
3818 	else
3819 		fs_info->update_uuid_tree_gen = 1;
3820 	up(&fs_info->uuid_tree_rescan_sem);
3821 	return 0;
3822 }
3823 
3824 /*
3825  * Callback for btrfs_uuid_tree_iterate().
3826  * returns:
3827  * 0	check succeeded, the entry is not outdated.
3828  * < 0	if an error occured.
3829  * > 0	if the check failed, which means the caller shall remove the entry.
3830  */
3831 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
3832 				       u8 *uuid, u8 type, u64 subid)
3833 {
3834 	struct btrfs_key key;
3835 	int ret = 0;
3836 	struct btrfs_root *subvol_root;
3837 
3838 	if (type != BTRFS_UUID_KEY_SUBVOL &&
3839 	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
3840 		goto out;
3841 
3842 	key.objectid = subid;
3843 	key.type = BTRFS_ROOT_ITEM_KEY;
3844 	key.offset = (u64)-1;
3845 	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
3846 	if (IS_ERR(subvol_root)) {
3847 		ret = PTR_ERR(subvol_root);
3848 		if (ret == -ENOENT)
3849 			ret = 1;
3850 		goto out;
3851 	}
3852 
3853 	switch (type) {
3854 	case BTRFS_UUID_KEY_SUBVOL:
3855 		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
3856 			ret = 1;
3857 		break;
3858 	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
3859 		if (memcmp(uuid, subvol_root->root_item.received_uuid,
3860 			   BTRFS_UUID_SIZE))
3861 			ret = 1;
3862 		break;
3863 	}
3864 
3865 out:
3866 	return ret;
3867 }
3868 
3869 static int btrfs_uuid_rescan_kthread(void *data)
3870 {
3871 	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3872 	int ret;
3873 
3874 	/*
3875 	 * 1st step is to iterate through the existing UUID tree and
3876 	 * to delete all entries that contain outdated data.
3877 	 * 2nd step is to add all missing entries to the UUID tree.
3878 	 */
3879 	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
3880 	if (ret < 0) {
3881 		btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
3882 		up(&fs_info->uuid_tree_rescan_sem);
3883 		return ret;
3884 	}
3885 	return btrfs_uuid_scan_kthread(data);
3886 }
3887 
3888 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
3889 {
3890 	struct btrfs_trans_handle *trans;
3891 	struct btrfs_root *tree_root = fs_info->tree_root;
3892 	struct btrfs_root *uuid_root;
3893 	struct task_struct *task;
3894 	int ret;
3895 
3896 	/*
3897 	 * 1 - root node
3898 	 * 1 - root item
3899 	 */
3900 	trans = btrfs_start_transaction(tree_root, 2);
3901 	if (IS_ERR(trans))
3902 		return PTR_ERR(trans);
3903 
3904 	uuid_root = btrfs_create_tree(trans, fs_info,
3905 				      BTRFS_UUID_TREE_OBJECTID);
3906 	if (IS_ERR(uuid_root)) {
3907 		btrfs_abort_transaction(trans, tree_root,
3908 					PTR_ERR(uuid_root));
3909 		return PTR_ERR(uuid_root);
3910 	}
3911 
3912 	fs_info->uuid_root = uuid_root;
3913 
3914 	ret = btrfs_commit_transaction(trans, tree_root);
3915 	if (ret)
3916 		return ret;
3917 
3918 	down(&fs_info->uuid_tree_rescan_sem);
3919 	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
3920 	if (IS_ERR(task)) {
3921 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
3922 		btrfs_warn(fs_info, "failed to start uuid_scan task");
3923 		up(&fs_info->uuid_tree_rescan_sem);
3924 		return PTR_ERR(task);
3925 	}
3926 
3927 	return 0;
3928 }
3929 
3930 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3931 {
3932 	struct task_struct *task;
3933 
3934 	down(&fs_info->uuid_tree_rescan_sem);
3935 	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3936 	if (IS_ERR(task)) {
3937 		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
3938 		btrfs_warn(fs_info, "failed to start uuid_rescan task");
3939 		up(&fs_info->uuid_tree_rescan_sem);
3940 		return PTR_ERR(task);
3941 	}
3942 
3943 	return 0;
3944 }
3945 
3946 /*
3947  * shrinking a device means finding all of the device extents past
3948  * the new size, and then following the back refs to the chunks.
3949  * The chunk relocation code actually frees the device extent
3950  */
3951 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3952 {
3953 	struct btrfs_trans_handle *trans;
3954 	struct btrfs_root *root = device->dev_root;
3955 	struct btrfs_dev_extent *dev_extent = NULL;
3956 	struct btrfs_path *path;
3957 	u64 length;
3958 	u64 chunk_tree;
3959 	u64 chunk_objectid;
3960 	u64 chunk_offset;
3961 	int ret;
3962 	int slot;
3963 	int failed = 0;
3964 	bool retried = false;
3965 	struct extent_buffer *l;
3966 	struct btrfs_key key;
3967 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3968 	u64 old_total = btrfs_super_total_bytes(super_copy);
3969 	u64 old_size = btrfs_device_get_total_bytes(device);
3970 	u64 diff = old_size - new_size;
3971 
3972 	if (device->is_tgtdev_for_dev_replace)
3973 		return -EINVAL;
3974 
3975 	path = btrfs_alloc_path();
3976 	if (!path)
3977 		return -ENOMEM;
3978 
3979 	path->reada = 2;
3980 
3981 	lock_chunks(root);
3982 
3983 	btrfs_device_set_total_bytes(device, new_size);
3984 	if (device->writeable) {
3985 		device->fs_devices->total_rw_bytes -= diff;
3986 		spin_lock(&root->fs_info->free_chunk_lock);
3987 		root->fs_info->free_chunk_space -= diff;
3988 		spin_unlock(&root->fs_info->free_chunk_lock);
3989 	}
3990 	unlock_chunks(root);
3991 
3992 again:
3993 	key.objectid = device->devid;
3994 	key.offset = (u64)-1;
3995 	key.type = BTRFS_DEV_EXTENT_KEY;
3996 
3997 	do {
3998 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3999 		if (ret < 0)
4000 			goto done;
4001 
4002 		ret = btrfs_previous_item(root, path, 0, key.type);
4003 		if (ret < 0)
4004 			goto done;
4005 		if (ret) {
4006 			ret = 0;
4007 			btrfs_release_path(path);
4008 			break;
4009 		}
4010 
4011 		l = path->nodes[0];
4012 		slot = path->slots[0];
4013 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4014 
4015 		if (key.objectid != device->devid) {
4016 			btrfs_release_path(path);
4017 			break;
4018 		}
4019 
4020 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4021 		length = btrfs_dev_extent_length(l, dev_extent);
4022 
4023 		if (key.offset + length <= new_size) {
4024 			btrfs_release_path(path);
4025 			break;
4026 		}
4027 
4028 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
4029 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
4030 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4031 		btrfs_release_path(path);
4032 
4033 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
4034 					   chunk_offset);
4035 		if (ret && ret != -ENOSPC)
4036 			goto done;
4037 		if (ret == -ENOSPC)
4038 			failed++;
4039 	} while (key.offset-- > 0);
4040 
4041 	if (failed && !retried) {
4042 		failed = 0;
4043 		retried = true;
4044 		goto again;
4045 	} else if (failed && retried) {
4046 		ret = -ENOSPC;
4047 		lock_chunks(root);
4048 
4049 		btrfs_device_set_total_bytes(device, old_size);
4050 		if (device->writeable)
4051 			device->fs_devices->total_rw_bytes += diff;
4052 		spin_lock(&root->fs_info->free_chunk_lock);
4053 		root->fs_info->free_chunk_space += diff;
4054 		spin_unlock(&root->fs_info->free_chunk_lock);
4055 		unlock_chunks(root);
4056 		goto done;
4057 	}
4058 
4059 	/* Shrinking succeeded, else we would be at "done". */
4060 	trans = btrfs_start_transaction(root, 0);
4061 	if (IS_ERR(trans)) {
4062 		ret = PTR_ERR(trans);
4063 		goto done;
4064 	}
4065 
4066 	lock_chunks(root);
4067 	btrfs_device_set_disk_total_bytes(device, new_size);
4068 	if (list_empty(&device->resized_list))
4069 		list_add_tail(&device->resized_list,
4070 			      &root->fs_info->fs_devices->resized_devices);
4071 
4072 	WARN_ON(diff > old_total);
4073 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
4074 	unlock_chunks(root);
4075 
4076 	/* Now btrfs_update_device() will change the on-disk size. */
4077 	ret = btrfs_update_device(trans, device);
4078 	btrfs_end_transaction(trans, root);
4079 done:
4080 	btrfs_free_path(path);
4081 	return ret;
4082 }
4083 
4084 static int btrfs_add_system_chunk(struct btrfs_root *root,
4085 			   struct btrfs_key *key,
4086 			   struct btrfs_chunk *chunk, int item_size)
4087 {
4088 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4089 	struct btrfs_disk_key disk_key;
4090 	u32 array_size;
4091 	u8 *ptr;
4092 
4093 	lock_chunks(root);
4094 	array_size = btrfs_super_sys_array_size(super_copy);
4095 	if (array_size + item_size + sizeof(disk_key)
4096 			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4097 		unlock_chunks(root);
4098 		return -EFBIG;
4099 	}
4100 
4101 	ptr = super_copy->sys_chunk_array + array_size;
4102 	btrfs_cpu_key_to_disk(&disk_key, key);
4103 	memcpy(ptr, &disk_key, sizeof(disk_key));
4104 	ptr += sizeof(disk_key);
4105 	memcpy(ptr, chunk, item_size);
4106 	item_size += sizeof(disk_key);
4107 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4108 	unlock_chunks(root);
4109 
4110 	return 0;
4111 }
4112 
4113 /*
4114  * sort the devices in descending order by max_avail, total_avail
4115  */
4116 static int btrfs_cmp_device_info(const void *a, const void *b)
4117 {
4118 	const struct btrfs_device_info *di_a = a;
4119 	const struct btrfs_device_info *di_b = b;
4120 
4121 	if (di_a->max_avail > di_b->max_avail)
4122 		return -1;
4123 	if (di_a->max_avail < di_b->max_avail)
4124 		return 1;
4125 	if (di_a->total_avail > di_b->total_avail)
4126 		return -1;
4127 	if (di_a->total_avail < di_b->total_avail)
4128 		return 1;
4129 	return 0;
4130 }
4131 
4132 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4133 	[BTRFS_RAID_RAID10] = {
4134 		.sub_stripes	= 2,
4135 		.dev_stripes	= 1,
4136 		.devs_max	= 0,	/* 0 == as many as possible */
4137 		.devs_min	= 4,
4138 		.devs_increment	= 2,
4139 		.ncopies	= 2,
4140 	},
4141 	[BTRFS_RAID_RAID1] = {
4142 		.sub_stripes	= 1,
4143 		.dev_stripes	= 1,
4144 		.devs_max	= 2,
4145 		.devs_min	= 2,
4146 		.devs_increment	= 2,
4147 		.ncopies	= 2,
4148 	},
4149 	[BTRFS_RAID_DUP] = {
4150 		.sub_stripes	= 1,
4151 		.dev_stripes	= 2,
4152 		.devs_max	= 1,
4153 		.devs_min	= 1,
4154 		.devs_increment	= 1,
4155 		.ncopies	= 2,
4156 	},
4157 	[BTRFS_RAID_RAID0] = {
4158 		.sub_stripes	= 1,
4159 		.dev_stripes	= 1,
4160 		.devs_max	= 0,
4161 		.devs_min	= 2,
4162 		.devs_increment	= 1,
4163 		.ncopies	= 1,
4164 	},
4165 	[BTRFS_RAID_SINGLE] = {
4166 		.sub_stripes	= 1,
4167 		.dev_stripes	= 1,
4168 		.devs_max	= 1,
4169 		.devs_min	= 1,
4170 		.devs_increment	= 1,
4171 		.ncopies	= 1,
4172 	},
4173 	[BTRFS_RAID_RAID5] = {
4174 		.sub_stripes	= 1,
4175 		.dev_stripes	= 1,
4176 		.devs_max	= 0,
4177 		.devs_min	= 2,
4178 		.devs_increment	= 1,
4179 		.ncopies	= 2,
4180 	},
4181 	[BTRFS_RAID_RAID6] = {
4182 		.sub_stripes	= 1,
4183 		.dev_stripes	= 1,
4184 		.devs_max	= 0,
4185 		.devs_min	= 3,
4186 		.devs_increment	= 1,
4187 		.ncopies	= 3,
4188 	},
4189 };
4190 
4191 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4192 {
4193 	/* TODO allow them to set a preferred stripe size */
4194 	return 64 * 1024;
4195 }
4196 
4197 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4198 {
4199 	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
4200 		return;
4201 
4202 	btrfs_set_fs_incompat(info, RAID56);
4203 }
4204 
4205 #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)		\
4206 			- sizeof(struct btrfs_item)		\
4207 			- sizeof(struct btrfs_chunk))		\
4208 			/ sizeof(struct btrfs_stripe) + 1)
4209 
4210 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE	\
4211 				- 2 * sizeof(struct btrfs_disk_key)	\
4212 				- 2 * sizeof(struct btrfs_chunk))	\
4213 				/ sizeof(struct btrfs_stripe) + 1)
4214 
4215 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4216 			       struct btrfs_root *extent_root, u64 start,
4217 			       u64 type)
4218 {
4219 	struct btrfs_fs_info *info = extent_root->fs_info;
4220 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
4221 	struct list_head *cur;
4222 	struct map_lookup *map = NULL;
4223 	struct extent_map_tree *em_tree;
4224 	struct extent_map *em;
4225 	struct btrfs_device_info *devices_info = NULL;
4226 	u64 total_avail;
4227 	int num_stripes;	/* total number of stripes to allocate */
4228 	int data_stripes;	/* number of stripes that count for
4229 				   block group size */
4230 	int sub_stripes;	/* sub_stripes info for map */
4231 	int dev_stripes;	/* stripes per dev */
4232 	int devs_max;		/* max devs to use */
4233 	int devs_min;		/* min devs needed */
4234 	int devs_increment;	/* ndevs has to be a multiple of this */
4235 	int ncopies;		/* how many copies to data has */
4236 	int ret;
4237 	u64 max_stripe_size;
4238 	u64 max_chunk_size;
4239 	u64 stripe_size;
4240 	u64 num_bytes;
4241 	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
4242 	int ndevs;
4243 	int i;
4244 	int j;
4245 	int index;
4246 
4247 	BUG_ON(!alloc_profile_is_valid(type, 0));
4248 
4249 	if (list_empty(&fs_devices->alloc_list))
4250 		return -ENOSPC;
4251 
4252 	index = __get_raid_index(type);
4253 
4254 	sub_stripes = btrfs_raid_array[index].sub_stripes;
4255 	dev_stripes = btrfs_raid_array[index].dev_stripes;
4256 	devs_max = btrfs_raid_array[index].devs_max;
4257 	devs_min = btrfs_raid_array[index].devs_min;
4258 	devs_increment = btrfs_raid_array[index].devs_increment;
4259 	ncopies = btrfs_raid_array[index].ncopies;
4260 
4261 	if (type & BTRFS_BLOCK_GROUP_DATA) {
4262 		max_stripe_size = 1024 * 1024 * 1024;
4263 		max_chunk_size = 10 * max_stripe_size;
4264 		if (!devs_max)
4265 			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4266 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4267 		/* for larger filesystems, use larger metadata chunks */
4268 		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
4269 			max_stripe_size = 1024 * 1024 * 1024;
4270 		else
4271 			max_stripe_size = 256 * 1024 * 1024;
4272 		max_chunk_size = max_stripe_size;
4273 		if (!devs_max)
4274 			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4275 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4276 		max_stripe_size = 32 * 1024 * 1024;
4277 		max_chunk_size = 2 * max_stripe_size;
4278 		if (!devs_max)
4279 			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4280 	} else {
4281 		btrfs_err(info, "invalid chunk type 0x%llx requested",
4282 		       type);
4283 		BUG_ON(1);
4284 	}
4285 
4286 	/* we don't want a chunk larger than 10% of writeable space */
4287 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4288 			     max_chunk_size);
4289 
4290 	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
4291 			       GFP_NOFS);
4292 	if (!devices_info)
4293 		return -ENOMEM;
4294 
4295 	cur = fs_devices->alloc_list.next;
4296 
4297 	/*
4298 	 * in the first pass through the devices list, we gather information
4299 	 * about the available holes on each device.
4300 	 */
4301 	ndevs = 0;
4302 	while (cur != &fs_devices->alloc_list) {
4303 		struct btrfs_device *device;
4304 		u64 max_avail;
4305 		u64 dev_offset;
4306 
4307 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
4308 
4309 		cur = cur->next;
4310 
4311 		if (!device->writeable) {
4312 			WARN(1, KERN_ERR
4313 			       "BTRFS: read-only device in alloc_list\n");
4314 			continue;
4315 		}
4316 
4317 		if (!device->in_fs_metadata ||
4318 		    device->is_tgtdev_for_dev_replace)
4319 			continue;
4320 
4321 		if (device->total_bytes > device->bytes_used)
4322 			total_avail = device->total_bytes - device->bytes_used;
4323 		else
4324 			total_avail = 0;
4325 
4326 		/* If there is no space on this device, skip it. */
4327 		if (total_avail == 0)
4328 			continue;
4329 
4330 		ret = find_free_dev_extent(trans, device,
4331 					   max_stripe_size * dev_stripes,
4332 					   &dev_offset, &max_avail);
4333 		if (ret && ret != -ENOSPC)
4334 			goto error;
4335 
4336 		if (ret == 0)
4337 			max_avail = max_stripe_size * dev_stripes;
4338 
4339 		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4340 			continue;
4341 
4342 		if (ndevs == fs_devices->rw_devices) {
4343 			WARN(1, "%s: found more than %llu devices\n",
4344 			     __func__, fs_devices->rw_devices);
4345 			break;
4346 		}
4347 		devices_info[ndevs].dev_offset = dev_offset;
4348 		devices_info[ndevs].max_avail = max_avail;
4349 		devices_info[ndevs].total_avail = total_avail;
4350 		devices_info[ndevs].dev = device;
4351 		++ndevs;
4352 	}
4353 
4354 	/*
4355 	 * now sort the devices by hole size / available space
4356 	 */
4357 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4358 	     btrfs_cmp_device_info, NULL);
4359 
4360 	/* round down to number of usable stripes */
4361 	ndevs -= ndevs % devs_increment;
4362 
4363 	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4364 		ret = -ENOSPC;
4365 		goto error;
4366 	}
4367 
4368 	if (devs_max && ndevs > devs_max)
4369 		ndevs = devs_max;
4370 	/*
4371 	 * the primary goal is to maximize the number of stripes, so use as many
4372 	 * devices as possible, even if the stripes are not maximum sized.
4373 	 */
4374 	stripe_size = devices_info[ndevs-1].max_avail;
4375 	num_stripes = ndevs * dev_stripes;
4376 
4377 	/*
4378 	 * this will have to be fixed for RAID1 and RAID10 over
4379 	 * more drives
4380 	 */
4381 	data_stripes = num_stripes / ncopies;
4382 
4383 	if (type & BTRFS_BLOCK_GROUP_RAID5) {
4384 		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
4385 				 btrfs_super_stripesize(info->super_copy));
4386 		data_stripes = num_stripes - 1;
4387 	}
4388 	if (type & BTRFS_BLOCK_GROUP_RAID6) {
4389 		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
4390 				 btrfs_super_stripesize(info->super_copy));
4391 		data_stripes = num_stripes - 2;
4392 	}
4393 
4394 	/*
4395 	 * Use the number of data stripes to figure out how big this chunk
4396 	 * is really going to be in terms of logical address space,
4397 	 * and compare that answer with the max chunk size
4398 	 */
4399 	if (stripe_size * data_stripes > max_chunk_size) {
4400 		u64 mask = (1ULL << 24) - 1;
4401 		stripe_size = max_chunk_size;
4402 		do_div(stripe_size, data_stripes);
4403 
4404 		/* bump the answer up to a 16MB boundary */
4405 		stripe_size = (stripe_size + mask) & ~mask;
4406 
4407 		/* but don't go higher than the limits we found
4408 		 * while searching for free extents
4409 		 */
4410 		if (stripe_size > devices_info[ndevs-1].max_avail)
4411 			stripe_size = devices_info[ndevs-1].max_avail;
4412 	}
4413 
4414 	do_div(stripe_size, dev_stripes);
4415 
4416 	/* align to BTRFS_STRIPE_LEN */
4417 	do_div(stripe_size, raid_stripe_len);
4418 	stripe_size *= raid_stripe_len;
4419 
4420 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4421 	if (!map) {
4422 		ret = -ENOMEM;
4423 		goto error;
4424 	}
4425 	map->num_stripes = num_stripes;
4426 
4427 	for (i = 0; i < ndevs; ++i) {
4428 		for (j = 0; j < dev_stripes; ++j) {
4429 			int s = i * dev_stripes + j;
4430 			map->stripes[s].dev = devices_info[i].dev;
4431 			map->stripes[s].physical = devices_info[i].dev_offset +
4432 						   j * stripe_size;
4433 		}
4434 	}
4435 	map->sector_size = extent_root->sectorsize;
4436 	map->stripe_len = raid_stripe_len;
4437 	map->io_align = raid_stripe_len;
4438 	map->io_width = raid_stripe_len;
4439 	map->type = type;
4440 	map->sub_stripes = sub_stripes;
4441 
4442 	num_bytes = stripe_size * data_stripes;
4443 
4444 	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
4445 
4446 	em = alloc_extent_map();
4447 	if (!em) {
4448 		kfree(map);
4449 		ret = -ENOMEM;
4450 		goto error;
4451 	}
4452 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4453 	em->bdev = (struct block_device *)map;
4454 	em->start = start;
4455 	em->len = num_bytes;
4456 	em->block_start = 0;
4457 	em->block_len = em->len;
4458 	em->orig_block_len = stripe_size;
4459 
4460 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4461 	write_lock(&em_tree->lock);
4462 	ret = add_extent_mapping(em_tree, em, 0);
4463 	if (!ret) {
4464 		list_add_tail(&em->list, &trans->transaction->pending_chunks);
4465 		atomic_inc(&em->refs);
4466 	}
4467 	write_unlock(&em_tree->lock);
4468 	if (ret) {
4469 		free_extent_map(em);
4470 		goto error;
4471 	}
4472 
4473 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
4474 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4475 				     start, num_bytes);
4476 	if (ret)
4477 		goto error_del_extent;
4478 
4479 	for (i = 0; i < map->num_stripes; i++) {
4480 		num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4481 		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4482 	}
4483 
4484 	spin_lock(&extent_root->fs_info->free_chunk_lock);
4485 	extent_root->fs_info->free_chunk_space -= (stripe_size *
4486 						   map->num_stripes);
4487 	spin_unlock(&extent_root->fs_info->free_chunk_lock);
4488 
4489 	free_extent_map(em);
4490 	check_raid56_incompat_flag(extent_root->fs_info, type);
4491 
4492 	kfree(devices_info);
4493 	return 0;
4494 
4495 error_del_extent:
4496 	write_lock(&em_tree->lock);
4497 	remove_extent_mapping(em_tree, em);
4498 	write_unlock(&em_tree->lock);
4499 
4500 	/* One for our allocation */
4501 	free_extent_map(em);
4502 	/* One for the tree reference */
4503 	free_extent_map(em);
4504 	/* One for the pending_chunks list reference */
4505 	free_extent_map(em);
4506 error:
4507 	kfree(devices_info);
4508 	return ret;
4509 }
4510 
4511 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4512 				struct btrfs_root *extent_root,
4513 				u64 chunk_offset, u64 chunk_size)
4514 {
4515 	struct btrfs_key key;
4516 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4517 	struct btrfs_device *device;
4518 	struct btrfs_chunk *chunk;
4519 	struct btrfs_stripe *stripe;
4520 	struct extent_map_tree *em_tree;
4521 	struct extent_map *em;
4522 	struct map_lookup *map;
4523 	size_t item_size;
4524 	u64 dev_offset;
4525 	u64 stripe_size;
4526 	int i = 0;
4527 	int ret;
4528 
4529 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4530 	read_lock(&em_tree->lock);
4531 	em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
4532 	read_unlock(&em_tree->lock);
4533 
4534 	if (!em) {
4535 		btrfs_crit(extent_root->fs_info, "unable to find logical "
4536 			   "%Lu len %Lu", chunk_offset, chunk_size);
4537 		return -EINVAL;
4538 	}
4539 
4540 	if (em->start != chunk_offset || em->len != chunk_size) {
4541 		btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4542 			  " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4543 			  chunk_size, em->start, em->len);
4544 		free_extent_map(em);
4545 		return -EINVAL;
4546 	}
4547 
4548 	map = (struct map_lookup *)em->bdev;
4549 	item_size = btrfs_chunk_item_size(map->num_stripes);
4550 	stripe_size = em->orig_block_len;
4551 
4552 	chunk = kzalloc(item_size, GFP_NOFS);
4553 	if (!chunk) {
4554 		ret = -ENOMEM;
4555 		goto out;
4556 	}
4557 
4558 	for (i = 0; i < map->num_stripes; i++) {
4559 		device = map->stripes[i].dev;
4560 		dev_offset = map->stripes[i].physical;
4561 
4562 		ret = btrfs_update_device(trans, device);
4563 		if (ret)
4564 			goto out;
4565 		ret = btrfs_alloc_dev_extent(trans, device,
4566 					     chunk_root->root_key.objectid,
4567 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4568 					     chunk_offset, dev_offset,
4569 					     stripe_size);
4570 		if (ret)
4571 			goto out;
4572 	}
4573 
4574 	stripe = &chunk->stripe;
4575 	for (i = 0; i < map->num_stripes; i++) {
4576 		device = map->stripes[i].dev;
4577 		dev_offset = map->stripes[i].physical;
4578 
4579 		btrfs_set_stack_stripe_devid(stripe, device->devid);
4580 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
4581 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4582 		stripe++;
4583 	}
4584 
4585 	btrfs_set_stack_chunk_length(chunk, chunk_size);
4586 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4587 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4588 	btrfs_set_stack_chunk_type(chunk, map->type);
4589 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4590 	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4591 	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4592 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
4593 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4594 
4595 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4596 	key.type = BTRFS_CHUNK_ITEM_KEY;
4597 	key.offset = chunk_offset;
4598 
4599 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4600 	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4601 		/*
4602 		 * TODO: Cleanup of inserted chunk root in case of
4603 		 * failure.
4604 		 */
4605 		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
4606 					     item_size);
4607 	}
4608 
4609 out:
4610 	kfree(chunk);
4611 	free_extent_map(em);
4612 	return ret;
4613 }
4614 
4615 /*
4616  * Chunk allocation falls into two parts. The first part does works
4617  * that make the new allocated chunk useable, but not do any operation
4618  * that modifies the chunk tree. The second part does the works that
4619  * require modifying the chunk tree. This division is important for the
4620  * bootstrap process of adding storage to a seed btrfs.
4621  */
4622 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4623 		      struct btrfs_root *extent_root, u64 type)
4624 {
4625 	u64 chunk_offset;
4626 
4627 	chunk_offset = find_next_chunk(extent_root->fs_info);
4628 	return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4629 }
4630 
4631 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4632 					 struct btrfs_root *root,
4633 					 struct btrfs_device *device)
4634 {
4635 	u64 chunk_offset;
4636 	u64 sys_chunk_offset;
4637 	u64 alloc_profile;
4638 	struct btrfs_fs_info *fs_info = root->fs_info;
4639 	struct btrfs_root *extent_root = fs_info->extent_root;
4640 	int ret;
4641 
4642 	chunk_offset = find_next_chunk(fs_info);
4643 	alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4644 	ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4645 				  alloc_profile);
4646 	if (ret)
4647 		return ret;
4648 
4649 	sys_chunk_offset = find_next_chunk(root->fs_info);
4650 	alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4651 	ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4652 				  alloc_profile);
4653 	return ret;
4654 }
4655 
4656 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4657 {
4658 	int max_errors;
4659 
4660 	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4661 			 BTRFS_BLOCK_GROUP_RAID10 |
4662 			 BTRFS_BLOCK_GROUP_RAID5 |
4663 			 BTRFS_BLOCK_GROUP_DUP)) {
4664 		max_errors = 1;
4665 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4666 		max_errors = 2;
4667 	} else {
4668 		max_errors = 0;
4669 	}
4670 
4671 	return max_errors;
4672 }
4673 
4674 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4675 {
4676 	struct extent_map *em;
4677 	struct map_lookup *map;
4678 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4679 	int readonly = 0;
4680 	int miss_ndevs = 0;
4681 	int i;
4682 
4683 	read_lock(&map_tree->map_tree.lock);
4684 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4685 	read_unlock(&map_tree->map_tree.lock);
4686 	if (!em)
4687 		return 1;
4688 
4689 	map = (struct map_lookup *)em->bdev;
4690 	for (i = 0; i < map->num_stripes; i++) {
4691 		if (map->stripes[i].dev->missing) {
4692 			miss_ndevs++;
4693 			continue;
4694 		}
4695 
4696 		if (!map->stripes[i].dev->writeable) {
4697 			readonly = 1;
4698 			goto end;
4699 		}
4700 	}
4701 
4702 	/*
4703 	 * If the number of missing devices is larger than max errors,
4704 	 * we can not write the data into that chunk successfully, so
4705 	 * set it readonly.
4706 	 */
4707 	if (miss_ndevs > btrfs_chunk_max_errors(map))
4708 		readonly = 1;
4709 end:
4710 	free_extent_map(em);
4711 	return readonly;
4712 }
4713 
4714 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4715 {
4716 	extent_map_tree_init(&tree->map_tree);
4717 }
4718 
4719 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4720 {
4721 	struct extent_map *em;
4722 
4723 	while (1) {
4724 		write_lock(&tree->map_tree.lock);
4725 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4726 		if (em)
4727 			remove_extent_mapping(&tree->map_tree, em);
4728 		write_unlock(&tree->map_tree.lock);
4729 		if (!em)
4730 			break;
4731 		/* once for us */
4732 		free_extent_map(em);
4733 		/* once for the tree */
4734 		free_extent_map(em);
4735 	}
4736 }
4737 
4738 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4739 {
4740 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4741 	struct extent_map *em;
4742 	struct map_lookup *map;
4743 	struct extent_map_tree *em_tree = &map_tree->map_tree;
4744 	int ret;
4745 
4746 	read_lock(&em_tree->lock);
4747 	em = lookup_extent_mapping(em_tree, logical, len);
4748 	read_unlock(&em_tree->lock);
4749 
4750 	/*
4751 	 * We could return errors for these cases, but that could get ugly and
4752 	 * we'd probably do the same thing which is just not do anything else
4753 	 * and exit, so return 1 so the callers don't try to use other copies.
4754 	 */
4755 	if (!em) {
4756 		btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4757 			    logical+len);
4758 		return 1;
4759 	}
4760 
4761 	if (em->start > logical || em->start + em->len < logical) {
4762 		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4763 			    "%Lu-%Lu", logical, logical+len, em->start,
4764 			    em->start + em->len);
4765 		free_extent_map(em);
4766 		return 1;
4767 	}
4768 
4769 	map = (struct map_lookup *)em->bdev;
4770 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4771 		ret = map->num_stripes;
4772 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4773 		ret = map->sub_stripes;
4774 	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4775 		ret = 2;
4776 	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4777 		ret = 3;
4778 	else
4779 		ret = 1;
4780 	free_extent_map(em);
4781 
4782 	btrfs_dev_replace_lock(&fs_info->dev_replace);
4783 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4784 		ret++;
4785 	btrfs_dev_replace_unlock(&fs_info->dev_replace);
4786 
4787 	return ret;
4788 }
4789 
4790 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4791 				    struct btrfs_mapping_tree *map_tree,
4792 				    u64 logical)
4793 {
4794 	struct extent_map *em;
4795 	struct map_lookup *map;
4796 	struct extent_map_tree *em_tree = &map_tree->map_tree;
4797 	unsigned long len = root->sectorsize;
4798 
4799 	read_lock(&em_tree->lock);
4800 	em = lookup_extent_mapping(em_tree, logical, len);
4801 	read_unlock(&em_tree->lock);
4802 	BUG_ON(!em);
4803 
4804 	BUG_ON(em->start > logical || em->start + em->len < logical);
4805 	map = (struct map_lookup *)em->bdev;
4806 	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4807 			 BTRFS_BLOCK_GROUP_RAID6)) {
4808 		len = map->stripe_len * nr_data_stripes(map);
4809 	}
4810 	free_extent_map(em);
4811 	return len;
4812 }
4813 
4814 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4815 			   u64 logical, u64 len, int mirror_num)
4816 {
4817 	struct extent_map *em;
4818 	struct map_lookup *map;
4819 	struct extent_map_tree *em_tree = &map_tree->map_tree;
4820 	int ret = 0;
4821 
4822 	read_lock(&em_tree->lock);
4823 	em = lookup_extent_mapping(em_tree, logical, len);
4824 	read_unlock(&em_tree->lock);
4825 	BUG_ON(!em);
4826 
4827 	BUG_ON(em->start > logical || em->start + em->len < logical);
4828 	map = (struct map_lookup *)em->bdev;
4829 	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4830 			 BTRFS_BLOCK_GROUP_RAID6))
4831 		ret = 1;
4832 	free_extent_map(em);
4833 	return ret;
4834 }
4835 
4836 static int find_live_mirror(struct btrfs_fs_info *fs_info,
4837 			    struct map_lookup *map, int first, int num,
4838 			    int optimal, int dev_replace_is_ongoing)
4839 {
4840 	int i;
4841 	int tolerance;
4842 	struct btrfs_device *srcdev;
4843 
4844 	if (dev_replace_is_ongoing &&
4845 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4846 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4847 		srcdev = fs_info->dev_replace.srcdev;
4848 	else
4849 		srcdev = NULL;
4850 
4851 	/*
4852 	 * try to avoid the drive that is the source drive for a
4853 	 * dev-replace procedure, only choose it if no other non-missing
4854 	 * mirror is available
4855 	 */
4856 	for (tolerance = 0; tolerance < 2; tolerance++) {
4857 		if (map->stripes[optimal].dev->bdev &&
4858 		    (tolerance || map->stripes[optimal].dev != srcdev))
4859 			return optimal;
4860 		for (i = first; i < first + num; i++) {
4861 			if (map->stripes[i].dev->bdev &&
4862 			    (tolerance || map->stripes[i].dev != srcdev))
4863 				return i;
4864 		}
4865 	}
4866 
4867 	/* we couldn't find one that doesn't fail.  Just return something
4868 	 * and the io error handling code will clean up eventually
4869 	 */
4870 	return optimal;
4871 }
4872 
4873 static inline int parity_smaller(u64 a, u64 b)
4874 {
4875 	return a > b;
4876 }
4877 
4878 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4879 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4880 {
4881 	struct btrfs_bio_stripe s;
4882 	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
4883 	int i;
4884 	u64 l;
4885 	int again = 1;
4886 	int m;
4887 
4888 	while (again) {
4889 		again = 0;
4890 		for (i = 0; i < real_stripes - 1; i++) {
4891 			if (parity_smaller(raid_map[i], raid_map[i+1])) {
4892 				s = bbio->stripes[i];
4893 				l = raid_map[i];
4894 				bbio->stripes[i] = bbio->stripes[i+1];
4895 				raid_map[i] = raid_map[i+1];
4896 				bbio->stripes[i+1] = s;
4897 				raid_map[i+1] = l;
4898 
4899 				if (bbio->tgtdev_map) {
4900 					m = bbio->tgtdev_map[i];
4901 					bbio->tgtdev_map[i] =
4902 							bbio->tgtdev_map[i + 1];
4903 					bbio->tgtdev_map[i + 1] = m;
4904 				}
4905 
4906 				again = 1;
4907 			}
4908 		}
4909 	}
4910 }
4911 
4912 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4913 			     u64 logical, u64 *length,
4914 			     struct btrfs_bio **bbio_ret,
4915 			     int mirror_num, u64 **raid_map_ret)
4916 {
4917 	struct extent_map *em;
4918 	struct map_lookup *map;
4919 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4920 	struct extent_map_tree *em_tree = &map_tree->map_tree;
4921 	u64 offset;
4922 	u64 stripe_offset;
4923 	u64 stripe_end_offset;
4924 	u64 stripe_nr;
4925 	u64 stripe_nr_orig;
4926 	u64 stripe_nr_end;
4927 	u64 stripe_len;
4928 	u64 *raid_map = NULL;
4929 	int stripe_index;
4930 	int i;
4931 	int ret = 0;
4932 	int num_stripes;
4933 	int max_errors = 0;
4934 	int tgtdev_indexes = 0;
4935 	struct btrfs_bio *bbio = NULL;
4936 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4937 	int dev_replace_is_ongoing = 0;
4938 	int num_alloc_stripes;
4939 	int patch_the_first_stripe_for_dev_replace = 0;
4940 	u64 physical_to_patch_in_first_stripe = 0;
4941 	u64 raid56_full_stripe_start = (u64)-1;
4942 
4943 	read_lock(&em_tree->lock);
4944 	em = lookup_extent_mapping(em_tree, logical, *length);
4945 	read_unlock(&em_tree->lock);
4946 
4947 	if (!em) {
4948 		btrfs_crit(fs_info, "unable to find logical %llu len %llu",
4949 			logical, *length);
4950 		return -EINVAL;
4951 	}
4952 
4953 	if (em->start > logical || em->start + em->len < logical) {
4954 		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4955 			   "found %Lu-%Lu", logical, em->start,
4956 			   em->start + em->len);
4957 		free_extent_map(em);
4958 		return -EINVAL;
4959 	}
4960 
4961 	map = (struct map_lookup *)em->bdev;
4962 	offset = logical - em->start;
4963 
4964 	stripe_len = map->stripe_len;
4965 	stripe_nr = offset;
4966 	/*
4967 	 * stripe_nr counts the total number of stripes we have to stride
4968 	 * to get to this block
4969 	 */
4970 	do_div(stripe_nr, stripe_len);
4971 
4972 	stripe_offset = stripe_nr * stripe_len;
4973 	BUG_ON(offset < stripe_offset);
4974 
4975 	/* stripe_offset is the offset of this block in its stripe*/
4976 	stripe_offset = offset - stripe_offset;
4977 
4978 	/* if we're here for raid56, we need to know the stripe aligned start */
4979 	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4980 		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4981 		raid56_full_stripe_start = offset;
4982 
4983 		/* allow a write of a full stripe, but make sure we don't
4984 		 * allow straddling of stripes
4985 		 */
4986 		do_div(raid56_full_stripe_start, full_stripe_len);
4987 		raid56_full_stripe_start *= full_stripe_len;
4988 	}
4989 
4990 	if (rw & REQ_DISCARD) {
4991 		/* we don't discard raid56 yet */
4992 		if (map->type &
4993 		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4994 			ret = -EOPNOTSUPP;
4995 			goto out;
4996 		}
4997 		*length = min_t(u64, em->len - offset, *length);
4998 	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4999 		u64 max_len;
5000 		/* For writes to RAID[56], allow a full stripeset across all disks.
5001 		   For other RAID types and for RAID[56] reads, just allow a single
5002 		   stripe (on a single disk). */
5003 		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
5004 		    (rw & REQ_WRITE)) {
5005 			max_len = stripe_len * nr_data_stripes(map) -
5006 				(offset - raid56_full_stripe_start);
5007 		} else {
5008 			/* we limit the length of each bio to what fits in a stripe */
5009 			max_len = stripe_len - stripe_offset;
5010 		}
5011 		*length = min_t(u64, em->len - offset, max_len);
5012 	} else {
5013 		*length = em->len - offset;
5014 	}
5015 
5016 	/* This is for when we're called from btrfs_merge_bio_hook() and all
5017 	   it cares about is the length */
5018 	if (!bbio_ret)
5019 		goto out;
5020 
5021 	btrfs_dev_replace_lock(dev_replace);
5022 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5023 	if (!dev_replace_is_ongoing)
5024 		btrfs_dev_replace_unlock(dev_replace);
5025 
5026 	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5027 	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
5028 	    dev_replace->tgtdev != NULL) {
5029 		/*
5030 		 * in dev-replace case, for repair case (that's the only
5031 		 * case where the mirror is selected explicitly when
5032 		 * calling btrfs_map_block), blocks left of the left cursor
5033 		 * can also be read from the target drive.
5034 		 * For REQ_GET_READ_MIRRORS, the target drive is added as
5035 		 * the last one to the array of stripes. For READ, it also
5036 		 * needs to be supported using the same mirror number.
5037 		 * If the requested block is not left of the left cursor,
5038 		 * EIO is returned. This can happen because btrfs_num_copies()
5039 		 * returns one more in the dev-replace case.
5040 		 */
5041 		u64 tmp_length = *length;
5042 		struct btrfs_bio *tmp_bbio = NULL;
5043 		int tmp_num_stripes;
5044 		u64 srcdev_devid = dev_replace->srcdev->devid;
5045 		int index_srcdev = 0;
5046 		int found = 0;
5047 		u64 physical_of_found = 0;
5048 
5049 		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
5050 			     logical, &tmp_length, &tmp_bbio, 0, NULL);
5051 		if (ret) {
5052 			WARN_ON(tmp_bbio != NULL);
5053 			goto out;
5054 		}
5055 
5056 		tmp_num_stripes = tmp_bbio->num_stripes;
5057 		if (mirror_num > tmp_num_stripes) {
5058 			/*
5059 			 * REQ_GET_READ_MIRRORS does not contain this
5060 			 * mirror, that means that the requested area
5061 			 * is not left of the left cursor
5062 			 */
5063 			ret = -EIO;
5064 			kfree(tmp_bbio);
5065 			goto out;
5066 		}
5067 
5068 		/*
5069 		 * process the rest of the function using the mirror_num
5070 		 * of the source drive. Therefore look it up first.
5071 		 * At the end, patch the device pointer to the one of the
5072 		 * target drive.
5073 		 */
5074 		for (i = 0; i < tmp_num_stripes; i++) {
5075 			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
5076 				/*
5077 				 * In case of DUP, in order to keep it
5078 				 * simple, only add the mirror with the
5079 				 * lowest physical address
5080 				 */
5081 				if (found &&
5082 				    physical_of_found <=
5083 				     tmp_bbio->stripes[i].physical)
5084 					continue;
5085 				index_srcdev = i;
5086 				found = 1;
5087 				physical_of_found =
5088 					tmp_bbio->stripes[i].physical;
5089 			}
5090 		}
5091 
5092 		if (found) {
5093 			mirror_num = index_srcdev + 1;
5094 			patch_the_first_stripe_for_dev_replace = 1;
5095 			physical_to_patch_in_first_stripe = physical_of_found;
5096 		} else {
5097 			WARN_ON(1);
5098 			ret = -EIO;
5099 			kfree(tmp_bbio);
5100 			goto out;
5101 		}
5102 
5103 		kfree(tmp_bbio);
5104 	} else if (mirror_num > map->num_stripes) {
5105 		mirror_num = 0;
5106 	}
5107 
5108 	num_stripes = 1;
5109 	stripe_index = 0;
5110 	stripe_nr_orig = stripe_nr;
5111 	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
5112 	do_div(stripe_nr_end, map->stripe_len);
5113 	stripe_end_offset = stripe_nr_end * map->stripe_len -
5114 			    (offset + *length);
5115 
5116 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5117 		if (rw & REQ_DISCARD)
5118 			num_stripes = min_t(u64, map->num_stripes,
5119 					    stripe_nr_end - stripe_nr_orig);
5120 		stripe_index = do_div(stripe_nr, map->num_stripes);
5121 		if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5122 			mirror_num = 1;
5123 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5124 		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
5125 			num_stripes = map->num_stripes;
5126 		else if (mirror_num)
5127 			stripe_index = mirror_num - 1;
5128 		else {
5129 			stripe_index = find_live_mirror(fs_info, map, 0,
5130 					    map->num_stripes,
5131 					    current->pid % map->num_stripes,
5132 					    dev_replace_is_ongoing);
5133 			mirror_num = stripe_index + 1;
5134 		}
5135 
5136 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5137 		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
5138 			num_stripes = map->num_stripes;
5139 		} else if (mirror_num) {
5140 			stripe_index = mirror_num - 1;
5141 		} else {
5142 			mirror_num = 1;
5143 		}
5144 
5145 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5146 		int factor = map->num_stripes / map->sub_stripes;
5147 
5148 		stripe_index = do_div(stripe_nr, factor);
5149 		stripe_index *= map->sub_stripes;
5150 
5151 		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5152 			num_stripes = map->sub_stripes;
5153 		else if (rw & REQ_DISCARD)
5154 			num_stripes = min_t(u64, map->sub_stripes *
5155 					    (stripe_nr_end - stripe_nr_orig),
5156 					    map->num_stripes);
5157 		else if (mirror_num)
5158 			stripe_index += mirror_num - 1;
5159 		else {
5160 			int old_stripe_index = stripe_index;
5161 			stripe_index = find_live_mirror(fs_info, map,
5162 					      stripe_index,
5163 					      map->sub_stripes, stripe_index +
5164 					      current->pid % map->sub_stripes,
5165 					      dev_replace_is_ongoing);
5166 			mirror_num = stripe_index - old_stripe_index + 1;
5167 		}
5168 
5169 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
5170 				BTRFS_BLOCK_GROUP_RAID6)) {
5171 		u64 tmp;
5172 
5173 		if (raid_map_ret &&
5174 		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5175 		     mirror_num > 1)) {
5176 			int i, rot;
5177 
5178 			/* push stripe_nr back to the start of the full stripe */
5179 			stripe_nr = raid56_full_stripe_start;
5180 			do_div(stripe_nr, stripe_len * nr_data_stripes(map));
5181 
5182 			/* RAID[56] write or recovery. Return all stripes */
5183 			num_stripes = map->num_stripes;
5184 			max_errors = nr_parity_stripes(map);
5185 
5186 			raid_map = kmalloc_array(num_stripes, sizeof(u64),
5187 					   GFP_NOFS);
5188 			if (!raid_map) {
5189 				ret = -ENOMEM;
5190 				goto out;
5191 			}
5192 
5193 			/* Work out the disk rotation on this stripe-set */
5194 			tmp = stripe_nr;
5195 			rot = do_div(tmp, num_stripes);
5196 
5197 			/* Fill in the logical address of each stripe */
5198 			tmp = stripe_nr * nr_data_stripes(map);
5199 			for (i = 0; i < nr_data_stripes(map); i++)
5200 				raid_map[(i+rot) % num_stripes] =
5201 					em->start + (tmp + i) * map->stripe_len;
5202 
5203 			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5204 			if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5205 				raid_map[(i+rot+1) % num_stripes] =
5206 					RAID6_Q_STRIPE;
5207 
5208 			*length = map->stripe_len;
5209 			stripe_index = 0;
5210 			stripe_offset = 0;
5211 		} else {
5212 			/*
5213 			 * Mirror #0 or #1 means the original data block.
5214 			 * Mirror #2 is RAID5 parity block.
5215 			 * Mirror #3 is RAID6 Q block.
5216 			 */
5217 			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5218 			if (mirror_num > 1)
5219 				stripe_index = nr_data_stripes(map) +
5220 						mirror_num - 2;
5221 
5222 			/* We distribute the parity blocks across stripes */
5223 			tmp = stripe_nr + stripe_index;
5224 			stripe_index = do_div(tmp, map->num_stripes);
5225 			if (!(rw & (REQ_WRITE | REQ_DISCARD |
5226 				    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5227 				mirror_num = 1;
5228 		}
5229 	} else {
5230 		/*
5231 		 * after this do_div call, stripe_nr is the number of stripes
5232 		 * on this device we have to walk to find the data, and
5233 		 * stripe_index is the number of our device in the stripe array
5234 		 */
5235 		stripe_index = do_div(stripe_nr, map->num_stripes);
5236 		mirror_num = stripe_index + 1;
5237 	}
5238 	BUG_ON(stripe_index >= map->num_stripes);
5239 
5240 	num_alloc_stripes = num_stripes;
5241 	if (dev_replace_is_ongoing) {
5242 		if (rw & (REQ_WRITE | REQ_DISCARD))
5243 			num_alloc_stripes <<= 1;
5244 		if (rw & REQ_GET_READ_MIRRORS)
5245 			num_alloc_stripes++;
5246 		tgtdev_indexes = num_stripes;
5247 	}
5248 
5249 	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
5250 		       GFP_NOFS);
5251 	if (!bbio) {
5252 		kfree(raid_map);
5253 		ret = -ENOMEM;
5254 		goto out;
5255 	}
5256 	atomic_set(&bbio->error, 0);
5257 	if (dev_replace_is_ongoing)
5258 		bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5259 
5260 	if (rw & REQ_DISCARD) {
5261 		int factor = 0;
5262 		int sub_stripes = 0;
5263 		u64 stripes_per_dev = 0;
5264 		u32 remaining_stripes = 0;
5265 		u32 last_stripe = 0;
5266 
5267 		if (map->type &
5268 		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
5269 			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5270 				sub_stripes = 1;
5271 			else
5272 				sub_stripes = map->sub_stripes;
5273 
5274 			factor = map->num_stripes / sub_stripes;
5275 			stripes_per_dev = div_u64_rem(stripe_nr_end -
5276 						      stripe_nr_orig,
5277 						      factor,
5278 						      &remaining_stripes);
5279 			div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5280 			last_stripe *= sub_stripes;
5281 		}
5282 
5283 		for (i = 0; i < num_stripes; i++) {
5284 			bbio->stripes[i].physical =
5285 				map->stripes[stripe_index].physical +
5286 				stripe_offset + stripe_nr * map->stripe_len;
5287 			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5288 
5289 			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5290 					 BTRFS_BLOCK_GROUP_RAID10)) {
5291 				bbio->stripes[i].length = stripes_per_dev *
5292 							  map->stripe_len;
5293 
5294 				if (i / sub_stripes < remaining_stripes)
5295 					bbio->stripes[i].length +=
5296 						map->stripe_len;
5297 
5298 				/*
5299 				 * Special for the first stripe and
5300 				 * the last stripe:
5301 				 *
5302 				 * |-------|...|-------|
5303 				 *     |----------|
5304 				 *    off     end_off
5305 				 */
5306 				if (i < sub_stripes)
5307 					bbio->stripes[i].length -=
5308 						stripe_offset;
5309 
5310 				if (stripe_index >= last_stripe &&
5311 				    stripe_index <= (last_stripe +
5312 						     sub_stripes - 1))
5313 					bbio->stripes[i].length -=
5314 						stripe_end_offset;
5315 
5316 				if (i == sub_stripes - 1)
5317 					stripe_offset = 0;
5318 			} else
5319 				bbio->stripes[i].length = *length;
5320 
5321 			stripe_index++;
5322 			if (stripe_index == map->num_stripes) {
5323 				/* This could only happen for RAID0/10 */
5324 				stripe_index = 0;
5325 				stripe_nr++;
5326 			}
5327 		}
5328 	} else {
5329 		for (i = 0; i < num_stripes; i++) {
5330 			bbio->stripes[i].physical =
5331 				map->stripes[stripe_index].physical +
5332 				stripe_offset +
5333 				stripe_nr * map->stripe_len;
5334 			bbio->stripes[i].dev =
5335 				map->stripes[stripe_index].dev;
5336 			stripe_index++;
5337 		}
5338 	}
5339 
5340 	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5341 		max_errors = btrfs_chunk_max_errors(map);
5342 
5343 	tgtdev_indexes = 0;
5344 	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5345 	    dev_replace->tgtdev != NULL) {
5346 		int index_where_to_add;
5347 		u64 srcdev_devid = dev_replace->srcdev->devid;
5348 
5349 		/*
5350 		 * duplicate the write operations while the dev replace
5351 		 * procedure is running. Since the copying of the old disk
5352 		 * to the new disk takes place at run time while the
5353 		 * filesystem is mounted writable, the regular write
5354 		 * operations to the old disk have to be duplicated to go
5355 		 * to the new disk as well.
5356 		 * Note that device->missing is handled by the caller, and
5357 		 * that the write to the old disk is already set up in the
5358 		 * stripes array.
5359 		 */
5360 		index_where_to_add = num_stripes;
5361 		for (i = 0; i < num_stripes; i++) {
5362 			if (bbio->stripes[i].dev->devid == srcdev_devid) {
5363 				/* write to new disk, too */
5364 				struct btrfs_bio_stripe *new =
5365 					bbio->stripes + index_where_to_add;
5366 				struct btrfs_bio_stripe *old =
5367 					bbio->stripes + i;
5368 
5369 				new->physical = old->physical;
5370 				new->length = old->length;
5371 				new->dev = dev_replace->tgtdev;
5372 				bbio->tgtdev_map[i] = index_where_to_add;
5373 				index_where_to_add++;
5374 				max_errors++;
5375 				tgtdev_indexes++;
5376 			}
5377 		}
5378 		num_stripes = index_where_to_add;
5379 	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
5380 		   dev_replace->tgtdev != NULL) {
5381 		u64 srcdev_devid = dev_replace->srcdev->devid;
5382 		int index_srcdev = 0;
5383 		int found = 0;
5384 		u64 physical_of_found = 0;
5385 
5386 		/*
5387 		 * During the dev-replace procedure, the target drive can
5388 		 * also be used to read data in case it is needed to repair
5389 		 * a corrupt block elsewhere. This is possible if the
5390 		 * requested area is left of the left cursor. In this area,
5391 		 * the target drive is a full copy of the source drive.
5392 		 */
5393 		for (i = 0; i < num_stripes; i++) {
5394 			if (bbio->stripes[i].dev->devid == srcdev_devid) {
5395 				/*
5396 				 * In case of DUP, in order to keep it
5397 				 * simple, only add the mirror with the
5398 				 * lowest physical address
5399 				 */
5400 				if (found &&
5401 				    physical_of_found <=
5402 				     bbio->stripes[i].physical)
5403 					continue;
5404 				index_srcdev = i;
5405 				found = 1;
5406 				physical_of_found = bbio->stripes[i].physical;
5407 			}
5408 		}
5409 		if (found) {
5410 			u64 length = map->stripe_len;
5411 
5412 			if (physical_of_found + length <=
5413 			    dev_replace->cursor_left) {
5414 				struct btrfs_bio_stripe *tgtdev_stripe =
5415 					bbio->stripes + num_stripes;
5416 
5417 				tgtdev_stripe->physical = physical_of_found;
5418 				tgtdev_stripe->length =
5419 					bbio->stripes[index_srcdev].length;
5420 				tgtdev_stripe->dev = dev_replace->tgtdev;
5421 				bbio->tgtdev_map[index_srcdev] = num_stripes;
5422 
5423 				tgtdev_indexes++;
5424 				num_stripes++;
5425 			}
5426 		}
5427 	}
5428 
5429 	*bbio_ret = bbio;
5430 	bbio->num_stripes = num_stripes;
5431 	bbio->max_errors = max_errors;
5432 	bbio->mirror_num = mirror_num;
5433 	bbio->num_tgtdevs = tgtdev_indexes;
5434 
5435 	/*
5436 	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
5437 	 * mirror_num == num_stripes + 1 && dev_replace target drive is
5438 	 * available as a mirror
5439 	 */
5440 	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5441 		WARN_ON(num_stripes > 1);
5442 		bbio->stripes[0].dev = dev_replace->tgtdev;
5443 		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5444 		bbio->mirror_num = map->num_stripes + 1;
5445 	}
5446 	if (raid_map) {
5447 		sort_parity_stripes(bbio, raid_map);
5448 		*raid_map_ret = raid_map;
5449 	}
5450 out:
5451 	if (dev_replace_is_ongoing)
5452 		btrfs_dev_replace_unlock(dev_replace);
5453 	free_extent_map(em);
5454 	return ret;
5455 }
5456 
5457 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5458 		      u64 logical, u64 *length,
5459 		      struct btrfs_bio **bbio_ret, int mirror_num)
5460 {
5461 	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5462 				 mirror_num, NULL);
5463 }
5464 
5465 /* For Scrub/replace */
5466 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5467 		     u64 logical, u64 *length,
5468 		     struct btrfs_bio **bbio_ret, int mirror_num,
5469 		     u64 **raid_map_ret)
5470 {
5471 	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5472 				 mirror_num, raid_map_ret);
5473 }
5474 
5475 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5476 		     u64 chunk_start, u64 physical, u64 devid,
5477 		     u64 **logical, int *naddrs, int *stripe_len)
5478 {
5479 	struct extent_map_tree *em_tree = &map_tree->map_tree;
5480 	struct extent_map *em;
5481 	struct map_lookup *map;
5482 	u64 *buf;
5483 	u64 bytenr;
5484 	u64 length;
5485 	u64 stripe_nr;
5486 	u64 rmap_len;
5487 	int i, j, nr = 0;
5488 
5489 	read_lock(&em_tree->lock);
5490 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
5491 	read_unlock(&em_tree->lock);
5492 
5493 	if (!em) {
5494 		printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
5495 		       chunk_start);
5496 		return -EIO;
5497 	}
5498 
5499 	if (em->start != chunk_start) {
5500 		printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
5501 		       em->start, chunk_start);
5502 		free_extent_map(em);
5503 		return -EIO;
5504 	}
5505 	map = (struct map_lookup *)em->bdev;
5506 
5507 	length = em->len;
5508 	rmap_len = map->stripe_len;
5509 
5510 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5511 		do_div(length, map->num_stripes / map->sub_stripes);
5512 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5513 		do_div(length, map->num_stripes);
5514 	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
5515 			      BTRFS_BLOCK_GROUP_RAID6)) {
5516 		do_div(length, nr_data_stripes(map));
5517 		rmap_len = map->stripe_len * nr_data_stripes(map);
5518 	}
5519 
5520 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
5521 	BUG_ON(!buf); /* -ENOMEM */
5522 
5523 	for (i = 0; i < map->num_stripes; i++) {
5524 		if (devid && map->stripes[i].dev->devid != devid)
5525 			continue;
5526 		if (map->stripes[i].physical > physical ||
5527 		    map->stripes[i].physical + length <= physical)
5528 			continue;
5529 
5530 		stripe_nr = physical - map->stripes[i].physical;
5531 		do_div(stripe_nr, map->stripe_len);
5532 
5533 		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5534 			stripe_nr = stripe_nr * map->num_stripes + i;
5535 			do_div(stripe_nr, map->sub_stripes);
5536 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5537 			stripe_nr = stripe_nr * map->num_stripes + i;
5538 		} /* else if RAID[56], multiply by nr_data_stripes().
5539 		   * Alternatively, just use rmap_len below instead of
5540 		   * map->stripe_len */
5541 
5542 		bytenr = chunk_start + stripe_nr * rmap_len;
5543 		WARN_ON(nr >= map->num_stripes);
5544 		for (j = 0; j < nr; j++) {
5545 			if (buf[j] == bytenr)
5546 				break;
5547 		}
5548 		if (j == nr) {
5549 			WARN_ON(nr >= map->num_stripes);
5550 			buf[nr++] = bytenr;
5551 		}
5552 	}
5553 
5554 	*logical = buf;
5555 	*naddrs = nr;
5556 	*stripe_len = rmap_len;
5557 
5558 	free_extent_map(em);
5559 	return 0;
5560 }
5561 
5562 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
5563 {
5564 	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
5565 		bio_endio_nodec(bio, err);
5566 	else
5567 		bio_endio(bio, err);
5568 	kfree(bbio);
5569 }
5570 
5571 static void btrfs_end_bio(struct bio *bio, int err)
5572 {
5573 	struct btrfs_bio *bbio = bio->bi_private;
5574 	struct btrfs_device *dev = bbio->stripes[0].dev;
5575 	int is_orig_bio = 0;
5576 
5577 	if (err) {
5578 		atomic_inc(&bbio->error);
5579 		if (err == -EIO || err == -EREMOTEIO) {
5580 			unsigned int stripe_index =
5581 				btrfs_io_bio(bio)->stripe_index;
5582 
5583 			BUG_ON(stripe_index >= bbio->num_stripes);
5584 			dev = bbio->stripes[stripe_index].dev;
5585 			if (dev->bdev) {
5586 				if (bio->bi_rw & WRITE)
5587 					btrfs_dev_stat_inc(dev,
5588 						BTRFS_DEV_STAT_WRITE_ERRS);
5589 				else
5590 					btrfs_dev_stat_inc(dev,
5591 						BTRFS_DEV_STAT_READ_ERRS);
5592 				if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
5593 					btrfs_dev_stat_inc(dev,
5594 						BTRFS_DEV_STAT_FLUSH_ERRS);
5595 				btrfs_dev_stat_print_on_error(dev);
5596 			}
5597 		}
5598 	}
5599 
5600 	if (bio == bbio->orig_bio)
5601 		is_orig_bio = 1;
5602 
5603 	btrfs_bio_counter_dec(bbio->fs_info);
5604 
5605 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
5606 		if (!is_orig_bio) {
5607 			bio_put(bio);
5608 			bio = bbio->orig_bio;
5609 		}
5610 
5611 		bio->bi_private = bbio->private;
5612 		bio->bi_end_io = bbio->end_io;
5613 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5614 		/* only send an error to the higher layers if it is
5615 		 * beyond the tolerance of the btrfs bio
5616 		 */
5617 		if (atomic_read(&bbio->error) > bbio->max_errors) {
5618 			err = -EIO;
5619 		} else {
5620 			/*
5621 			 * this bio is actually up to date, we didn't
5622 			 * go over the max number of errors
5623 			 */
5624 			set_bit(BIO_UPTODATE, &bio->bi_flags);
5625 			err = 0;
5626 		}
5627 
5628 		btrfs_end_bbio(bbio, bio, err);
5629 	} else if (!is_orig_bio) {
5630 		bio_put(bio);
5631 	}
5632 }
5633 
5634 /*
5635  * see run_scheduled_bios for a description of why bios are collected for
5636  * async submit.
5637  *
5638  * This will add one bio to the pending list for a device and make sure
5639  * the work struct is scheduled.
5640  */
5641 static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5642 					struct btrfs_device *device,
5643 					int rw, struct bio *bio)
5644 {
5645 	int should_queue = 1;
5646 	struct btrfs_pending_bios *pending_bios;
5647 
5648 	if (device->missing || !device->bdev) {
5649 		bio_endio(bio, -EIO);
5650 		return;
5651 	}
5652 
5653 	/* don't bother with additional async steps for reads, right now */
5654 	if (!(rw & REQ_WRITE)) {
5655 		bio_get(bio);
5656 		btrfsic_submit_bio(rw, bio);
5657 		bio_put(bio);
5658 		return;
5659 	}
5660 
5661 	/*
5662 	 * nr_async_bios allows us to reliably return congestion to the
5663 	 * higher layers.  Otherwise, the async bio makes it appear we have
5664 	 * made progress against dirty pages when we've really just put it
5665 	 * on a queue for later
5666 	 */
5667 	atomic_inc(&root->fs_info->nr_async_bios);
5668 	WARN_ON(bio->bi_next);
5669 	bio->bi_next = NULL;
5670 	bio->bi_rw |= rw;
5671 
5672 	spin_lock(&device->io_lock);
5673 	if (bio->bi_rw & REQ_SYNC)
5674 		pending_bios = &device->pending_sync_bios;
5675 	else
5676 		pending_bios = &device->pending_bios;
5677 
5678 	if (pending_bios->tail)
5679 		pending_bios->tail->bi_next = bio;
5680 
5681 	pending_bios->tail = bio;
5682 	if (!pending_bios->head)
5683 		pending_bios->head = bio;
5684 	if (device->running_pending)
5685 		should_queue = 0;
5686 
5687 	spin_unlock(&device->io_lock);
5688 
5689 	if (should_queue)
5690 		btrfs_queue_work(root->fs_info->submit_workers,
5691 				 &device->work);
5692 }
5693 
5694 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5695 		       sector_t sector)
5696 {
5697 	struct bio_vec *prev;
5698 	struct request_queue *q = bdev_get_queue(bdev);
5699 	unsigned int max_sectors = queue_max_sectors(q);
5700 	struct bvec_merge_data bvm = {
5701 		.bi_bdev = bdev,
5702 		.bi_sector = sector,
5703 		.bi_rw = bio->bi_rw,
5704 	};
5705 
5706 	if (WARN_ON(bio->bi_vcnt == 0))
5707 		return 1;
5708 
5709 	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5710 	if (bio_sectors(bio) > max_sectors)
5711 		return 0;
5712 
5713 	if (!q->merge_bvec_fn)
5714 		return 1;
5715 
5716 	bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
5717 	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
5718 		return 0;
5719 	return 1;
5720 }
5721 
5722 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5723 			      struct bio *bio, u64 physical, int dev_nr,
5724 			      int rw, int async)
5725 {
5726 	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5727 
5728 	bio->bi_private = bbio;
5729 	btrfs_io_bio(bio)->stripe_index = dev_nr;
5730 	bio->bi_end_io = btrfs_end_bio;
5731 	bio->bi_iter.bi_sector = physical >> 9;
5732 #ifdef DEBUG
5733 	{
5734 		struct rcu_string *name;
5735 
5736 		rcu_read_lock();
5737 		name = rcu_dereference(dev->name);
5738 		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5739 			 "(%s id %llu), size=%u\n", rw,
5740 			 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
5741 			 name->str, dev->devid, bio->bi_iter.bi_size);
5742 		rcu_read_unlock();
5743 	}
5744 #endif
5745 	bio->bi_bdev = dev->bdev;
5746 
5747 	btrfs_bio_counter_inc_noblocked(root->fs_info);
5748 
5749 	if (async)
5750 		btrfs_schedule_bio(root, dev, rw, bio);
5751 	else
5752 		btrfsic_submit_bio(rw, bio);
5753 }
5754 
5755 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5756 			      struct bio *first_bio, struct btrfs_device *dev,
5757 			      int dev_nr, int rw, int async)
5758 {
5759 	struct bio_vec *bvec = first_bio->bi_io_vec;
5760 	struct bio *bio;
5761 	int nr_vecs = bio_get_nr_vecs(dev->bdev);
5762 	u64 physical = bbio->stripes[dev_nr].physical;
5763 
5764 again:
5765 	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
5766 	if (!bio)
5767 		return -ENOMEM;
5768 
5769 	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
5770 		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5771 				 bvec->bv_offset) < bvec->bv_len) {
5772 			u64 len = bio->bi_iter.bi_size;
5773 
5774 			atomic_inc(&bbio->stripes_pending);
5775 			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
5776 					  rw, async);
5777 			physical += len;
5778 			goto again;
5779 		}
5780 		bvec++;
5781 	}
5782 
5783 	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
5784 	return 0;
5785 }
5786 
5787 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5788 {
5789 	atomic_inc(&bbio->error);
5790 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
5791 		/* Shoud be the original bio. */
5792 		WARN_ON(bio != bbio->orig_bio);
5793 
5794 		bio->bi_private = bbio->private;
5795 		bio->bi_end_io = bbio->end_io;
5796 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5797 		bio->bi_iter.bi_sector = logical >> 9;
5798 
5799 		btrfs_end_bbio(bbio, bio, -EIO);
5800 	}
5801 }
5802 
5803 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5804 		  int mirror_num, int async_submit)
5805 {
5806 	struct btrfs_device *dev;
5807 	struct bio *first_bio = bio;
5808 	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5809 	u64 length = 0;
5810 	u64 map_length;
5811 	u64 *raid_map = NULL;
5812 	int ret;
5813 	int dev_nr = 0;
5814 	int total_devs = 1;
5815 	struct btrfs_bio *bbio = NULL;
5816 
5817 	length = bio->bi_iter.bi_size;
5818 	map_length = length;
5819 
5820 	btrfs_bio_counter_inc_blocked(root->fs_info);
5821 	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5822 			      mirror_num, &raid_map);
5823 	if (ret) {
5824 		btrfs_bio_counter_dec(root->fs_info);
5825 		return ret;
5826 	}
5827 
5828 	total_devs = bbio->num_stripes;
5829 	bbio->orig_bio = first_bio;
5830 	bbio->private = first_bio->bi_private;
5831 	bbio->end_io = first_bio->bi_end_io;
5832 	bbio->fs_info = root->fs_info;
5833 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5834 
5835 	if (raid_map) {
5836 		/* In this case, map_length has been set to the length of
5837 		   a single stripe; not the whole write */
5838 		if (rw & WRITE) {
5839 			ret = raid56_parity_write(root, bio, bbio,
5840 						  raid_map, map_length);
5841 		} else {
5842 			ret = raid56_parity_recover(root, bio, bbio,
5843 						    raid_map, map_length,
5844 						    mirror_num, 1);
5845 		}
5846 
5847 		btrfs_bio_counter_dec(root->fs_info);
5848 		return ret;
5849 	}
5850 
5851 	if (map_length < length) {
5852 		btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
5853 			logical, length, map_length);
5854 		BUG();
5855 	}
5856 
5857 	while (dev_nr < total_devs) {
5858 		dev = bbio->stripes[dev_nr].dev;
5859 		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5860 			bbio_error(bbio, first_bio, logical);
5861 			dev_nr++;
5862 			continue;
5863 		}
5864 
5865 		/*
5866 		 * Check and see if we're ok with this bio based on it's size
5867 		 * and offset with the given device.
5868 		 */
5869 		if (!bio_size_ok(dev->bdev, first_bio,
5870 				 bbio->stripes[dev_nr].physical >> 9)) {
5871 			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
5872 						 dev_nr, rw, async_submit);
5873 			BUG_ON(ret);
5874 			dev_nr++;
5875 			continue;
5876 		}
5877 
5878 		if (dev_nr < total_devs - 1) {
5879 			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5880 			BUG_ON(!bio); /* -ENOMEM */
5881 		} else {
5882 			bio = first_bio;
5883 			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
5884 		}
5885 
5886 		submit_stripe_bio(root, bbio, bio,
5887 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
5888 				  async_submit);
5889 		dev_nr++;
5890 	}
5891 	btrfs_bio_counter_dec(root->fs_info);
5892 	return 0;
5893 }
5894 
5895 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5896 				       u8 *uuid, u8 *fsid)
5897 {
5898 	struct btrfs_device *device;
5899 	struct btrfs_fs_devices *cur_devices;
5900 
5901 	cur_devices = fs_info->fs_devices;
5902 	while (cur_devices) {
5903 		if (!fsid ||
5904 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5905 			device = __find_device(&cur_devices->devices,
5906 					       devid, uuid);
5907 			if (device)
5908 				return device;
5909 		}
5910 		cur_devices = cur_devices->seed;
5911 	}
5912 	return NULL;
5913 }
5914 
5915 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5916 					    struct btrfs_fs_devices *fs_devices,
5917 					    u64 devid, u8 *dev_uuid)
5918 {
5919 	struct btrfs_device *device;
5920 
5921 	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5922 	if (IS_ERR(device))
5923 		return NULL;
5924 
5925 	list_add(&device->dev_list, &fs_devices->devices);
5926 	device->fs_devices = fs_devices;
5927 	fs_devices->num_devices++;
5928 
5929 	device->missing = 1;
5930 	fs_devices->missing_devices++;
5931 
5932 	return device;
5933 }
5934 
5935 /**
5936  * btrfs_alloc_device - allocate struct btrfs_device
5937  * @fs_info:	used only for generating a new devid, can be NULL if
5938  *		devid is provided (i.e. @devid != NULL).
5939  * @devid:	a pointer to devid for this device.  If NULL a new devid
5940  *		is generated.
5941  * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
5942  *		is generated.
5943  *
5944  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
5945  * on error.  Returned struct is not linked onto any lists and can be
5946  * destroyed with kfree() right away.
5947  */
5948 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5949 					const u64 *devid,
5950 					const u8 *uuid)
5951 {
5952 	struct btrfs_device *dev;
5953 	u64 tmp;
5954 
5955 	if (WARN_ON(!devid && !fs_info))
5956 		return ERR_PTR(-EINVAL);
5957 
5958 	dev = __alloc_device();
5959 	if (IS_ERR(dev))
5960 		return dev;
5961 
5962 	if (devid)
5963 		tmp = *devid;
5964 	else {
5965 		int ret;
5966 
5967 		ret = find_next_devid(fs_info, &tmp);
5968 		if (ret) {
5969 			kfree(dev);
5970 			return ERR_PTR(ret);
5971 		}
5972 	}
5973 	dev->devid = tmp;
5974 
5975 	if (uuid)
5976 		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
5977 	else
5978 		generate_random_uuid(dev->uuid);
5979 
5980 	btrfs_init_work(&dev->work, btrfs_submit_helper,
5981 			pending_bios_fn, NULL, NULL);
5982 
5983 	return dev;
5984 }
5985 
5986 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5987 			  struct extent_buffer *leaf,
5988 			  struct btrfs_chunk *chunk)
5989 {
5990 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5991 	struct map_lookup *map;
5992 	struct extent_map *em;
5993 	u64 logical;
5994 	u64 length;
5995 	u64 devid;
5996 	u8 uuid[BTRFS_UUID_SIZE];
5997 	int num_stripes;
5998 	int ret;
5999 	int i;
6000 
6001 	logical = key->offset;
6002 	length = btrfs_chunk_length(leaf, chunk);
6003 
6004 	read_lock(&map_tree->map_tree.lock);
6005 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6006 	read_unlock(&map_tree->map_tree.lock);
6007 
6008 	/* already mapped? */
6009 	if (em && em->start <= logical && em->start + em->len > logical) {
6010 		free_extent_map(em);
6011 		return 0;
6012 	} else if (em) {
6013 		free_extent_map(em);
6014 	}
6015 
6016 	em = alloc_extent_map();
6017 	if (!em)
6018 		return -ENOMEM;
6019 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6020 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6021 	if (!map) {
6022 		free_extent_map(em);
6023 		return -ENOMEM;
6024 	}
6025 
6026 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6027 	em->bdev = (struct block_device *)map;
6028 	em->start = logical;
6029 	em->len = length;
6030 	em->orig_start = 0;
6031 	em->block_start = 0;
6032 	em->block_len = em->len;
6033 
6034 	map->num_stripes = num_stripes;
6035 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
6036 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
6037 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
6038 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6039 	map->type = btrfs_chunk_type(leaf, chunk);
6040 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6041 	for (i = 0; i < num_stripes; i++) {
6042 		map->stripes[i].physical =
6043 			btrfs_stripe_offset_nr(leaf, chunk, i);
6044 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6045 		read_extent_buffer(leaf, uuid, (unsigned long)
6046 				   btrfs_stripe_dev_uuid_nr(chunk, i),
6047 				   BTRFS_UUID_SIZE);
6048 		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
6049 							uuid, NULL);
6050 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
6051 			free_extent_map(em);
6052 			return -EIO;
6053 		}
6054 		if (!map->stripes[i].dev) {
6055 			map->stripes[i].dev =
6056 				add_missing_dev(root, root->fs_info->fs_devices,
6057 						devid, uuid);
6058 			if (!map->stripes[i].dev) {
6059 				free_extent_map(em);
6060 				return -EIO;
6061 			}
6062 		}
6063 		map->stripes[i].dev->in_fs_metadata = 1;
6064 	}
6065 
6066 	write_lock(&map_tree->map_tree.lock);
6067 	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6068 	write_unlock(&map_tree->map_tree.lock);
6069 	BUG_ON(ret); /* Tree corruption */
6070 	free_extent_map(em);
6071 
6072 	return 0;
6073 }
6074 
6075 static void fill_device_from_item(struct extent_buffer *leaf,
6076 				 struct btrfs_dev_item *dev_item,
6077 				 struct btrfs_device *device)
6078 {
6079 	unsigned long ptr;
6080 
6081 	device->devid = btrfs_device_id(leaf, dev_item);
6082 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6083 	device->total_bytes = device->disk_total_bytes;
6084 	device->commit_total_bytes = device->disk_total_bytes;
6085 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6086 	device->commit_bytes_used = device->bytes_used;
6087 	device->type = btrfs_device_type(leaf, dev_item);
6088 	device->io_align = btrfs_device_io_align(leaf, dev_item);
6089 	device->io_width = btrfs_device_io_width(leaf, dev_item);
6090 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6091 	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6092 	device->is_tgtdev_for_dev_replace = 0;
6093 
6094 	ptr = btrfs_device_uuid(dev_item);
6095 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6096 }
6097 
6098 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
6099 						  u8 *fsid)
6100 {
6101 	struct btrfs_fs_devices *fs_devices;
6102 	int ret;
6103 
6104 	BUG_ON(!mutex_is_locked(&uuid_mutex));
6105 
6106 	fs_devices = root->fs_info->fs_devices->seed;
6107 	while (fs_devices) {
6108 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
6109 			return fs_devices;
6110 
6111 		fs_devices = fs_devices->seed;
6112 	}
6113 
6114 	fs_devices = find_fsid(fsid);
6115 	if (!fs_devices) {
6116 		if (!btrfs_test_opt(root, DEGRADED))
6117 			return ERR_PTR(-ENOENT);
6118 
6119 		fs_devices = alloc_fs_devices(fsid);
6120 		if (IS_ERR(fs_devices))
6121 			return fs_devices;
6122 
6123 		fs_devices->seeding = 1;
6124 		fs_devices->opened = 1;
6125 		return fs_devices;
6126 	}
6127 
6128 	fs_devices = clone_fs_devices(fs_devices);
6129 	if (IS_ERR(fs_devices))
6130 		return fs_devices;
6131 
6132 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6133 				   root->fs_info->bdev_holder);
6134 	if (ret) {
6135 		free_fs_devices(fs_devices);
6136 		fs_devices = ERR_PTR(ret);
6137 		goto out;
6138 	}
6139 
6140 	if (!fs_devices->seeding) {
6141 		__btrfs_close_devices(fs_devices);
6142 		free_fs_devices(fs_devices);
6143 		fs_devices = ERR_PTR(-EINVAL);
6144 		goto out;
6145 	}
6146 
6147 	fs_devices->seed = root->fs_info->fs_devices->seed;
6148 	root->fs_info->fs_devices->seed = fs_devices;
6149 out:
6150 	return fs_devices;
6151 }
6152 
6153 static int read_one_dev(struct btrfs_root *root,
6154 			struct extent_buffer *leaf,
6155 			struct btrfs_dev_item *dev_item)
6156 {
6157 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6158 	struct btrfs_device *device;
6159 	u64 devid;
6160 	int ret;
6161 	u8 fs_uuid[BTRFS_UUID_SIZE];
6162 	u8 dev_uuid[BTRFS_UUID_SIZE];
6163 
6164 	devid = btrfs_device_id(leaf, dev_item);
6165 	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6166 			   BTRFS_UUID_SIZE);
6167 	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6168 			   BTRFS_UUID_SIZE);
6169 
6170 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
6171 		fs_devices = open_seed_devices(root, fs_uuid);
6172 		if (IS_ERR(fs_devices))
6173 			return PTR_ERR(fs_devices);
6174 	}
6175 
6176 	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
6177 	if (!device) {
6178 		if (!btrfs_test_opt(root, DEGRADED))
6179 			return -EIO;
6180 
6181 		btrfs_warn(root->fs_info, "devid %llu missing", devid);
6182 		device = add_missing_dev(root, fs_devices, devid, dev_uuid);
6183 		if (!device)
6184 			return -ENOMEM;
6185 	} else {
6186 		if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
6187 			return -EIO;
6188 
6189 		if(!device->bdev && !device->missing) {
6190 			/*
6191 			 * this happens when a device that was properly setup
6192 			 * in the device info lists suddenly goes bad.
6193 			 * device->bdev is NULL, and so we have to set
6194 			 * device->missing to one here
6195 			 */
6196 			device->fs_devices->missing_devices++;
6197 			device->missing = 1;
6198 		}
6199 
6200 		/* Move the device to its own fs_devices */
6201 		if (device->fs_devices != fs_devices) {
6202 			ASSERT(device->missing);
6203 
6204 			list_move(&device->dev_list, &fs_devices->devices);
6205 			device->fs_devices->num_devices--;
6206 			fs_devices->num_devices++;
6207 
6208 			device->fs_devices->missing_devices--;
6209 			fs_devices->missing_devices++;
6210 
6211 			device->fs_devices = fs_devices;
6212 		}
6213 	}
6214 
6215 	if (device->fs_devices != root->fs_info->fs_devices) {
6216 		BUG_ON(device->writeable);
6217 		if (device->generation !=
6218 		    btrfs_device_generation(leaf, dev_item))
6219 			return -EINVAL;
6220 	}
6221 
6222 	fill_device_from_item(leaf, dev_item, device);
6223 	device->in_fs_metadata = 1;
6224 	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6225 		device->fs_devices->total_rw_bytes += device->total_bytes;
6226 		spin_lock(&root->fs_info->free_chunk_lock);
6227 		root->fs_info->free_chunk_space += device->total_bytes -
6228 			device->bytes_used;
6229 		spin_unlock(&root->fs_info->free_chunk_lock);
6230 	}
6231 	ret = 0;
6232 	return ret;
6233 }
6234 
6235 int btrfs_read_sys_array(struct btrfs_root *root)
6236 {
6237 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
6238 	struct extent_buffer *sb;
6239 	struct btrfs_disk_key *disk_key;
6240 	struct btrfs_chunk *chunk;
6241 	u8 *ptr;
6242 	unsigned long sb_ptr;
6243 	int ret = 0;
6244 	u32 num_stripes;
6245 	u32 array_size;
6246 	u32 len = 0;
6247 	u32 cur;
6248 	struct btrfs_key key;
6249 
6250 	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
6251 					  BTRFS_SUPER_INFO_SIZE);
6252 	if (!sb)
6253 		return -ENOMEM;
6254 	btrfs_set_buffer_uptodate(sb);
6255 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6256 	/*
6257 	 * The sb extent buffer is artifical and just used to read the system array.
6258 	 * btrfs_set_buffer_uptodate() call does not properly mark all it's
6259 	 * pages up-to-date when the page is larger: extent does not cover the
6260 	 * whole page and consequently check_page_uptodate does not find all
6261 	 * the page's extents up-to-date (the hole beyond sb),
6262 	 * write_extent_buffer then triggers a WARN_ON.
6263 	 *
6264 	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6265 	 * but sb spans only this function. Add an explicit SetPageUptodate call
6266 	 * to silence the warning eg. on PowerPC 64.
6267 	 */
6268 	if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
6269 		SetPageUptodate(sb->pages[0]);
6270 
6271 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6272 	array_size = btrfs_super_sys_array_size(super_copy);
6273 
6274 	ptr = super_copy->sys_chunk_array;
6275 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
6276 	cur = 0;
6277 
6278 	while (cur < array_size) {
6279 		disk_key = (struct btrfs_disk_key *)ptr;
6280 		btrfs_disk_key_to_cpu(&key, disk_key);
6281 
6282 		len = sizeof(*disk_key); ptr += len;
6283 		sb_ptr += len;
6284 		cur += len;
6285 
6286 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6287 			chunk = (struct btrfs_chunk *)sb_ptr;
6288 			ret = read_one_chunk(root, &key, sb, chunk);
6289 			if (ret)
6290 				break;
6291 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6292 			len = btrfs_chunk_item_size(num_stripes);
6293 		} else {
6294 			ret = -EIO;
6295 			break;
6296 		}
6297 		ptr += len;
6298 		sb_ptr += len;
6299 		cur += len;
6300 	}
6301 	free_extent_buffer(sb);
6302 	return ret;
6303 }
6304 
6305 int btrfs_read_chunk_tree(struct btrfs_root *root)
6306 {
6307 	struct btrfs_path *path;
6308 	struct extent_buffer *leaf;
6309 	struct btrfs_key key;
6310 	struct btrfs_key found_key;
6311 	int ret;
6312 	int slot;
6313 
6314 	root = root->fs_info->chunk_root;
6315 
6316 	path = btrfs_alloc_path();
6317 	if (!path)
6318 		return -ENOMEM;
6319 
6320 	mutex_lock(&uuid_mutex);
6321 	lock_chunks(root);
6322 
6323 	/*
6324 	 * Read all device items, and then all the chunk items. All
6325 	 * device items are found before any chunk item (their object id
6326 	 * is smaller than the lowest possible object id for a chunk
6327 	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6328 	 */
6329 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6330 	key.offset = 0;
6331 	key.type = 0;
6332 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6333 	if (ret < 0)
6334 		goto error;
6335 	while (1) {
6336 		leaf = path->nodes[0];
6337 		slot = path->slots[0];
6338 		if (slot >= btrfs_header_nritems(leaf)) {
6339 			ret = btrfs_next_leaf(root, path);
6340 			if (ret == 0)
6341 				continue;
6342 			if (ret < 0)
6343 				goto error;
6344 			break;
6345 		}
6346 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6347 		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6348 			struct btrfs_dev_item *dev_item;
6349 			dev_item = btrfs_item_ptr(leaf, slot,
6350 						  struct btrfs_dev_item);
6351 			ret = read_one_dev(root, leaf, dev_item);
6352 			if (ret)
6353 				goto error;
6354 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6355 			struct btrfs_chunk *chunk;
6356 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6357 			ret = read_one_chunk(root, &found_key, leaf, chunk);
6358 			if (ret)
6359 				goto error;
6360 		}
6361 		path->slots[0]++;
6362 	}
6363 	ret = 0;
6364 error:
6365 	unlock_chunks(root);
6366 	mutex_unlock(&uuid_mutex);
6367 
6368 	btrfs_free_path(path);
6369 	return ret;
6370 }
6371 
6372 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6373 {
6374 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6375 	struct btrfs_device *device;
6376 
6377 	while (fs_devices) {
6378 		mutex_lock(&fs_devices->device_list_mutex);
6379 		list_for_each_entry(device, &fs_devices->devices, dev_list)
6380 			device->dev_root = fs_info->dev_root;
6381 		mutex_unlock(&fs_devices->device_list_mutex);
6382 
6383 		fs_devices = fs_devices->seed;
6384 	}
6385 }
6386 
6387 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6388 {
6389 	int i;
6390 
6391 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6392 		btrfs_dev_stat_reset(dev, i);
6393 }
6394 
6395 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6396 {
6397 	struct btrfs_key key;
6398 	struct btrfs_key found_key;
6399 	struct btrfs_root *dev_root = fs_info->dev_root;
6400 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6401 	struct extent_buffer *eb;
6402 	int slot;
6403 	int ret = 0;
6404 	struct btrfs_device *device;
6405 	struct btrfs_path *path = NULL;
6406 	int i;
6407 
6408 	path = btrfs_alloc_path();
6409 	if (!path) {
6410 		ret = -ENOMEM;
6411 		goto out;
6412 	}
6413 
6414 	mutex_lock(&fs_devices->device_list_mutex);
6415 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6416 		int item_size;
6417 		struct btrfs_dev_stats_item *ptr;
6418 
6419 		key.objectid = 0;
6420 		key.type = BTRFS_DEV_STATS_KEY;
6421 		key.offset = device->devid;
6422 		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6423 		if (ret) {
6424 			__btrfs_reset_dev_stats(device);
6425 			device->dev_stats_valid = 1;
6426 			btrfs_release_path(path);
6427 			continue;
6428 		}
6429 		slot = path->slots[0];
6430 		eb = path->nodes[0];
6431 		btrfs_item_key_to_cpu(eb, &found_key, slot);
6432 		item_size = btrfs_item_size_nr(eb, slot);
6433 
6434 		ptr = btrfs_item_ptr(eb, slot,
6435 				     struct btrfs_dev_stats_item);
6436 
6437 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6438 			if (item_size >= (1 + i) * sizeof(__le64))
6439 				btrfs_dev_stat_set(device, i,
6440 					btrfs_dev_stats_value(eb, ptr, i));
6441 			else
6442 				btrfs_dev_stat_reset(device, i);
6443 		}
6444 
6445 		device->dev_stats_valid = 1;
6446 		btrfs_dev_stat_print_on_load(device);
6447 		btrfs_release_path(path);
6448 	}
6449 	mutex_unlock(&fs_devices->device_list_mutex);
6450 
6451 out:
6452 	btrfs_free_path(path);
6453 	return ret < 0 ? ret : 0;
6454 }
6455 
6456 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6457 				struct btrfs_root *dev_root,
6458 				struct btrfs_device *device)
6459 {
6460 	struct btrfs_path *path;
6461 	struct btrfs_key key;
6462 	struct extent_buffer *eb;
6463 	struct btrfs_dev_stats_item *ptr;
6464 	int ret;
6465 	int i;
6466 
6467 	key.objectid = 0;
6468 	key.type = BTRFS_DEV_STATS_KEY;
6469 	key.offset = device->devid;
6470 
6471 	path = btrfs_alloc_path();
6472 	BUG_ON(!path);
6473 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6474 	if (ret < 0) {
6475 		printk_in_rcu(KERN_WARNING "BTRFS: "
6476 			"error %d while searching for dev_stats item for device %s!\n",
6477 			      ret, rcu_str_deref(device->name));
6478 		goto out;
6479 	}
6480 
6481 	if (ret == 0 &&
6482 	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
6483 		/* need to delete old one and insert a new one */
6484 		ret = btrfs_del_item(trans, dev_root, path);
6485 		if (ret != 0) {
6486 			printk_in_rcu(KERN_WARNING "BTRFS: "
6487 				"delete too small dev_stats item for device %s failed %d!\n",
6488 				      rcu_str_deref(device->name), ret);
6489 			goto out;
6490 		}
6491 		ret = 1;
6492 	}
6493 
6494 	if (ret == 1) {
6495 		/* need to insert a new item */
6496 		btrfs_release_path(path);
6497 		ret = btrfs_insert_empty_item(trans, dev_root, path,
6498 					      &key, sizeof(*ptr));
6499 		if (ret < 0) {
6500 			printk_in_rcu(KERN_WARNING "BTRFS: "
6501 					  "insert dev_stats item for device %s failed %d!\n",
6502 				      rcu_str_deref(device->name), ret);
6503 			goto out;
6504 		}
6505 	}
6506 
6507 	eb = path->nodes[0];
6508 	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
6509 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6510 		btrfs_set_dev_stats_value(eb, ptr, i,
6511 					  btrfs_dev_stat_read(device, i));
6512 	btrfs_mark_buffer_dirty(eb);
6513 
6514 out:
6515 	btrfs_free_path(path);
6516 	return ret;
6517 }
6518 
6519 /*
6520  * called from commit_transaction. Writes all changed device stats to disk.
6521  */
6522 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6523 			struct btrfs_fs_info *fs_info)
6524 {
6525 	struct btrfs_root *dev_root = fs_info->dev_root;
6526 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6527 	struct btrfs_device *device;
6528 	int stats_cnt;
6529 	int ret = 0;
6530 
6531 	mutex_lock(&fs_devices->device_list_mutex);
6532 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6533 		if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
6534 			continue;
6535 
6536 		stats_cnt = atomic_read(&device->dev_stats_ccnt);
6537 		ret = update_dev_stat_item(trans, dev_root, device);
6538 		if (!ret)
6539 			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
6540 	}
6541 	mutex_unlock(&fs_devices->device_list_mutex);
6542 
6543 	return ret;
6544 }
6545 
6546 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
6547 {
6548 	btrfs_dev_stat_inc(dev, index);
6549 	btrfs_dev_stat_print_on_error(dev);
6550 }
6551 
6552 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6553 {
6554 	if (!dev->dev_stats_valid)
6555 		return;
6556 	printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
6557 			   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6558 			   rcu_str_deref(dev->name),
6559 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6560 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6561 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6562 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6563 			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6564 }
6565 
6566 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6567 {
6568 	int i;
6569 
6570 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6571 		if (btrfs_dev_stat_read(dev, i) != 0)
6572 			break;
6573 	if (i == BTRFS_DEV_STAT_VALUES_MAX)
6574 		return; /* all values == 0, suppress message */
6575 
6576 	printk_in_rcu(KERN_INFO "BTRFS: "
6577 		   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6578 	       rcu_str_deref(dev->name),
6579 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6580 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6581 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6582 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6583 	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6584 }
6585 
6586 int btrfs_get_dev_stats(struct btrfs_root *root,
6587 			struct btrfs_ioctl_get_dev_stats *stats)
6588 {
6589 	struct btrfs_device *dev;
6590 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6591 	int i;
6592 
6593 	mutex_lock(&fs_devices->device_list_mutex);
6594 	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
6595 	mutex_unlock(&fs_devices->device_list_mutex);
6596 
6597 	if (!dev) {
6598 		btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
6599 		return -ENODEV;
6600 	} else if (!dev->dev_stats_valid) {
6601 		btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
6602 		return -ENODEV;
6603 	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
6604 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6605 			if (stats->nr_items > i)
6606 				stats->values[i] =
6607 					btrfs_dev_stat_read_and_reset(dev, i);
6608 			else
6609 				btrfs_dev_stat_reset(dev, i);
6610 		}
6611 	} else {
6612 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6613 			if (stats->nr_items > i)
6614 				stats->values[i] = btrfs_dev_stat_read(dev, i);
6615 	}
6616 	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
6617 		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
6618 	return 0;
6619 }
6620 
6621 int btrfs_scratch_superblock(struct btrfs_device *device)
6622 {
6623 	struct buffer_head *bh;
6624 	struct btrfs_super_block *disk_super;
6625 
6626 	bh = btrfs_read_dev_super(device->bdev);
6627 	if (!bh)
6628 		return -EINVAL;
6629 	disk_super = (struct btrfs_super_block *)bh->b_data;
6630 
6631 	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
6632 	set_buffer_dirty(bh);
6633 	sync_dirty_buffer(bh);
6634 	brelse(bh);
6635 
6636 	return 0;
6637 }
6638 
6639 /*
6640  * Update the size of all devices, which is used for writing out the
6641  * super blocks.
6642  */
6643 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
6644 {
6645 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6646 	struct btrfs_device *curr, *next;
6647 
6648 	if (list_empty(&fs_devices->resized_devices))
6649 		return;
6650 
6651 	mutex_lock(&fs_devices->device_list_mutex);
6652 	lock_chunks(fs_info->dev_root);
6653 	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
6654 				 resized_list) {
6655 		list_del_init(&curr->resized_list);
6656 		curr->commit_total_bytes = curr->disk_total_bytes;
6657 	}
6658 	unlock_chunks(fs_info->dev_root);
6659 	mutex_unlock(&fs_devices->device_list_mutex);
6660 }
6661 
6662 /* Must be invoked during the transaction commit */
6663 void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
6664 					struct btrfs_transaction *transaction)
6665 {
6666 	struct extent_map *em;
6667 	struct map_lookup *map;
6668 	struct btrfs_device *dev;
6669 	int i;
6670 
6671 	if (list_empty(&transaction->pending_chunks))
6672 		return;
6673 
6674 	/* In order to kick the device replace finish process */
6675 	lock_chunks(root);
6676 	list_for_each_entry(em, &transaction->pending_chunks, list) {
6677 		map = (struct map_lookup *)em->bdev;
6678 
6679 		for (i = 0; i < map->num_stripes; i++) {
6680 			dev = map->stripes[i].dev;
6681 			dev->commit_bytes_used = dev->bytes_used;
6682 		}
6683 	}
6684 	unlock_chunks(root);
6685 }
6686