xref: /linux/fs/btrfs/volumes.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <asm/div64.h>
27 #include "compat.h"
28 #include "ctree.h"
29 #include "extent_map.h"
30 #include "disk-io.h"
31 #include "transaction.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "async-thread.h"
35 
36 static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 				struct btrfs_root *root,
38 				struct btrfs_device *device);
39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
40 
41 static DEFINE_MUTEX(uuid_mutex);
42 static LIST_HEAD(fs_uuids);
43 
44 static void lock_chunks(struct btrfs_root *root)
45 {
46 	mutex_lock(&root->fs_info->chunk_mutex);
47 }
48 
49 static void unlock_chunks(struct btrfs_root *root)
50 {
51 	mutex_unlock(&root->fs_info->chunk_mutex);
52 }
53 
54 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
55 {
56 	struct btrfs_device *device;
57 	WARN_ON(fs_devices->opened);
58 	while (!list_empty(&fs_devices->devices)) {
59 		device = list_entry(fs_devices->devices.next,
60 				    struct btrfs_device, dev_list);
61 		list_del(&device->dev_list);
62 		kfree(device->name);
63 		kfree(device);
64 	}
65 	kfree(fs_devices);
66 }
67 
68 int btrfs_cleanup_fs_uuids(void)
69 {
70 	struct btrfs_fs_devices *fs_devices;
71 
72 	while (!list_empty(&fs_uuids)) {
73 		fs_devices = list_entry(fs_uuids.next,
74 					struct btrfs_fs_devices, list);
75 		list_del(&fs_devices->list);
76 		free_fs_devices(fs_devices);
77 	}
78 	return 0;
79 }
80 
81 static noinline struct btrfs_device *__find_device(struct list_head *head,
82 						   u64 devid, u8 *uuid)
83 {
84 	struct btrfs_device *dev;
85 
86 	list_for_each_entry(dev, head, dev_list) {
87 		if (dev->devid == devid &&
88 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
89 			return dev;
90 		}
91 	}
92 	return NULL;
93 }
94 
95 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
96 {
97 	struct btrfs_fs_devices *fs_devices;
98 
99 	list_for_each_entry(fs_devices, &fs_uuids, list) {
100 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
101 			return fs_devices;
102 	}
103 	return NULL;
104 }
105 
106 static void requeue_list(struct btrfs_pending_bios *pending_bios,
107 			struct bio *head, struct bio *tail)
108 {
109 
110 	struct bio *old_head;
111 
112 	old_head = pending_bios->head;
113 	pending_bios->head = head;
114 	if (pending_bios->tail)
115 		tail->bi_next = old_head;
116 	else
117 		pending_bios->tail = tail;
118 }
119 
120 /*
121  * we try to collect pending bios for a device so we don't get a large
122  * number of procs sending bios down to the same device.  This greatly
123  * improves the schedulers ability to collect and merge the bios.
124  *
125  * But, it also turns into a long list of bios to process and that is sure
126  * to eventually make the worker thread block.  The solution here is to
127  * make some progress and then put this work struct back at the end of
128  * the list if the block device is congested.  This way, multiple devices
129  * can make progress from a single worker thread.
130  */
131 static noinline int run_scheduled_bios(struct btrfs_device *device)
132 {
133 	struct bio *pending;
134 	struct backing_dev_info *bdi;
135 	struct btrfs_fs_info *fs_info;
136 	struct btrfs_pending_bios *pending_bios;
137 	struct bio *tail;
138 	struct bio *cur;
139 	int again = 0;
140 	unsigned long num_run;
141 	unsigned long batch_run = 0;
142 	unsigned long limit;
143 	unsigned long last_waited = 0;
144 	int force_reg = 0;
145 	int sync_pending = 0;
146 	struct blk_plug plug;
147 
148 	/*
149 	 * this function runs all the bios we've collected for
150 	 * a particular device.  We don't want to wander off to
151 	 * another device without first sending all of these down.
152 	 * So, setup a plug here and finish it off before we return
153 	 */
154 	blk_start_plug(&plug);
155 
156 	bdi = blk_get_backing_dev_info(device->bdev);
157 	fs_info = device->dev_root->fs_info;
158 	limit = btrfs_async_submit_limit(fs_info);
159 	limit = limit * 2 / 3;
160 
161 loop:
162 	spin_lock(&device->io_lock);
163 
164 loop_lock:
165 	num_run = 0;
166 
167 	/* take all the bios off the list at once and process them
168 	 * later on (without the lock held).  But, remember the
169 	 * tail and other pointers so the bios can be properly reinserted
170 	 * into the list if we hit congestion
171 	 */
172 	if (!force_reg && device->pending_sync_bios.head) {
173 		pending_bios = &device->pending_sync_bios;
174 		force_reg = 1;
175 	} else {
176 		pending_bios = &device->pending_bios;
177 		force_reg = 0;
178 	}
179 
180 	pending = pending_bios->head;
181 	tail = pending_bios->tail;
182 	WARN_ON(pending && !tail);
183 
184 	/*
185 	 * if pending was null this time around, no bios need processing
186 	 * at all and we can stop.  Otherwise it'll loop back up again
187 	 * and do an additional check so no bios are missed.
188 	 *
189 	 * device->running_pending is used to synchronize with the
190 	 * schedule_bio code.
191 	 */
192 	if (device->pending_sync_bios.head == NULL &&
193 	    device->pending_bios.head == NULL) {
194 		again = 0;
195 		device->running_pending = 0;
196 	} else {
197 		again = 1;
198 		device->running_pending = 1;
199 	}
200 
201 	pending_bios->head = NULL;
202 	pending_bios->tail = NULL;
203 
204 	spin_unlock(&device->io_lock);
205 
206 	while (pending) {
207 
208 		rmb();
209 		/* we want to work on both lists, but do more bios on the
210 		 * sync list than the regular list
211 		 */
212 		if ((num_run > 32 &&
213 		    pending_bios != &device->pending_sync_bios &&
214 		    device->pending_sync_bios.head) ||
215 		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
216 		    device->pending_bios.head)) {
217 			spin_lock(&device->io_lock);
218 			requeue_list(pending_bios, pending, tail);
219 			goto loop_lock;
220 		}
221 
222 		cur = pending;
223 		pending = pending->bi_next;
224 		cur->bi_next = NULL;
225 		atomic_dec(&fs_info->nr_async_bios);
226 
227 		if (atomic_read(&fs_info->nr_async_bios) < limit &&
228 		    waitqueue_active(&fs_info->async_submit_wait))
229 			wake_up(&fs_info->async_submit_wait);
230 
231 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
232 
233 		/*
234 		 * if we're doing the sync list, record that our
235 		 * plug has some sync requests on it
236 		 *
237 		 * If we're doing the regular list and there are
238 		 * sync requests sitting around, unplug before
239 		 * we add more
240 		 */
241 		if (pending_bios == &device->pending_sync_bios) {
242 			sync_pending = 1;
243 		} else if (sync_pending) {
244 			blk_finish_plug(&plug);
245 			blk_start_plug(&plug);
246 			sync_pending = 0;
247 		}
248 
249 		submit_bio(cur->bi_rw, cur);
250 		num_run++;
251 		batch_run++;
252 		if (need_resched())
253 			cond_resched();
254 
255 		/*
256 		 * we made progress, there is more work to do and the bdi
257 		 * is now congested.  Back off and let other work structs
258 		 * run instead
259 		 */
260 		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
261 		    fs_info->fs_devices->open_devices > 1) {
262 			struct io_context *ioc;
263 
264 			ioc = current->io_context;
265 
266 			/*
267 			 * the main goal here is that we don't want to
268 			 * block if we're going to be able to submit
269 			 * more requests without blocking.
270 			 *
271 			 * This code does two great things, it pokes into
272 			 * the elevator code from a filesystem _and_
273 			 * it makes assumptions about how batching works.
274 			 */
275 			if (ioc && ioc->nr_batch_requests > 0 &&
276 			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
277 			    (last_waited == 0 ||
278 			     ioc->last_waited == last_waited)) {
279 				/*
280 				 * we want to go through our batch of
281 				 * requests and stop.  So, we copy out
282 				 * the ioc->last_waited time and test
283 				 * against it before looping
284 				 */
285 				last_waited = ioc->last_waited;
286 				if (need_resched())
287 					cond_resched();
288 				continue;
289 			}
290 			spin_lock(&device->io_lock);
291 			requeue_list(pending_bios, pending, tail);
292 			device->running_pending = 1;
293 
294 			spin_unlock(&device->io_lock);
295 			btrfs_requeue_work(&device->work);
296 			goto done;
297 		}
298 	}
299 
300 	cond_resched();
301 	if (again)
302 		goto loop;
303 
304 	spin_lock(&device->io_lock);
305 	if (device->pending_bios.head || device->pending_sync_bios.head)
306 		goto loop_lock;
307 	spin_unlock(&device->io_lock);
308 
309 done:
310 	blk_finish_plug(&plug);
311 	return 0;
312 }
313 
314 static void pending_bios_fn(struct btrfs_work *work)
315 {
316 	struct btrfs_device *device;
317 
318 	device = container_of(work, struct btrfs_device, work);
319 	run_scheduled_bios(device);
320 }
321 
322 static noinline int device_list_add(const char *path,
323 			   struct btrfs_super_block *disk_super,
324 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
325 {
326 	struct btrfs_device *device;
327 	struct btrfs_fs_devices *fs_devices;
328 	u64 found_transid = btrfs_super_generation(disk_super);
329 	char *name;
330 
331 	fs_devices = find_fsid(disk_super->fsid);
332 	if (!fs_devices) {
333 		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
334 		if (!fs_devices)
335 			return -ENOMEM;
336 		INIT_LIST_HEAD(&fs_devices->devices);
337 		INIT_LIST_HEAD(&fs_devices->alloc_list);
338 		list_add(&fs_devices->list, &fs_uuids);
339 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
340 		fs_devices->latest_devid = devid;
341 		fs_devices->latest_trans = found_transid;
342 		mutex_init(&fs_devices->device_list_mutex);
343 		device = NULL;
344 	} else {
345 		device = __find_device(&fs_devices->devices, devid,
346 				       disk_super->dev_item.uuid);
347 	}
348 	if (!device) {
349 		if (fs_devices->opened)
350 			return -EBUSY;
351 
352 		device = kzalloc(sizeof(*device), GFP_NOFS);
353 		if (!device) {
354 			/* we can safely leave the fs_devices entry around */
355 			return -ENOMEM;
356 		}
357 		device->devid = devid;
358 		device->work.func = pending_bios_fn;
359 		memcpy(device->uuid, disk_super->dev_item.uuid,
360 		       BTRFS_UUID_SIZE);
361 		spin_lock_init(&device->io_lock);
362 		device->name = kstrdup(path, GFP_NOFS);
363 		if (!device->name) {
364 			kfree(device);
365 			return -ENOMEM;
366 		}
367 		INIT_LIST_HEAD(&device->dev_alloc_list);
368 
369 		/* init readahead state */
370 		spin_lock_init(&device->reada_lock);
371 		device->reada_curr_zone = NULL;
372 		atomic_set(&device->reada_in_flight, 0);
373 		device->reada_next = 0;
374 		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376 
377 		mutex_lock(&fs_devices->device_list_mutex);
378 		list_add_rcu(&device->dev_list, &fs_devices->devices);
379 		mutex_unlock(&fs_devices->device_list_mutex);
380 
381 		device->fs_devices = fs_devices;
382 		fs_devices->num_devices++;
383 	} else if (!device->name || strcmp(device->name, path)) {
384 		name = kstrdup(path, GFP_NOFS);
385 		if (!name)
386 			return -ENOMEM;
387 		kfree(device->name);
388 		device->name = name;
389 		if (device->missing) {
390 			fs_devices->missing_devices--;
391 			device->missing = 0;
392 		}
393 	}
394 
395 	if (found_transid > fs_devices->latest_trans) {
396 		fs_devices->latest_devid = devid;
397 		fs_devices->latest_trans = found_transid;
398 	}
399 	*fs_devices_ret = fs_devices;
400 	return 0;
401 }
402 
403 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
404 {
405 	struct btrfs_fs_devices *fs_devices;
406 	struct btrfs_device *device;
407 	struct btrfs_device *orig_dev;
408 
409 	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
410 	if (!fs_devices)
411 		return ERR_PTR(-ENOMEM);
412 
413 	INIT_LIST_HEAD(&fs_devices->devices);
414 	INIT_LIST_HEAD(&fs_devices->alloc_list);
415 	INIT_LIST_HEAD(&fs_devices->list);
416 	mutex_init(&fs_devices->device_list_mutex);
417 	fs_devices->latest_devid = orig->latest_devid;
418 	fs_devices->latest_trans = orig->latest_trans;
419 	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
420 
421 	/* We have held the volume lock, it is safe to get the devices. */
422 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
423 		device = kzalloc(sizeof(*device), GFP_NOFS);
424 		if (!device)
425 			goto error;
426 
427 		device->name = kstrdup(orig_dev->name, GFP_NOFS);
428 		if (!device->name) {
429 			kfree(device);
430 			goto error;
431 		}
432 
433 		device->devid = orig_dev->devid;
434 		device->work.func = pending_bios_fn;
435 		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
436 		spin_lock_init(&device->io_lock);
437 		INIT_LIST_HEAD(&device->dev_list);
438 		INIT_LIST_HEAD(&device->dev_alloc_list);
439 
440 		list_add(&device->dev_list, &fs_devices->devices);
441 		device->fs_devices = fs_devices;
442 		fs_devices->num_devices++;
443 	}
444 	return fs_devices;
445 error:
446 	free_fs_devices(fs_devices);
447 	return ERR_PTR(-ENOMEM);
448 }
449 
450 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
451 {
452 	struct btrfs_device *device, *next;
453 
454 	mutex_lock(&uuid_mutex);
455 again:
456 	/* This is the initialized path, it is safe to release the devices. */
457 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
458 		if (device->in_fs_metadata)
459 			continue;
460 
461 		if (device->bdev) {
462 			blkdev_put(device->bdev, device->mode);
463 			device->bdev = NULL;
464 			fs_devices->open_devices--;
465 		}
466 		if (device->writeable) {
467 			list_del_init(&device->dev_alloc_list);
468 			device->writeable = 0;
469 			fs_devices->rw_devices--;
470 		}
471 		list_del_init(&device->dev_list);
472 		fs_devices->num_devices--;
473 		kfree(device->name);
474 		kfree(device);
475 	}
476 
477 	if (fs_devices->seed) {
478 		fs_devices = fs_devices->seed;
479 		goto again;
480 	}
481 
482 	mutex_unlock(&uuid_mutex);
483 	return 0;
484 }
485 
486 static void __free_device(struct work_struct *work)
487 {
488 	struct btrfs_device *device;
489 
490 	device = container_of(work, struct btrfs_device, rcu_work);
491 
492 	if (device->bdev)
493 		blkdev_put(device->bdev, device->mode);
494 
495 	kfree(device->name);
496 	kfree(device);
497 }
498 
499 static void free_device(struct rcu_head *head)
500 {
501 	struct btrfs_device *device;
502 
503 	device = container_of(head, struct btrfs_device, rcu);
504 
505 	INIT_WORK(&device->rcu_work, __free_device);
506 	schedule_work(&device->rcu_work);
507 }
508 
509 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
510 {
511 	struct btrfs_device *device;
512 
513 	if (--fs_devices->opened > 0)
514 		return 0;
515 
516 	mutex_lock(&fs_devices->device_list_mutex);
517 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
518 		struct btrfs_device *new_device;
519 
520 		if (device->bdev)
521 			fs_devices->open_devices--;
522 
523 		if (device->writeable) {
524 			list_del_init(&device->dev_alloc_list);
525 			fs_devices->rw_devices--;
526 		}
527 
528 		if (device->can_discard)
529 			fs_devices->num_can_discard--;
530 
531 		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
532 		BUG_ON(!new_device);
533 		memcpy(new_device, device, sizeof(*new_device));
534 		new_device->name = kstrdup(device->name, GFP_NOFS);
535 		BUG_ON(device->name && !new_device->name);
536 		new_device->bdev = NULL;
537 		new_device->writeable = 0;
538 		new_device->in_fs_metadata = 0;
539 		new_device->can_discard = 0;
540 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
541 
542 		call_rcu(&device->rcu, free_device);
543 	}
544 	mutex_unlock(&fs_devices->device_list_mutex);
545 
546 	WARN_ON(fs_devices->open_devices);
547 	WARN_ON(fs_devices->rw_devices);
548 	fs_devices->opened = 0;
549 	fs_devices->seeding = 0;
550 
551 	return 0;
552 }
553 
554 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 {
556 	struct btrfs_fs_devices *seed_devices = NULL;
557 	int ret;
558 
559 	mutex_lock(&uuid_mutex);
560 	ret = __btrfs_close_devices(fs_devices);
561 	if (!fs_devices->opened) {
562 		seed_devices = fs_devices->seed;
563 		fs_devices->seed = NULL;
564 	}
565 	mutex_unlock(&uuid_mutex);
566 
567 	while (seed_devices) {
568 		fs_devices = seed_devices;
569 		seed_devices = fs_devices->seed;
570 		__btrfs_close_devices(fs_devices);
571 		free_fs_devices(fs_devices);
572 	}
573 	return ret;
574 }
575 
576 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
577 				fmode_t flags, void *holder)
578 {
579 	struct request_queue *q;
580 	struct block_device *bdev;
581 	struct list_head *head = &fs_devices->devices;
582 	struct btrfs_device *device;
583 	struct block_device *latest_bdev = NULL;
584 	struct buffer_head *bh;
585 	struct btrfs_super_block *disk_super;
586 	u64 latest_devid = 0;
587 	u64 latest_transid = 0;
588 	u64 devid;
589 	int seeding = 1;
590 	int ret = 0;
591 
592 	flags |= FMODE_EXCL;
593 
594 	list_for_each_entry(device, head, dev_list) {
595 		if (device->bdev)
596 			continue;
597 		if (!device->name)
598 			continue;
599 
600 		bdev = blkdev_get_by_path(device->name, flags, holder);
601 		if (IS_ERR(bdev)) {
602 			printk(KERN_INFO "open %s failed\n", device->name);
603 			goto error;
604 		}
605 		set_blocksize(bdev, 4096);
606 
607 		bh = btrfs_read_dev_super(bdev);
608 		if (!bh)
609 			goto error_close;
610 
611 		disk_super = (struct btrfs_super_block *)bh->b_data;
612 		devid = btrfs_stack_device_id(&disk_super->dev_item);
613 		if (devid != device->devid)
614 			goto error_brelse;
615 
616 		if (memcmp(device->uuid, disk_super->dev_item.uuid,
617 			   BTRFS_UUID_SIZE))
618 			goto error_brelse;
619 
620 		device->generation = btrfs_super_generation(disk_super);
621 		if (!latest_transid || device->generation > latest_transid) {
622 			latest_devid = devid;
623 			latest_transid = device->generation;
624 			latest_bdev = bdev;
625 		}
626 
627 		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
628 			device->writeable = 0;
629 		} else {
630 			device->writeable = !bdev_read_only(bdev);
631 			seeding = 0;
632 		}
633 
634 		q = bdev_get_queue(bdev);
635 		if (blk_queue_discard(q)) {
636 			device->can_discard = 1;
637 			fs_devices->num_can_discard++;
638 		}
639 
640 		device->bdev = bdev;
641 		device->in_fs_metadata = 0;
642 		device->mode = flags;
643 
644 		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
645 			fs_devices->rotating = 1;
646 
647 		fs_devices->open_devices++;
648 		if (device->writeable) {
649 			fs_devices->rw_devices++;
650 			list_add(&device->dev_alloc_list,
651 				 &fs_devices->alloc_list);
652 		}
653 		brelse(bh);
654 		continue;
655 
656 error_brelse:
657 		brelse(bh);
658 error_close:
659 		blkdev_put(bdev, flags);
660 error:
661 		continue;
662 	}
663 	if (fs_devices->open_devices == 0) {
664 		ret = -EINVAL;
665 		goto out;
666 	}
667 	fs_devices->seeding = seeding;
668 	fs_devices->opened = 1;
669 	fs_devices->latest_bdev = latest_bdev;
670 	fs_devices->latest_devid = latest_devid;
671 	fs_devices->latest_trans = latest_transid;
672 	fs_devices->total_rw_bytes = 0;
673 out:
674 	return ret;
675 }
676 
677 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
678 		       fmode_t flags, void *holder)
679 {
680 	int ret;
681 
682 	mutex_lock(&uuid_mutex);
683 	if (fs_devices->opened) {
684 		fs_devices->opened++;
685 		ret = 0;
686 	} else {
687 		ret = __btrfs_open_devices(fs_devices, flags, holder);
688 	}
689 	mutex_unlock(&uuid_mutex);
690 	return ret;
691 }
692 
693 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 			  struct btrfs_fs_devices **fs_devices_ret)
695 {
696 	struct btrfs_super_block *disk_super;
697 	struct block_device *bdev;
698 	struct buffer_head *bh;
699 	int ret;
700 	u64 devid;
701 	u64 transid;
702 
703 	mutex_lock(&uuid_mutex);
704 
705 	flags |= FMODE_EXCL;
706 	bdev = blkdev_get_by_path(path, flags, holder);
707 
708 	if (IS_ERR(bdev)) {
709 		ret = PTR_ERR(bdev);
710 		goto error;
711 	}
712 
713 	ret = set_blocksize(bdev, 4096);
714 	if (ret)
715 		goto error_close;
716 	bh = btrfs_read_dev_super(bdev);
717 	if (!bh) {
718 		ret = -EINVAL;
719 		goto error_close;
720 	}
721 	disk_super = (struct btrfs_super_block *)bh->b_data;
722 	devid = btrfs_stack_device_id(&disk_super->dev_item);
723 	transid = btrfs_super_generation(disk_super);
724 	if (disk_super->label[0])
725 		printk(KERN_INFO "device label %s ", disk_super->label);
726 	else
727 		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
728 	printk(KERN_CONT "devid %llu transid %llu %s\n",
729 	       (unsigned long long)devid, (unsigned long long)transid, path);
730 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
731 
732 	brelse(bh);
733 error_close:
734 	blkdev_put(bdev, flags);
735 error:
736 	mutex_unlock(&uuid_mutex);
737 	return ret;
738 }
739 
740 /* helper to account the used device space in the range */
741 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
742 				   u64 end, u64 *length)
743 {
744 	struct btrfs_key key;
745 	struct btrfs_root *root = device->dev_root;
746 	struct btrfs_dev_extent *dev_extent;
747 	struct btrfs_path *path;
748 	u64 extent_end;
749 	int ret;
750 	int slot;
751 	struct extent_buffer *l;
752 
753 	*length = 0;
754 
755 	if (start >= device->total_bytes)
756 		return 0;
757 
758 	path = btrfs_alloc_path();
759 	if (!path)
760 		return -ENOMEM;
761 	path->reada = 2;
762 
763 	key.objectid = device->devid;
764 	key.offset = start;
765 	key.type = BTRFS_DEV_EXTENT_KEY;
766 
767 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
768 	if (ret < 0)
769 		goto out;
770 	if (ret > 0) {
771 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
772 		if (ret < 0)
773 			goto out;
774 	}
775 
776 	while (1) {
777 		l = path->nodes[0];
778 		slot = path->slots[0];
779 		if (slot >= btrfs_header_nritems(l)) {
780 			ret = btrfs_next_leaf(root, path);
781 			if (ret == 0)
782 				continue;
783 			if (ret < 0)
784 				goto out;
785 
786 			break;
787 		}
788 		btrfs_item_key_to_cpu(l, &key, slot);
789 
790 		if (key.objectid < device->devid)
791 			goto next;
792 
793 		if (key.objectid > device->devid)
794 			break;
795 
796 		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
797 			goto next;
798 
799 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
800 		extent_end = key.offset + btrfs_dev_extent_length(l,
801 								  dev_extent);
802 		if (key.offset <= start && extent_end > end) {
803 			*length = end - start + 1;
804 			break;
805 		} else if (key.offset <= start && extent_end > start)
806 			*length += extent_end - start;
807 		else if (key.offset > start && extent_end <= end)
808 			*length += extent_end - key.offset;
809 		else if (key.offset > start && key.offset <= end) {
810 			*length += end - key.offset + 1;
811 			break;
812 		} else if (key.offset > end)
813 			break;
814 
815 next:
816 		path->slots[0]++;
817 	}
818 	ret = 0;
819 out:
820 	btrfs_free_path(path);
821 	return ret;
822 }
823 
824 /*
825  * find_free_dev_extent - find free space in the specified device
826  * @trans:	transaction handler
827  * @device:	the device which we search the free space in
828  * @num_bytes:	the size of the free space that we need
829  * @start:	store the start of the free space.
830  * @len:	the size of the free space. that we find, or the size of the max
831  * 		free space if we don't find suitable free space
832  *
833  * this uses a pretty simple search, the expectation is that it is
834  * called very infrequently and that a given device has a small number
835  * of extents
836  *
837  * @start is used to store the start of the free space if we find. But if we
838  * don't find suitable free space, it will be used to store the start position
839  * of the max free space.
840  *
841  * @len is used to store the size of the free space that we find.
842  * But if we don't find suitable free space, it is used to store the size of
843  * the max free space.
844  */
845 int find_free_dev_extent(struct btrfs_trans_handle *trans,
846 			 struct btrfs_device *device, u64 num_bytes,
847 			 u64 *start, u64 *len)
848 {
849 	struct btrfs_key key;
850 	struct btrfs_root *root = device->dev_root;
851 	struct btrfs_dev_extent *dev_extent;
852 	struct btrfs_path *path;
853 	u64 hole_size;
854 	u64 max_hole_start;
855 	u64 max_hole_size;
856 	u64 extent_end;
857 	u64 search_start;
858 	u64 search_end = device->total_bytes;
859 	int ret;
860 	int slot;
861 	struct extent_buffer *l;
862 
863 	/* FIXME use last free of some kind */
864 
865 	/* we don't want to overwrite the superblock on the drive,
866 	 * so we make sure to start at an offset of at least 1MB
867 	 */
868 	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
869 
870 	max_hole_start = search_start;
871 	max_hole_size = 0;
872 	hole_size = 0;
873 
874 	if (search_start >= search_end) {
875 		ret = -ENOSPC;
876 		goto error;
877 	}
878 
879 	path = btrfs_alloc_path();
880 	if (!path) {
881 		ret = -ENOMEM;
882 		goto error;
883 	}
884 	path->reada = 2;
885 
886 	key.objectid = device->devid;
887 	key.offset = search_start;
888 	key.type = BTRFS_DEV_EXTENT_KEY;
889 
890 	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
891 	if (ret < 0)
892 		goto out;
893 	if (ret > 0) {
894 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
895 		if (ret < 0)
896 			goto out;
897 	}
898 
899 	while (1) {
900 		l = path->nodes[0];
901 		slot = path->slots[0];
902 		if (slot >= btrfs_header_nritems(l)) {
903 			ret = btrfs_next_leaf(root, path);
904 			if (ret == 0)
905 				continue;
906 			if (ret < 0)
907 				goto out;
908 
909 			break;
910 		}
911 		btrfs_item_key_to_cpu(l, &key, slot);
912 
913 		if (key.objectid < device->devid)
914 			goto next;
915 
916 		if (key.objectid > device->devid)
917 			break;
918 
919 		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
920 			goto next;
921 
922 		if (key.offset > search_start) {
923 			hole_size = key.offset - search_start;
924 
925 			if (hole_size > max_hole_size) {
926 				max_hole_start = search_start;
927 				max_hole_size = hole_size;
928 			}
929 
930 			/*
931 			 * If this free space is greater than which we need,
932 			 * it must be the max free space that we have found
933 			 * until now, so max_hole_start must point to the start
934 			 * of this free space and the length of this free space
935 			 * is stored in max_hole_size. Thus, we return
936 			 * max_hole_start and max_hole_size and go back to the
937 			 * caller.
938 			 */
939 			if (hole_size >= num_bytes) {
940 				ret = 0;
941 				goto out;
942 			}
943 		}
944 
945 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
946 		extent_end = key.offset + btrfs_dev_extent_length(l,
947 								  dev_extent);
948 		if (extent_end > search_start)
949 			search_start = extent_end;
950 next:
951 		path->slots[0]++;
952 		cond_resched();
953 	}
954 
955 	/*
956 	 * At this point, search_start should be the end of
957 	 * allocated dev extents, and when shrinking the device,
958 	 * search_end may be smaller than search_start.
959 	 */
960 	if (search_end > search_start)
961 		hole_size = search_end - search_start;
962 
963 	if (hole_size > max_hole_size) {
964 		max_hole_start = search_start;
965 		max_hole_size = hole_size;
966 	}
967 
968 	/* See above. */
969 	if (hole_size < num_bytes)
970 		ret = -ENOSPC;
971 	else
972 		ret = 0;
973 
974 out:
975 	btrfs_free_path(path);
976 error:
977 	*start = max_hole_start;
978 	if (len)
979 		*len = max_hole_size;
980 	return ret;
981 }
982 
983 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
984 			  struct btrfs_device *device,
985 			  u64 start)
986 {
987 	int ret;
988 	struct btrfs_path *path;
989 	struct btrfs_root *root = device->dev_root;
990 	struct btrfs_key key;
991 	struct btrfs_key found_key;
992 	struct extent_buffer *leaf = NULL;
993 	struct btrfs_dev_extent *extent = NULL;
994 
995 	path = btrfs_alloc_path();
996 	if (!path)
997 		return -ENOMEM;
998 
999 	key.objectid = device->devid;
1000 	key.offset = start;
1001 	key.type = BTRFS_DEV_EXTENT_KEY;
1002 again:
1003 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1004 	if (ret > 0) {
1005 		ret = btrfs_previous_item(root, path, key.objectid,
1006 					  BTRFS_DEV_EXTENT_KEY);
1007 		if (ret)
1008 			goto out;
1009 		leaf = path->nodes[0];
1010 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1011 		extent = btrfs_item_ptr(leaf, path->slots[0],
1012 					struct btrfs_dev_extent);
1013 		BUG_ON(found_key.offset > start || found_key.offset +
1014 		       btrfs_dev_extent_length(leaf, extent) < start);
1015 		key = found_key;
1016 		btrfs_release_path(path);
1017 		goto again;
1018 	} else if (ret == 0) {
1019 		leaf = path->nodes[0];
1020 		extent = btrfs_item_ptr(leaf, path->slots[0],
1021 					struct btrfs_dev_extent);
1022 	}
1023 	BUG_ON(ret);
1024 
1025 	if (device->bytes_used > 0) {
1026 		u64 len = btrfs_dev_extent_length(leaf, extent);
1027 		device->bytes_used -= len;
1028 		spin_lock(&root->fs_info->free_chunk_lock);
1029 		root->fs_info->free_chunk_space += len;
1030 		spin_unlock(&root->fs_info->free_chunk_lock);
1031 	}
1032 	ret = btrfs_del_item(trans, root, path);
1033 
1034 out:
1035 	btrfs_free_path(path);
1036 	return ret;
1037 }
1038 
1039 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1040 			   struct btrfs_device *device,
1041 			   u64 chunk_tree, u64 chunk_objectid,
1042 			   u64 chunk_offset, u64 start, u64 num_bytes)
1043 {
1044 	int ret;
1045 	struct btrfs_path *path;
1046 	struct btrfs_root *root = device->dev_root;
1047 	struct btrfs_dev_extent *extent;
1048 	struct extent_buffer *leaf;
1049 	struct btrfs_key key;
1050 
1051 	WARN_ON(!device->in_fs_metadata);
1052 	path = btrfs_alloc_path();
1053 	if (!path)
1054 		return -ENOMEM;
1055 
1056 	key.objectid = device->devid;
1057 	key.offset = start;
1058 	key.type = BTRFS_DEV_EXTENT_KEY;
1059 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1060 				      sizeof(*extent));
1061 	BUG_ON(ret);
1062 
1063 	leaf = path->nodes[0];
1064 	extent = btrfs_item_ptr(leaf, path->slots[0],
1065 				struct btrfs_dev_extent);
1066 	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1067 	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1068 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1069 
1070 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1071 		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1072 		    BTRFS_UUID_SIZE);
1073 
1074 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1075 	btrfs_mark_buffer_dirty(leaf);
1076 	btrfs_free_path(path);
1077 	return ret;
1078 }
1079 
1080 static noinline int find_next_chunk(struct btrfs_root *root,
1081 				    u64 objectid, u64 *offset)
1082 {
1083 	struct btrfs_path *path;
1084 	int ret;
1085 	struct btrfs_key key;
1086 	struct btrfs_chunk *chunk;
1087 	struct btrfs_key found_key;
1088 
1089 	path = btrfs_alloc_path();
1090 	if (!path)
1091 		return -ENOMEM;
1092 
1093 	key.objectid = objectid;
1094 	key.offset = (u64)-1;
1095 	key.type = BTRFS_CHUNK_ITEM_KEY;
1096 
1097 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1098 	if (ret < 0)
1099 		goto error;
1100 
1101 	BUG_ON(ret == 0);
1102 
1103 	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1104 	if (ret) {
1105 		*offset = 0;
1106 	} else {
1107 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1108 				      path->slots[0]);
1109 		if (found_key.objectid != objectid)
1110 			*offset = 0;
1111 		else {
1112 			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1113 					       struct btrfs_chunk);
1114 			*offset = found_key.offset +
1115 				btrfs_chunk_length(path->nodes[0], chunk);
1116 		}
1117 	}
1118 	ret = 0;
1119 error:
1120 	btrfs_free_path(path);
1121 	return ret;
1122 }
1123 
1124 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1125 {
1126 	int ret;
1127 	struct btrfs_key key;
1128 	struct btrfs_key found_key;
1129 	struct btrfs_path *path;
1130 
1131 	root = root->fs_info->chunk_root;
1132 
1133 	path = btrfs_alloc_path();
1134 	if (!path)
1135 		return -ENOMEM;
1136 
1137 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1138 	key.type = BTRFS_DEV_ITEM_KEY;
1139 	key.offset = (u64)-1;
1140 
1141 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1142 	if (ret < 0)
1143 		goto error;
1144 
1145 	BUG_ON(ret == 0);
1146 
1147 	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1148 				  BTRFS_DEV_ITEM_KEY);
1149 	if (ret) {
1150 		*objectid = 1;
1151 	} else {
1152 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1153 				      path->slots[0]);
1154 		*objectid = found_key.offset + 1;
1155 	}
1156 	ret = 0;
1157 error:
1158 	btrfs_free_path(path);
1159 	return ret;
1160 }
1161 
1162 /*
1163  * the device information is stored in the chunk root
1164  * the btrfs_device struct should be fully filled in
1165  */
1166 int btrfs_add_device(struct btrfs_trans_handle *trans,
1167 		     struct btrfs_root *root,
1168 		     struct btrfs_device *device)
1169 {
1170 	int ret;
1171 	struct btrfs_path *path;
1172 	struct btrfs_dev_item *dev_item;
1173 	struct extent_buffer *leaf;
1174 	struct btrfs_key key;
1175 	unsigned long ptr;
1176 
1177 	root = root->fs_info->chunk_root;
1178 
1179 	path = btrfs_alloc_path();
1180 	if (!path)
1181 		return -ENOMEM;
1182 
1183 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1184 	key.type = BTRFS_DEV_ITEM_KEY;
1185 	key.offset = device->devid;
1186 
1187 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1188 				      sizeof(*dev_item));
1189 	if (ret)
1190 		goto out;
1191 
1192 	leaf = path->nodes[0];
1193 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1194 
1195 	btrfs_set_device_id(leaf, dev_item, device->devid);
1196 	btrfs_set_device_generation(leaf, dev_item, 0);
1197 	btrfs_set_device_type(leaf, dev_item, device->type);
1198 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1199 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1200 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1201 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1202 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1203 	btrfs_set_device_group(leaf, dev_item, 0);
1204 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1205 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1206 	btrfs_set_device_start_offset(leaf, dev_item, 0);
1207 
1208 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
1209 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1210 	ptr = (unsigned long)btrfs_device_fsid(dev_item);
1211 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1212 	btrfs_mark_buffer_dirty(leaf);
1213 
1214 	ret = 0;
1215 out:
1216 	btrfs_free_path(path);
1217 	return ret;
1218 }
1219 
1220 static int btrfs_rm_dev_item(struct btrfs_root *root,
1221 			     struct btrfs_device *device)
1222 {
1223 	int ret;
1224 	struct btrfs_path *path;
1225 	struct btrfs_key key;
1226 	struct btrfs_trans_handle *trans;
1227 
1228 	root = root->fs_info->chunk_root;
1229 
1230 	path = btrfs_alloc_path();
1231 	if (!path)
1232 		return -ENOMEM;
1233 
1234 	trans = btrfs_start_transaction(root, 0);
1235 	if (IS_ERR(trans)) {
1236 		btrfs_free_path(path);
1237 		return PTR_ERR(trans);
1238 	}
1239 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1240 	key.type = BTRFS_DEV_ITEM_KEY;
1241 	key.offset = device->devid;
1242 	lock_chunks(root);
1243 
1244 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1245 	if (ret < 0)
1246 		goto out;
1247 
1248 	if (ret > 0) {
1249 		ret = -ENOENT;
1250 		goto out;
1251 	}
1252 
1253 	ret = btrfs_del_item(trans, root, path);
1254 	if (ret)
1255 		goto out;
1256 out:
1257 	btrfs_free_path(path);
1258 	unlock_chunks(root);
1259 	btrfs_commit_transaction(trans, root);
1260 	return ret;
1261 }
1262 
1263 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1264 {
1265 	struct btrfs_device *device;
1266 	struct btrfs_device *next_device;
1267 	struct block_device *bdev;
1268 	struct buffer_head *bh = NULL;
1269 	struct btrfs_super_block *disk_super;
1270 	struct btrfs_fs_devices *cur_devices;
1271 	u64 all_avail;
1272 	u64 devid;
1273 	u64 num_devices;
1274 	u8 *dev_uuid;
1275 	int ret = 0;
1276 	bool clear_super = false;
1277 
1278 	mutex_lock(&uuid_mutex);
1279 	mutex_lock(&root->fs_info->volume_mutex);
1280 
1281 	all_avail = root->fs_info->avail_data_alloc_bits |
1282 		root->fs_info->avail_system_alloc_bits |
1283 		root->fs_info->avail_metadata_alloc_bits;
1284 
1285 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1286 	    root->fs_info->fs_devices->num_devices <= 4) {
1287 		printk(KERN_ERR "btrfs: unable to go below four devices "
1288 		       "on raid10\n");
1289 		ret = -EINVAL;
1290 		goto out;
1291 	}
1292 
1293 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1294 	    root->fs_info->fs_devices->num_devices <= 2) {
1295 		printk(KERN_ERR "btrfs: unable to go below two "
1296 		       "devices on raid1\n");
1297 		ret = -EINVAL;
1298 		goto out;
1299 	}
1300 
1301 	if (strcmp(device_path, "missing") == 0) {
1302 		struct list_head *devices;
1303 		struct btrfs_device *tmp;
1304 
1305 		device = NULL;
1306 		devices = &root->fs_info->fs_devices->devices;
1307 		/*
1308 		 * It is safe to read the devices since the volume_mutex
1309 		 * is held.
1310 		 */
1311 		list_for_each_entry(tmp, devices, dev_list) {
1312 			if (tmp->in_fs_metadata && !tmp->bdev) {
1313 				device = tmp;
1314 				break;
1315 			}
1316 		}
1317 		bdev = NULL;
1318 		bh = NULL;
1319 		disk_super = NULL;
1320 		if (!device) {
1321 			printk(KERN_ERR "btrfs: no missing devices found to "
1322 			       "remove\n");
1323 			goto out;
1324 		}
1325 	} else {
1326 		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1327 					  root->fs_info->bdev_holder);
1328 		if (IS_ERR(bdev)) {
1329 			ret = PTR_ERR(bdev);
1330 			goto out;
1331 		}
1332 
1333 		set_blocksize(bdev, 4096);
1334 		bh = btrfs_read_dev_super(bdev);
1335 		if (!bh) {
1336 			ret = -EINVAL;
1337 			goto error_close;
1338 		}
1339 		disk_super = (struct btrfs_super_block *)bh->b_data;
1340 		devid = btrfs_stack_device_id(&disk_super->dev_item);
1341 		dev_uuid = disk_super->dev_item.uuid;
1342 		device = btrfs_find_device(root, devid, dev_uuid,
1343 					   disk_super->fsid);
1344 		if (!device) {
1345 			ret = -ENOENT;
1346 			goto error_brelse;
1347 		}
1348 	}
1349 
1350 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1351 		printk(KERN_ERR "btrfs: unable to remove the only writeable "
1352 		       "device\n");
1353 		ret = -EINVAL;
1354 		goto error_brelse;
1355 	}
1356 
1357 	if (device->writeable) {
1358 		lock_chunks(root);
1359 		list_del_init(&device->dev_alloc_list);
1360 		unlock_chunks(root);
1361 		root->fs_info->fs_devices->rw_devices--;
1362 		clear_super = true;
1363 	}
1364 
1365 	ret = btrfs_shrink_device(device, 0);
1366 	if (ret)
1367 		goto error_undo;
1368 
1369 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1370 	if (ret)
1371 		goto error_undo;
1372 
1373 	spin_lock(&root->fs_info->free_chunk_lock);
1374 	root->fs_info->free_chunk_space = device->total_bytes -
1375 		device->bytes_used;
1376 	spin_unlock(&root->fs_info->free_chunk_lock);
1377 
1378 	device->in_fs_metadata = 0;
1379 	btrfs_scrub_cancel_dev(root, device);
1380 
1381 	/*
1382 	 * the device list mutex makes sure that we don't change
1383 	 * the device list while someone else is writing out all
1384 	 * the device supers.
1385 	 */
1386 
1387 	cur_devices = device->fs_devices;
1388 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1389 	list_del_rcu(&device->dev_list);
1390 
1391 	device->fs_devices->num_devices--;
1392 
1393 	if (device->missing)
1394 		root->fs_info->fs_devices->missing_devices--;
1395 
1396 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
1397 				 struct btrfs_device, dev_list);
1398 	if (device->bdev == root->fs_info->sb->s_bdev)
1399 		root->fs_info->sb->s_bdev = next_device->bdev;
1400 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1401 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1402 
1403 	if (device->bdev)
1404 		device->fs_devices->open_devices--;
1405 
1406 	call_rcu(&device->rcu, free_device);
1407 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1408 
1409 	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1410 	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1411 
1412 	if (cur_devices->open_devices == 0) {
1413 		struct btrfs_fs_devices *fs_devices;
1414 		fs_devices = root->fs_info->fs_devices;
1415 		while (fs_devices) {
1416 			if (fs_devices->seed == cur_devices)
1417 				break;
1418 			fs_devices = fs_devices->seed;
1419 		}
1420 		fs_devices->seed = cur_devices->seed;
1421 		cur_devices->seed = NULL;
1422 		lock_chunks(root);
1423 		__btrfs_close_devices(cur_devices);
1424 		unlock_chunks(root);
1425 		free_fs_devices(cur_devices);
1426 	}
1427 
1428 	/*
1429 	 * at this point, the device is zero sized.  We want to
1430 	 * remove it from the devices list and zero out the old super
1431 	 */
1432 	if (clear_super) {
1433 		/* make sure this device isn't detected as part of
1434 		 * the FS anymore
1435 		 */
1436 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1437 		set_buffer_dirty(bh);
1438 		sync_dirty_buffer(bh);
1439 	}
1440 
1441 	ret = 0;
1442 
1443 error_brelse:
1444 	brelse(bh);
1445 error_close:
1446 	if (bdev)
1447 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1448 out:
1449 	mutex_unlock(&root->fs_info->volume_mutex);
1450 	mutex_unlock(&uuid_mutex);
1451 	return ret;
1452 error_undo:
1453 	if (device->writeable) {
1454 		lock_chunks(root);
1455 		list_add(&device->dev_alloc_list,
1456 			 &root->fs_info->fs_devices->alloc_list);
1457 		unlock_chunks(root);
1458 		root->fs_info->fs_devices->rw_devices++;
1459 	}
1460 	goto error_brelse;
1461 }
1462 
1463 /*
1464  * does all the dirty work required for changing file system's UUID.
1465  */
1466 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1467 				struct btrfs_root *root)
1468 {
1469 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1470 	struct btrfs_fs_devices *old_devices;
1471 	struct btrfs_fs_devices *seed_devices;
1472 	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1473 	struct btrfs_device *device;
1474 	u64 super_flags;
1475 
1476 	BUG_ON(!mutex_is_locked(&uuid_mutex));
1477 	if (!fs_devices->seeding)
1478 		return -EINVAL;
1479 
1480 	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1481 	if (!seed_devices)
1482 		return -ENOMEM;
1483 
1484 	old_devices = clone_fs_devices(fs_devices);
1485 	if (IS_ERR(old_devices)) {
1486 		kfree(seed_devices);
1487 		return PTR_ERR(old_devices);
1488 	}
1489 
1490 	list_add(&old_devices->list, &fs_uuids);
1491 
1492 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1493 	seed_devices->opened = 1;
1494 	INIT_LIST_HEAD(&seed_devices->devices);
1495 	INIT_LIST_HEAD(&seed_devices->alloc_list);
1496 	mutex_init(&seed_devices->device_list_mutex);
1497 
1498 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1499 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1500 			      synchronize_rcu);
1501 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1502 
1503 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1504 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
1505 		device->fs_devices = seed_devices;
1506 	}
1507 
1508 	fs_devices->seeding = 0;
1509 	fs_devices->num_devices = 0;
1510 	fs_devices->open_devices = 0;
1511 	fs_devices->seed = seed_devices;
1512 
1513 	generate_random_uuid(fs_devices->fsid);
1514 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1515 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1516 	super_flags = btrfs_super_flags(disk_super) &
1517 		      ~BTRFS_SUPER_FLAG_SEEDING;
1518 	btrfs_set_super_flags(disk_super, super_flags);
1519 
1520 	return 0;
1521 }
1522 
1523 /*
1524  * strore the expected generation for seed devices in device items.
1525  */
1526 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1527 			       struct btrfs_root *root)
1528 {
1529 	struct btrfs_path *path;
1530 	struct extent_buffer *leaf;
1531 	struct btrfs_dev_item *dev_item;
1532 	struct btrfs_device *device;
1533 	struct btrfs_key key;
1534 	u8 fs_uuid[BTRFS_UUID_SIZE];
1535 	u8 dev_uuid[BTRFS_UUID_SIZE];
1536 	u64 devid;
1537 	int ret;
1538 
1539 	path = btrfs_alloc_path();
1540 	if (!path)
1541 		return -ENOMEM;
1542 
1543 	root = root->fs_info->chunk_root;
1544 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1545 	key.offset = 0;
1546 	key.type = BTRFS_DEV_ITEM_KEY;
1547 
1548 	while (1) {
1549 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1550 		if (ret < 0)
1551 			goto error;
1552 
1553 		leaf = path->nodes[0];
1554 next_slot:
1555 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1556 			ret = btrfs_next_leaf(root, path);
1557 			if (ret > 0)
1558 				break;
1559 			if (ret < 0)
1560 				goto error;
1561 			leaf = path->nodes[0];
1562 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1563 			btrfs_release_path(path);
1564 			continue;
1565 		}
1566 
1567 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1568 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1569 		    key.type != BTRFS_DEV_ITEM_KEY)
1570 			break;
1571 
1572 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
1573 					  struct btrfs_dev_item);
1574 		devid = btrfs_device_id(leaf, dev_item);
1575 		read_extent_buffer(leaf, dev_uuid,
1576 				   (unsigned long)btrfs_device_uuid(dev_item),
1577 				   BTRFS_UUID_SIZE);
1578 		read_extent_buffer(leaf, fs_uuid,
1579 				   (unsigned long)btrfs_device_fsid(dev_item),
1580 				   BTRFS_UUID_SIZE);
1581 		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1582 		BUG_ON(!device);
1583 
1584 		if (device->fs_devices->seeding) {
1585 			btrfs_set_device_generation(leaf, dev_item,
1586 						    device->generation);
1587 			btrfs_mark_buffer_dirty(leaf);
1588 		}
1589 
1590 		path->slots[0]++;
1591 		goto next_slot;
1592 	}
1593 	ret = 0;
1594 error:
1595 	btrfs_free_path(path);
1596 	return ret;
1597 }
1598 
1599 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1600 {
1601 	struct request_queue *q;
1602 	struct btrfs_trans_handle *trans;
1603 	struct btrfs_device *device;
1604 	struct block_device *bdev;
1605 	struct list_head *devices;
1606 	struct super_block *sb = root->fs_info->sb;
1607 	u64 total_bytes;
1608 	int seeding_dev = 0;
1609 	int ret = 0;
1610 
1611 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1612 		return -EINVAL;
1613 
1614 	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1615 				  root->fs_info->bdev_holder);
1616 	if (IS_ERR(bdev))
1617 		return PTR_ERR(bdev);
1618 
1619 	if (root->fs_info->fs_devices->seeding) {
1620 		seeding_dev = 1;
1621 		down_write(&sb->s_umount);
1622 		mutex_lock(&uuid_mutex);
1623 	}
1624 
1625 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
1626 	mutex_lock(&root->fs_info->volume_mutex);
1627 
1628 	devices = &root->fs_info->fs_devices->devices;
1629 	/*
1630 	 * we have the volume lock, so we don't need the extra
1631 	 * device list mutex while reading the list here.
1632 	 */
1633 	list_for_each_entry(device, devices, dev_list) {
1634 		if (device->bdev == bdev) {
1635 			ret = -EEXIST;
1636 			goto error;
1637 		}
1638 	}
1639 
1640 	device = kzalloc(sizeof(*device), GFP_NOFS);
1641 	if (!device) {
1642 		/* we can safely leave the fs_devices entry around */
1643 		ret = -ENOMEM;
1644 		goto error;
1645 	}
1646 
1647 	device->name = kstrdup(device_path, GFP_NOFS);
1648 	if (!device->name) {
1649 		kfree(device);
1650 		ret = -ENOMEM;
1651 		goto error;
1652 	}
1653 
1654 	ret = find_next_devid(root, &device->devid);
1655 	if (ret) {
1656 		kfree(device->name);
1657 		kfree(device);
1658 		goto error;
1659 	}
1660 
1661 	trans = btrfs_start_transaction(root, 0);
1662 	if (IS_ERR(trans)) {
1663 		kfree(device->name);
1664 		kfree(device);
1665 		ret = PTR_ERR(trans);
1666 		goto error;
1667 	}
1668 
1669 	lock_chunks(root);
1670 
1671 	q = bdev_get_queue(bdev);
1672 	if (blk_queue_discard(q))
1673 		device->can_discard = 1;
1674 	device->writeable = 1;
1675 	device->work.func = pending_bios_fn;
1676 	generate_random_uuid(device->uuid);
1677 	spin_lock_init(&device->io_lock);
1678 	device->generation = trans->transid;
1679 	device->io_width = root->sectorsize;
1680 	device->io_align = root->sectorsize;
1681 	device->sector_size = root->sectorsize;
1682 	device->total_bytes = i_size_read(bdev->bd_inode);
1683 	device->disk_total_bytes = device->total_bytes;
1684 	device->dev_root = root->fs_info->dev_root;
1685 	device->bdev = bdev;
1686 	device->in_fs_metadata = 1;
1687 	device->mode = FMODE_EXCL;
1688 	set_blocksize(device->bdev, 4096);
1689 
1690 	if (seeding_dev) {
1691 		sb->s_flags &= ~MS_RDONLY;
1692 		ret = btrfs_prepare_sprout(trans, root);
1693 		BUG_ON(ret);
1694 	}
1695 
1696 	device->fs_devices = root->fs_info->fs_devices;
1697 
1698 	/*
1699 	 * we don't want write_supers to jump in here with our device
1700 	 * half setup
1701 	 */
1702 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1703 	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1704 	list_add(&device->dev_alloc_list,
1705 		 &root->fs_info->fs_devices->alloc_list);
1706 	root->fs_info->fs_devices->num_devices++;
1707 	root->fs_info->fs_devices->open_devices++;
1708 	root->fs_info->fs_devices->rw_devices++;
1709 	if (device->can_discard)
1710 		root->fs_info->fs_devices->num_can_discard++;
1711 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1712 
1713 	spin_lock(&root->fs_info->free_chunk_lock);
1714 	root->fs_info->free_chunk_space += device->total_bytes;
1715 	spin_unlock(&root->fs_info->free_chunk_lock);
1716 
1717 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1718 		root->fs_info->fs_devices->rotating = 1;
1719 
1720 	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1721 	btrfs_set_super_total_bytes(root->fs_info->super_copy,
1722 				    total_bytes + device->total_bytes);
1723 
1724 	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1725 	btrfs_set_super_num_devices(root->fs_info->super_copy,
1726 				    total_bytes + 1);
1727 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1728 
1729 	if (seeding_dev) {
1730 		ret = init_first_rw_device(trans, root, device);
1731 		BUG_ON(ret);
1732 		ret = btrfs_finish_sprout(trans, root);
1733 		BUG_ON(ret);
1734 	} else {
1735 		ret = btrfs_add_device(trans, root, device);
1736 	}
1737 
1738 	/*
1739 	 * we've got more storage, clear any full flags on the space
1740 	 * infos
1741 	 */
1742 	btrfs_clear_space_info_full(root->fs_info);
1743 
1744 	unlock_chunks(root);
1745 	btrfs_commit_transaction(trans, root);
1746 
1747 	if (seeding_dev) {
1748 		mutex_unlock(&uuid_mutex);
1749 		up_write(&sb->s_umount);
1750 
1751 		ret = btrfs_relocate_sys_chunks(root);
1752 		BUG_ON(ret);
1753 	}
1754 out:
1755 	mutex_unlock(&root->fs_info->volume_mutex);
1756 	return ret;
1757 error:
1758 	blkdev_put(bdev, FMODE_EXCL);
1759 	if (seeding_dev) {
1760 		mutex_unlock(&uuid_mutex);
1761 		up_write(&sb->s_umount);
1762 	}
1763 	goto out;
1764 }
1765 
1766 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1767 					struct btrfs_device *device)
1768 {
1769 	int ret;
1770 	struct btrfs_path *path;
1771 	struct btrfs_root *root;
1772 	struct btrfs_dev_item *dev_item;
1773 	struct extent_buffer *leaf;
1774 	struct btrfs_key key;
1775 
1776 	root = device->dev_root->fs_info->chunk_root;
1777 
1778 	path = btrfs_alloc_path();
1779 	if (!path)
1780 		return -ENOMEM;
1781 
1782 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1783 	key.type = BTRFS_DEV_ITEM_KEY;
1784 	key.offset = device->devid;
1785 
1786 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1787 	if (ret < 0)
1788 		goto out;
1789 
1790 	if (ret > 0) {
1791 		ret = -ENOENT;
1792 		goto out;
1793 	}
1794 
1795 	leaf = path->nodes[0];
1796 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1797 
1798 	btrfs_set_device_id(leaf, dev_item, device->devid);
1799 	btrfs_set_device_type(leaf, dev_item, device->type);
1800 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1801 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1802 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1803 	btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1804 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1805 	btrfs_mark_buffer_dirty(leaf);
1806 
1807 out:
1808 	btrfs_free_path(path);
1809 	return ret;
1810 }
1811 
1812 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1813 		      struct btrfs_device *device, u64 new_size)
1814 {
1815 	struct btrfs_super_block *super_copy =
1816 		device->dev_root->fs_info->super_copy;
1817 	u64 old_total = btrfs_super_total_bytes(super_copy);
1818 	u64 diff = new_size - device->total_bytes;
1819 
1820 	if (!device->writeable)
1821 		return -EACCES;
1822 	if (new_size <= device->total_bytes)
1823 		return -EINVAL;
1824 
1825 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
1826 	device->fs_devices->total_rw_bytes += diff;
1827 
1828 	device->total_bytes = new_size;
1829 	device->disk_total_bytes = new_size;
1830 	btrfs_clear_space_info_full(device->dev_root->fs_info);
1831 
1832 	return btrfs_update_device(trans, device);
1833 }
1834 
1835 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1836 		      struct btrfs_device *device, u64 new_size)
1837 {
1838 	int ret;
1839 	lock_chunks(device->dev_root);
1840 	ret = __btrfs_grow_device(trans, device, new_size);
1841 	unlock_chunks(device->dev_root);
1842 	return ret;
1843 }
1844 
1845 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1846 			    struct btrfs_root *root,
1847 			    u64 chunk_tree, u64 chunk_objectid,
1848 			    u64 chunk_offset)
1849 {
1850 	int ret;
1851 	struct btrfs_path *path;
1852 	struct btrfs_key key;
1853 
1854 	root = root->fs_info->chunk_root;
1855 	path = btrfs_alloc_path();
1856 	if (!path)
1857 		return -ENOMEM;
1858 
1859 	key.objectid = chunk_objectid;
1860 	key.offset = chunk_offset;
1861 	key.type = BTRFS_CHUNK_ITEM_KEY;
1862 
1863 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1864 	BUG_ON(ret);
1865 
1866 	ret = btrfs_del_item(trans, root, path);
1867 
1868 	btrfs_free_path(path);
1869 	return ret;
1870 }
1871 
1872 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1873 			chunk_offset)
1874 {
1875 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1876 	struct btrfs_disk_key *disk_key;
1877 	struct btrfs_chunk *chunk;
1878 	u8 *ptr;
1879 	int ret = 0;
1880 	u32 num_stripes;
1881 	u32 array_size;
1882 	u32 len = 0;
1883 	u32 cur;
1884 	struct btrfs_key key;
1885 
1886 	array_size = btrfs_super_sys_array_size(super_copy);
1887 
1888 	ptr = super_copy->sys_chunk_array;
1889 	cur = 0;
1890 
1891 	while (cur < array_size) {
1892 		disk_key = (struct btrfs_disk_key *)ptr;
1893 		btrfs_disk_key_to_cpu(&key, disk_key);
1894 
1895 		len = sizeof(*disk_key);
1896 
1897 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1898 			chunk = (struct btrfs_chunk *)(ptr + len);
1899 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1900 			len += btrfs_chunk_item_size(num_stripes);
1901 		} else {
1902 			ret = -EIO;
1903 			break;
1904 		}
1905 		if (key.objectid == chunk_objectid &&
1906 		    key.offset == chunk_offset) {
1907 			memmove(ptr, ptr + len, array_size - (cur + len));
1908 			array_size -= len;
1909 			btrfs_set_super_sys_array_size(super_copy, array_size);
1910 		} else {
1911 			ptr += len;
1912 			cur += len;
1913 		}
1914 	}
1915 	return ret;
1916 }
1917 
1918 static int btrfs_relocate_chunk(struct btrfs_root *root,
1919 			 u64 chunk_tree, u64 chunk_objectid,
1920 			 u64 chunk_offset)
1921 {
1922 	struct extent_map_tree *em_tree;
1923 	struct btrfs_root *extent_root;
1924 	struct btrfs_trans_handle *trans;
1925 	struct extent_map *em;
1926 	struct map_lookup *map;
1927 	int ret;
1928 	int i;
1929 
1930 	root = root->fs_info->chunk_root;
1931 	extent_root = root->fs_info->extent_root;
1932 	em_tree = &root->fs_info->mapping_tree.map_tree;
1933 
1934 	ret = btrfs_can_relocate(extent_root, chunk_offset);
1935 	if (ret)
1936 		return -ENOSPC;
1937 
1938 	/* step one, relocate all the extents inside this chunk */
1939 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1940 	if (ret)
1941 		return ret;
1942 
1943 	trans = btrfs_start_transaction(root, 0);
1944 	BUG_ON(IS_ERR(trans));
1945 
1946 	lock_chunks(root);
1947 
1948 	/*
1949 	 * step two, delete the device extents and the
1950 	 * chunk tree entries
1951 	 */
1952 	read_lock(&em_tree->lock);
1953 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1954 	read_unlock(&em_tree->lock);
1955 
1956 	BUG_ON(em->start > chunk_offset ||
1957 	       em->start + em->len < chunk_offset);
1958 	map = (struct map_lookup *)em->bdev;
1959 
1960 	for (i = 0; i < map->num_stripes; i++) {
1961 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1962 					    map->stripes[i].physical);
1963 		BUG_ON(ret);
1964 
1965 		if (map->stripes[i].dev) {
1966 			ret = btrfs_update_device(trans, map->stripes[i].dev);
1967 			BUG_ON(ret);
1968 		}
1969 	}
1970 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1971 			       chunk_offset);
1972 
1973 	BUG_ON(ret);
1974 
1975 	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1976 
1977 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1978 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1979 		BUG_ON(ret);
1980 	}
1981 
1982 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1983 	BUG_ON(ret);
1984 
1985 	write_lock(&em_tree->lock);
1986 	remove_extent_mapping(em_tree, em);
1987 	write_unlock(&em_tree->lock);
1988 
1989 	kfree(map);
1990 	em->bdev = NULL;
1991 
1992 	/* once for the tree */
1993 	free_extent_map(em);
1994 	/* once for us */
1995 	free_extent_map(em);
1996 
1997 	unlock_chunks(root);
1998 	btrfs_end_transaction(trans, root);
1999 	return 0;
2000 }
2001 
2002 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2003 {
2004 	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2005 	struct btrfs_path *path;
2006 	struct extent_buffer *leaf;
2007 	struct btrfs_chunk *chunk;
2008 	struct btrfs_key key;
2009 	struct btrfs_key found_key;
2010 	u64 chunk_tree = chunk_root->root_key.objectid;
2011 	u64 chunk_type;
2012 	bool retried = false;
2013 	int failed = 0;
2014 	int ret;
2015 
2016 	path = btrfs_alloc_path();
2017 	if (!path)
2018 		return -ENOMEM;
2019 
2020 again:
2021 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2022 	key.offset = (u64)-1;
2023 	key.type = BTRFS_CHUNK_ITEM_KEY;
2024 
2025 	while (1) {
2026 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2027 		if (ret < 0)
2028 			goto error;
2029 		BUG_ON(ret == 0);
2030 
2031 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
2032 					  key.type);
2033 		if (ret < 0)
2034 			goto error;
2035 		if (ret > 0)
2036 			break;
2037 
2038 		leaf = path->nodes[0];
2039 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2040 
2041 		chunk = btrfs_item_ptr(leaf, path->slots[0],
2042 				       struct btrfs_chunk);
2043 		chunk_type = btrfs_chunk_type(leaf, chunk);
2044 		btrfs_release_path(path);
2045 
2046 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2047 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2048 						   found_key.objectid,
2049 						   found_key.offset);
2050 			if (ret == -ENOSPC)
2051 				failed++;
2052 			else if (ret)
2053 				BUG();
2054 		}
2055 
2056 		if (found_key.offset == 0)
2057 			break;
2058 		key.offset = found_key.offset - 1;
2059 	}
2060 	ret = 0;
2061 	if (failed && !retried) {
2062 		failed = 0;
2063 		retried = true;
2064 		goto again;
2065 	} else if (failed && retried) {
2066 		WARN_ON(1);
2067 		ret = -ENOSPC;
2068 	}
2069 error:
2070 	btrfs_free_path(path);
2071 	return ret;
2072 }
2073 
2074 static u64 div_factor(u64 num, int factor)
2075 {
2076 	if (factor == 10)
2077 		return num;
2078 	num *= factor;
2079 	do_div(num, 10);
2080 	return num;
2081 }
2082 
2083 int btrfs_balance(struct btrfs_root *dev_root)
2084 {
2085 	int ret;
2086 	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
2087 	struct btrfs_device *device;
2088 	u64 old_size;
2089 	u64 size_to_free;
2090 	struct btrfs_path *path;
2091 	struct btrfs_key key;
2092 	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2093 	struct btrfs_trans_handle *trans;
2094 	struct btrfs_key found_key;
2095 
2096 	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
2097 		return -EROFS;
2098 
2099 	if (!capable(CAP_SYS_ADMIN))
2100 		return -EPERM;
2101 
2102 	mutex_lock(&dev_root->fs_info->volume_mutex);
2103 	dev_root = dev_root->fs_info->dev_root;
2104 
2105 	/* step one make some room on all the devices */
2106 	list_for_each_entry(device, devices, dev_list) {
2107 		old_size = device->total_bytes;
2108 		size_to_free = div_factor(old_size, 1);
2109 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2110 		if (!device->writeable ||
2111 		    device->total_bytes - device->bytes_used > size_to_free)
2112 			continue;
2113 
2114 		ret = btrfs_shrink_device(device, old_size - size_to_free);
2115 		if (ret == -ENOSPC)
2116 			break;
2117 		BUG_ON(ret);
2118 
2119 		trans = btrfs_start_transaction(dev_root, 0);
2120 		BUG_ON(IS_ERR(trans));
2121 
2122 		ret = btrfs_grow_device(trans, device, old_size);
2123 		BUG_ON(ret);
2124 
2125 		btrfs_end_transaction(trans, dev_root);
2126 	}
2127 
2128 	/* step two, relocate all the chunks */
2129 	path = btrfs_alloc_path();
2130 	if (!path) {
2131 		ret = -ENOMEM;
2132 		goto error;
2133 	}
2134 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2135 	key.offset = (u64)-1;
2136 	key.type = BTRFS_CHUNK_ITEM_KEY;
2137 
2138 	while (1) {
2139 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2140 		if (ret < 0)
2141 			goto error;
2142 
2143 		/*
2144 		 * this shouldn't happen, it means the last relocate
2145 		 * failed
2146 		 */
2147 		if (ret == 0)
2148 			break;
2149 
2150 		ret = btrfs_previous_item(chunk_root, path, 0,
2151 					  BTRFS_CHUNK_ITEM_KEY);
2152 		if (ret)
2153 			break;
2154 
2155 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2156 				      path->slots[0]);
2157 		if (found_key.objectid != key.objectid)
2158 			break;
2159 
2160 		/* chunk zero is special */
2161 		if (found_key.offset == 0)
2162 			break;
2163 
2164 		btrfs_release_path(path);
2165 		ret = btrfs_relocate_chunk(chunk_root,
2166 					   chunk_root->root_key.objectid,
2167 					   found_key.objectid,
2168 					   found_key.offset);
2169 		if (ret && ret != -ENOSPC)
2170 			goto error;
2171 		key.offset = found_key.offset - 1;
2172 	}
2173 	ret = 0;
2174 error:
2175 	btrfs_free_path(path);
2176 	mutex_unlock(&dev_root->fs_info->volume_mutex);
2177 	return ret;
2178 }
2179 
2180 /*
2181  * shrinking a device means finding all of the device extents past
2182  * the new size, and then following the back refs to the chunks.
2183  * The chunk relocation code actually frees the device extent
2184  */
2185 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2186 {
2187 	struct btrfs_trans_handle *trans;
2188 	struct btrfs_root *root = device->dev_root;
2189 	struct btrfs_dev_extent *dev_extent = NULL;
2190 	struct btrfs_path *path;
2191 	u64 length;
2192 	u64 chunk_tree;
2193 	u64 chunk_objectid;
2194 	u64 chunk_offset;
2195 	int ret;
2196 	int slot;
2197 	int failed = 0;
2198 	bool retried = false;
2199 	struct extent_buffer *l;
2200 	struct btrfs_key key;
2201 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2202 	u64 old_total = btrfs_super_total_bytes(super_copy);
2203 	u64 old_size = device->total_bytes;
2204 	u64 diff = device->total_bytes - new_size;
2205 
2206 	if (new_size >= device->total_bytes)
2207 		return -EINVAL;
2208 
2209 	path = btrfs_alloc_path();
2210 	if (!path)
2211 		return -ENOMEM;
2212 
2213 	path->reada = 2;
2214 
2215 	lock_chunks(root);
2216 
2217 	device->total_bytes = new_size;
2218 	if (device->writeable) {
2219 		device->fs_devices->total_rw_bytes -= diff;
2220 		spin_lock(&root->fs_info->free_chunk_lock);
2221 		root->fs_info->free_chunk_space -= diff;
2222 		spin_unlock(&root->fs_info->free_chunk_lock);
2223 	}
2224 	unlock_chunks(root);
2225 
2226 again:
2227 	key.objectid = device->devid;
2228 	key.offset = (u64)-1;
2229 	key.type = BTRFS_DEV_EXTENT_KEY;
2230 
2231 	while (1) {
2232 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2233 		if (ret < 0)
2234 			goto done;
2235 
2236 		ret = btrfs_previous_item(root, path, 0, key.type);
2237 		if (ret < 0)
2238 			goto done;
2239 		if (ret) {
2240 			ret = 0;
2241 			btrfs_release_path(path);
2242 			break;
2243 		}
2244 
2245 		l = path->nodes[0];
2246 		slot = path->slots[0];
2247 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2248 
2249 		if (key.objectid != device->devid) {
2250 			btrfs_release_path(path);
2251 			break;
2252 		}
2253 
2254 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2255 		length = btrfs_dev_extent_length(l, dev_extent);
2256 
2257 		if (key.offset + length <= new_size) {
2258 			btrfs_release_path(path);
2259 			break;
2260 		}
2261 
2262 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2263 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2264 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2265 		btrfs_release_path(path);
2266 
2267 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2268 					   chunk_offset);
2269 		if (ret && ret != -ENOSPC)
2270 			goto done;
2271 		if (ret == -ENOSPC)
2272 			failed++;
2273 		key.offset -= 1;
2274 	}
2275 
2276 	if (failed && !retried) {
2277 		failed = 0;
2278 		retried = true;
2279 		goto again;
2280 	} else if (failed && retried) {
2281 		ret = -ENOSPC;
2282 		lock_chunks(root);
2283 
2284 		device->total_bytes = old_size;
2285 		if (device->writeable)
2286 			device->fs_devices->total_rw_bytes += diff;
2287 		spin_lock(&root->fs_info->free_chunk_lock);
2288 		root->fs_info->free_chunk_space += diff;
2289 		spin_unlock(&root->fs_info->free_chunk_lock);
2290 		unlock_chunks(root);
2291 		goto done;
2292 	}
2293 
2294 	/* Shrinking succeeded, else we would be at "done". */
2295 	trans = btrfs_start_transaction(root, 0);
2296 	if (IS_ERR(trans)) {
2297 		ret = PTR_ERR(trans);
2298 		goto done;
2299 	}
2300 
2301 	lock_chunks(root);
2302 
2303 	device->disk_total_bytes = new_size;
2304 	/* Now btrfs_update_device() will change the on-disk size. */
2305 	ret = btrfs_update_device(trans, device);
2306 	if (ret) {
2307 		unlock_chunks(root);
2308 		btrfs_end_transaction(trans, root);
2309 		goto done;
2310 	}
2311 	WARN_ON(diff > old_total);
2312 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
2313 	unlock_chunks(root);
2314 	btrfs_end_transaction(trans, root);
2315 done:
2316 	btrfs_free_path(path);
2317 	return ret;
2318 }
2319 
2320 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2321 			   struct btrfs_root *root,
2322 			   struct btrfs_key *key,
2323 			   struct btrfs_chunk *chunk, int item_size)
2324 {
2325 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2326 	struct btrfs_disk_key disk_key;
2327 	u32 array_size;
2328 	u8 *ptr;
2329 
2330 	array_size = btrfs_super_sys_array_size(super_copy);
2331 	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2332 		return -EFBIG;
2333 
2334 	ptr = super_copy->sys_chunk_array + array_size;
2335 	btrfs_cpu_key_to_disk(&disk_key, key);
2336 	memcpy(ptr, &disk_key, sizeof(disk_key));
2337 	ptr += sizeof(disk_key);
2338 	memcpy(ptr, chunk, item_size);
2339 	item_size += sizeof(disk_key);
2340 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2341 	return 0;
2342 }
2343 
2344 /*
2345  * sort the devices in descending order by max_avail, total_avail
2346  */
2347 static int btrfs_cmp_device_info(const void *a, const void *b)
2348 {
2349 	const struct btrfs_device_info *di_a = a;
2350 	const struct btrfs_device_info *di_b = b;
2351 
2352 	if (di_a->max_avail > di_b->max_avail)
2353 		return -1;
2354 	if (di_a->max_avail < di_b->max_avail)
2355 		return 1;
2356 	if (di_a->total_avail > di_b->total_avail)
2357 		return -1;
2358 	if (di_a->total_avail < di_b->total_avail)
2359 		return 1;
2360 	return 0;
2361 }
2362 
2363 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2364 			       struct btrfs_root *extent_root,
2365 			       struct map_lookup **map_ret,
2366 			       u64 *num_bytes_out, u64 *stripe_size_out,
2367 			       u64 start, u64 type)
2368 {
2369 	struct btrfs_fs_info *info = extent_root->fs_info;
2370 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
2371 	struct list_head *cur;
2372 	struct map_lookup *map = NULL;
2373 	struct extent_map_tree *em_tree;
2374 	struct extent_map *em;
2375 	struct btrfs_device_info *devices_info = NULL;
2376 	u64 total_avail;
2377 	int num_stripes;	/* total number of stripes to allocate */
2378 	int sub_stripes;	/* sub_stripes info for map */
2379 	int dev_stripes;	/* stripes per dev */
2380 	int devs_max;		/* max devs to use */
2381 	int devs_min;		/* min devs needed */
2382 	int devs_increment;	/* ndevs has to be a multiple of this */
2383 	int ncopies;		/* how many copies to data has */
2384 	int ret;
2385 	u64 max_stripe_size;
2386 	u64 max_chunk_size;
2387 	u64 stripe_size;
2388 	u64 num_bytes;
2389 	int ndevs;
2390 	int i;
2391 	int j;
2392 
2393 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2394 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
2395 		WARN_ON(1);
2396 		type &= ~BTRFS_BLOCK_GROUP_DUP;
2397 	}
2398 
2399 	if (list_empty(&fs_devices->alloc_list))
2400 		return -ENOSPC;
2401 
2402 	sub_stripes = 1;
2403 	dev_stripes = 1;
2404 	devs_increment = 1;
2405 	ncopies = 1;
2406 	devs_max = 0;	/* 0 == as many as possible */
2407 	devs_min = 1;
2408 
2409 	/*
2410 	 * define the properties of each RAID type.
2411 	 * FIXME: move this to a global table and use it in all RAID
2412 	 * calculation code
2413 	 */
2414 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2415 		dev_stripes = 2;
2416 		ncopies = 2;
2417 		devs_max = 1;
2418 	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2419 		devs_min = 2;
2420 	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2421 		devs_increment = 2;
2422 		ncopies = 2;
2423 		devs_max = 2;
2424 		devs_min = 2;
2425 	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2426 		sub_stripes = 2;
2427 		devs_increment = 2;
2428 		ncopies = 2;
2429 		devs_min = 4;
2430 	} else {
2431 		devs_max = 1;
2432 	}
2433 
2434 	if (type & BTRFS_BLOCK_GROUP_DATA) {
2435 		max_stripe_size = 1024 * 1024 * 1024;
2436 		max_chunk_size = 10 * max_stripe_size;
2437 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2438 		max_stripe_size = 256 * 1024 * 1024;
2439 		max_chunk_size = max_stripe_size;
2440 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2441 		max_stripe_size = 8 * 1024 * 1024;
2442 		max_chunk_size = 2 * max_stripe_size;
2443 	} else {
2444 		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
2445 		       type);
2446 		BUG_ON(1);
2447 	}
2448 
2449 	/* we don't want a chunk larger than 10% of writeable space */
2450 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2451 			     max_chunk_size);
2452 
2453 	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2454 			       GFP_NOFS);
2455 	if (!devices_info)
2456 		return -ENOMEM;
2457 
2458 	cur = fs_devices->alloc_list.next;
2459 
2460 	/*
2461 	 * in the first pass through the devices list, we gather information
2462 	 * about the available holes on each device.
2463 	 */
2464 	ndevs = 0;
2465 	while (cur != &fs_devices->alloc_list) {
2466 		struct btrfs_device *device;
2467 		u64 max_avail;
2468 		u64 dev_offset;
2469 
2470 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2471 
2472 		cur = cur->next;
2473 
2474 		if (!device->writeable) {
2475 			printk(KERN_ERR
2476 			       "btrfs: read-only device in alloc_list\n");
2477 			WARN_ON(1);
2478 			continue;
2479 		}
2480 
2481 		if (!device->in_fs_metadata)
2482 			continue;
2483 
2484 		if (device->total_bytes > device->bytes_used)
2485 			total_avail = device->total_bytes - device->bytes_used;
2486 		else
2487 			total_avail = 0;
2488 
2489 		/* If there is no space on this device, skip it. */
2490 		if (total_avail == 0)
2491 			continue;
2492 
2493 		ret = find_free_dev_extent(trans, device,
2494 					   max_stripe_size * dev_stripes,
2495 					   &dev_offset, &max_avail);
2496 		if (ret && ret != -ENOSPC)
2497 			goto error;
2498 
2499 		if (ret == 0)
2500 			max_avail = max_stripe_size * dev_stripes;
2501 
2502 		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
2503 			continue;
2504 
2505 		devices_info[ndevs].dev_offset = dev_offset;
2506 		devices_info[ndevs].max_avail = max_avail;
2507 		devices_info[ndevs].total_avail = total_avail;
2508 		devices_info[ndevs].dev = device;
2509 		++ndevs;
2510 	}
2511 
2512 	/*
2513 	 * now sort the devices by hole size / available space
2514 	 */
2515 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
2516 	     btrfs_cmp_device_info, NULL);
2517 
2518 	/* round down to number of usable stripes */
2519 	ndevs -= ndevs % devs_increment;
2520 
2521 	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
2522 		ret = -ENOSPC;
2523 		goto error;
2524 	}
2525 
2526 	if (devs_max && ndevs > devs_max)
2527 		ndevs = devs_max;
2528 	/*
2529 	 * the primary goal is to maximize the number of stripes, so use as many
2530 	 * devices as possible, even if the stripes are not maximum sized.
2531 	 */
2532 	stripe_size = devices_info[ndevs-1].max_avail;
2533 	num_stripes = ndevs * dev_stripes;
2534 
2535 	if (stripe_size * num_stripes > max_chunk_size * ncopies) {
2536 		stripe_size = max_chunk_size * ncopies;
2537 		do_div(stripe_size, num_stripes);
2538 	}
2539 
2540 	do_div(stripe_size, dev_stripes);
2541 	do_div(stripe_size, BTRFS_STRIPE_LEN);
2542 	stripe_size *= BTRFS_STRIPE_LEN;
2543 
2544 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2545 	if (!map) {
2546 		ret = -ENOMEM;
2547 		goto error;
2548 	}
2549 	map->num_stripes = num_stripes;
2550 
2551 	for (i = 0; i < ndevs; ++i) {
2552 		for (j = 0; j < dev_stripes; ++j) {
2553 			int s = i * dev_stripes + j;
2554 			map->stripes[s].dev = devices_info[i].dev;
2555 			map->stripes[s].physical = devices_info[i].dev_offset +
2556 						   j * stripe_size;
2557 		}
2558 	}
2559 	map->sector_size = extent_root->sectorsize;
2560 	map->stripe_len = BTRFS_STRIPE_LEN;
2561 	map->io_align = BTRFS_STRIPE_LEN;
2562 	map->io_width = BTRFS_STRIPE_LEN;
2563 	map->type = type;
2564 	map->sub_stripes = sub_stripes;
2565 
2566 	*map_ret = map;
2567 	num_bytes = stripe_size * (num_stripes / ncopies);
2568 
2569 	*stripe_size_out = stripe_size;
2570 	*num_bytes_out = num_bytes;
2571 
2572 	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
2573 
2574 	em = alloc_extent_map();
2575 	if (!em) {
2576 		ret = -ENOMEM;
2577 		goto error;
2578 	}
2579 	em->bdev = (struct block_device *)map;
2580 	em->start = start;
2581 	em->len = num_bytes;
2582 	em->block_start = 0;
2583 	em->block_len = em->len;
2584 
2585 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2586 	write_lock(&em_tree->lock);
2587 	ret = add_extent_mapping(em_tree, em);
2588 	write_unlock(&em_tree->lock);
2589 	BUG_ON(ret);
2590 	free_extent_map(em);
2591 
2592 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
2593 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2594 				     start, num_bytes);
2595 	BUG_ON(ret);
2596 
2597 	for (i = 0; i < map->num_stripes; ++i) {
2598 		struct btrfs_device *device;
2599 		u64 dev_offset;
2600 
2601 		device = map->stripes[i].dev;
2602 		dev_offset = map->stripes[i].physical;
2603 
2604 		ret = btrfs_alloc_dev_extent(trans, device,
2605 				info->chunk_root->root_key.objectid,
2606 				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2607 				start, dev_offset, stripe_size);
2608 		BUG_ON(ret);
2609 	}
2610 
2611 	kfree(devices_info);
2612 	return 0;
2613 
2614 error:
2615 	kfree(map);
2616 	kfree(devices_info);
2617 	return ret;
2618 }
2619 
2620 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2621 				struct btrfs_root *extent_root,
2622 				struct map_lookup *map, u64 chunk_offset,
2623 				u64 chunk_size, u64 stripe_size)
2624 {
2625 	u64 dev_offset;
2626 	struct btrfs_key key;
2627 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2628 	struct btrfs_device *device;
2629 	struct btrfs_chunk *chunk;
2630 	struct btrfs_stripe *stripe;
2631 	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2632 	int index = 0;
2633 	int ret;
2634 
2635 	chunk = kzalloc(item_size, GFP_NOFS);
2636 	if (!chunk)
2637 		return -ENOMEM;
2638 
2639 	index = 0;
2640 	while (index < map->num_stripes) {
2641 		device = map->stripes[index].dev;
2642 		device->bytes_used += stripe_size;
2643 		ret = btrfs_update_device(trans, device);
2644 		BUG_ON(ret);
2645 		index++;
2646 	}
2647 
2648 	spin_lock(&extent_root->fs_info->free_chunk_lock);
2649 	extent_root->fs_info->free_chunk_space -= (stripe_size *
2650 						   map->num_stripes);
2651 	spin_unlock(&extent_root->fs_info->free_chunk_lock);
2652 
2653 	index = 0;
2654 	stripe = &chunk->stripe;
2655 	while (index < map->num_stripes) {
2656 		device = map->stripes[index].dev;
2657 		dev_offset = map->stripes[index].physical;
2658 
2659 		btrfs_set_stack_stripe_devid(stripe, device->devid);
2660 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
2661 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2662 		stripe++;
2663 		index++;
2664 	}
2665 
2666 	btrfs_set_stack_chunk_length(chunk, chunk_size);
2667 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2668 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2669 	btrfs_set_stack_chunk_type(chunk, map->type);
2670 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2671 	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2672 	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2673 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2674 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2675 
2676 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2677 	key.type = BTRFS_CHUNK_ITEM_KEY;
2678 	key.offset = chunk_offset;
2679 
2680 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2681 	BUG_ON(ret);
2682 
2683 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2684 		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2685 					     item_size);
2686 		BUG_ON(ret);
2687 	}
2688 
2689 	kfree(chunk);
2690 	return 0;
2691 }
2692 
2693 /*
2694  * Chunk allocation falls into two parts. The first part does works
2695  * that make the new allocated chunk useable, but not do any operation
2696  * that modifies the chunk tree. The second part does the works that
2697  * require modifying the chunk tree. This division is important for the
2698  * bootstrap process of adding storage to a seed btrfs.
2699  */
2700 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2701 		      struct btrfs_root *extent_root, u64 type)
2702 {
2703 	u64 chunk_offset;
2704 	u64 chunk_size;
2705 	u64 stripe_size;
2706 	struct map_lookup *map;
2707 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2708 	int ret;
2709 
2710 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2711 			      &chunk_offset);
2712 	if (ret)
2713 		return ret;
2714 
2715 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2716 				  &stripe_size, chunk_offset, type);
2717 	if (ret)
2718 		return ret;
2719 
2720 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2721 				   chunk_size, stripe_size);
2722 	BUG_ON(ret);
2723 	return 0;
2724 }
2725 
2726 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2727 					 struct btrfs_root *root,
2728 					 struct btrfs_device *device)
2729 {
2730 	u64 chunk_offset;
2731 	u64 sys_chunk_offset;
2732 	u64 chunk_size;
2733 	u64 sys_chunk_size;
2734 	u64 stripe_size;
2735 	u64 sys_stripe_size;
2736 	u64 alloc_profile;
2737 	struct map_lookup *map;
2738 	struct map_lookup *sys_map;
2739 	struct btrfs_fs_info *fs_info = root->fs_info;
2740 	struct btrfs_root *extent_root = fs_info->extent_root;
2741 	int ret;
2742 
2743 	ret = find_next_chunk(fs_info->chunk_root,
2744 			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2745 	if (ret)
2746 		return ret;
2747 
2748 	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2749 			(fs_info->metadata_alloc_profile &
2750 			 fs_info->avail_metadata_alloc_bits);
2751 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2752 
2753 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2754 				  &stripe_size, chunk_offset, alloc_profile);
2755 	BUG_ON(ret);
2756 
2757 	sys_chunk_offset = chunk_offset + chunk_size;
2758 
2759 	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2760 			(fs_info->system_alloc_profile &
2761 			 fs_info->avail_system_alloc_bits);
2762 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2763 
2764 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2765 				  &sys_chunk_size, &sys_stripe_size,
2766 				  sys_chunk_offset, alloc_profile);
2767 	BUG_ON(ret);
2768 
2769 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2770 	BUG_ON(ret);
2771 
2772 	/*
2773 	 * Modifying chunk tree needs allocating new blocks from both
2774 	 * system block group and metadata block group. So we only can
2775 	 * do operations require modifying the chunk tree after both
2776 	 * block groups were created.
2777 	 */
2778 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2779 				   chunk_size, stripe_size);
2780 	BUG_ON(ret);
2781 
2782 	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2783 				   sys_chunk_offset, sys_chunk_size,
2784 				   sys_stripe_size);
2785 	BUG_ON(ret);
2786 	return 0;
2787 }
2788 
2789 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2790 {
2791 	struct extent_map *em;
2792 	struct map_lookup *map;
2793 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2794 	int readonly = 0;
2795 	int i;
2796 
2797 	read_lock(&map_tree->map_tree.lock);
2798 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2799 	read_unlock(&map_tree->map_tree.lock);
2800 	if (!em)
2801 		return 1;
2802 
2803 	if (btrfs_test_opt(root, DEGRADED)) {
2804 		free_extent_map(em);
2805 		return 0;
2806 	}
2807 
2808 	map = (struct map_lookup *)em->bdev;
2809 	for (i = 0; i < map->num_stripes; i++) {
2810 		if (!map->stripes[i].dev->writeable) {
2811 			readonly = 1;
2812 			break;
2813 		}
2814 	}
2815 	free_extent_map(em);
2816 	return readonly;
2817 }
2818 
2819 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2820 {
2821 	extent_map_tree_init(&tree->map_tree);
2822 }
2823 
2824 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2825 {
2826 	struct extent_map *em;
2827 
2828 	while (1) {
2829 		write_lock(&tree->map_tree.lock);
2830 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2831 		if (em)
2832 			remove_extent_mapping(&tree->map_tree, em);
2833 		write_unlock(&tree->map_tree.lock);
2834 		if (!em)
2835 			break;
2836 		kfree(em->bdev);
2837 		/* once for us */
2838 		free_extent_map(em);
2839 		/* once for the tree */
2840 		free_extent_map(em);
2841 	}
2842 }
2843 
2844 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2845 {
2846 	struct extent_map *em;
2847 	struct map_lookup *map;
2848 	struct extent_map_tree *em_tree = &map_tree->map_tree;
2849 	int ret;
2850 
2851 	read_lock(&em_tree->lock);
2852 	em = lookup_extent_mapping(em_tree, logical, len);
2853 	read_unlock(&em_tree->lock);
2854 	BUG_ON(!em);
2855 
2856 	BUG_ON(em->start > logical || em->start + em->len < logical);
2857 	map = (struct map_lookup *)em->bdev;
2858 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2859 		ret = map->num_stripes;
2860 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2861 		ret = map->sub_stripes;
2862 	else
2863 		ret = 1;
2864 	free_extent_map(em);
2865 	return ret;
2866 }
2867 
2868 static int find_live_mirror(struct map_lookup *map, int first, int num,
2869 			    int optimal)
2870 {
2871 	int i;
2872 	if (map->stripes[optimal].dev->bdev)
2873 		return optimal;
2874 	for (i = first; i < first + num; i++) {
2875 		if (map->stripes[i].dev->bdev)
2876 			return i;
2877 	}
2878 	/* we couldn't find one that doesn't fail.  Just return something
2879 	 * and the io error handling code will clean up eventually
2880 	 */
2881 	return optimal;
2882 }
2883 
2884 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2885 			     u64 logical, u64 *length,
2886 			     struct btrfs_bio **bbio_ret,
2887 			     int mirror_num)
2888 {
2889 	struct extent_map *em;
2890 	struct map_lookup *map;
2891 	struct extent_map_tree *em_tree = &map_tree->map_tree;
2892 	u64 offset;
2893 	u64 stripe_offset;
2894 	u64 stripe_end_offset;
2895 	u64 stripe_nr;
2896 	u64 stripe_nr_orig;
2897 	u64 stripe_nr_end;
2898 	int stripes_allocated = 8;
2899 	int stripes_required = 1;
2900 	int stripe_index;
2901 	int i;
2902 	int num_stripes;
2903 	int max_errors = 0;
2904 	struct btrfs_bio *bbio = NULL;
2905 
2906 	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2907 		stripes_allocated = 1;
2908 again:
2909 	if (bbio_ret) {
2910 		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2911 				GFP_NOFS);
2912 		if (!bbio)
2913 			return -ENOMEM;
2914 
2915 		atomic_set(&bbio->error, 0);
2916 	}
2917 
2918 	read_lock(&em_tree->lock);
2919 	em = lookup_extent_mapping(em_tree, logical, *length);
2920 	read_unlock(&em_tree->lock);
2921 
2922 	if (!em) {
2923 		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2924 		       (unsigned long long)logical,
2925 		       (unsigned long long)*length);
2926 		BUG();
2927 	}
2928 
2929 	BUG_ON(em->start > logical || em->start + em->len < logical);
2930 	map = (struct map_lookup *)em->bdev;
2931 	offset = logical - em->start;
2932 
2933 	if (mirror_num > map->num_stripes)
2934 		mirror_num = 0;
2935 
2936 	/* if our btrfs_bio struct is too small, back off and try again */
2937 	if (rw & REQ_WRITE) {
2938 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2939 				 BTRFS_BLOCK_GROUP_DUP)) {
2940 			stripes_required = map->num_stripes;
2941 			max_errors = 1;
2942 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2943 			stripes_required = map->sub_stripes;
2944 			max_errors = 1;
2945 		}
2946 	}
2947 	if (rw & REQ_DISCARD) {
2948 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2949 				 BTRFS_BLOCK_GROUP_RAID1 |
2950 				 BTRFS_BLOCK_GROUP_DUP |
2951 				 BTRFS_BLOCK_GROUP_RAID10)) {
2952 			stripes_required = map->num_stripes;
2953 		}
2954 	}
2955 	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2956 	    stripes_allocated < stripes_required) {
2957 		stripes_allocated = map->num_stripes;
2958 		free_extent_map(em);
2959 		kfree(bbio);
2960 		goto again;
2961 	}
2962 	stripe_nr = offset;
2963 	/*
2964 	 * stripe_nr counts the total number of stripes we have to stride
2965 	 * to get to this block
2966 	 */
2967 	do_div(stripe_nr, map->stripe_len);
2968 
2969 	stripe_offset = stripe_nr * map->stripe_len;
2970 	BUG_ON(offset < stripe_offset);
2971 
2972 	/* stripe_offset is the offset of this block in its stripe*/
2973 	stripe_offset = offset - stripe_offset;
2974 
2975 	if (rw & REQ_DISCARD)
2976 		*length = min_t(u64, em->len - offset, *length);
2977 	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2978 			      BTRFS_BLOCK_GROUP_RAID1 |
2979 			      BTRFS_BLOCK_GROUP_RAID10 |
2980 			      BTRFS_BLOCK_GROUP_DUP)) {
2981 		/* we limit the length of each bio to what fits in a stripe */
2982 		*length = min_t(u64, em->len - offset,
2983 				map->stripe_len - stripe_offset);
2984 	} else {
2985 		*length = em->len - offset;
2986 	}
2987 
2988 	if (!bbio_ret)
2989 		goto out;
2990 
2991 	num_stripes = 1;
2992 	stripe_index = 0;
2993 	stripe_nr_orig = stripe_nr;
2994 	stripe_nr_end = (offset + *length + map->stripe_len - 1) &
2995 			(~(map->stripe_len - 1));
2996 	do_div(stripe_nr_end, map->stripe_len);
2997 	stripe_end_offset = stripe_nr_end * map->stripe_len -
2998 			    (offset + *length);
2999 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3000 		if (rw & REQ_DISCARD)
3001 			num_stripes = min_t(u64, map->num_stripes,
3002 					    stripe_nr_end - stripe_nr_orig);
3003 		stripe_index = do_div(stripe_nr, map->num_stripes);
3004 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3005 		if (rw & (REQ_WRITE | REQ_DISCARD))
3006 			num_stripes = map->num_stripes;
3007 		else if (mirror_num)
3008 			stripe_index = mirror_num - 1;
3009 		else {
3010 			stripe_index = find_live_mirror(map, 0,
3011 					    map->num_stripes,
3012 					    current->pid % map->num_stripes);
3013 			mirror_num = stripe_index + 1;
3014 		}
3015 
3016 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3017 		if (rw & (REQ_WRITE | REQ_DISCARD)) {
3018 			num_stripes = map->num_stripes;
3019 		} else if (mirror_num) {
3020 			stripe_index = mirror_num - 1;
3021 		} else {
3022 			mirror_num = 1;
3023 		}
3024 
3025 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3026 		int factor = map->num_stripes / map->sub_stripes;
3027 
3028 		stripe_index = do_div(stripe_nr, factor);
3029 		stripe_index *= map->sub_stripes;
3030 
3031 		if (rw & REQ_WRITE)
3032 			num_stripes = map->sub_stripes;
3033 		else if (rw & REQ_DISCARD)
3034 			num_stripes = min_t(u64, map->sub_stripes *
3035 					    (stripe_nr_end - stripe_nr_orig),
3036 					    map->num_stripes);
3037 		else if (mirror_num)
3038 			stripe_index += mirror_num - 1;
3039 		else {
3040 			stripe_index = find_live_mirror(map, stripe_index,
3041 					      map->sub_stripes, stripe_index +
3042 					      current->pid % map->sub_stripes);
3043 			mirror_num = stripe_index + 1;
3044 		}
3045 	} else {
3046 		/*
3047 		 * after this do_div call, stripe_nr is the number of stripes
3048 		 * on this device we have to walk to find the data, and
3049 		 * stripe_index is the number of our device in the stripe array
3050 		 */
3051 		stripe_index = do_div(stripe_nr, map->num_stripes);
3052 		mirror_num = stripe_index + 1;
3053 	}
3054 	BUG_ON(stripe_index >= map->num_stripes);
3055 
3056 	if (rw & REQ_DISCARD) {
3057 		for (i = 0; i < num_stripes; i++) {
3058 			bbio->stripes[i].physical =
3059 				map->stripes[stripe_index].physical +
3060 				stripe_offset + stripe_nr * map->stripe_len;
3061 			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3062 
3063 			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3064 				u64 stripes;
3065 				u32 last_stripe = 0;
3066 				int j;
3067 
3068 				div_u64_rem(stripe_nr_end - 1,
3069 					    map->num_stripes,
3070 					    &last_stripe);
3071 
3072 				for (j = 0; j < map->num_stripes; j++) {
3073 					u32 test;
3074 
3075 					div_u64_rem(stripe_nr_end - 1 - j,
3076 						    map->num_stripes, &test);
3077 					if (test == stripe_index)
3078 						break;
3079 				}
3080 				stripes = stripe_nr_end - 1 - j;
3081 				do_div(stripes, map->num_stripes);
3082 				bbio->stripes[i].length = map->stripe_len *
3083 					(stripes - stripe_nr + 1);
3084 
3085 				if (i == 0) {
3086 					bbio->stripes[i].length -=
3087 						stripe_offset;
3088 					stripe_offset = 0;
3089 				}
3090 				if (stripe_index == last_stripe)
3091 					bbio->stripes[i].length -=
3092 						stripe_end_offset;
3093 			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3094 				u64 stripes;
3095 				int j;
3096 				int factor = map->num_stripes /
3097 					     map->sub_stripes;
3098 				u32 last_stripe = 0;
3099 
3100 				div_u64_rem(stripe_nr_end - 1,
3101 					    factor, &last_stripe);
3102 				last_stripe *= map->sub_stripes;
3103 
3104 				for (j = 0; j < factor; j++) {
3105 					u32 test;
3106 
3107 					div_u64_rem(stripe_nr_end - 1 - j,
3108 						    factor, &test);
3109 
3110 					if (test ==
3111 					    stripe_index / map->sub_stripes)
3112 						break;
3113 				}
3114 				stripes = stripe_nr_end - 1 - j;
3115 				do_div(stripes, factor);
3116 				bbio->stripes[i].length = map->stripe_len *
3117 					(stripes - stripe_nr + 1);
3118 
3119 				if (i < map->sub_stripes) {
3120 					bbio->stripes[i].length -=
3121 						stripe_offset;
3122 					if (i == map->sub_stripes - 1)
3123 						stripe_offset = 0;
3124 				}
3125 				if (stripe_index >= last_stripe &&
3126 				    stripe_index <= (last_stripe +
3127 						     map->sub_stripes - 1)) {
3128 					bbio->stripes[i].length -=
3129 						stripe_end_offset;
3130 				}
3131 			} else
3132 				bbio->stripes[i].length = *length;
3133 
3134 			stripe_index++;
3135 			if (stripe_index == map->num_stripes) {
3136 				/* This could only happen for RAID0/10 */
3137 				stripe_index = 0;
3138 				stripe_nr++;
3139 			}
3140 		}
3141 	} else {
3142 		for (i = 0; i < num_stripes; i++) {
3143 			bbio->stripes[i].physical =
3144 				map->stripes[stripe_index].physical +
3145 				stripe_offset +
3146 				stripe_nr * map->stripe_len;
3147 			bbio->stripes[i].dev =
3148 				map->stripes[stripe_index].dev;
3149 			stripe_index++;
3150 		}
3151 	}
3152 	if (bbio_ret) {
3153 		*bbio_ret = bbio;
3154 		bbio->num_stripes = num_stripes;
3155 		bbio->max_errors = max_errors;
3156 		bbio->mirror_num = mirror_num;
3157 	}
3158 out:
3159 	free_extent_map(em);
3160 	return 0;
3161 }
3162 
3163 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3164 		      u64 logical, u64 *length,
3165 		      struct btrfs_bio **bbio_ret, int mirror_num)
3166 {
3167 	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3168 				 mirror_num);
3169 }
3170 
3171 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3172 		     u64 chunk_start, u64 physical, u64 devid,
3173 		     u64 **logical, int *naddrs, int *stripe_len)
3174 {
3175 	struct extent_map_tree *em_tree = &map_tree->map_tree;
3176 	struct extent_map *em;
3177 	struct map_lookup *map;
3178 	u64 *buf;
3179 	u64 bytenr;
3180 	u64 length;
3181 	u64 stripe_nr;
3182 	int i, j, nr = 0;
3183 
3184 	read_lock(&em_tree->lock);
3185 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
3186 	read_unlock(&em_tree->lock);
3187 
3188 	BUG_ON(!em || em->start != chunk_start);
3189 	map = (struct map_lookup *)em->bdev;
3190 
3191 	length = em->len;
3192 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3193 		do_div(length, map->num_stripes / map->sub_stripes);
3194 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3195 		do_div(length, map->num_stripes);
3196 
3197 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3198 	BUG_ON(!buf);
3199 
3200 	for (i = 0; i < map->num_stripes; i++) {
3201 		if (devid && map->stripes[i].dev->devid != devid)
3202 			continue;
3203 		if (map->stripes[i].physical > physical ||
3204 		    map->stripes[i].physical + length <= physical)
3205 			continue;
3206 
3207 		stripe_nr = physical - map->stripes[i].physical;
3208 		do_div(stripe_nr, map->stripe_len);
3209 
3210 		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3211 			stripe_nr = stripe_nr * map->num_stripes + i;
3212 			do_div(stripe_nr, map->sub_stripes);
3213 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3214 			stripe_nr = stripe_nr * map->num_stripes + i;
3215 		}
3216 		bytenr = chunk_start + stripe_nr * map->stripe_len;
3217 		WARN_ON(nr >= map->num_stripes);
3218 		for (j = 0; j < nr; j++) {
3219 			if (buf[j] == bytenr)
3220 				break;
3221 		}
3222 		if (j == nr) {
3223 			WARN_ON(nr >= map->num_stripes);
3224 			buf[nr++] = bytenr;
3225 		}
3226 	}
3227 
3228 	*logical = buf;
3229 	*naddrs = nr;
3230 	*stripe_len = map->stripe_len;
3231 
3232 	free_extent_map(em);
3233 	return 0;
3234 }
3235 
3236 static void btrfs_end_bio(struct bio *bio, int err)
3237 {
3238 	struct btrfs_bio *bbio = bio->bi_private;
3239 	int is_orig_bio = 0;
3240 
3241 	if (err)
3242 		atomic_inc(&bbio->error);
3243 
3244 	if (bio == bbio->orig_bio)
3245 		is_orig_bio = 1;
3246 
3247 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
3248 		if (!is_orig_bio) {
3249 			bio_put(bio);
3250 			bio = bbio->orig_bio;
3251 		}
3252 		bio->bi_private = bbio->private;
3253 		bio->bi_end_io = bbio->end_io;
3254 		bio->bi_bdev = (struct block_device *)
3255 					(unsigned long)bbio->mirror_num;
3256 		/* only send an error to the higher layers if it is
3257 		 * beyond the tolerance of the multi-bio
3258 		 */
3259 		if (atomic_read(&bbio->error) > bbio->max_errors) {
3260 			err = -EIO;
3261 		} else if (err) {
3262 			/*
3263 			 * this bio is actually up to date, we didn't
3264 			 * go over the max number of errors
3265 			 */
3266 			set_bit(BIO_UPTODATE, &bio->bi_flags);
3267 			err = 0;
3268 		}
3269 		kfree(bbio);
3270 
3271 		bio_endio(bio, err);
3272 	} else if (!is_orig_bio) {
3273 		bio_put(bio);
3274 	}
3275 }
3276 
3277 struct async_sched {
3278 	struct bio *bio;
3279 	int rw;
3280 	struct btrfs_fs_info *info;
3281 	struct btrfs_work work;
3282 };
3283 
3284 /*
3285  * see run_scheduled_bios for a description of why bios are collected for
3286  * async submit.
3287  *
3288  * This will add one bio to the pending list for a device and make sure
3289  * the work struct is scheduled.
3290  */
3291 static noinline int schedule_bio(struct btrfs_root *root,
3292 				 struct btrfs_device *device,
3293 				 int rw, struct bio *bio)
3294 {
3295 	int should_queue = 1;
3296 	struct btrfs_pending_bios *pending_bios;
3297 
3298 	/* don't bother with additional async steps for reads, right now */
3299 	if (!(rw & REQ_WRITE)) {
3300 		bio_get(bio);
3301 		submit_bio(rw, bio);
3302 		bio_put(bio);
3303 		return 0;
3304 	}
3305 
3306 	/*
3307 	 * nr_async_bios allows us to reliably return congestion to the
3308 	 * higher layers.  Otherwise, the async bio makes it appear we have
3309 	 * made progress against dirty pages when we've really just put it
3310 	 * on a queue for later
3311 	 */
3312 	atomic_inc(&root->fs_info->nr_async_bios);
3313 	WARN_ON(bio->bi_next);
3314 	bio->bi_next = NULL;
3315 	bio->bi_rw |= rw;
3316 
3317 	spin_lock(&device->io_lock);
3318 	if (bio->bi_rw & REQ_SYNC)
3319 		pending_bios = &device->pending_sync_bios;
3320 	else
3321 		pending_bios = &device->pending_bios;
3322 
3323 	if (pending_bios->tail)
3324 		pending_bios->tail->bi_next = bio;
3325 
3326 	pending_bios->tail = bio;
3327 	if (!pending_bios->head)
3328 		pending_bios->head = bio;
3329 	if (device->running_pending)
3330 		should_queue = 0;
3331 
3332 	spin_unlock(&device->io_lock);
3333 
3334 	if (should_queue)
3335 		btrfs_queue_worker(&root->fs_info->submit_workers,
3336 				   &device->work);
3337 	return 0;
3338 }
3339 
3340 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3341 		  int mirror_num, int async_submit)
3342 {
3343 	struct btrfs_mapping_tree *map_tree;
3344 	struct btrfs_device *dev;
3345 	struct bio *first_bio = bio;
3346 	u64 logical = (u64)bio->bi_sector << 9;
3347 	u64 length = 0;
3348 	u64 map_length;
3349 	int ret;
3350 	int dev_nr = 0;
3351 	int total_devs = 1;
3352 	struct btrfs_bio *bbio = NULL;
3353 
3354 	length = bio->bi_size;
3355 	map_tree = &root->fs_info->mapping_tree;
3356 	map_length = length;
3357 
3358 	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3359 			      mirror_num);
3360 	BUG_ON(ret);
3361 
3362 	total_devs = bbio->num_stripes;
3363 	if (map_length < length) {
3364 		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3365 		       "len %llu\n", (unsigned long long)logical,
3366 		       (unsigned long long)length,
3367 		       (unsigned long long)map_length);
3368 		BUG();
3369 	}
3370 
3371 	bbio->orig_bio = first_bio;
3372 	bbio->private = first_bio->bi_private;
3373 	bbio->end_io = first_bio->bi_end_io;
3374 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3375 
3376 	while (dev_nr < total_devs) {
3377 		if (dev_nr < total_devs - 1) {
3378 			bio = bio_clone(first_bio, GFP_NOFS);
3379 			BUG_ON(!bio);
3380 		} else {
3381 			bio = first_bio;
3382 		}
3383 		bio->bi_private = bbio;
3384 		bio->bi_end_io = btrfs_end_bio;
3385 		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3386 		dev = bbio->stripes[dev_nr].dev;
3387 		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3388 			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3389 				 "(%s id %llu), size=%u\n", rw,
3390 				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3391 				 dev->name, dev->devid, bio->bi_size);
3392 			bio->bi_bdev = dev->bdev;
3393 			if (async_submit)
3394 				schedule_bio(root, dev, rw, bio);
3395 			else
3396 				submit_bio(rw, bio);
3397 		} else {
3398 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3399 			bio->bi_sector = logical >> 9;
3400 			bio_endio(bio, -EIO);
3401 		}
3402 		dev_nr++;
3403 	}
3404 	return 0;
3405 }
3406 
3407 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
3408 				       u8 *uuid, u8 *fsid)
3409 {
3410 	struct btrfs_device *device;
3411 	struct btrfs_fs_devices *cur_devices;
3412 
3413 	cur_devices = root->fs_info->fs_devices;
3414 	while (cur_devices) {
3415 		if (!fsid ||
3416 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3417 			device = __find_device(&cur_devices->devices,
3418 					       devid, uuid);
3419 			if (device)
3420 				return device;
3421 		}
3422 		cur_devices = cur_devices->seed;
3423 	}
3424 	return NULL;
3425 }
3426 
3427 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3428 					    u64 devid, u8 *dev_uuid)
3429 {
3430 	struct btrfs_device *device;
3431 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
3432 
3433 	device = kzalloc(sizeof(*device), GFP_NOFS);
3434 	if (!device)
3435 		return NULL;
3436 	list_add(&device->dev_list,
3437 		 &fs_devices->devices);
3438 	device->dev_root = root->fs_info->dev_root;
3439 	device->devid = devid;
3440 	device->work.func = pending_bios_fn;
3441 	device->fs_devices = fs_devices;
3442 	device->missing = 1;
3443 	fs_devices->num_devices++;
3444 	fs_devices->missing_devices++;
3445 	spin_lock_init(&device->io_lock);
3446 	INIT_LIST_HEAD(&device->dev_alloc_list);
3447 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
3448 	return device;
3449 }
3450 
3451 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3452 			  struct extent_buffer *leaf,
3453 			  struct btrfs_chunk *chunk)
3454 {
3455 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3456 	struct map_lookup *map;
3457 	struct extent_map *em;
3458 	u64 logical;
3459 	u64 length;
3460 	u64 devid;
3461 	u8 uuid[BTRFS_UUID_SIZE];
3462 	int num_stripes;
3463 	int ret;
3464 	int i;
3465 
3466 	logical = key->offset;
3467 	length = btrfs_chunk_length(leaf, chunk);
3468 
3469 	read_lock(&map_tree->map_tree.lock);
3470 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3471 	read_unlock(&map_tree->map_tree.lock);
3472 
3473 	/* already mapped? */
3474 	if (em && em->start <= logical && em->start + em->len > logical) {
3475 		free_extent_map(em);
3476 		return 0;
3477 	} else if (em) {
3478 		free_extent_map(em);
3479 	}
3480 
3481 	em = alloc_extent_map();
3482 	if (!em)
3483 		return -ENOMEM;
3484 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3485 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3486 	if (!map) {
3487 		free_extent_map(em);
3488 		return -ENOMEM;
3489 	}
3490 
3491 	em->bdev = (struct block_device *)map;
3492 	em->start = logical;
3493 	em->len = length;
3494 	em->block_start = 0;
3495 	em->block_len = em->len;
3496 
3497 	map->num_stripes = num_stripes;
3498 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
3499 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
3500 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
3501 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
3502 	map->type = btrfs_chunk_type(leaf, chunk);
3503 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
3504 	for (i = 0; i < num_stripes; i++) {
3505 		map->stripes[i].physical =
3506 			btrfs_stripe_offset_nr(leaf, chunk, i);
3507 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
3508 		read_extent_buffer(leaf, uuid, (unsigned long)
3509 				   btrfs_stripe_dev_uuid_nr(chunk, i),
3510 				   BTRFS_UUID_SIZE);
3511 		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
3512 							NULL);
3513 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
3514 			kfree(map);
3515 			free_extent_map(em);
3516 			return -EIO;
3517 		}
3518 		if (!map->stripes[i].dev) {
3519 			map->stripes[i].dev =
3520 				add_missing_dev(root, devid, uuid);
3521 			if (!map->stripes[i].dev) {
3522 				kfree(map);
3523 				free_extent_map(em);
3524 				return -EIO;
3525 			}
3526 		}
3527 		map->stripes[i].dev->in_fs_metadata = 1;
3528 	}
3529 
3530 	write_lock(&map_tree->map_tree.lock);
3531 	ret = add_extent_mapping(&map_tree->map_tree, em);
3532 	write_unlock(&map_tree->map_tree.lock);
3533 	BUG_ON(ret);
3534 	free_extent_map(em);
3535 
3536 	return 0;
3537 }
3538 
3539 static int fill_device_from_item(struct extent_buffer *leaf,
3540 				 struct btrfs_dev_item *dev_item,
3541 				 struct btrfs_device *device)
3542 {
3543 	unsigned long ptr;
3544 
3545 	device->devid = btrfs_device_id(leaf, dev_item);
3546 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3547 	device->total_bytes = device->disk_total_bytes;
3548 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3549 	device->type = btrfs_device_type(leaf, dev_item);
3550 	device->io_align = btrfs_device_io_align(leaf, dev_item);
3551 	device->io_width = btrfs_device_io_width(leaf, dev_item);
3552 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3553 
3554 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
3555 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3556 
3557 	return 0;
3558 }
3559 
3560 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3561 {
3562 	struct btrfs_fs_devices *fs_devices;
3563 	int ret;
3564 
3565 	mutex_lock(&uuid_mutex);
3566 
3567 	fs_devices = root->fs_info->fs_devices->seed;
3568 	while (fs_devices) {
3569 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3570 			ret = 0;
3571 			goto out;
3572 		}
3573 		fs_devices = fs_devices->seed;
3574 	}
3575 
3576 	fs_devices = find_fsid(fsid);
3577 	if (!fs_devices) {
3578 		ret = -ENOENT;
3579 		goto out;
3580 	}
3581 
3582 	fs_devices = clone_fs_devices(fs_devices);
3583 	if (IS_ERR(fs_devices)) {
3584 		ret = PTR_ERR(fs_devices);
3585 		goto out;
3586 	}
3587 
3588 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3589 				   root->fs_info->bdev_holder);
3590 	if (ret)
3591 		goto out;
3592 
3593 	if (!fs_devices->seeding) {
3594 		__btrfs_close_devices(fs_devices);
3595 		free_fs_devices(fs_devices);
3596 		ret = -EINVAL;
3597 		goto out;
3598 	}
3599 
3600 	fs_devices->seed = root->fs_info->fs_devices->seed;
3601 	root->fs_info->fs_devices->seed = fs_devices;
3602 out:
3603 	mutex_unlock(&uuid_mutex);
3604 	return ret;
3605 }
3606 
3607 static int read_one_dev(struct btrfs_root *root,
3608 			struct extent_buffer *leaf,
3609 			struct btrfs_dev_item *dev_item)
3610 {
3611 	struct btrfs_device *device;
3612 	u64 devid;
3613 	int ret;
3614 	u8 fs_uuid[BTRFS_UUID_SIZE];
3615 	u8 dev_uuid[BTRFS_UUID_SIZE];
3616 
3617 	devid = btrfs_device_id(leaf, dev_item);
3618 	read_extent_buffer(leaf, dev_uuid,
3619 			   (unsigned long)btrfs_device_uuid(dev_item),
3620 			   BTRFS_UUID_SIZE);
3621 	read_extent_buffer(leaf, fs_uuid,
3622 			   (unsigned long)btrfs_device_fsid(dev_item),
3623 			   BTRFS_UUID_SIZE);
3624 
3625 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3626 		ret = open_seed_devices(root, fs_uuid);
3627 		if (ret && !btrfs_test_opt(root, DEGRADED))
3628 			return ret;
3629 	}
3630 
3631 	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3632 	if (!device || !device->bdev) {
3633 		if (!btrfs_test_opt(root, DEGRADED))
3634 			return -EIO;
3635 
3636 		if (!device) {
3637 			printk(KERN_WARNING "warning devid %llu missing\n",
3638 			       (unsigned long long)devid);
3639 			device = add_missing_dev(root, devid, dev_uuid);
3640 			if (!device)
3641 				return -ENOMEM;
3642 		} else if (!device->missing) {
3643 			/*
3644 			 * this happens when a device that was properly setup
3645 			 * in the device info lists suddenly goes bad.
3646 			 * device->bdev is NULL, and so we have to set
3647 			 * device->missing to one here
3648 			 */
3649 			root->fs_info->fs_devices->missing_devices++;
3650 			device->missing = 1;
3651 		}
3652 	}
3653 
3654 	if (device->fs_devices != root->fs_info->fs_devices) {
3655 		BUG_ON(device->writeable);
3656 		if (device->generation !=
3657 		    btrfs_device_generation(leaf, dev_item))
3658 			return -EINVAL;
3659 	}
3660 
3661 	fill_device_from_item(leaf, dev_item, device);
3662 	device->dev_root = root->fs_info->dev_root;
3663 	device->in_fs_metadata = 1;
3664 	if (device->writeable) {
3665 		device->fs_devices->total_rw_bytes += device->total_bytes;
3666 		spin_lock(&root->fs_info->free_chunk_lock);
3667 		root->fs_info->free_chunk_space += device->total_bytes -
3668 			device->bytes_used;
3669 		spin_unlock(&root->fs_info->free_chunk_lock);
3670 	}
3671 	ret = 0;
3672 	return ret;
3673 }
3674 
3675 int btrfs_read_sys_array(struct btrfs_root *root)
3676 {
3677 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3678 	struct extent_buffer *sb;
3679 	struct btrfs_disk_key *disk_key;
3680 	struct btrfs_chunk *chunk;
3681 	u8 *ptr;
3682 	unsigned long sb_ptr;
3683 	int ret = 0;
3684 	u32 num_stripes;
3685 	u32 array_size;
3686 	u32 len = 0;
3687 	u32 cur;
3688 	struct btrfs_key key;
3689 
3690 	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3691 					  BTRFS_SUPER_INFO_SIZE);
3692 	if (!sb)
3693 		return -ENOMEM;
3694 	btrfs_set_buffer_uptodate(sb);
3695 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
3696 
3697 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3698 	array_size = btrfs_super_sys_array_size(super_copy);
3699 
3700 	ptr = super_copy->sys_chunk_array;
3701 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3702 	cur = 0;
3703 
3704 	while (cur < array_size) {
3705 		disk_key = (struct btrfs_disk_key *)ptr;
3706 		btrfs_disk_key_to_cpu(&key, disk_key);
3707 
3708 		len = sizeof(*disk_key); ptr += len;
3709 		sb_ptr += len;
3710 		cur += len;
3711 
3712 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3713 			chunk = (struct btrfs_chunk *)sb_ptr;
3714 			ret = read_one_chunk(root, &key, sb, chunk);
3715 			if (ret)
3716 				break;
3717 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3718 			len = btrfs_chunk_item_size(num_stripes);
3719 		} else {
3720 			ret = -EIO;
3721 			break;
3722 		}
3723 		ptr += len;
3724 		sb_ptr += len;
3725 		cur += len;
3726 	}
3727 	free_extent_buffer(sb);
3728 	return ret;
3729 }
3730 
3731 int btrfs_read_chunk_tree(struct btrfs_root *root)
3732 {
3733 	struct btrfs_path *path;
3734 	struct extent_buffer *leaf;
3735 	struct btrfs_key key;
3736 	struct btrfs_key found_key;
3737 	int ret;
3738 	int slot;
3739 
3740 	root = root->fs_info->chunk_root;
3741 
3742 	path = btrfs_alloc_path();
3743 	if (!path)
3744 		return -ENOMEM;
3745 
3746 	/* first we search for all of the device items, and then we
3747 	 * read in all of the chunk items.  This way we can create chunk
3748 	 * mappings that reference all of the devices that are afound
3749 	 */
3750 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3751 	key.offset = 0;
3752 	key.type = 0;
3753 again:
3754 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3755 	if (ret < 0)
3756 		goto error;
3757 	while (1) {
3758 		leaf = path->nodes[0];
3759 		slot = path->slots[0];
3760 		if (slot >= btrfs_header_nritems(leaf)) {
3761 			ret = btrfs_next_leaf(root, path);
3762 			if (ret == 0)
3763 				continue;
3764 			if (ret < 0)
3765 				goto error;
3766 			break;
3767 		}
3768 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3769 		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3770 			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3771 				break;
3772 			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3773 				struct btrfs_dev_item *dev_item;
3774 				dev_item = btrfs_item_ptr(leaf, slot,
3775 						  struct btrfs_dev_item);
3776 				ret = read_one_dev(root, leaf, dev_item);
3777 				if (ret)
3778 					goto error;
3779 			}
3780 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3781 			struct btrfs_chunk *chunk;
3782 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3783 			ret = read_one_chunk(root, &found_key, leaf, chunk);
3784 			if (ret)
3785 				goto error;
3786 		}
3787 		path->slots[0]++;
3788 	}
3789 	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3790 		key.objectid = 0;
3791 		btrfs_release_path(path);
3792 		goto again;
3793 	}
3794 	ret = 0;
3795 error:
3796 	btrfs_free_path(path);
3797 	return ret;
3798 }
3799