1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <asm/div64.h> 27 #include "compat.h" 28 #include "ctree.h" 29 #include "extent_map.h" 30 #include "disk-io.h" 31 #include "transaction.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "async-thread.h" 35 36 static int init_first_rw_device(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct btrfs_device *device); 39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 40 41 static DEFINE_MUTEX(uuid_mutex); 42 static LIST_HEAD(fs_uuids); 43 44 static void lock_chunks(struct btrfs_root *root) 45 { 46 mutex_lock(&root->fs_info->chunk_mutex); 47 } 48 49 static void unlock_chunks(struct btrfs_root *root) 50 { 51 mutex_unlock(&root->fs_info->chunk_mutex); 52 } 53 54 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 55 { 56 struct btrfs_device *device; 57 WARN_ON(fs_devices->opened); 58 while (!list_empty(&fs_devices->devices)) { 59 device = list_entry(fs_devices->devices.next, 60 struct btrfs_device, dev_list); 61 list_del(&device->dev_list); 62 kfree(device->name); 63 kfree(device); 64 } 65 kfree(fs_devices); 66 } 67 68 int btrfs_cleanup_fs_uuids(void) 69 { 70 struct btrfs_fs_devices *fs_devices; 71 72 while (!list_empty(&fs_uuids)) { 73 fs_devices = list_entry(fs_uuids.next, 74 struct btrfs_fs_devices, list); 75 list_del(&fs_devices->list); 76 free_fs_devices(fs_devices); 77 } 78 return 0; 79 } 80 81 static noinline struct btrfs_device *__find_device(struct list_head *head, 82 u64 devid, u8 *uuid) 83 { 84 struct btrfs_device *dev; 85 86 list_for_each_entry(dev, head, dev_list) { 87 if (dev->devid == devid && 88 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 89 return dev; 90 } 91 } 92 return NULL; 93 } 94 95 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 96 { 97 struct btrfs_fs_devices *fs_devices; 98 99 list_for_each_entry(fs_devices, &fs_uuids, list) { 100 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 101 return fs_devices; 102 } 103 return NULL; 104 } 105 106 static void requeue_list(struct btrfs_pending_bios *pending_bios, 107 struct bio *head, struct bio *tail) 108 { 109 110 struct bio *old_head; 111 112 old_head = pending_bios->head; 113 pending_bios->head = head; 114 if (pending_bios->tail) 115 tail->bi_next = old_head; 116 else 117 pending_bios->tail = tail; 118 } 119 120 /* 121 * we try to collect pending bios for a device so we don't get a large 122 * number of procs sending bios down to the same device. This greatly 123 * improves the schedulers ability to collect and merge the bios. 124 * 125 * But, it also turns into a long list of bios to process and that is sure 126 * to eventually make the worker thread block. The solution here is to 127 * make some progress and then put this work struct back at the end of 128 * the list if the block device is congested. This way, multiple devices 129 * can make progress from a single worker thread. 130 */ 131 static noinline int run_scheduled_bios(struct btrfs_device *device) 132 { 133 struct bio *pending; 134 struct backing_dev_info *bdi; 135 struct btrfs_fs_info *fs_info; 136 struct btrfs_pending_bios *pending_bios; 137 struct bio *tail; 138 struct bio *cur; 139 int again = 0; 140 unsigned long num_run; 141 unsigned long batch_run = 0; 142 unsigned long limit; 143 unsigned long last_waited = 0; 144 int force_reg = 0; 145 int sync_pending = 0; 146 struct blk_plug plug; 147 148 /* 149 * this function runs all the bios we've collected for 150 * a particular device. We don't want to wander off to 151 * another device without first sending all of these down. 152 * So, setup a plug here and finish it off before we return 153 */ 154 blk_start_plug(&plug); 155 156 bdi = blk_get_backing_dev_info(device->bdev); 157 fs_info = device->dev_root->fs_info; 158 limit = btrfs_async_submit_limit(fs_info); 159 limit = limit * 2 / 3; 160 161 loop: 162 spin_lock(&device->io_lock); 163 164 loop_lock: 165 num_run = 0; 166 167 /* take all the bios off the list at once and process them 168 * later on (without the lock held). But, remember the 169 * tail and other pointers so the bios can be properly reinserted 170 * into the list if we hit congestion 171 */ 172 if (!force_reg && device->pending_sync_bios.head) { 173 pending_bios = &device->pending_sync_bios; 174 force_reg = 1; 175 } else { 176 pending_bios = &device->pending_bios; 177 force_reg = 0; 178 } 179 180 pending = pending_bios->head; 181 tail = pending_bios->tail; 182 WARN_ON(pending && !tail); 183 184 /* 185 * if pending was null this time around, no bios need processing 186 * at all and we can stop. Otherwise it'll loop back up again 187 * and do an additional check so no bios are missed. 188 * 189 * device->running_pending is used to synchronize with the 190 * schedule_bio code. 191 */ 192 if (device->pending_sync_bios.head == NULL && 193 device->pending_bios.head == NULL) { 194 again = 0; 195 device->running_pending = 0; 196 } else { 197 again = 1; 198 device->running_pending = 1; 199 } 200 201 pending_bios->head = NULL; 202 pending_bios->tail = NULL; 203 204 spin_unlock(&device->io_lock); 205 206 while (pending) { 207 208 rmb(); 209 /* we want to work on both lists, but do more bios on the 210 * sync list than the regular list 211 */ 212 if ((num_run > 32 && 213 pending_bios != &device->pending_sync_bios && 214 device->pending_sync_bios.head) || 215 (num_run > 64 && pending_bios == &device->pending_sync_bios && 216 device->pending_bios.head)) { 217 spin_lock(&device->io_lock); 218 requeue_list(pending_bios, pending, tail); 219 goto loop_lock; 220 } 221 222 cur = pending; 223 pending = pending->bi_next; 224 cur->bi_next = NULL; 225 atomic_dec(&fs_info->nr_async_bios); 226 227 if (atomic_read(&fs_info->nr_async_bios) < limit && 228 waitqueue_active(&fs_info->async_submit_wait)) 229 wake_up(&fs_info->async_submit_wait); 230 231 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 232 233 /* 234 * if we're doing the sync list, record that our 235 * plug has some sync requests on it 236 * 237 * If we're doing the regular list and there are 238 * sync requests sitting around, unplug before 239 * we add more 240 */ 241 if (pending_bios == &device->pending_sync_bios) { 242 sync_pending = 1; 243 } else if (sync_pending) { 244 blk_finish_plug(&plug); 245 blk_start_plug(&plug); 246 sync_pending = 0; 247 } 248 249 submit_bio(cur->bi_rw, cur); 250 num_run++; 251 batch_run++; 252 if (need_resched()) 253 cond_resched(); 254 255 /* 256 * we made progress, there is more work to do and the bdi 257 * is now congested. Back off and let other work structs 258 * run instead 259 */ 260 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 261 fs_info->fs_devices->open_devices > 1) { 262 struct io_context *ioc; 263 264 ioc = current->io_context; 265 266 /* 267 * the main goal here is that we don't want to 268 * block if we're going to be able to submit 269 * more requests without blocking. 270 * 271 * This code does two great things, it pokes into 272 * the elevator code from a filesystem _and_ 273 * it makes assumptions about how batching works. 274 */ 275 if (ioc && ioc->nr_batch_requests > 0 && 276 time_before(jiffies, ioc->last_waited + HZ/50UL) && 277 (last_waited == 0 || 278 ioc->last_waited == last_waited)) { 279 /* 280 * we want to go through our batch of 281 * requests and stop. So, we copy out 282 * the ioc->last_waited time and test 283 * against it before looping 284 */ 285 last_waited = ioc->last_waited; 286 if (need_resched()) 287 cond_resched(); 288 continue; 289 } 290 spin_lock(&device->io_lock); 291 requeue_list(pending_bios, pending, tail); 292 device->running_pending = 1; 293 294 spin_unlock(&device->io_lock); 295 btrfs_requeue_work(&device->work); 296 goto done; 297 } 298 } 299 300 cond_resched(); 301 if (again) 302 goto loop; 303 304 spin_lock(&device->io_lock); 305 if (device->pending_bios.head || device->pending_sync_bios.head) 306 goto loop_lock; 307 spin_unlock(&device->io_lock); 308 309 done: 310 blk_finish_plug(&plug); 311 return 0; 312 } 313 314 static void pending_bios_fn(struct btrfs_work *work) 315 { 316 struct btrfs_device *device; 317 318 device = container_of(work, struct btrfs_device, work); 319 run_scheduled_bios(device); 320 } 321 322 static noinline int device_list_add(const char *path, 323 struct btrfs_super_block *disk_super, 324 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 325 { 326 struct btrfs_device *device; 327 struct btrfs_fs_devices *fs_devices; 328 u64 found_transid = btrfs_super_generation(disk_super); 329 char *name; 330 331 fs_devices = find_fsid(disk_super->fsid); 332 if (!fs_devices) { 333 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 334 if (!fs_devices) 335 return -ENOMEM; 336 INIT_LIST_HEAD(&fs_devices->devices); 337 INIT_LIST_HEAD(&fs_devices->alloc_list); 338 list_add(&fs_devices->list, &fs_uuids); 339 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 340 fs_devices->latest_devid = devid; 341 fs_devices->latest_trans = found_transid; 342 mutex_init(&fs_devices->device_list_mutex); 343 device = NULL; 344 } else { 345 device = __find_device(&fs_devices->devices, devid, 346 disk_super->dev_item.uuid); 347 } 348 if (!device) { 349 if (fs_devices->opened) 350 return -EBUSY; 351 352 device = kzalloc(sizeof(*device), GFP_NOFS); 353 if (!device) { 354 /* we can safely leave the fs_devices entry around */ 355 return -ENOMEM; 356 } 357 device->devid = devid; 358 device->work.func = pending_bios_fn; 359 memcpy(device->uuid, disk_super->dev_item.uuid, 360 BTRFS_UUID_SIZE); 361 spin_lock_init(&device->io_lock); 362 device->name = kstrdup(path, GFP_NOFS); 363 if (!device->name) { 364 kfree(device); 365 return -ENOMEM; 366 } 367 INIT_LIST_HEAD(&device->dev_alloc_list); 368 369 /* init readahead state */ 370 spin_lock_init(&device->reada_lock); 371 device->reada_curr_zone = NULL; 372 atomic_set(&device->reada_in_flight, 0); 373 device->reada_next = 0; 374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); 375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); 376 377 mutex_lock(&fs_devices->device_list_mutex); 378 list_add_rcu(&device->dev_list, &fs_devices->devices); 379 mutex_unlock(&fs_devices->device_list_mutex); 380 381 device->fs_devices = fs_devices; 382 fs_devices->num_devices++; 383 } else if (!device->name || strcmp(device->name, path)) { 384 name = kstrdup(path, GFP_NOFS); 385 if (!name) 386 return -ENOMEM; 387 kfree(device->name); 388 device->name = name; 389 if (device->missing) { 390 fs_devices->missing_devices--; 391 device->missing = 0; 392 } 393 } 394 395 if (found_transid > fs_devices->latest_trans) { 396 fs_devices->latest_devid = devid; 397 fs_devices->latest_trans = found_transid; 398 } 399 *fs_devices_ret = fs_devices; 400 return 0; 401 } 402 403 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 404 { 405 struct btrfs_fs_devices *fs_devices; 406 struct btrfs_device *device; 407 struct btrfs_device *orig_dev; 408 409 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 410 if (!fs_devices) 411 return ERR_PTR(-ENOMEM); 412 413 INIT_LIST_HEAD(&fs_devices->devices); 414 INIT_LIST_HEAD(&fs_devices->alloc_list); 415 INIT_LIST_HEAD(&fs_devices->list); 416 mutex_init(&fs_devices->device_list_mutex); 417 fs_devices->latest_devid = orig->latest_devid; 418 fs_devices->latest_trans = orig->latest_trans; 419 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 420 421 /* We have held the volume lock, it is safe to get the devices. */ 422 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 423 device = kzalloc(sizeof(*device), GFP_NOFS); 424 if (!device) 425 goto error; 426 427 device->name = kstrdup(orig_dev->name, GFP_NOFS); 428 if (!device->name) { 429 kfree(device); 430 goto error; 431 } 432 433 device->devid = orig_dev->devid; 434 device->work.func = pending_bios_fn; 435 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 436 spin_lock_init(&device->io_lock); 437 INIT_LIST_HEAD(&device->dev_list); 438 INIT_LIST_HEAD(&device->dev_alloc_list); 439 440 list_add(&device->dev_list, &fs_devices->devices); 441 device->fs_devices = fs_devices; 442 fs_devices->num_devices++; 443 } 444 return fs_devices; 445 error: 446 free_fs_devices(fs_devices); 447 return ERR_PTR(-ENOMEM); 448 } 449 450 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 451 { 452 struct btrfs_device *device, *next; 453 454 mutex_lock(&uuid_mutex); 455 again: 456 /* This is the initialized path, it is safe to release the devices. */ 457 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 458 if (device->in_fs_metadata) 459 continue; 460 461 if (device->bdev) { 462 blkdev_put(device->bdev, device->mode); 463 device->bdev = NULL; 464 fs_devices->open_devices--; 465 } 466 if (device->writeable) { 467 list_del_init(&device->dev_alloc_list); 468 device->writeable = 0; 469 fs_devices->rw_devices--; 470 } 471 list_del_init(&device->dev_list); 472 fs_devices->num_devices--; 473 kfree(device->name); 474 kfree(device); 475 } 476 477 if (fs_devices->seed) { 478 fs_devices = fs_devices->seed; 479 goto again; 480 } 481 482 mutex_unlock(&uuid_mutex); 483 return 0; 484 } 485 486 static void __free_device(struct work_struct *work) 487 { 488 struct btrfs_device *device; 489 490 device = container_of(work, struct btrfs_device, rcu_work); 491 492 if (device->bdev) 493 blkdev_put(device->bdev, device->mode); 494 495 kfree(device->name); 496 kfree(device); 497 } 498 499 static void free_device(struct rcu_head *head) 500 { 501 struct btrfs_device *device; 502 503 device = container_of(head, struct btrfs_device, rcu); 504 505 INIT_WORK(&device->rcu_work, __free_device); 506 schedule_work(&device->rcu_work); 507 } 508 509 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 510 { 511 struct btrfs_device *device; 512 513 if (--fs_devices->opened > 0) 514 return 0; 515 516 mutex_lock(&fs_devices->device_list_mutex); 517 list_for_each_entry(device, &fs_devices->devices, dev_list) { 518 struct btrfs_device *new_device; 519 520 if (device->bdev) 521 fs_devices->open_devices--; 522 523 if (device->writeable) { 524 list_del_init(&device->dev_alloc_list); 525 fs_devices->rw_devices--; 526 } 527 528 if (device->can_discard) 529 fs_devices->num_can_discard--; 530 531 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 532 BUG_ON(!new_device); 533 memcpy(new_device, device, sizeof(*new_device)); 534 new_device->name = kstrdup(device->name, GFP_NOFS); 535 BUG_ON(device->name && !new_device->name); 536 new_device->bdev = NULL; 537 new_device->writeable = 0; 538 new_device->in_fs_metadata = 0; 539 new_device->can_discard = 0; 540 list_replace_rcu(&device->dev_list, &new_device->dev_list); 541 542 call_rcu(&device->rcu, free_device); 543 } 544 mutex_unlock(&fs_devices->device_list_mutex); 545 546 WARN_ON(fs_devices->open_devices); 547 WARN_ON(fs_devices->rw_devices); 548 fs_devices->opened = 0; 549 fs_devices->seeding = 0; 550 551 return 0; 552 } 553 554 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 555 { 556 struct btrfs_fs_devices *seed_devices = NULL; 557 int ret; 558 559 mutex_lock(&uuid_mutex); 560 ret = __btrfs_close_devices(fs_devices); 561 if (!fs_devices->opened) { 562 seed_devices = fs_devices->seed; 563 fs_devices->seed = NULL; 564 } 565 mutex_unlock(&uuid_mutex); 566 567 while (seed_devices) { 568 fs_devices = seed_devices; 569 seed_devices = fs_devices->seed; 570 __btrfs_close_devices(fs_devices); 571 free_fs_devices(fs_devices); 572 } 573 return ret; 574 } 575 576 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 577 fmode_t flags, void *holder) 578 { 579 struct request_queue *q; 580 struct block_device *bdev; 581 struct list_head *head = &fs_devices->devices; 582 struct btrfs_device *device; 583 struct block_device *latest_bdev = NULL; 584 struct buffer_head *bh; 585 struct btrfs_super_block *disk_super; 586 u64 latest_devid = 0; 587 u64 latest_transid = 0; 588 u64 devid; 589 int seeding = 1; 590 int ret = 0; 591 592 flags |= FMODE_EXCL; 593 594 list_for_each_entry(device, head, dev_list) { 595 if (device->bdev) 596 continue; 597 if (!device->name) 598 continue; 599 600 bdev = blkdev_get_by_path(device->name, flags, holder); 601 if (IS_ERR(bdev)) { 602 printk(KERN_INFO "open %s failed\n", device->name); 603 goto error; 604 } 605 set_blocksize(bdev, 4096); 606 607 bh = btrfs_read_dev_super(bdev); 608 if (!bh) 609 goto error_close; 610 611 disk_super = (struct btrfs_super_block *)bh->b_data; 612 devid = btrfs_stack_device_id(&disk_super->dev_item); 613 if (devid != device->devid) 614 goto error_brelse; 615 616 if (memcmp(device->uuid, disk_super->dev_item.uuid, 617 BTRFS_UUID_SIZE)) 618 goto error_brelse; 619 620 device->generation = btrfs_super_generation(disk_super); 621 if (!latest_transid || device->generation > latest_transid) { 622 latest_devid = devid; 623 latest_transid = device->generation; 624 latest_bdev = bdev; 625 } 626 627 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 628 device->writeable = 0; 629 } else { 630 device->writeable = !bdev_read_only(bdev); 631 seeding = 0; 632 } 633 634 q = bdev_get_queue(bdev); 635 if (blk_queue_discard(q)) { 636 device->can_discard = 1; 637 fs_devices->num_can_discard++; 638 } 639 640 device->bdev = bdev; 641 device->in_fs_metadata = 0; 642 device->mode = flags; 643 644 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 645 fs_devices->rotating = 1; 646 647 fs_devices->open_devices++; 648 if (device->writeable) { 649 fs_devices->rw_devices++; 650 list_add(&device->dev_alloc_list, 651 &fs_devices->alloc_list); 652 } 653 brelse(bh); 654 continue; 655 656 error_brelse: 657 brelse(bh); 658 error_close: 659 blkdev_put(bdev, flags); 660 error: 661 continue; 662 } 663 if (fs_devices->open_devices == 0) { 664 ret = -EINVAL; 665 goto out; 666 } 667 fs_devices->seeding = seeding; 668 fs_devices->opened = 1; 669 fs_devices->latest_bdev = latest_bdev; 670 fs_devices->latest_devid = latest_devid; 671 fs_devices->latest_trans = latest_transid; 672 fs_devices->total_rw_bytes = 0; 673 out: 674 return ret; 675 } 676 677 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 678 fmode_t flags, void *holder) 679 { 680 int ret; 681 682 mutex_lock(&uuid_mutex); 683 if (fs_devices->opened) { 684 fs_devices->opened++; 685 ret = 0; 686 } else { 687 ret = __btrfs_open_devices(fs_devices, flags, holder); 688 } 689 mutex_unlock(&uuid_mutex); 690 return ret; 691 } 692 693 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 694 struct btrfs_fs_devices **fs_devices_ret) 695 { 696 struct btrfs_super_block *disk_super; 697 struct block_device *bdev; 698 struct buffer_head *bh; 699 int ret; 700 u64 devid; 701 u64 transid; 702 703 mutex_lock(&uuid_mutex); 704 705 flags |= FMODE_EXCL; 706 bdev = blkdev_get_by_path(path, flags, holder); 707 708 if (IS_ERR(bdev)) { 709 ret = PTR_ERR(bdev); 710 goto error; 711 } 712 713 ret = set_blocksize(bdev, 4096); 714 if (ret) 715 goto error_close; 716 bh = btrfs_read_dev_super(bdev); 717 if (!bh) { 718 ret = -EINVAL; 719 goto error_close; 720 } 721 disk_super = (struct btrfs_super_block *)bh->b_data; 722 devid = btrfs_stack_device_id(&disk_super->dev_item); 723 transid = btrfs_super_generation(disk_super); 724 if (disk_super->label[0]) 725 printk(KERN_INFO "device label %s ", disk_super->label); 726 else 727 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 728 printk(KERN_CONT "devid %llu transid %llu %s\n", 729 (unsigned long long)devid, (unsigned long long)transid, path); 730 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 731 732 brelse(bh); 733 error_close: 734 blkdev_put(bdev, flags); 735 error: 736 mutex_unlock(&uuid_mutex); 737 return ret; 738 } 739 740 /* helper to account the used device space in the range */ 741 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 742 u64 end, u64 *length) 743 { 744 struct btrfs_key key; 745 struct btrfs_root *root = device->dev_root; 746 struct btrfs_dev_extent *dev_extent; 747 struct btrfs_path *path; 748 u64 extent_end; 749 int ret; 750 int slot; 751 struct extent_buffer *l; 752 753 *length = 0; 754 755 if (start >= device->total_bytes) 756 return 0; 757 758 path = btrfs_alloc_path(); 759 if (!path) 760 return -ENOMEM; 761 path->reada = 2; 762 763 key.objectid = device->devid; 764 key.offset = start; 765 key.type = BTRFS_DEV_EXTENT_KEY; 766 767 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 768 if (ret < 0) 769 goto out; 770 if (ret > 0) { 771 ret = btrfs_previous_item(root, path, key.objectid, key.type); 772 if (ret < 0) 773 goto out; 774 } 775 776 while (1) { 777 l = path->nodes[0]; 778 slot = path->slots[0]; 779 if (slot >= btrfs_header_nritems(l)) { 780 ret = btrfs_next_leaf(root, path); 781 if (ret == 0) 782 continue; 783 if (ret < 0) 784 goto out; 785 786 break; 787 } 788 btrfs_item_key_to_cpu(l, &key, slot); 789 790 if (key.objectid < device->devid) 791 goto next; 792 793 if (key.objectid > device->devid) 794 break; 795 796 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 797 goto next; 798 799 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 800 extent_end = key.offset + btrfs_dev_extent_length(l, 801 dev_extent); 802 if (key.offset <= start && extent_end > end) { 803 *length = end - start + 1; 804 break; 805 } else if (key.offset <= start && extent_end > start) 806 *length += extent_end - start; 807 else if (key.offset > start && extent_end <= end) 808 *length += extent_end - key.offset; 809 else if (key.offset > start && key.offset <= end) { 810 *length += end - key.offset + 1; 811 break; 812 } else if (key.offset > end) 813 break; 814 815 next: 816 path->slots[0]++; 817 } 818 ret = 0; 819 out: 820 btrfs_free_path(path); 821 return ret; 822 } 823 824 /* 825 * find_free_dev_extent - find free space in the specified device 826 * @trans: transaction handler 827 * @device: the device which we search the free space in 828 * @num_bytes: the size of the free space that we need 829 * @start: store the start of the free space. 830 * @len: the size of the free space. that we find, or the size of the max 831 * free space if we don't find suitable free space 832 * 833 * this uses a pretty simple search, the expectation is that it is 834 * called very infrequently and that a given device has a small number 835 * of extents 836 * 837 * @start is used to store the start of the free space if we find. But if we 838 * don't find suitable free space, it will be used to store the start position 839 * of the max free space. 840 * 841 * @len is used to store the size of the free space that we find. 842 * But if we don't find suitable free space, it is used to store the size of 843 * the max free space. 844 */ 845 int find_free_dev_extent(struct btrfs_trans_handle *trans, 846 struct btrfs_device *device, u64 num_bytes, 847 u64 *start, u64 *len) 848 { 849 struct btrfs_key key; 850 struct btrfs_root *root = device->dev_root; 851 struct btrfs_dev_extent *dev_extent; 852 struct btrfs_path *path; 853 u64 hole_size; 854 u64 max_hole_start; 855 u64 max_hole_size; 856 u64 extent_end; 857 u64 search_start; 858 u64 search_end = device->total_bytes; 859 int ret; 860 int slot; 861 struct extent_buffer *l; 862 863 /* FIXME use last free of some kind */ 864 865 /* we don't want to overwrite the superblock on the drive, 866 * so we make sure to start at an offset of at least 1MB 867 */ 868 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 869 870 max_hole_start = search_start; 871 max_hole_size = 0; 872 hole_size = 0; 873 874 if (search_start >= search_end) { 875 ret = -ENOSPC; 876 goto error; 877 } 878 879 path = btrfs_alloc_path(); 880 if (!path) { 881 ret = -ENOMEM; 882 goto error; 883 } 884 path->reada = 2; 885 886 key.objectid = device->devid; 887 key.offset = search_start; 888 key.type = BTRFS_DEV_EXTENT_KEY; 889 890 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 891 if (ret < 0) 892 goto out; 893 if (ret > 0) { 894 ret = btrfs_previous_item(root, path, key.objectid, key.type); 895 if (ret < 0) 896 goto out; 897 } 898 899 while (1) { 900 l = path->nodes[0]; 901 slot = path->slots[0]; 902 if (slot >= btrfs_header_nritems(l)) { 903 ret = btrfs_next_leaf(root, path); 904 if (ret == 0) 905 continue; 906 if (ret < 0) 907 goto out; 908 909 break; 910 } 911 btrfs_item_key_to_cpu(l, &key, slot); 912 913 if (key.objectid < device->devid) 914 goto next; 915 916 if (key.objectid > device->devid) 917 break; 918 919 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 920 goto next; 921 922 if (key.offset > search_start) { 923 hole_size = key.offset - search_start; 924 925 if (hole_size > max_hole_size) { 926 max_hole_start = search_start; 927 max_hole_size = hole_size; 928 } 929 930 /* 931 * If this free space is greater than which we need, 932 * it must be the max free space that we have found 933 * until now, so max_hole_start must point to the start 934 * of this free space and the length of this free space 935 * is stored in max_hole_size. Thus, we return 936 * max_hole_start and max_hole_size and go back to the 937 * caller. 938 */ 939 if (hole_size >= num_bytes) { 940 ret = 0; 941 goto out; 942 } 943 } 944 945 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 946 extent_end = key.offset + btrfs_dev_extent_length(l, 947 dev_extent); 948 if (extent_end > search_start) 949 search_start = extent_end; 950 next: 951 path->slots[0]++; 952 cond_resched(); 953 } 954 955 /* 956 * At this point, search_start should be the end of 957 * allocated dev extents, and when shrinking the device, 958 * search_end may be smaller than search_start. 959 */ 960 if (search_end > search_start) 961 hole_size = search_end - search_start; 962 963 if (hole_size > max_hole_size) { 964 max_hole_start = search_start; 965 max_hole_size = hole_size; 966 } 967 968 /* See above. */ 969 if (hole_size < num_bytes) 970 ret = -ENOSPC; 971 else 972 ret = 0; 973 974 out: 975 btrfs_free_path(path); 976 error: 977 *start = max_hole_start; 978 if (len) 979 *len = max_hole_size; 980 return ret; 981 } 982 983 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 984 struct btrfs_device *device, 985 u64 start) 986 { 987 int ret; 988 struct btrfs_path *path; 989 struct btrfs_root *root = device->dev_root; 990 struct btrfs_key key; 991 struct btrfs_key found_key; 992 struct extent_buffer *leaf = NULL; 993 struct btrfs_dev_extent *extent = NULL; 994 995 path = btrfs_alloc_path(); 996 if (!path) 997 return -ENOMEM; 998 999 key.objectid = device->devid; 1000 key.offset = start; 1001 key.type = BTRFS_DEV_EXTENT_KEY; 1002 again: 1003 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1004 if (ret > 0) { 1005 ret = btrfs_previous_item(root, path, key.objectid, 1006 BTRFS_DEV_EXTENT_KEY); 1007 if (ret) 1008 goto out; 1009 leaf = path->nodes[0]; 1010 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1012 struct btrfs_dev_extent); 1013 BUG_ON(found_key.offset > start || found_key.offset + 1014 btrfs_dev_extent_length(leaf, extent) < start); 1015 key = found_key; 1016 btrfs_release_path(path); 1017 goto again; 1018 } else if (ret == 0) { 1019 leaf = path->nodes[0]; 1020 extent = btrfs_item_ptr(leaf, path->slots[0], 1021 struct btrfs_dev_extent); 1022 } 1023 BUG_ON(ret); 1024 1025 if (device->bytes_used > 0) { 1026 u64 len = btrfs_dev_extent_length(leaf, extent); 1027 device->bytes_used -= len; 1028 spin_lock(&root->fs_info->free_chunk_lock); 1029 root->fs_info->free_chunk_space += len; 1030 spin_unlock(&root->fs_info->free_chunk_lock); 1031 } 1032 ret = btrfs_del_item(trans, root, path); 1033 1034 out: 1035 btrfs_free_path(path); 1036 return ret; 1037 } 1038 1039 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1040 struct btrfs_device *device, 1041 u64 chunk_tree, u64 chunk_objectid, 1042 u64 chunk_offset, u64 start, u64 num_bytes) 1043 { 1044 int ret; 1045 struct btrfs_path *path; 1046 struct btrfs_root *root = device->dev_root; 1047 struct btrfs_dev_extent *extent; 1048 struct extent_buffer *leaf; 1049 struct btrfs_key key; 1050 1051 WARN_ON(!device->in_fs_metadata); 1052 path = btrfs_alloc_path(); 1053 if (!path) 1054 return -ENOMEM; 1055 1056 key.objectid = device->devid; 1057 key.offset = start; 1058 key.type = BTRFS_DEV_EXTENT_KEY; 1059 ret = btrfs_insert_empty_item(trans, root, path, &key, 1060 sizeof(*extent)); 1061 BUG_ON(ret); 1062 1063 leaf = path->nodes[0]; 1064 extent = btrfs_item_ptr(leaf, path->slots[0], 1065 struct btrfs_dev_extent); 1066 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1067 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1068 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1069 1070 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1071 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1072 BTRFS_UUID_SIZE); 1073 1074 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1075 btrfs_mark_buffer_dirty(leaf); 1076 btrfs_free_path(path); 1077 return ret; 1078 } 1079 1080 static noinline int find_next_chunk(struct btrfs_root *root, 1081 u64 objectid, u64 *offset) 1082 { 1083 struct btrfs_path *path; 1084 int ret; 1085 struct btrfs_key key; 1086 struct btrfs_chunk *chunk; 1087 struct btrfs_key found_key; 1088 1089 path = btrfs_alloc_path(); 1090 if (!path) 1091 return -ENOMEM; 1092 1093 key.objectid = objectid; 1094 key.offset = (u64)-1; 1095 key.type = BTRFS_CHUNK_ITEM_KEY; 1096 1097 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1098 if (ret < 0) 1099 goto error; 1100 1101 BUG_ON(ret == 0); 1102 1103 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1104 if (ret) { 1105 *offset = 0; 1106 } else { 1107 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1108 path->slots[0]); 1109 if (found_key.objectid != objectid) 1110 *offset = 0; 1111 else { 1112 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1113 struct btrfs_chunk); 1114 *offset = found_key.offset + 1115 btrfs_chunk_length(path->nodes[0], chunk); 1116 } 1117 } 1118 ret = 0; 1119 error: 1120 btrfs_free_path(path); 1121 return ret; 1122 } 1123 1124 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1125 { 1126 int ret; 1127 struct btrfs_key key; 1128 struct btrfs_key found_key; 1129 struct btrfs_path *path; 1130 1131 root = root->fs_info->chunk_root; 1132 1133 path = btrfs_alloc_path(); 1134 if (!path) 1135 return -ENOMEM; 1136 1137 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1138 key.type = BTRFS_DEV_ITEM_KEY; 1139 key.offset = (u64)-1; 1140 1141 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1142 if (ret < 0) 1143 goto error; 1144 1145 BUG_ON(ret == 0); 1146 1147 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1148 BTRFS_DEV_ITEM_KEY); 1149 if (ret) { 1150 *objectid = 1; 1151 } else { 1152 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1153 path->slots[0]); 1154 *objectid = found_key.offset + 1; 1155 } 1156 ret = 0; 1157 error: 1158 btrfs_free_path(path); 1159 return ret; 1160 } 1161 1162 /* 1163 * the device information is stored in the chunk root 1164 * the btrfs_device struct should be fully filled in 1165 */ 1166 int btrfs_add_device(struct btrfs_trans_handle *trans, 1167 struct btrfs_root *root, 1168 struct btrfs_device *device) 1169 { 1170 int ret; 1171 struct btrfs_path *path; 1172 struct btrfs_dev_item *dev_item; 1173 struct extent_buffer *leaf; 1174 struct btrfs_key key; 1175 unsigned long ptr; 1176 1177 root = root->fs_info->chunk_root; 1178 1179 path = btrfs_alloc_path(); 1180 if (!path) 1181 return -ENOMEM; 1182 1183 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1184 key.type = BTRFS_DEV_ITEM_KEY; 1185 key.offset = device->devid; 1186 1187 ret = btrfs_insert_empty_item(trans, root, path, &key, 1188 sizeof(*dev_item)); 1189 if (ret) 1190 goto out; 1191 1192 leaf = path->nodes[0]; 1193 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1194 1195 btrfs_set_device_id(leaf, dev_item, device->devid); 1196 btrfs_set_device_generation(leaf, dev_item, 0); 1197 btrfs_set_device_type(leaf, dev_item, device->type); 1198 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1199 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1200 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1201 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1202 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1203 btrfs_set_device_group(leaf, dev_item, 0); 1204 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1205 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1206 btrfs_set_device_start_offset(leaf, dev_item, 0); 1207 1208 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1209 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1210 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1211 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1212 btrfs_mark_buffer_dirty(leaf); 1213 1214 ret = 0; 1215 out: 1216 btrfs_free_path(path); 1217 return ret; 1218 } 1219 1220 static int btrfs_rm_dev_item(struct btrfs_root *root, 1221 struct btrfs_device *device) 1222 { 1223 int ret; 1224 struct btrfs_path *path; 1225 struct btrfs_key key; 1226 struct btrfs_trans_handle *trans; 1227 1228 root = root->fs_info->chunk_root; 1229 1230 path = btrfs_alloc_path(); 1231 if (!path) 1232 return -ENOMEM; 1233 1234 trans = btrfs_start_transaction(root, 0); 1235 if (IS_ERR(trans)) { 1236 btrfs_free_path(path); 1237 return PTR_ERR(trans); 1238 } 1239 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1240 key.type = BTRFS_DEV_ITEM_KEY; 1241 key.offset = device->devid; 1242 lock_chunks(root); 1243 1244 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1245 if (ret < 0) 1246 goto out; 1247 1248 if (ret > 0) { 1249 ret = -ENOENT; 1250 goto out; 1251 } 1252 1253 ret = btrfs_del_item(trans, root, path); 1254 if (ret) 1255 goto out; 1256 out: 1257 btrfs_free_path(path); 1258 unlock_chunks(root); 1259 btrfs_commit_transaction(trans, root); 1260 return ret; 1261 } 1262 1263 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1264 { 1265 struct btrfs_device *device; 1266 struct btrfs_device *next_device; 1267 struct block_device *bdev; 1268 struct buffer_head *bh = NULL; 1269 struct btrfs_super_block *disk_super; 1270 struct btrfs_fs_devices *cur_devices; 1271 u64 all_avail; 1272 u64 devid; 1273 u64 num_devices; 1274 u8 *dev_uuid; 1275 int ret = 0; 1276 bool clear_super = false; 1277 1278 mutex_lock(&uuid_mutex); 1279 mutex_lock(&root->fs_info->volume_mutex); 1280 1281 all_avail = root->fs_info->avail_data_alloc_bits | 1282 root->fs_info->avail_system_alloc_bits | 1283 root->fs_info->avail_metadata_alloc_bits; 1284 1285 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1286 root->fs_info->fs_devices->num_devices <= 4) { 1287 printk(KERN_ERR "btrfs: unable to go below four devices " 1288 "on raid10\n"); 1289 ret = -EINVAL; 1290 goto out; 1291 } 1292 1293 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1294 root->fs_info->fs_devices->num_devices <= 2) { 1295 printk(KERN_ERR "btrfs: unable to go below two " 1296 "devices on raid1\n"); 1297 ret = -EINVAL; 1298 goto out; 1299 } 1300 1301 if (strcmp(device_path, "missing") == 0) { 1302 struct list_head *devices; 1303 struct btrfs_device *tmp; 1304 1305 device = NULL; 1306 devices = &root->fs_info->fs_devices->devices; 1307 /* 1308 * It is safe to read the devices since the volume_mutex 1309 * is held. 1310 */ 1311 list_for_each_entry(tmp, devices, dev_list) { 1312 if (tmp->in_fs_metadata && !tmp->bdev) { 1313 device = tmp; 1314 break; 1315 } 1316 } 1317 bdev = NULL; 1318 bh = NULL; 1319 disk_super = NULL; 1320 if (!device) { 1321 printk(KERN_ERR "btrfs: no missing devices found to " 1322 "remove\n"); 1323 goto out; 1324 } 1325 } else { 1326 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1327 root->fs_info->bdev_holder); 1328 if (IS_ERR(bdev)) { 1329 ret = PTR_ERR(bdev); 1330 goto out; 1331 } 1332 1333 set_blocksize(bdev, 4096); 1334 bh = btrfs_read_dev_super(bdev); 1335 if (!bh) { 1336 ret = -EINVAL; 1337 goto error_close; 1338 } 1339 disk_super = (struct btrfs_super_block *)bh->b_data; 1340 devid = btrfs_stack_device_id(&disk_super->dev_item); 1341 dev_uuid = disk_super->dev_item.uuid; 1342 device = btrfs_find_device(root, devid, dev_uuid, 1343 disk_super->fsid); 1344 if (!device) { 1345 ret = -ENOENT; 1346 goto error_brelse; 1347 } 1348 } 1349 1350 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1351 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1352 "device\n"); 1353 ret = -EINVAL; 1354 goto error_brelse; 1355 } 1356 1357 if (device->writeable) { 1358 lock_chunks(root); 1359 list_del_init(&device->dev_alloc_list); 1360 unlock_chunks(root); 1361 root->fs_info->fs_devices->rw_devices--; 1362 clear_super = true; 1363 } 1364 1365 ret = btrfs_shrink_device(device, 0); 1366 if (ret) 1367 goto error_undo; 1368 1369 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1370 if (ret) 1371 goto error_undo; 1372 1373 spin_lock(&root->fs_info->free_chunk_lock); 1374 root->fs_info->free_chunk_space = device->total_bytes - 1375 device->bytes_used; 1376 spin_unlock(&root->fs_info->free_chunk_lock); 1377 1378 device->in_fs_metadata = 0; 1379 btrfs_scrub_cancel_dev(root, device); 1380 1381 /* 1382 * the device list mutex makes sure that we don't change 1383 * the device list while someone else is writing out all 1384 * the device supers. 1385 */ 1386 1387 cur_devices = device->fs_devices; 1388 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1389 list_del_rcu(&device->dev_list); 1390 1391 device->fs_devices->num_devices--; 1392 1393 if (device->missing) 1394 root->fs_info->fs_devices->missing_devices--; 1395 1396 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1397 struct btrfs_device, dev_list); 1398 if (device->bdev == root->fs_info->sb->s_bdev) 1399 root->fs_info->sb->s_bdev = next_device->bdev; 1400 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1401 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1402 1403 if (device->bdev) 1404 device->fs_devices->open_devices--; 1405 1406 call_rcu(&device->rcu, free_device); 1407 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1408 1409 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1410 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1411 1412 if (cur_devices->open_devices == 0) { 1413 struct btrfs_fs_devices *fs_devices; 1414 fs_devices = root->fs_info->fs_devices; 1415 while (fs_devices) { 1416 if (fs_devices->seed == cur_devices) 1417 break; 1418 fs_devices = fs_devices->seed; 1419 } 1420 fs_devices->seed = cur_devices->seed; 1421 cur_devices->seed = NULL; 1422 lock_chunks(root); 1423 __btrfs_close_devices(cur_devices); 1424 unlock_chunks(root); 1425 free_fs_devices(cur_devices); 1426 } 1427 1428 /* 1429 * at this point, the device is zero sized. We want to 1430 * remove it from the devices list and zero out the old super 1431 */ 1432 if (clear_super) { 1433 /* make sure this device isn't detected as part of 1434 * the FS anymore 1435 */ 1436 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1437 set_buffer_dirty(bh); 1438 sync_dirty_buffer(bh); 1439 } 1440 1441 ret = 0; 1442 1443 error_brelse: 1444 brelse(bh); 1445 error_close: 1446 if (bdev) 1447 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1448 out: 1449 mutex_unlock(&root->fs_info->volume_mutex); 1450 mutex_unlock(&uuid_mutex); 1451 return ret; 1452 error_undo: 1453 if (device->writeable) { 1454 lock_chunks(root); 1455 list_add(&device->dev_alloc_list, 1456 &root->fs_info->fs_devices->alloc_list); 1457 unlock_chunks(root); 1458 root->fs_info->fs_devices->rw_devices++; 1459 } 1460 goto error_brelse; 1461 } 1462 1463 /* 1464 * does all the dirty work required for changing file system's UUID. 1465 */ 1466 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1467 struct btrfs_root *root) 1468 { 1469 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1470 struct btrfs_fs_devices *old_devices; 1471 struct btrfs_fs_devices *seed_devices; 1472 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1473 struct btrfs_device *device; 1474 u64 super_flags; 1475 1476 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1477 if (!fs_devices->seeding) 1478 return -EINVAL; 1479 1480 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1481 if (!seed_devices) 1482 return -ENOMEM; 1483 1484 old_devices = clone_fs_devices(fs_devices); 1485 if (IS_ERR(old_devices)) { 1486 kfree(seed_devices); 1487 return PTR_ERR(old_devices); 1488 } 1489 1490 list_add(&old_devices->list, &fs_uuids); 1491 1492 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1493 seed_devices->opened = 1; 1494 INIT_LIST_HEAD(&seed_devices->devices); 1495 INIT_LIST_HEAD(&seed_devices->alloc_list); 1496 mutex_init(&seed_devices->device_list_mutex); 1497 1498 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1499 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1500 synchronize_rcu); 1501 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1502 1503 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1504 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1505 device->fs_devices = seed_devices; 1506 } 1507 1508 fs_devices->seeding = 0; 1509 fs_devices->num_devices = 0; 1510 fs_devices->open_devices = 0; 1511 fs_devices->seed = seed_devices; 1512 1513 generate_random_uuid(fs_devices->fsid); 1514 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1515 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1516 super_flags = btrfs_super_flags(disk_super) & 1517 ~BTRFS_SUPER_FLAG_SEEDING; 1518 btrfs_set_super_flags(disk_super, super_flags); 1519 1520 return 0; 1521 } 1522 1523 /* 1524 * strore the expected generation for seed devices in device items. 1525 */ 1526 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1527 struct btrfs_root *root) 1528 { 1529 struct btrfs_path *path; 1530 struct extent_buffer *leaf; 1531 struct btrfs_dev_item *dev_item; 1532 struct btrfs_device *device; 1533 struct btrfs_key key; 1534 u8 fs_uuid[BTRFS_UUID_SIZE]; 1535 u8 dev_uuid[BTRFS_UUID_SIZE]; 1536 u64 devid; 1537 int ret; 1538 1539 path = btrfs_alloc_path(); 1540 if (!path) 1541 return -ENOMEM; 1542 1543 root = root->fs_info->chunk_root; 1544 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1545 key.offset = 0; 1546 key.type = BTRFS_DEV_ITEM_KEY; 1547 1548 while (1) { 1549 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1550 if (ret < 0) 1551 goto error; 1552 1553 leaf = path->nodes[0]; 1554 next_slot: 1555 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1556 ret = btrfs_next_leaf(root, path); 1557 if (ret > 0) 1558 break; 1559 if (ret < 0) 1560 goto error; 1561 leaf = path->nodes[0]; 1562 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1563 btrfs_release_path(path); 1564 continue; 1565 } 1566 1567 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1568 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1569 key.type != BTRFS_DEV_ITEM_KEY) 1570 break; 1571 1572 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1573 struct btrfs_dev_item); 1574 devid = btrfs_device_id(leaf, dev_item); 1575 read_extent_buffer(leaf, dev_uuid, 1576 (unsigned long)btrfs_device_uuid(dev_item), 1577 BTRFS_UUID_SIZE); 1578 read_extent_buffer(leaf, fs_uuid, 1579 (unsigned long)btrfs_device_fsid(dev_item), 1580 BTRFS_UUID_SIZE); 1581 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1582 BUG_ON(!device); 1583 1584 if (device->fs_devices->seeding) { 1585 btrfs_set_device_generation(leaf, dev_item, 1586 device->generation); 1587 btrfs_mark_buffer_dirty(leaf); 1588 } 1589 1590 path->slots[0]++; 1591 goto next_slot; 1592 } 1593 ret = 0; 1594 error: 1595 btrfs_free_path(path); 1596 return ret; 1597 } 1598 1599 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1600 { 1601 struct request_queue *q; 1602 struct btrfs_trans_handle *trans; 1603 struct btrfs_device *device; 1604 struct block_device *bdev; 1605 struct list_head *devices; 1606 struct super_block *sb = root->fs_info->sb; 1607 u64 total_bytes; 1608 int seeding_dev = 0; 1609 int ret = 0; 1610 1611 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1612 return -EINVAL; 1613 1614 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1615 root->fs_info->bdev_holder); 1616 if (IS_ERR(bdev)) 1617 return PTR_ERR(bdev); 1618 1619 if (root->fs_info->fs_devices->seeding) { 1620 seeding_dev = 1; 1621 down_write(&sb->s_umount); 1622 mutex_lock(&uuid_mutex); 1623 } 1624 1625 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1626 mutex_lock(&root->fs_info->volume_mutex); 1627 1628 devices = &root->fs_info->fs_devices->devices; 1629 /* 1630 * we have the volume lock, so we don't need the extra 1631 * device list mutex while reading the list here. 1632 */ 1633 list_for_each_entry(device, devices, dev_list) { 1634 if (device->bdev == bdev) { 1635 ret = -EEXIST; 1636 goto error; 1637 } 1638 } 1639 1640 device = kzalloc(sizeof(*device), GFP_NOFS); 1641 if (!device) { 1642 /* we can safely leave the fs_devices entry around */ 1643 ret = -ENOMEM; 1644 goto error; 1645 } 1646 1647 device->name = kstrdup(device_path, GFP_NOFS); 1648 if (!device->name) { 1649 kfree(device); 1650 ret = -ENOMEM; 1651 goto error; 1652 } 1653 1654 ret = find_next_devid(root, &device->devid); 1655 if (ret) { 1656 kfree(device->name); 1657 kfree(device); 1658 goto error; 1659 } 1660 1661 trans = btrfs_start_transaction(root, 0); 1662 if (IS_ERR(trans)) { 1663 kfree(device->name); 1664 kfree(device); 1665 ret = PTR_ERR(trans); 1666 goto error; 1667 } 1668 1669 lock_chunks(root); 1670 1671 q = bdev_get_queue(bdev); 1672 if (blk_queue_discard(q)) 1673 device->can_discard = 1; 1674 device->writeable = 1; 1675 device->work.func = pending_bios_fn; 1676 generate_random_uuid(device->uuid); 1677 spin_lock_init(&device->io_lock); 1678 device->generation = trans->transid; 1679 device->io_width = root->sectorsize; 1680 device->io_align = root->sectorsize; 1681 device->sector_size = root->sectorsize; 1682 device->total_bytes = i_size_read(bdev->bd_inode); 1683 device->disk_total_bytes = device->total_bytes; 1684 device->dev_root = root->fs_info->dev_root; 1685 device->bdev = bdev; 1686 device->in_fs_metadata = 1; 1687 device->mode = FMODE_EXCL; 1688 set_blocksize(device->bdev, 4096); 1689 1690 if (seeding_dev) { 1691 sb->s_flags &= ~MS_RDONLY; 1692 ret = btrfs_prepare_sprout(trans, root); 1693 BUG_ON(ret); 1694 } 1695 1696 device->fs_devices = root->fs_info->fs_devices; 1697 1698 /* 1699 * we don't want write_supers to jump in here with our device 1700 * half setup 1701 */ 1702 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1703 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1704 list_add(&device->dev_alloc_list, 1705 &root->fs_info->fs_devices->alloc_list); 1706 root->fs_info->fs_devices->num_devices++; 1707 root->fs_info->fs_devices->open_devices++; 1708 root->fs_info->fs_devices->rw_devices++; 1709 if (device->can_discard) 1710 root->fs_info->fs_devices->num_can_discard++; 1711 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1712 1713 spin_lock(&root->fs_info->free_chunk_lock); 1714 root->fs_info->free_chunk_space += device->total_bytes; 1715 spin_unlock(&root->fs_info->free_chunk_lock); 1716 1717 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1718 root->fs_info->fs_devices->rotating = 1; 1719 1720 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 1721 btrfs_set_super_total_bytes(root->fs_info->super_copy, 1722 total_bytes + device->total_bytes); 1723 1724 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 1725 btrfs_set_super_num_devices(root->fs_info->super_copy, 1726 total_bytes + 1); 1727 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1728 1729 if (seeding_dev) { 1730 ret = init_first_rw_device(trans, root, device); 1731 BUG_ON(ret); 1732 ret = btrfs_finish_sprout(trans, root); 1733 BUG_ON(ret); 1734 } else { 1735 ret = btrfs_add_device(trans, root, device); 1736 } 1737 1738 /* 1739 * we've got more storage, clear any full flags on the space 1740 * infos 1741 */ 1742 btrfs_clear_space_info_full(root->fs_info); 1743 1744 unlock_chunks(root); 1745 btrfs_commit_transaction(trans, root); 1746 1747 if (seeding_dev) { 1748 mutex_unlock(&uuid_mutex); 1749 up_write(&sb->s_umount); 1750 1751 ret = btrfs_relocate_sys_chunks(root); 1752 BUG_ON(ret); 1753 } 1754 out: 1755 mutex_unlock(&root->fs_info->volume_mutex); 1756 return ret; 1757 error: 1758 blkdev_put(bdev, FMODE_EXCL); 1759 if (seeding_dev) { 1760 mutex_unlock(&uuid_mutex); 1761 up_write(&sb->s_umount); 1762 } 1763 goto out; 1764 } 1765 1766 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1767 struct btrfs_device *device) 1768 { 1769 int ret; 1770 struct btrfs_path *path; 1771 struct btrfs_root *root; 1772 struct btrfs_dev_item *dev_item; 1773 struct extent_buffer *leaf; 1774 struct btrfs_key key; 1775 1776 root = device->dev_root->fs_info->chunk_root; 1777 1778 path = btrfs_alloc_path(); 1779 if (!path) 1780 return -ENOMEM; 1781 1782 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1783 key.type = BTRFS_DEV_ITEM_KEY; 1784 key.offset = device->devid; 1785 1786 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1787 if (ret < 0) 1788 goto out; 1789 1790 if (ret > 0) { 1791 ret = -ENOENT; 1792 goto out; 1793 } 1794 1795 leaf = path->nodes[0]; 1796 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1797 1798 btrfs_set_device_id(leaf, dev_item, device->devid); 1799 btrfs_set_device_type(leaf, dev_item, device->type); 1800 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1801 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1802 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1803 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1804 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1805 btrfs_mark_buffer_dirty(leaf); 1806 1807 out: 1808 btrfs_free_path(path); 1809 return ret; 1810 } 1811 1812 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1813 struct btrfs_device *device, u64 new_size) 1814 { 1815 struct btrfs_super_block *super_copy = 1816 device->dev_root->fs_info->super_copy; 1817 u64 old_total = btrfs_super_total_bytes(super_copy); 1818 u64 diff = new_size - device->total_bytes; 1819 1820 if (!device->writeable) 1821 return -EACCES; 1822 if (new_size <= device->total_bytes) 1823 return -EINVAL; 1824 1825 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1826 device->fs_devices->total_rw_bytes += diff; 1827 1828 device->total_bytes = new_size; 1829 device->disk_total_bytes = new_size; 1830 btrfs_clear_space_info_full(device->dev_root->fs_info); 1831 1832 return btrfs_update_device(trans, device); 1833 } 1834 1835 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1836 struct btrfs_device *device, u64 new_size) 1837 { 1838 int ret; 1839 lock_chunks(device->dev_root); 1840 ret = __btrfs_grow_device(trans, device, new_size); 1841 unlock_chunks(device->dev_root); 1842 return ret; 1843 } 1844 1845 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1846 struct btrfs_root *root, 1847 u64 chunk_tree, u64 chunk_objectid, 1848 u64 chunk_offset) 1849 { 1850 int ret; 1851 struct btrfs_path *path; 1852 struct btrfs_key key; 1853 1854 root = root->fs_info->chunk_root; 1855 path = btrfs_alloc_path(); 1856 if (!path) 1857 return -ENOMEM; 1858 1859 key.objectid = chunk_objectid; 1860 key.offset = chunk_offset; 1861 key.type = BTRFS_CHUNK_ITEM_KEY; 1862 1863 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1864 BUG_ON(ret); 1865 1866 ret = btrfs_del_item(trans, root, path); 1867 1868 btrfs_free_path(path); 1869 return ret; 1870 } 1871 1872 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1873 chunk_offset) 1874 { 1875 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 1876 struct btrfs_disk_key *disk_key; 1877 struct btrfs_chunk *chunk; 1878 u8 *ptr; 1879 int ret = 0; 1880 u32 num_stripes; 1881 u32 array_size; 1882 u32 len = 0; 1883 u32 cur; 1884 struct btrfs_key key; 1885 1886 array_size = btrfs_super_sys_array_size(super_copy); 1887 1888 ptr = super_copy->sys_chunk_array; 1889 cur = 0; 1890 1891 while (cur < array_size) { 1892 disk_key = (struct btrfs_disk_key *)ptr; 1893 btrfs_disk_key_to_cpu(&key, disk_key); 1894 1895 len = sizeof(*disk_key); 1896 1897 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1898 chunk = (struct btrfs_chunk *)(ptr + len); 1899 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1900 len += btrfs_chunk_item_size(num_stripes); 1901 } else { 1902 ret = -EIO; 1903 break; 1904 } 1905 if (key.objectid == chunk_objectid && 1906 key.offset == chunk_offset) { 1907 memmove(ptr, ptr + len, array_size - (cur + len)); 1908 array_size -= len; 1909 btrfs_set_super_sys_array_size(super_copy, array_size); 1910 } else { 1911 ptr += len; 1912 cur += len; 1913 } 1914 } 1915 return ret; 1916 } 1917 1918 static int btrfs_relocate_chunk(struct btrfs_root *root, 1919 u64 chunk_tree, u64 chunk_objectid, 1920 u64 chunk_offset) 1921 { 1922 struct extent_map_tree *em_tree; 1923 struct btrfs_root *extent_root; 1924 struct btrfs_trans_handle *trans; 1925 struct extent_map *em; 1926 struct map_lookup *map; 1927 int ret; 1928 int i; 1929 1930 root = root->fs_info->chunk_root; 1931 extent_root = root->fs_info->extent_root; 1932 em_tree = &root->fs_info->mapping_tree.map_tree; 1933 1934 ret = btrfs_can_relocate(extent_root, chunk_offset); 1935 if (ret) 1936 return -ENOSPC; 1937 1938 /* step one, relocate all the extents inside this chunk */ 1939 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1940 if (ret) 1941 return ret; 1942 1943 trans = btrfs_start_transaction(root, 0); 1944 BUG_ON(IS_ERR(trans)); 1945 1946 lock_chunks(root); 1947 1948 /* 1949 * step two, delete the device extents and the 1950 * chunk tree entries 1951 */ 1952 read_lock(&em_tree->lock); 1953 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1954 read_unlock(&em_tree->lock); 1955 1956 BUG_ON(em->start > chunk_offset || 1957 em->start + em->len < chunk_offset); 1958 map = (struct map_lookup *)em->bdev; 1959 1960 for (i = 0; i < map->num_stripes; i++) { 1961 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1962 map->stripes[i].physical); 1963 BUG_ON(ret); 1964 1965 if (map->stripes[i].dev) { 1966 ret = btrfs_update_device(trans, map->stripes[i].dev); 1967 BUG_ON(ret); 1968 } 1969 } 1970 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1971 chunk_offset); 1972 1973 BUG_ON(ret); 1974 1975 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 1976 1977 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1978 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1979 BUG_ON(ret); 1980 } 1981 1982 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1983 BUG_ON(ret); 1984 1985 write_lock(&em_tree->lock); 1986 remove_extent_mapping(em_tree, em); 1987 write_unlock(&em_tree->lock); 1988 1989 kfree(map); 1990 em->bdev = NULL; 1991 1992 /* once for the tree */ 1993 free_extent_map(em); 1994 /* once for us */ 1995 free_extent_map(em); 1996 1997 unlock_chunks(root); 1998 btrfs_end_transaction(trans, root); 1999 return 0; 2000 } 2001 2002 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2003 { 2004 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2005 struct btrfs_path *path; 2006 struct extent_buffer *leaf; 2007 struct btrfs_chunk *chunk; 2008 struct btrfs_key key; 2009 struct btrfs_key found_key; 2010 u64 chunk_tree = chunk_root->root_key.objectid; 2011 u64 chunk_type; 2012 bool retried = false; 2013 int failed = 0; 2014 int ret; 2015 2016 path = btrfs_alloc_path(); 2017 if (!path) 2018 return -ENOMEM; 2019 2020 again: 2021 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2022 key.offset = (u64)-1; 2023 key.type = BTRFS_CHUNK_ITEM_KEY; 2024 2025 while (1) { 2026 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2027 if (ret < 0) 2028 goto error; 2029 BUG_ON(ret == 0); 2030 2031 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2032 key.type); 2033 if (ret < 0) 2034 goto error; 2035 if (ret > 0) 2036 break; 2037 2038 leaf = path->nodes[0]; 2039 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2040 2041 chunk = btrfs_item_ptr(leaf, path->slots[0], 2042 struct btrfs_chunk); 2043 chunk_type = btrfs_chunk_type(leaf, chunk); 2044 btrfs_release_path(path); 2045 2046 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2047 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2048 found_key.objectid, 2049 found_key.offset); 2050 if (ret == -ENOSPC) 2051 failed++; 2052 else if (ret) 2053 BUG(); 2054 } 2055 2056 if (found_key.offset == 0) 2057 break; 2058 key.offset = found_key.offset - 1; 2059 } 2060 ret = 0; 2061 if (failed && !retried) { 2062 failed = 0; 2063 retried = true; 2064 goto again; 2065 } else if (failed && retried) { 2066 WARN_ON(1); 2067 ret = -ENOSPC; 2068 } 2069 error: 2070 btrfs_free_path(path); 2071 return ret; 2072 } 2073 2074 static u64 div_factor(u64 num, int factor) 2075 { 2076 if (factor == 10) 2077 return num; 2078 num *= factor; 2079 do_div(num, 10); 2080 return num; 2081 } 2082 2083 int btrfs_balance(struct btrfs_root *dev_root) 2084 { 2085 int ret; 2086 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2087 struct btrfs_device *device; 2088 u64 old_size; 2089 u64 size_to_free; 2090 struct btrfs_path *path; 2091 struct btrfs_key key; 2092 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 2093 struct btrfs_trans_handle *trans; 2094 struct btrfs_key found_key; 2095 2096 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2097 return -EROFS; 2098 2099 if (!capable(CAP_SYS_ADMIN)) 2100 return -EPERM; 2101 2102 mutex_lock(&dev_root->fs_info->volume_mutex); 2103 dev_root = dev_root->fs_info->dev_root; 2104 2105 /* step one make some room on all the devices */ 2106 list_for_each_entry(device, devices, dev_list) { 2107 old_size = device->total_bytes; 2108 size_to_free = div_factor(old_size, 1); 2109 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2110 if (!device->writeable || 2111 device->total_bytes - device->bytes_used > size_to_free) 2112 continue; 2113 2114 ret = btrfs_shrink_device(device, old_size - size_to_free); 2115 if (ret == -ENOSPC) 2116 break; 2117 BUG_ON(ret); 2118 2119 trans = btrfs_start_transaction(dev_root, 0); 2120 BUG_ON(IS_ERR(trans)); 2121 2122 ret = btrfs_grow_device(trans, device, old_size); 2123 BUG_ON(ret); 2124 2125 btrfs_end_transaction(trans, dev_root); 2126 } 2127 2128 /* step two, relocate all the chunks */ 2129 path = btrfs_alloc_path(); 2130 if (!path) { 2131 ret = -ENOMEM; 2132 goto error; 2133 } 2134 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2135 key.offset = (u64)-1; 2136 key.type = BTRFS_CHUNK_ITEM_KEY; 2137 2138 while (1) { 2139 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2140 if (ret < 0) 2141 goto error; 2142 2143 /* 2144 * this shouldn't happen, it means the last relocate 2145 * failed 2146 */ 2147 if (ret == 0) 2148 break; 2149 2150 ret = btrfs_previous_item(chunk_root, path, 0, 2151 BTRFS_CHUNK_ITEM_KEY); 2152 if (ret) 2153 break; 2154 2155 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2156 path->slots[0]); 2157 if (found_key.objectid != key.objectid) 2158 break; 2159 2160 /* chunk zero is special */ 2161 if (found_key.offset == 0) 2162 break; 2163 2164 btrfs_release_path(path); 2165 ret = btrfs_relocate_chunk(chunk_root, 2166 chunk_root->root_key.objectid, 2167 found_key.objectid, 2168 found_key.offset); 2169 if (ret && ret != -ENOSPC) 2170 goto error; 2171 key.offset = found_key.offset - 1; 2172 } 2173 ret = 0; 2174 error: 2175 btrfs_free_path(path); 2176 mutex_unlock(&dev_root->fs_info->volume_mutex); 2177 return ret; 2178 } 2179 2180 /* 2181 * shrinking a device means finding all of the device extents past 2182 * the new size, and then following the back refs to the chunks. 2183 * The chunk relocation code actually frees the device extent 2184 */ 2185 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 2186 { 2187 struct btrfs_trans_handle *trans; 2188 struct btrfs_root *root = device->dev_root; 2189 struct btrfs_dev_extent *dev_extent = NULL; 2190 struct btrfs_path *path; 2191 u64 length; 2192 u64 chunk_tree; 2193 u64 chunk_objectid; 2194 u64 chunk_offset; 2195 int ret; 2196 int slot; 2197 int failed = 0; 2198 bool retried = false; 2199 struct extent_buffer *l; 2200 struct btrfs_key key; 2201 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2202 u64 old_total = btrfs_super_total_bytes(super_copy); 2203 u64 old_size = device->total_bytes; 2204 u64 diff = device->total_bytes - new_size; 2205 2206 if (new_size >= device->total_bytes) 2207 return -EINVAL; 2208 2209 path = btrfs_alloc_path(); 2210 if (!path) 2211 return -ENOMEM; 2212 2213 path->reada = 2; 2214 2215 lock_chunks(root); 2216 2217 device->total_bytes = new_size; 2218 if (device->writeable) { 2219 device->fs_devices->total_rw_bytes -= diff; 2220 spin_lock(&root->fs_info->free_chunk_lock); 2221 root->fs_info->free_chunk_space -= diff; 2222 spin_unlock(&root->fs_info->free_chunk_lock); 2223 } 2224 unlock_chunks(root); 2225 2226 again: 2227 key.objectid = device->devid; 2228 key.offset = (u64)-1; 2229 key.type = BTRFS_DEV_EXTENT_KEY; 2230 2231 while (1) { 2232 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2233 if (ret < 0) 2234 goto done; 2235 2236 ret = btrfs_previous_item(root, path, 0, key.type); 2237 if (ret < 0) 2238 goto done; 2239 if (ret) { 2240 ret = 0; 2241 btrfs_release_path(path); 2242 break; 2243 } 2244 2245 l = path->nodes[0]; 2246 slot = path->slots[0]; 2247 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2248 2249 if (key.objectid != device->devid) { 2250 btrfs_release_path(path); 2251 break; 2252 } 2253 2254 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2255 length = btrfs_dev_extent_length(l, dev_extent); 2256 2257 if (key.offset + length <= new_size) { 2258 btrfs_release_path(path); 2259 break; 2260 } 2261 2262 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2263 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2264 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2265 btrfs_release_path(path); 2266 2267 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2268 chunk_offset); 2269 if (ret && ret != -ENOSPC) 2270 goto done; 2271 if (ret == -ENOSPC) 2272 failed++; 2273 key.offset -= 1; 2274 } 2275 2276 if (failed && !retried) { 2277 failed = 0; 2278 retried = true; 2279 goto again; 2280 } else if (failed && retried) { 2281 ret = -ENOSPC; 2282 lock_chunks(root); 2283 2284 device->total_bytes = old_size; 2285 if (device->writeable) 2286 device->fs_devices->total_rw_bytes += diff; 2287 spin_lock(&root->fs_info->free_chunk_lock); 2288 root->fs_info->free_chunk_space += diff; 2289 spin_unlock(&root->fs_info->free_chunk_lock); 2290 unlock_chunks(root); 2291 goto done; 2292 } 2293 2294 /* Shrinking succeeded, else we would be at "done". */ 2295 trans = btrfs_start_transaction(root, 0); 2296 if (IS_ERR(trans)) { 2297 ret = PTR_ERR(trans); 2298 goto done; 2299 } 2300 2301 lock_chunks(root); 2302 2303 device->disk_total_bytes = new_size; 2304 /* Now btrfs_update_device() will change the on-disk size. */ 2305 ret = btrfs_update_device(trans, device); 2306 if (ret) { 2307 unlock_chunks(root); 2308 btrfs_end_transaction(trans, root); 2309 goto done; 2310 } 2311 WARN_ON(diff > old_total); 2312 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2313 unlock_chunks(root); 2314 btrfs_end_transaction(trans, root); 2315 done: 2316 btrfs_free_path(path); 2317 return ret; 2318 } 2319 2320 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2321 struct btrfs_root *root, 2322 struct btrfs_key *key, 2323 struct btrfs_chunk *chunk, int item_size) 2324 { 2325 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2326 struct btrfs_disk_key disk_key; 2327 u32 array_size; 2328 u8 *ptr; 2329 2330 array_size = btrfs_super_sys_array_size(super_copy); 2331 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2332 return -EFBIG; 2333 2334 ptr = super_copy->sys_chunk_array + array_size; 2335 btrfs_cpu_key_to_disk(&disk_key, key); 2336 memcpy(ptr, &disk_key, sizeof(disk_key)); 2337 ptr += sizeof(disk_key); 2338 memcpy(ptr, chunk, item_size); 2339 item_size += sizeof(disk_key); 2340 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2341 return 0; 2342 } 2343 2344 /* 2345 * sort the devices in descending order by max_avail, total_avail 2346 */ 2347 static int btrfs_cmp_device_info(const void *a, const void *b) 2348 { 2349 const struct btrfs_device_info *di_a = a; 2350 const struct btrfs_device_info *di_b = b; 2351 2352 if (di_a->max_avail > di_b->max_avail) 2353 return -1; 2354 if (di_a->max_avail < di_b->max_avail) 2355 return 1; 2356 if (di_a->total_avail > di_b->total_avail) 2357 return -1; 2358 if (di_a->total_avail < di_b->total_avail) 2359 return 1; 2360 return 0; 2361 } 2362 2363 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2364 struct btrfs_root *extent_root, 2365 struct map_lookup **map_ret, 2366 u64 *num_bytes_out, u64 *stripe_size_out, 2367 u64 start, u64 type) 2368 { 2369 struct btrfs_fs_info *info = extent_root->fs_info; 2370 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2371 struct list_head *cur; 2372 struct map_lookup *map = NULL; 2373 struct extent_map_tree *em_tree; 2374 struct extent_map *em; 2375 struct btrfs_device_info *devices_info = NULL; 2376 u64 total_avail; 2377 int num_stripes; /* total number of stripes to allocate */ 2378 int sub_stripes; /* sub_stripes info for map */ 2379 int dev_stripes; /* stripes per dev */ 2380 int devs_max; /* max devs to use */ 2381 int devs_min; /* min devs needed */ 2382 int devs_increment; /* ndevs has to be a multiple of this */ 2383 int ncopies; /* how many copies to data has */ 2384 int ret; 2385 u64 max_stripe_size; 2386 u64 max_chunk_size; 2387 u64 stripe_size; 2388 u64 num_bytes; 2389 int ndevs; 2390 int i; 2391 int j; 2392 2393 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2394 (type & BTRFS_BLOCK_GROUP_DUP)) { 2395 WARN_ON(1); 2396 type &= ~BTRFS_BLOCK_GROUP_DUP; 2397 } 2398 2399 if (list_empty(&fs_devices->alloc_list)) 2400 return -ENOSPC; 2401 2402 sub_stripes = 1; 2403 dev_stripes = 1; 2404 devs_increment = 1; 2405 ncopies = 1; 2406 devs_max = 0; /* 0 == as many as possible */ 2407 devs_min = 1; 2408 2409 /* 2410 * define the properties of each RAID type. 2411 * FIXME: move this to a global table and use it in all RAID 2412 * calculation code 2413 */ 2414 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2415 dev_stripes = 2; 2416 ncopies = 2; 2417 devs_max = 1; 2418 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2419 devs_min = 2; 2420 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2421 devs_increment = 2; 2422 ncopies = 2; 2423 devs_max = 2; 2424 devs_min = 2; 2425 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2426 sub_stripes = 2; 2427 devs_increment = 2; 2428 ncopies = 2; 2429 devs_min = 4; 2430 } else { 2431 devs_max = 1; 2432 } 2433 2434 if (type & BTRFS_BLOCK_GROUP_DATA) { 2435 max_stripe_size = 1024 * 1024 * 1024; 2436 max_chunk_size = 10 * max_stripe_size; 2437 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2438 max_stripe_size = 256 * 1024 * 1024; 2439 max_chunk_size = max_stripe_size; 2440 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2441 max_stripe_size = 8 * 1024 * 1024; 2442 max_chunk_size = 2 * max_stripe_size; 2443 } else { 2444 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 2445 type); 2446 BUG_ON(1); 2447 } 2448 2449 /* we don't want a chunk larger than 10% of writeable space */ 2450 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2451 max_chunk_size); 2452 2453 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 2454 GFP_NOFS); 2455 if (!devices_info) 2456 return -ENOMEM; 2457 2458 cur = fs_devices->alloc_list.next; 2459 2460 /* 2461 * in the first pass through the devices list, we gather information 2462 * about the available holes on each device. 2463 */ 2464 ndevs = 0; 2465 while (cur != &fs_devices->alloc_list) { 2466 struct btrfs_device *device; 2467 u64 max_avail; 2468 u64 dev_offset; 2469 2470 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2471 2472 cur = cur->next; 2473 2474 if (!device->writeable) { 2475 printk(KERN_ERR 2476 "btrfs: read-only device in alloc_list\n"); 2477 WARN_ON(1); 2478 continue; 2479 } 2480 2481 if (!device->in_fs_metadata) 2482 continue; 2483 2484 if (device->total_bytes > device->bytes_used) 2485 total_avail = device->total_bytes - device->bytes_used; 2486 else 2487 total_avail = 0; 2488 2489 /* If there is no space on this device, skip it. */ 2490 if (total_avail == 0) 2491 continue; 2492 2493 ret = find_free_dev_extent(trans, device, 2494 max_stripe_size * dev_stripes, 2495 &dev_offset, &max_avail); 2496 if (ret && ret != -ENOSPC) 2497 goto error; 2498 2499 if (ret == 0) 2500 max_avail = max_stripe_size * dev_stripes; 2501 2502 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 2503 continue; 2504 2505 devices_info[ndevs].dev_offset = dev_offset; 2506 devices_info[ndevs].max_avail = max_avail; 2507 devices_info[ndevs].total_avail = total_avail; 2508 devices_info[ndevs].dev = device; 2509 ++ndevs; 2510 } 2511 2512 /* 2513 * now sort the devices by hole size / available space 2514 */ 2515 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 2516 btrfs_cmp_device_info, NULL); 2517 2518 /* round down to number of usable stripes */ 2519 ndevs -= ndevs % devs_increment; 2520 2521 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 2522 ret = -ENOSPC; 2523 goto error; 2524 } 2525 2526 if (devs_max && ndevs > devs_max) 2527 ndevs = devs_max; 2528 /* 2529 * the primary goal is to maximize the number of stripes, so use as many 2530 * devices as possible, even if the stripes are not maximum sized. 2531 */ 2532 stripe_size = devices_info[ndevs-1].max_avail; 2533 num_stripes = ndevs * dev_stripes; 2534 2535 if (stripe_size * num_stripes > max_chunk_size * ncopies) { 2536 stripe_size = max_chunk_size * ncopies; 2537 do_div(stripe_size, num_stripes); 2538 } 2539 2540 do_div(stripe_size, dev_stripes); 2541 do_div(stripe_size, BTRFS_STRIPE_LEN); 2542 stripe_size *= BTRFS_STRIPE_LEN; 2543 2544 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2545 if (!map) { 2546 ret = -ENOMEM; 2547 goto error; 2548 } 2549 map->num_stripes = num_stripes; 2550 2551 for (i = 0; i < ndevs; ++i) { 2552 for (j = 0; j < dev_stripes; ++j) { 2553 int s = i * dev_stripes + j; 2554 map->stripes[s].dev = devices_info[i].dev; 2555 map->stripes[s].physical = devices_info[i].dev_offset + 2556 j * stripe_size; 2557 } 2558 } 2559 map->sector_size = extent_root->sectorsize; 2560 map->stripe_len = BTRFS_STRIPE_LEN; 2561 map->io_align = BTRFS_STRIPE_LEN; 2562 map->io_width = BTRFS_STRIPE_LEN; 2563 map->type = type; 2564 map->sub_stripes = sub_stripes; 2565 2566 *map_ret = map; 2567 num_bytes = stripe_size * (num_stripes / ncopies); 2568 2569 *stripe_size_out = stripe_size; 2570 *num_bytes_out = num_bytes; 2571 2572 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 2573 2574 em = alloc_extent_map(); 2575 if (!em) { 2576 ret = -ENOMEM; 2577 goto error; 2578 } 2579 em->bdev = (struct block_device *)map; 2580 em->start = start; 2581 em->len = num_bytes; 2582 em->block_start = 0; 2583 em->block_len = em->len; 2584 2585 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2586 write_lock(&em_tree->lock); 2587 ret = add_extent_mapping(em_tree, em); 2588 write_unlock(&em_tree->lock); 2589 BUG_ON(ret); 2590 free_extent_map(em); 2591 2592 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2593 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2594 start, num_bytes); 2595 BUG_ON(ret); 2596 2597 for (i = 0; i < map->num_stripes; ++i) { 2598 struct btrfs_device *device; 2599 u64 dev_offset; 2600 2601 device = map->stripes[i].dev; 2602 dev_offset = map->stripes[i].physical; 2603 2604 ret = btrfs_alloc_dev_extent(trans, device, 2605 info->chunk_root->root_key.objectid, 2606 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2607 start, dev_offset, stripe_size); 2608 BUG_ON(ret); 2609 } 2610 2611 kfree(devices_info); 2612 return 0; 2613 2614 error: 2615 kfree(map); 2616 kfree(devices_info); 2617 return ret; 2618 } 2619 2620 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2621 struct btrfs_root *extent_root, 2622 struct map_lookup *map, u64 chunk_offset, 2623 u64 chunk_size, u64 stripe_size) 2624 { 2625 u64 dev_offset; 2626 struct btrfs_key key; 2627 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2628 struct btrfs_device *device; 2629 struct btrfs_chunk *chunk; 2630 struct btrfs_stripe *stripe; 2631 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2632 int index = 0; 2633 int ret; 2634 2635 chunk = kzalloc(item_size, GFP_NOFS); 2636 if (!chunk) 2637 return -ENOMEM; 2638 2639 index = 0; 2640 while (index < map->num_stripes) { 2641 device = map->stripes[index].dev; 2642 device->bytes_used += stripe_size; 2643 ret = btrfs_update_device(trans, device); 2644 BUG_ON(ret); 2645 index++; 2646 } 2647 2648 spin_lock(&extent_root->fs_info->free_chunk_lock); 2649 extent_root->fs_info->free_chunk_space -= (stripe_size * 2650 map->num_stripes); 2651 spin_unlock(&extent_root->fs_info->free_chunk_lock); 2652 2653 index = 0; 2654 stripe = &chunk->stripe; 2655 while (index < map->num_stripes) { 2656 device = map->stripes[index].dev; 2657 dev_offset = map->stripes[index].physical; 2658 2659 btrfs_set_stack_stripe_devid(stripe, device->devid); 2660 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2661 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2662 stripe++; 2663 index++; 2664 } 2665 2666 btrfs_set_stack_chunk_length(chunk, chunk_size); 2667 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2668 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2669 btrfs_set_stack_chunk_type(chunk, map->type); 2670 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2671 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2672 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2673 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2674 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2675 2676 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2677 key.type = BTRFS_CHUNK_ITEM_KEY; 2678 key.offset = chunk_offset; 2679 2680 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2681 BUG_ON(ret); 2682 2683 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2684 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2685 item_size); 2686 BUG_ON(ret); 2687 } 2688 2689 kfree(chunk); 2690 return 0; 2691 } 2692 2693 /* 2694 * Chunk allocation falls into two parts. The first part does works 2695 * that make the new allocated chunk useable, but not do any operation 2696 * that modifies the chunk tree. The second part does the works that 2697 * require modifying the chunk tree. This division is important for the 2698 * bootstrap process of adding storage to a seed btrfs. 2699 */ 2700 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2701 struct btrfs_root *extent_root, u64 type) 2702 { 2703 u64 chunk_offset; 2704 u64 chunk_size; 2705 u64 stripe_size; 2706 struct map_lookup *map; 2707 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2708 int ret; 2709 2710 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2711 &chunk_offset); 2712 if (ret) 2713 return ret; 2714 2715 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2716 &stripe_size, chunk_offset, type); 2717 if (ret) 2718 return ret; 2719 2720 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2721 chunk_size, stripe_size); 2722 BUG_ON(ret); 2723 return 0; 2724 } 2725 2726 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2727 struct btrfs_root *root, 2728 struct btrfs_device *device) 2729 { 2730 u64 chunk_offset; 2731 u64 sys_chunk_offset; 2732 u64 chunk_size; 2733 u64 sys_chunk_size; 2734 u64 stripe_size; 2735 u64 sys_stripe_size; 2736 u64 alloc_profile; 2737 struct map_lookup *map; 2738 struct map_lookup *sys_map; 2739 struct btrfs_fs_info *fs_info = root->fs_info; 2740 struct btrfs_root *extent_root = fs_info->extent_root; 2741 int ret; 2742 2743 ret = find_next_chunk(fs_info->chunk_root, 2744 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2745 if (ret) 2746 return ret; 2747 2748 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2749 (fs_info->metadata_alloc_profile & 2750 fs_info->avail_metadata_alloc_bits); 2751 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2752 2753 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2754 &stripe_size, chunk_offset, alloc_profile); 2755 BUG_ON(ret); 2756 2757 sys_chunk_offset = chunk_offset + chunk_size; 2758 2759 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2760 (fs_info->system_alloc_profile & 2761 fs_info->avail_system_alloc_bits); 2762 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2763 2764 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2765 &sys_chunk_size, &sys_stripe_size, 2766 sys_chunk_offset, alloc_profile); 2767 BUG_ON(ret); 2768 2769 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2770 BUG_ON(ret); 2771 2772 /* 2773 * Modifying chunk tree needs allocating new blocks from both 2774 * system block group and metadata block group. So we only can 2775 * do operations require modifying the chunk tree after both 2776 * block groups were created. 2777 */ 2778 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2779 chunk_size, stripe_size); 2780 BUG_ON(ret); 2781 2782 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2783 sys_chunk_offset, sys_chunk_size, 2784 sys_stripe_size); 2785 BUG_ON(ret); 2786 return 0; 2787 } 2788 2789 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2790 { 2791 struct extent_map *em; 2792 struct map_lookup *map; 2793 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2794 int readonly = 0; 2795 int i; 2796 2797 read_lock(&map_tree->map_tree.lock); 2798 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2799 read_unlock(&map_tree->map_tree.lock); 2800 if (!em) 2801 return 1; 2802 2803 if (btrfs_test_opt(root, DEGRADED)) { 2804 free_extent_map(em); 2805 return 0; 2806 } 2807 2808 map = (struct map_lookup *)em->bdev; 2809 for (i = 0; i < map->num_stripes; i++) { 2810 if (!map->stripes[i].dev->writeable) { 2811 readonly = 1; 2812 break; 2813 } 2814 } 2815 free_extent_map(em); 2816 return readonly; 2817 } 2818 2819 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2820 { 2821 extent_map_tree_init(&tree->map_tree); 2822 } 2823 2824 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2825 { 2826 struct extent_map *em; 2827 2828 while (1) { 2829 write_lock(&tree->map_tree.lock); 2830 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2831 if (em) 2832 remove_extent_mapping(&tree->map_tree, em); 2833 write_unlock(&tree->map_tree.lock); 2834 if (!em) 2835 break; 2836 kfree(em->bdev); 2837 /* once for us */ 2838 free_extent_map(em); 2839 /* once for the tree */ 2840 free_extent_map(em); 2841 } 2842 } 2843 2844 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2845 { 2846 struct extent_map *em; 2847 struct map_lookup *map; 2848 struct extent_map_tree *em_tree = &map_tree->map_tree; 2849 int ret; 2850 2851 read_lock(&em_tree->lock); 2852 em = lookup_extent_mapping(em_tree, logical, len); 2853 read_unlock(&em_tree->lock); 2854 BUG_ON(!em); 2855 2856 BUG_ON(em->start > logical || em->start + em->len < logical); 2857 map = (struct map_lookup *)em->bdev; 2858 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2859 ret = map->num_stripes; 2860 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2861 ret = map->sub_stripes; 2862 else 2863 ret = 1; 2864 free_extent_map(em); 2865 return ret; 2866 } 2867 2868 static int find_live_mirror(struct map_lookup *map, int first, int num, 2869 int optimal) 2870 { 2871 int i; 2872 if (map->stripes[optimal].dev->bdev) 2873 return optimal; 2874 for (i = first; i < first + num; i++) { 2875 if (map->stripes[i].dev->bdev) 2876 return i; 2877 } 2878 /* we couldn't find one that doesn't fail. Just return something 2879 * and the io error handling code will clean up eventually 2880 */ 2881 return optimal; 2882 } 2883 2884 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2885 u64 logical, u64 *length, 2886 struct btrfs_bio **bbio_ret, 2887 int mirror_num) 2888 { 2889 struct extent_map *em; 2890 struct map_lookup *map; 2891 struct extent_map_tree *em_tree = &map_tree->map_tree; 2892 u64 offset; 2893 u64 stripe_offset; 2894 u64 stripe_end_offset; 2895 u64 stripe_nr; 2896 u64 stripe_nr_orig; 2897 u64 stripe_nr_end; 2898 int stripes_allocated = 8; 2899 int stripes_required = 1; 2900 int stripe_index; 2901 int i; 2902 int num_stripes; 2903 int max_errors = 0; 2904 struct btrfs_bio *bbio = NULL; 2905 2906 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2907 stripes_allocated = 1; 2908 again: 2909 if (bbio_ret) { 2910 bbio = kzalloc(btrfs_bio_size(stripes_allocated), 2911 GFP_NOFS); 2912 if (!bbio) 2913 return -ENOMEM; 2914 2915 atomic_set(&bbio->error, 0); 2916 } 2917 2918 read_lock(&em_tree->lock); 2919 em = lookup_extent_mapping(em_tree, logical, *length); 2920 read_unlock(&em_tree->lock); 2921 2922 if (!em) { 2923 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2924 (unsigned long long)logical, 2925 (unsigned long long)*length); 2926 BUG(); 2927 } 2928 2929 BUG_ON(em->start > logical || em->start + em->len < logical); 2930 map = (struct map_lookup *)em->bdev; 2931 offset = logical - em->start; 2932 2933 if (mirror_num > map->num_stripes) 2934 mirror_num = 0; 2935 2936 /* if our btrfs_bio struct is too small, back off and try again */ 2937 if (rw & REQ_WRITE) { 2938 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2939 BTRFS_BLOCK_GROUP_DUP)) { 2940 stripes_required = map->num_stripes; 2941 max_errors = 1; 2942 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2943 stripes_required = map->sub_stripes; 2944 max_errors = 1; 2945 } 2946 } 2947 if (rw & REQ_DISCARD) { 2948 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2949 BTRFS_BLOCK_GROUP_RAID1 | 2950 BTRFS_BLOCK_GROUP_DUP | 2951 BTRFS_BLOCK_GROUP_RAID10)) { 2952 stripes_required = map->num_stripes; 2953 } 2954 } 2955 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2956 stripes_allocated < stripes_required) { 2957 stripes_allocated = map->num_stripes; 2958 free_extent_map(em); 2959 kfree(bbio); 2960 goto again; 2961 } 2962 stripe_nr = offset; 2963 /* 2964 * stripe_nr counts the total number of stripes we have to stride 2965 * to get to this block 2966 */ 2967 do_div(stripe_nr, map->stripe_len); 2968 2969 stripe_offset = stripe_nr * map->stripe_len; 2970 BUG_ON(offset < stripe_offset); 2971 2972 /* stripe_offset is the offset of this block in its stripe*/ 2973 stripe_offset = offset - stripe_offset; 2974 2975 if (rw & REQ_DISCARD) 2976 *length = min_t(u64, em->len - offset, *length); 2977 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2978 BTRFS_BLOCK_GROUP_RAID1 | 2979 BTRFS_BLOCK_GROUP_RAID10 | 2980 BTRFS_BLOCK_GROUP_DUP)) { 2981 /* we limit the length of each bio to what fits in a stripe */ 2982 *length = min_t(u64, em->len - offset, 2983 map->stripe_len - stripe_offset); 2984 } else { 2985 *length = em->len - offset; 2986 } 2987 2988 if (!bbio_ret) 2989 goto out; 2990 2991 num_stripes = 1; 2992 stripe_index = 0; 2993 stripe_nr_orig = stripe_nr; 2994 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 2995 (~(map->stripe_len - 1)); 2996 do_div(stripe_nr_end, map->stripe_len); 2997 stripe_end_offset = stripe_nr_end * map->stripe_len - 2998 (offset + *length); 2999 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3000 if (rw & REQ_DISCARD) 3001 num_stripes = min_t(u64, map->num_stripes, 3002 stripe_nr_end - stripe_nr_orig); 3003 stripe_index = do_div(stripe_nr, map->num_stripes); 3004 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3005 if (rw & (REQ_WRITE | REQ_DISCARD)) 3006 num_stripes = map->num_stripes; 3007 else if (mirror_num) 3008 stripe_index = mirror_num - 1; 3009 else { 3010 stripe_index = find_live_mirror(map, 0, 3011 map->num_stripes, 3012 current->pid % map->num_stripes); 3013 mirror_num = stripe_index + 1; 3014 } 3015 3016 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3017 if (rw & (REQ_WRITE | REQ_DISCARD)) { 3018 num_stripes = map->num_stripes; 3019 } else if (mirror_num) { 3020 stripe_index = mirror_num - 1; 3021 } else { 3022 mirror_num = 1; 3023 } 3024 3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3026 int factor = map->num_stripes / map->sub_stripes; 3027 3028 stripe_index = do_div(stripe_nr, factor); 3029 stripe_index *= map->sub_stripes; 3030 3031 if (rw & REQ_WRITE) 3032 num_stripes = map->sub_stripes; 3033 else if (rw & REQ_DISCARD) 3034 num_stripes = min_t(u64, map->sub_stripes * 3035 (stripe_nr_end - stripe_nr_orig), 3036 map->num_stripes); 3037 else if (mirror_num) 3038 stripe_index += mirror_num - 1; 3039 else { 3040 stripe_index = find_live_mirror(map, stripe_index, 3041 map->sub_stripes, stripe_index + 3042 current->pid % map->sub_stripes); 3043 mirror_num = stripe_index + 1; 3044 } 3045 } else { 3046 /* 3047 * after this do_div call, stripe_nr is the number of stripes 3048 * on this device we have to walk to find the data, and 3049 * stripe_index is the number of our device in the stripe array 3050 */ 3051 stripe_index = do_div(stripe_nr, map->num_stripes); 3052 mirror_num = stripe_index + 1; 3053 } 3054 BUG_ON(stripe_index >= map->num_stripes); 3055 3056 if (rw & REQ_DISCARD) { 3057 for (i = 0; i < num_stripes; i++) { 3058 bbio->stripes[i].physical = 3059 map->stripes[stripe_index].physical + 3060 stripe_offset + stripe_nr * map->stripe_len; 3061 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3062 3063 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3064 u64 stripes; 3065 u32 last_stripe = 0; 3066 int j; 3067 3068 div_u64_rem(stripe_nr_end - 1, 3069 map->num_stripes, 3070 &last_stripe); 3071 3072 for (j = 0; j < map->num_stripes; j++) { 3073 u32 test; 3074 3075 div_u64_rem(stripe_nr_end - 1 - j, 3076 map->num_stripes, &test); 3077 if (test == stripe_index) 3078 break; 3079 } 3080 stripes = stripe_nr_end - 1 - j; 3081 do_div(stripes, map->num_stripes); 3082 bbio->stripes[i].length = map->stripe_len * 3083 (stripes - stripe_nr + 1); 3084 3085 if (i == 0) { 3086 bbio->stripes[i].length -= 3087 stripe_offset; 3088 stripe_offset = 0; 3089 } 3090 if (stripe_index == last_stripe) 3091 bbio->stripes[i].length -= 3092 stripe_end_offset; 3093 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3094 u64 stripes; 3095 int j; 3096 int factor = map->num_stripes / 3097 map->sub_stripes; 3098 u32 last_stripe = 0; 3099 3100 div_u64_rem(stripe_nr_end - 1, 3101 factor, &last_stripe); 3102 last_stripe *= map->sub_stripes; 3103 3104 for (j = 0; j < factor; j++) { 3105 u32 test; 3106 3107 div_u64_rem(stripe_nr_end - 1 - j, 3108 factor, &test); 3109 3110 if (test == 3111 stripe_index / map->sub_stripes) 3112 break; 3113 } 3114 stripes = stripe_nr_end - 1 - j; 3115 do_div(stripes, factor); 3116 bbio->stripes[i].length = map->stripe_len * 3117 (stripes - stripe_nr + 1); 3118 3119 if (i < map->sub_stripes) { 3120 bbio->stripes[i].length -= 3121 stripe_offset; 3122 if (i == map->sub_stripes - 1) 3123 stripe_offset = 0; 3124 } 3125 if (stripe_index >= last_stripe && 3126 stripe_index <= (last_stripe + 3127 map->sub_stripes - 1)) { 3128 bbio->stripes[i].length -= 3129 stripe_end_offset; 3130 } 3131 } else 3132 bbio->stripes[i].length = *length; 3133 3134 stripe_index++; 3135 if (stripe_index == map->num_stripes) { 3136 /* This could only happen for RAID0/10 */ 3137 stripe_index = 0; 3138 stripe_nr++; 3139 } 3140 } 3141 } else { 3142 for (i = 0; i < num_stripes; i++) { 3143 bbio->stripes[i].physical = 3144 map->stripes[stripe_index].physical + 3145 stripe_offset + 3146 stripe_nr * map->stripe_len; 3147 bbio->stripes[i].dev = 3148 map->stripes[stripe_index].dev; 3149 stripe_index++; 3150 } 3151 } 3152 if (bbio_ret) { 3153 *bbio_ret = bbio; 3154 bbio->num_stripes = num_stripes; 3155 bbio->max_errors = max_errors; 3156 bbio->mirror_num = mirror_num; 3157 } 3158 out: 3159 free_extent_map(em); 3160 return 0; 3161 } 3162 3163 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3164 u64 logical, u64 *length, 3165 struct btrfs_bio **bbio_ret, int mirror_num) 3166 { 3167 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 3168 mirror_num); 3169 } 3170 3171 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3172 u64 chunk_start, u64 physical, u64 devid, 3173 u64 **logical, int *naddrs, int *stripe_len) 3174 { 3175 struct extent_map_tree *em_tree = &map_tree->map_tree; 3176 struct extent_map *em; 3177 struct map_lookup *map; 3178 u64 *buf; 3179 u64 bytenr; 3180 u64 length; 3181 u64 stripe_nr; 3182 int i, j, nr = 0; 3183 3184 read_lock(&em_tree->lock); 3185 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3186 read_unlock(&em_tree->lock); 3187 3188 BUG_ON(!em || em->start != chunk_start); 3189 map = (struct map_lookup *)em->bdev; 3190 3191 length = em->len; 3192 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3193 do_div(length, map->num_stripes / map->sub_stripes); 3194 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3195 do_div(length, map->num_stripes); 3196 3197 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 3198 BUG_ON(!buf); 3199 3200 for (i = 0; i < map->num_stripes; i++) { 3201 if (devid && map->stripes[i].dev->devid != devid) 3202 continue; 3203 if (map->stripes[i].physical > physical || 3204 map->stripes[i].physical + length <= physical) 3205 continue; 3206 3207 stripe_nr = physical - map->stripes[i].physical; 3208 do_div(stripe_nr, map->stripe_len); 3209 3210 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3211 stripe_nr = stripe_nr * map->num_stripes + i; 3212 do_div(stripe_nr, map->sub_stripes); 3213 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3214 stripe_nr = stripe_nr * map->num_stripes + i; 3215 } 3216 bytenr = chunk_start + stripe_nr * map->stripe_len; 3217 WARN_ON(nr >= map->num_stripes); 3218 for (j = 0; j < nr; j++) { 3219 if (buf[j] == bytenr) 3220 break; 3221 } 3222 if (j == nr) { 3223 WARN_ON(nr >= map->num_stripes); 3224 buf[nr++] = bytenr; 3225 } 3226 } 3227 3228 *logical = buf; 3229 *naddrs = nr; 3230 *stripe_len = map->stripe_len; 3231 3232 free_extent_map(em); 3233 return 0; 3234 } 3235 3236 static void btrfs_end_bio(struct bio *bio, int err) 3237 { 3238 struct btrfs_bio *bbio = bio->bi_private; 3239 int is_orig_bio = 0; 3240 3241 if (err) 3242 atomic_inc(&bbio->error); 3243 3244 if (bio == bbio->orig_bio) 3245 is_orig_bio = 1; 3246 3247 if (atomic_dec_and_test(&bbio->stripes_pending)) { 3248 if (!is_orig_bio) { 3249 bio_put(bio); 3250 bio = bbio->orig_bio; 3251 } 3252 bio->bi_private = bbio->private; 3253 bio->bi_end_io = bbio->end_io; 3254 bio->bi_bdev = (struct block_device *) 3255 (unsigned long)bbio->mirror_num; 3256 /* only send an error to the higher layers if it is 3257 * beyond the tolerance of the multi-bio 3258 */ 3259 if (atomic_read(&bbio->error) > bbio->max_errors) { 3260 err = -EIO; 3261 } else if (err) { 3262 /* 3263 * this bio is actually up to date, we didn't 3264 * go over the max number of errors 3265 */ 3266 set_bit(BIO_UPTODATE, &bio->bi_flags); 3267 err = 0; 3268 } 3269 kfree(bbio); 3270 3271 bio_endio(bio, err); 3272 } else if (!is_orig_bio) { 3273 bio_put(bio); 3274 } 3275 } 3276 3277 struct async_sched { 3278 struct bio *bio; 3279 int rw; 3280 struct btrfs_fs_info *info; 3281 struct btrfs_work work; 3282 }; 3283 3284 /* 3285 * see run_scheduled_bios for a description of why bios are collected for 3286 * async submit. 3287 * 3288 * This will add one bio to the pending list for a device and make sure 3289 * the work struct is scheduled. 3290 */ 3291 static noinline int schedule_bio(struct btrfs_root *root, 3292 struct btrfs_device *device, 3293 int rw, struct bio *bio) 3294 { 3295 int should_queue = 1; 3296 struct btrfs_pending_bios *pending_bios; 3297 3298 /* don't bother with additional async steps for reads, right now */ 3299 if (!(rw & REQ_WRITE)) { 3300 bio_get(bio); 3301 submit_bio(rw, bio); 3302 bio_put(bio); 3303 return 0; 3304 } 3305 3306 /* 3307 * nr_async_bios allows us to reliably return congestion to the 3308 * higher layers. Otherwise, the async bio makes it appear we have 3309 * made progress against dirty pages when we've really just put it 3310 * on a queue for later 3311 */ 3312 atomic_inc(&root->fs_info->nr_async_bios); 3313 WARN_ON(bio->bi_next); 3314 bio->bi_next = NULL; 3315 bio->bi_rw |= rw; 3316 3317 spin_lock(&device->io_lock); 3318 if (bio->bi_rw & REQ_SYNC) 3319 pending_bios = &device->pending_sync_bios; 3320 else 3321 pending_bios = &device->pending_bios; 3322 3323 if (pending_bios->tail) 3324 pending_bios->tail->bi_next = bio; 3325 3326 pending_bios->tail = bio; 3327 if (!pending_bios->head) 3328 pending_bios->head = bio; 3329 if (device->running_pending) 3330 should_queue = 0; 3331 3332 spin_unlock(&device->io_lock); 3333 3334 if (should_queue) 3335 btrfs_queue_worker(&root->fs_info->submit_workers, 3336 &device->work); 3337 return 0; 3338 } 3339 3340 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 3341 int mirror_num, int async_submit) 3342 { 3343 struct btrfs_mapping_tree *map_tree; 3344 struct btrfs_device *dev; 3345 struct bio *first_bio = bio; 3346 u64 logical = (u64)bio->bi_sector << 9; 3347 u64 length = 0; 3348 u64 map_length; 3349 int ret; 3350 int dev_nr = 0; 3351 int total_devs = 1; 3352 struct btrfs_bio *bbio = NULL; 3353 3354 length = bio->bi_size; 3355 map_tree = &root->fs_info->mapping_tree; 3356 map_length = length; 3357 3358 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 3359 mirror_num); 3360 BUG_ON(ret); 3361 3362 total_devs = bbio->num_stripes; 3363 if (map_length < length) { 3364 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3365 "len %llu\n", (unsigned long long)logical, 3366 (unsigned long long)length, 3367 (unsigned long long)map_length); 3368 BUG(); 3369 } 3370 3371 bbio->orig_bio = first_bio; 3372 bbio->private = first_bio->bi_private; 3373 bbio->end_io = first_bio->bi_end_io; 3374 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 3375 3376 while (dev_nr < total_devs) { 3377 if (dev_nr < total_devs - 1) { 3378 bio = bio_clone(first_bio, GFP_NOFS); 3379 BUG_ON(!bio); 3380 } else { 3381 bio = first_bio; 3382 } 3383 bio->bi_private = bbio; 3384 bio->bi_end_io = btrfs_end_bio; 3385 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 3386 dev = bbio->stripes[dev_nr].dev; 3387 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3388 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " 3389 "(%s id %llu), size=%u\n", rw, 3390 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 3391 dev->name, dev->devid, bio->bi_size); 3392 bio->bi_bdev = dev->bdev; 3393 if (async_submit) 3394 schedule_bio(root, dev, rw, bio); 3395 else 3396 submit_bio(rw, bio); 3397 } else { 3398 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 3399 bio->bi_sector = logical >> 9; 3400 bio_endio(bio, -EIO); 3401 } 3402 dev_nr++; 3403 } 3404 return 0; 3405 } 3406 3407 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 3408 u8 *uuid, u8 *fsid) 3409 { 3410 struct btrfs_device *device; 3411 struct btrfs_fs_devices *cur_devices; 3412 3413 cur_devices = root->fs_info->fs_devices; 3414 while (cur_devices) { 3415 if (!fsid || 3416 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3417 device = __find_device(&cur_devices->devices, 3418 devid, uuid); 3419 if (device) 3420 return device; 3421 } 3422 cur_devices = cur_devices->seed; 3423 } 3424 return NULL; 3425 } 3426 3427 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3428 u64 devid, u8 *dev_uuid) 3429 { 3430 struct btrfs_device *device; 3431 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3432 3433 device = kzalloc(sizeof(*device), GFP_NOFS); 3434 if (!device) 3435 return NULL; 3436 list_add(&device->dev_list, 3437 &fs_devices->devices); 3438 device->dev_root = root->fs_info->dev_root; 3439 device->devid = devid; 3440 device->work.func = pending_bios_fn; 3441 device->fs_devices = fs_devices; 3442 device->missing = 1; 3443 fs_devices->num_devices++; 3444 fs_devices->missing_devices++; 3445 spin_lock_init(&device->io_lock); 3446 INIT_LIST_HEAD(&device->dev_alloc_list); 3447 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3448 return device; 3449 } 3450 3451 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3452 struct extent_buffer *leaf, 3453 struct btrfs_chunk *chunk) 3454 { 3455 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3456 struct map_lookup *map; 3457 struct extent_map *em; 3458 u64 logical; 3459 u64 length; 3460 u64 devid; 3461 u8 uuid[BTRFS_UUID_SIZE]; 3462 int num_stripes; 3463 int ret; 3464 int i; 3465 3466 logical = key->offset; 3467 length = btrfs_chunk_length(leaf, chunk); 3468 3469 read_lock(&map_tree->map_tree.lock); 3470 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3471 read_unlock(&map_tree->map_tree.lock); 3472 3473 /* already mapped? */ 3474 if (em && em->start <= logical && em->start + em->len > logical) { 3475 free_extent_map(em); 3476 return 0; 3477 } else if (em) { 3478 free_extent_map(em); 3479 } 3480 3481 em = alloc_extent_map(); 3482 if (!em) 3483 return -ENOMEM; 3484 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3485 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3486 if (!map) { 3487 free_extent_map(em); 3488 return -ENOMEM; 3489 } 3490 3491 em->bdev = (struct block_device *)map; 3492 em->start = logical; 3493 em->len = length; 3494 em->block_start = 0; 3495 em->block_len = em->len; 3496 3497 map->num_stripes = num_stripes; 3498 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3499 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3500 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3501 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3502 map->type = btrfs_chunk_type(leaf, chunk); 3503 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3504 for (i = 0; i < num_stripes; i++) { 3505 map->stripes[i].physical = 3506 btrfs_stripe_offset_nr(leaf, chunk, i); 3507 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3508 read_extent_buffer(leaf, uuid, (unsigned long) 3509 btrfs_stripe_dev_uuid_nr(chunk, i), 3510 BTRFS_UUID_SIZE); 3511 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3512 NULL); 3513 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3514 kfree(map); 3515 free_extent_map(em); 3516 return -EIO; 3517 } 3518 if (!map->stripes[i].dev) { 3519 map->stripes[i].dev = 3520 add_missing_dev(root, devid, uuid); 3521 if (!map->stripes[i].dev) { 3522 kfree(map); 3523 free_extent_map(em); 3524 return -EIO; 3525 } 3526 } 3527 map->stripes[i].dev->in_fs_metadata = 1; 3528 } 3529 3530 write_lock(&map_tree->map_tree.lock); 3531 ret = add_extent_mapping(&map_tree->map_tree, em); 3532 write_unlock(&map_tree->map_tree.lock); 3533 BUG_ON(ret); 3534 free_extent_map(em); 3535 3536 return 0; 3537 } 3538 3539 static int fill_device_from_item(struct extent_buffer *leaf, 3540 struct btrfs_dev_item *dev_item, 3541 struct btrfs_device *device) 3542 { 3543 unsigned long ptr; 3544 3545 device->devid = btrfs_device_id(leaf, dev_item); 3546 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3547 device->total_bytes = device->disk_total_bytes; 3548 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3549 device->type = btrfs_device_type(leaf, dev_item); 3550 device->io_align = btrfs_device_io_align(leaf, dev_item); 3551 device->io_width = btrfs_device_io_width(leaf, dev_item); 3552 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3553 3554 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3555 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3556 3557 return 0; 3558 } 3559 3560 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3561 { 3562 struct btrfs_fs_devices *fs_devices; 3563 int ret; 3564 3565 mutex_lock(&uuid_mutex); 3566 3567 fs_devices = root->fs_info->fs_devices->seed; 3568 while (fs_devices) { 3569 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3570 ret = 0; 3571 goto out; 3572 } 3573 fs_devices = fs_devices->seed; 3574 } 3575 3576 fs_devices = find_fsid(fsid); 3577 if (!fs_devices) { 3578 ret = -ENOENT; 3579 goto out; 3580 } 3581 3582 fs_devices = clone_fs_devices(fs_devices); 3583 if (IS_ERR(fs_devices)) { 3584 ret = PTR_ERR(fs_devices); 3585 goto out; 3586 } 3587 3588 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3589 root->fs_info->bdev_holder); 3590 if (ret) 3591 goto out; 3592 3593 if (!fs_devices->seeding) { 3594 __btrfs_close_devices(fs_devices); 3595 free_fs_devices(fs_devices); 3596 ret = -EINVAL; 3597 goto out; 3598 } 3599 3600 fs_devices->seed = root->fs_info->fs_devices->seed; 3601 root->fs_info->fs_devices->seed = fs_devices; 3602 out: 3603 mutex_unlock(&uuid_mutex); 3604 return ret; 3605 } 3606 3607 static int read_one_dev(struct btrfs_root *root, 3608 struct extent_buffer *leaf, 3609 struct btrfs_dev_item *dev_item) 3610 { 3611 struct btrfs_device *device; 3612 u64 devid; 3613 int ret; 3614 u8 fs_uuid[BTRFS_UUID_SIZE]; 3615 u8 dev_uuid[BTRFS_UUID_SIZE]; 3616 3617 devid = btrfs_device_id(leaf, dev_item); 3618 read_extent_buffer(leaf, dev_uuid, 3619 (unsigned long)btrfs_device_uuid(dev_item), 3620 BTRFS_UUID_SIZE); 3621 read_extent_buffer(leaf, fs_uuid, 3622 (unsigned long)btrfs_device_fsid(dev_item), 3623 BTRFS_UUID_SIZE); 3624 3625 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3626 ret = open_seed_devices(root, fs_uuid); 3627 if (ret && !btrfs_test_opt(root, DEGRADED)) 3628 return ret; 3629 } 3630 3631 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3632 if (!device || !device->bdev) { 3633 if (!btrfs_test_opt(root, DEGRADED)) 3634 return -EIO; 3635 3636 if (!device) { 3637 printk(KERN_WARNING "warning devid %llu missing\n", 3638 (unsigned long long)devid); 3639 device = add_missing_dev(root, devid, dev_uuid); 3640 if (!device) 3641 return -ENOMEM; 3642 } else if (!device->missing) { 3643 /* 3644 * this happens when a device that was properly setup 3645 * in the device info lists suddenly goes bad. 3646 * device->bdev is NULL, and so we have to set 3647 * device->missing to one here 3648 */ 3649 root->fs_info->fs_devices->missing_devices++; 3650 device->missing = 1; 3651 } 3652 } 3653 3654 if (device->fs_devices != root->fs_info->fs_devices) { 3655 BUG_ON(device->writeable); 3656 if (device->generation != 3657 btrfs_device_generation(leaf, dev_item)) 3658 return -EINVAL; 3659 } 3660 3661 fill_device_from_item(leaf, dev_item, device); 3662 device->dev_root = root->fs_info->dev_root; 3663 device->in_fs_metadata = 1; 3664 if (device->writeable) { 3665 device->fs_devices->total_rw_bytes += device->total_bytes; 3666 spin_lock(&root->fs_info->free_chunk_lock); 3667 root->fs_info->free_chunk_space += device->total_bytes - 3668 device->bytes_used; 3669 spin_unlock(&root->fs_info->free_chunk_lock); 3670 } 3671 ret = 0; 3672 return ret; 3673 } 3674 3675 int btrfs_read_sys_array(struct btrfs_root *root) 3676 { 3677 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3678 struct extent_buffer *sb; 3679 struct btrfs_disk_key *disk_key; 3680 struct btrfs_chunk *chunk; 3681 u8 *ptr; 3682 unsigned long sb_ptr; 3683 int ret = 0; 3684 u32 num_stripes; 3685 u32 array_size; 3686 u32 len = 0; 3687 u32 cur; 3688 struct btrfs_key key; 3689 3690 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3691 BTRFS_SUPER_INFO_SIZE); 3692 if (!sb) 3693 return -ENOMEM; 3694 btrfs_set_buffer_uptodate(sb); 3695 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 3696 3697 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3698 array_size = btrfs_super_sys_array_size(super_copy); 3699 3700 ptr = super_copy->sys_chunk_array; 3701 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3702 cur = 0; 3703 3704 while (cur < array_size) { 3705 disk_key = (struct btrfs_disk_key *)ptr; 3706 btrfs_disk_key_to_cpu(&key, disk_key); 3707 3708 len = sizeof(*disk_key); ptr += len; 3709 sb_ptr += len; 3710 cur += len; 3711 3712 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3713 chunk = (struct btrfs_chunk *)sb_ptr; 3714 ret = read_one_chunk(root, &key, sb, chunk); 3715 if (ret) 3716 break; 3717 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3718 len = btrfs_chunk_item_size(num_stripes); 3719 } else { 3720 ret = -EIO; 3721 break; 3722 } 3723 ptr += len; 3724 sb_ptr += len; 3725 cur += len; 3726 } 3727 free_extent_buffer(sb); 3728 return ret; 3729 } 3730 3731 int btrfs_read_chunk_tree(struct btrfs_root *root) 3732 { 3733 struct btrfs_path *path; 3734 struct extent_buffer *leaf; 3735 struct btrfs_key key; 3736 struct btrfs_key found_key; 3737 int ret; 3738 int slot; 3739 3740 root = root->fs_info->chunk_root; 3741 3742 path = btrfs_alloc_path(); 3743 if (!path) 3744 return -ENOMEM; 3745 3746 /* first we search for all of the device items, and then we 3747 * read in all of the chunk items. This way we can create chunk 3748 * mappings that reference all of the devices that are afound 3749 */ 3750 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3751 key.offset = 0; 3752 key.type = 0; 3753 again: 3754 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3755 if (ret < 0) 3756 goto error; 3757 while (1) { 3758 leaf = path->nodes[0]; 3759 slot = path->slots[0]; 3760 if (slot >= btrfs_header_nritems(leaf)) { 3761 ret = btrfs_next_leaf(root, path); 3762 if (ret == 0) 3763 continue; 3764 if (ret < 0) 3765 goto error; 3766 break; 3767 } 3768 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3769 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3770 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3771 break; 3772 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3773 struct btrfs_dev_item *dev_item; 3774 dev_item = btrfs_item_ptr(leaf, slot, 3775 struct btrfs_dev_item); 3776 ret = read_one_dev(root, leaf, dev_item); 3777 if (ret) 3778 goto error; 3779 } 3780 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3781 struct btrfs_chunk *chunk; 3782 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3783 ret = read_one_chunk(root, &found_key, leaf, chunk); 3784 if (ret) 3785 goto error; 3786 } 3787 path->slots[0]++; 3788 } 3789 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3790 key.objectid = 0; 3791 btrfs_release_path(path); 3792 goto again; 3793 } 3794 ret = 0; 3795 error: 3796 btrfs_free_path(path); 3797 return ret; 3798 } 3799