1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/kthread.h> 27 #include <asm/div64.h> 28 #include "compat.h" 29 #include "ctree.h" 30 #include "extent_map.h" 31 #include "disk-io.h" 32 #include "transaction.h" 33 #include "print-tree.h" 34 #include "volumes.h" 35 #include "async-thread.h" 36 #include "check-integrity.h" 37 38 static int init_first_rw_device(struct btrfs_trans_handle *trans, 39 struct btrfs_root *root, 40 struct btrfs_device *device); 41 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 42 43 static DEFINE_MUTEX(uuid_mutex); 44 static LIST_HEAD(fs_uuids); 45 46 static void lock_chunks(struct btrfs_root *root) 47 { 48 mutex_lock(&root->fs_info->chunk_mutex); 49 } 50 51 static void unlock_chunks(struct btrfs_root *root) 52 { 53 mutex_unlock(&root->fs_info->chunk_mutex); 54 } 55 56 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 57 { 58 struct btrfs_device *device; 59 WARN_ON(fs_devices->opened); 60 while (!list_empty(&fs_devices->devices)) { 61 device = list_entry(fs_devices->devices.next, 62 struct btrfs_device, dev_list); 63 list_del(&device->dev_list); 64 kfree(device->name); 65 kfree(device); 66 } 67 kfree(fs_devices); 68 } 69 70 int btrfs_cleanup_fs_uuids(void) 71 { 72 struct btrfs_fs_devices *fs_devices; 73 74 while (!list_empty(&fs_uuids)) { 75 fs_devices = list_entry(fs_uuids.next, 76 struct btrfs_fs_devices, list); 77 list_del(&fs_devices->list); 78 free_fs_devices(fs_devices); 79 } 80 return 0; 81 } 82 83 static noinline struct btrfs_device *__find_device(struct list_head *head, 84 u64 devid, u8 *uuid) 85 { 86 struct btrfs_device *dev; 87 88 list_for_each_entry(dev, head, dev_list) { 89 if (dev->devid == devid && 90 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 91 return dev; 92 } 93 } 94 return NULL; 95 } 96 97 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 98 { 99 struct btrfs_fs_devices *fs_devices; 100 101 list_for_each_entry(fs_devices, &fs_uuids, list) { 102 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 103 return fs_devices; 104 } 105 return NULL; 106 } 107 108 static void requeue_list(struct btrfs_pending_bios *pending_bios, 109 struct bio *head, struct bio *tail) 110 { 111 112 struct bio *old_head; 113 114 old_head = pending_bios->head; 115 pending_bios->head = head; 116 if (pending_bios->tail) 117 tail->bi_next = old_head; 118 else 119 pending_bios->tail = tail; 120 } 121 122 /* 123 * we try to collect pending bios for a device so we don't get a large 124 * number of procs sending bios down to the same device. This greatly 125 * improves the schedulers ability to collect and merge the bios. 126 * 127 * But, it also turns into a long list of bios to process and that is sure 128 * to eventually make the worker thread block. The solution here is to 129 * make some progress and then put this work struct back at the end of 130 * the list if the block device is congested. This way, multiple devices 131 * can make progress from a single worker thread. 132 */ 133 static noinline int run_scheduled_bios(struct btrfs_device *device) 134 { 135 struct bio *pending; 136 struct backing_dev_info *bdi; 137 struct btrfs_fs_info *fs_info; 138 struct btrfs_pending_bios *pending_bios; 139 struct bio *tail; 140 struct bio *cur; 141 int again = 0; 142 unsigned long num_run; 143 unsigned long batch_run = 0; 144 unsigned long limit; 145 unsigned long last_waited = 0; 146 int force_reg = 0; 147 int sync_pending = 0; 148 struct blk_plug plug; 149 150 /* 151 * this function runs all the bios we've collected for 152 * a particular device. We don't want to wander off to 153 * another device without first sending all of these down. 154 * So, setup a plug here and finish it off before we return 155 */ 156 blk_start_plug(&plug); 157 158 bdi = blk_get_backing_dev_info(device->bdev); 159 fs_info = device->dev_root->fs_info; 160 limit = btrfs_async_submit_limit(fs_info); 161 limit = limit * 2 / 3; 162 163 loop: 164 spin_lock(&device->io_lock); 165 166 loop_lock: 167 num_run = 0; 168 169 /* take all the bios off the list at once and process them 170 * later on (without the lock held). But, remember the 171 * tail and other pointers so the bios can be properly reinserted 172 * into the list if we hit congestion 173 */ 174 if (!force_reg && device->pending_sync_bios.head) { 175 pending_bios = &device->pending_sync_bios; 176 force_reg = 1; 177 } else { 178 pending_bios = &device->pending_bios; 179 force_reg = 0; 180 } 181 182 pending = pending_bios->head; 183 tail = pending_bios->tail; 184 WARN_ON(pending && !tail); 185 186 /* 187 * if pending was null this time around, no bios need processing 188 * at all and we can stop. Otherwise it'll loop back up again 189 * and do an additional check so no bios are missed. 190 * 191 * device->running_pending is used to synchronize with the 192 * schedule_bio code. 193 */ 194 if (device->pending_sync_bios.head == NULL && 195 device->pending_bios.head == NULL) { 196 again = 0; 197 device->running_pending = 0; 198 } else { 199 again = 1; 200 device->running_pending = 1; 201 } 202 203 pending_bios->head = NULL; 204 pending_bios->tail = NULL; 205 206 spin_unlock(&device->io_lock); 207 208 while (pending) { 209 210 rmb(); 211 /* we want to work on both lists, but do more bios on the 212 * sync list than the regular list 213 */ 214 if ((num_run > 32 && 215 pending_bios != &device->pending_sync_bios && 216 device->pending_sync_bios.head) || 217 (num_run > 64 && pending_bios == &device->pending_sync_bios && 218 device->pending_bios.head)) { 219 spin_lock(&device->io_lock); 220 requeue_list(pending_bios, pending, tail); 221 goto loop_lock; 222 } 223 224 cur = pending; 225 pending = pending->bi_next; 226 cur->bi_next = NULL; 227 atomic_dec(&fs_info->nr_async_bios); 228 229 if (atomic_read(&fs_info->nr_async_bios) < limit && 230 waitqueue_active(&fs_info->async_submit_wait)) 231 wake_up(&fs_info->async_submit_wait); 232 233 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 234 235 /* 236 * if we're doing the sync list, record that our 237 * plug has some sync requests on it 238 * 239 * If we're doing the regular list and there are 240 * sync requests sitting around, unplug before 241 * we add more 242 */ 243 if (pending_bios == &device->pending_sync_bios) { 244 sync_pending = 1; 245 } else if (sync_pending) { 246 blk_finish_plug(&plug); 247 blk_start_plug(&plug); 248 sync_pending = 0; 249 } 250 251 btrfsic_submit_bio(cur->bi_rw, cur); 252 num_run++; 253 batch_run++; 254 if (need_resched()) 255 cond_resched(); 256 257 /* 258 * we made progress, there is more work to do and the bdi 259 * is now congested. Back off and let other work structs 260 * run instead 261 */ 262 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 263 fs_info->fs_devices->open_devices > 1) { 264 struct io_context *ioc; 265 266 ioc = current->io_context; 267 268 /* 269 * the main goal here is that we don't want to 270 * block if we're going to be able to submit 271 * more requests without blocking. 272 * 273 * This code does two great things, it pokes into 274 * the elevator code from a filesystem _and_ 275 * it makes assumptions about how batching works. 276 */ 277 if (ioc && ioc->nr_batch_requests > 0 && 278 time_before(jiffies, ioc->last_waited + HZ/50UL) && 279 (last_waited == 0 || 280 ioc->last_waited == last_waited)) { 281 /* 282 * we want to go through our batch of 283 * requests and stop. So, we copy out 284 * the ioc->last_waited time and test 285 * against it before looping 286 */ 287 last_waited = ioc->last_waited; 288 if (need_resched()) 289 cond_resched(); 290 continue; 291 } 292 spin_lock(&device->io_lock); 293 requeue_list(pending_bios, pending, tail); 294 device->running_pending = 1; 295 296 spin_unlock(&device->io_lock); 297 btrfs_requeue_work(&device->work); 298 goto done; 299 } 300 /* unplug every 64 requests just for good measure */ 301 if (batch_run % 64 == 0) { 302 blk_finish_plug(&plug); 303 blk_start_plug(&plug); 304 sync_pending = 0; 305 } 306 } 307 308 cond_resched(); 309 if (again) 310 goto loop; 311 312 spin_lock(&device->io_lock); 313 if (device->pending_bios.head || device->pending_sync_bios.head) 314 goto loop_lock; 315 spin_unlock(&device->io_lock); 316 317 done: 318 blk_finish_plug(&plug); 319 return 0; 320 } 321 322 static void pending_bios_fn(struct btrfs_work *work) 323 { 324 struct btrfs_device *device; 325 326 device = container_of(work, struct btrfs_device, work); 327 run_scheduled_bios(device); 328 } 329 330 static noinline int device_list_add(const char *path, 331 struct btrfs_super_block *disk_super, 332 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 333 { 334 struct btrfs_device *device; 335 struct btrfs_fs_devices *fs_devices; 336 u64 found_transid = btrfs_super_generation(disk_super); 337 char *name; 338 339 fs_devices = find_fsid(disk_super->fsid); 340 if (!fs_devices) { 341 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 342 if (!fs_devices) 343 return -ENOMEM; 344 INIT_LIST_HEAD(&fs_devices->devices); 345 INIT_LIST_HEAD(&fs_devices->alloc_list); 346 list_add(&fs_devices->list, &fs_uuids); 347 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 348 fs_devices->latest_devid = devid; 349 fs_devices->latest_trans = found_transid; 350 mutex_init(&fs_devices->device_list_mutex); 351 device = NULL; 352 } else { 353 device = __find_device(&fs_devices->devices, devid, 354 disk_super->dev_item.uuid); 355 } 356 if (!device) { 357 if (fs_devices->opened) 358 return -EBUSY; 359 360 device = kzalloc(sizeof(*device), GFP_NOFS); 361 if (!device) { 362 /* we can safely leave the fs_devices entry around */ 363 return -ENOMEM; 364 } 365 device->devid = devid; 366 device->work.func = pending_bios_fn; 367 memcpy(device->uuid, disk_super->dev_item.uuid, 368 BTRFS_UUID_SIZE); 369 spin_lock_init(&device->io_lock); 370 device->name = kstrdup(path, GFP_NOFS); 371 if (!device->name) { 372 kfree(device); 373 return -ENOMEM; 374 } 375 INIT_LIST_HEAD(&device->dev_alloc_list); 376 377 /* init readahead state */ 378 spin_lock_init(&device->reada_lock); 379 device->reada_curr_zone = NULL; 380 atomic_set(&device->reada_in_flight, 0); 381 device->reada_next = 0; 382 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); 383 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); 384 385 mutex_lock(&fs_devices->device_list_mutex); 386 list_add_rcu(&device->dev_list, &fs_devices->devices); 387 mutex_unlock(&fs_devices->device_list_mutex); 388 389 device->fs_devices = fs_devices; 390 fs_devices->num_devices++; 391 } else if (!device->name || strcmp(device->name, path)) { 392 name = kstrdup(path, GFP_NOFS); 393 if (!name) 394 return -ENOMEM; 395 kfree(device->name); 396 device->name = name; 397 if (device->missing) { 398 fs_devices->missing_devices--; 399 device->missing = 0; 400 } 401 } 402 403 if (found_transid > fs_devices->latest_trans) { 404 fs_devices->latest_devid = devid; 405 fs_devices->latest_trans = found_transid; 406 } 407 *fs_devices_ret = fs_devices; 408 return 0; 409 } 410 411 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 412 { 413 struct btrfs_fs_devices *fs_devices; 414 struct btrfs_device *device; 415 struct btrfs_device *orig_dev; 416 417 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 418 if (!fs_devices) 419 return ERR_PTR(-ENOMEM); 420 421 INIT_LIST_HEAD(&fs_devices->devices); 422 INIT_LIST_HEAD(&fs_devices->alloc_list); 423 INIT_LIST_HEAD(&fs_devices->list); 424 mutex_init(&fs_devices->device_list_mutex); 425 fs_devices->latest_devid = orig->latest_devid; 426 fs_devices->latest_trans = orig->latest_trans; 427 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 428 429 /* We have held the volume lock, it is safe to get the devices. */ 430 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 431 device = kzalloc(sizeof(*device), GFP_NOFS); 432 if (!device) 433 goto error; 434 435 device->name = kstrdup(orig_dev->name, GFP_NOFS); 436 if (!device->name) { 437 kfree(device); 438 goto error; 439 } 440 441 device->devid = orig_dev->devid; 442 device->work.func = pending_bios_fn; 443 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 444 spin_lock_init(&device->io_lock); 445 INIT_LIST_HEAD(&device->dev_list); 446 INIT_LIST_HEAD(&device->dev_alloc_list); 447 448 list_add(&device->dev_list, &fs_devices->devices); 449 device->fs_devices = fs_devices; 450 fs_devices->num_devices++; 451 } 452 return fs_devices; 453 error: 454 free_fs_devices(fs_devices); 455 return ERR_PTR(-ENOMEM); 456 } 457 458 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 459 { 460 struct btrfs_device *device, *next; 461 462 struct block_device *latest_bdev = NULL; 463 u64 latest_devid = 0; 464 u64 latest_transid = 0; 465 466 mutex_lock(&uuid_mutex); 467 again: 468 /* This is the initialized path, it is safe to release the devices. */ 469 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 470 if (device->in_fs_metadata) { 471 if (!latest_transid || 472 device->generation > latest_transid) { 473 latest_devid = device->devid; 474 latest_transid = device->generation; 475 latest_bdev = device->bdev; 476 } 477 continue; 478 } 479 480 if (device->bdev) { 481 blkdev_put(device->bdev, device->mode); 482 device->bdev = NULL; 483 fs_devices->open_devices--; 484 } 485 if (device->writeable) { 486 list_del_init(&device->dev_alloc_list); 487 device->writeable = 0; 488 fs_devices->rw_devices--; 489 } 490 list_del_init(&device->dev_list); 491 fs_devices->num_devices--; 492 kfree(device->name); 493 kfree(device); 494 } 495 496 if (fs_devices->seed) { 497 fs_devices = fs_devices->seed; 498 goto again; 499 } 500 501 fs_devices->latest_bdev = latest_bdev; 502 fs_devices->latest_devid = latest_devid; 503 fs_devices->latest_trans = latest_transid; 504 505 mutex_unlock(&uuid_mutex); 506 return 0; 507 } 508 509 static void __free_device(struct work_struct *work) 510 { 511 struct btrfs_device *device; 512 513 device = container_of(work, struct btrfs_device, rcu_work); 514 515 if (device->bdev) 516 blkdev_put(device->bdev, device->mode); 517 518 kfree(device->name); 519 kfree(device); 520 } 521 522 static void free_device(struct rcu_head *head) 523 { 524 struct btrfs_device *device; 525 526 device = container_of(head, struct btrfs_device, rcu); 527 528 INIT_WORK(&device->rcu_work, __free_device); 529 schedule_work(&device->rcu_work); 530 } 531 532 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 533 { 534 struct btrfs_device *device; 535 536 if (--fs_devices->opened > 0) 537 return 0; 538 539 mutex_lock(&fs_devices->device_list_mutex); 540 list_for_each_entry(device, &fs_devices->devices, dev_list) { 541 struct btrfs_device *new_device; 542 543 if (device->bdev) 544 fs_devices->open_devices--; 545 546 if (device->writeable) { 547 list_del_init(&device->dev_alloc_list); 548 fs_devices->rw_devices--; 549 } 550 551 if (device->can_discard) 552 fs_devices->num_can_discard--; 553 554 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 555 BUG_ON(!new_device); 556 memcpy(new_device, device, sizeof(*new_device)); 557 new_device->name = kstrdup(device->name, GFP_NOFS); 558 BUG_ON(device->name && !new_device->name); 559 new_device->bdev = NULL; 560 new_device->writeable = 0; 561 new_device->in_fs_metadata = 0; 562 new_device->can_discard = 0; 563 list_replace_rcu(&device->dev_list, &new_device->dev_list); 564 565 call_rcu(&device->rcu, free_device); 566 } 567 mutex_unlock(&fs_devices->device_list_mutex); 568 569 WARN_ON(fs_devices->open_devices); 570 WARN_ON(fs_devices->rw_devices); 571 fs_devices->opened = 0; 572 fs_devices->seeding = 0; 573 574 return 0; 575 } 576 577 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 578 { 579 struct btrfs_fs_devices *seed_devices = NULL; 580 int ret; 581 582 mutex_lock(&uuid_mutex); 583 ret = __btrfs_close_devices(fs_devices); 584 if (!fs_devices->opened) { 585 seed_devices = fs_devices->seed; 586 fs_devices->seed = NULL; 587 } 588 mutex_unlock(&uuid_mutex); 589 590 while (seed_devices) { 591 fs_devices = seed_devices; 592 seed_devices = fs_devices->seed; 593 __btrfs_close_devices(fs_devices); 594 free_fs_devices(fs_devices); 595 } 596 return ret; 597 } 598 599 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 600 fmode_t flags, void *holder) 601 { 602 struct request_queue *q; 603 struct block_device *bdev; 604 struct list_head *head = &fs_devices->devices; 605 struct btrfs_device *device; 606 struct block_device *latest_bdev = NULL; 607 struct buffer_head *bh; 608 struct btrfs_super_block *disk_super; 609 u64 latest_devid = 0; 610 u64 latest_transid = 0; 611 u64 devid; 612 int seeding = 1; 613 int ret = 0; 614 615 flags |= FMODE_EXCL; 616 617 list_for_each_entry(device, head, dev_list) { 618 if (device->bdev) 619 continue; 620 if (!device->name) 621 continue; 622 623 bdev = blkdev_get_by_path(device->name, flags, holder); 624 if (IS_ERR(bdev)) { 625 printk(KERN_INFO "open %s failed\n", device->name); 626 goto error; 627 } 628 set_blocksize(bdev, 4096); 629 630 bh = btrfs_read_dev_super(bdev); 631 if (!bh) 632 goto error_close; 633 634 disk_super = (struct btrfs_super_block *)bh->b_data; 635 devid = btrfs_stack_device_id(&disk_super->dev_item); 636 if (devid != device->devid) 637 goto error_brelse; 638 639 if (memcmp(device->uuid, disk_super->dev_item.uuid, 640 BTRFS_UUID_SIZE)) 641 goto error_brelse; 642 643 device->generation = btrfs_super_generation(disk_super); 644 if (!latest_transid || device->generation > latest_transid) { 645 latest_devid = devid; 646 latest_transid = device->generation; 647 latest_bdev = bdev; 648 } 649 650 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 651 device->writeable = 0; 652 } else { 653 device->writeable = !bdev_read_only(bdev); 654 seeding = 0; 655 } 656 657 q = bdev_get_queue(bdev); 658 if (blk_queue_discard(q)) { 659 device->can_discard = 1; 660 fs_devices->num_can_discard++; 661 } 662 663 device->bdev = bdev; 664 device->in_fs_metadata = 0; 665 device->mode = flags; 666 667 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 668 fs_devices->rotating = 1; 669 670 fs_devices->open_devices++; 671 if (device->writeable) { 672 fs_devices->rw_devices++; 673 list_add(&device->dev_alloc_list, 674 &fs_devices->alloc_list); 675 } 676 brelse(bh); 677 continue; 678 679 error_brelse: 680 brelse(bh); 681 error_close: 682 blkdev_put(bdev, flags); 683 error: 684 continue; 685 } 686 if (fs_devices->open_devices == 0) { 687 ret = -EINVAL; 688 goto out; 689 } 690 fs_devices->seeding = seeding; 691 fs_devices->opened = 1; 692 fs_devices->latest_bdev = latest_bdev; 693 fs_devices->latest_devid = latest_devid; 694 fs_devices->latest_trans = latest_transid; 695 fs_devices->total_rw_bytes = 0; 696 out: 697 return ret; 698 } 699 700 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 701 fmode_t flags, void *holder) 702 { 703 int ret; 704 705 mutex_lock(&uuid_mutex); 706 if (fs_devices->opened) { 707 fs_devices->opened++; 708 ret = 0; 709 } else { 710 ret = __btrfs_open_devices(fs_devices, flags, holder); 711 } 712 mutex_unlock(&uuid_mutex); 713 return ret; 714 } 715 716 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 717 struct btrfs_fs_devices **fs_devices_ret) 718 { 719 struct btrfs_super_block *disk_super; 720 struct block_device *bdev; 721 struct buffer_head *bh; 722 int ret; 723 u64 devid; 724 u64 transid; 725 726 flags |= FMODE_EXCL; 727 bdev = blkdev_get_by_path(path, flags, holder); 728 729 if (IS_ERR(bdev)) { 730 ret = PTR_ERR(bdev); 731 goto error; 732 } 733 734 mutex_lock(&uuid_mutex); 735 ret = set_blocksize(bdev, 4096); 736 if (ret) 737 goto error_close; 738 bh = btrfs_read_dev_super(bdev); 739 if (!bh) { 740 ret = -EINVAL; 741 goto error_close; 742 } 743 disk_super = (struct btrfs_super_block *)bh->b_data; 744 devid = btrfs_stack_device_id(&disk_super->dev_item); 745 transid = btrfs_super_generation(disk_super); 746 if (disk_super->label[0]) 747 printk(KERN_INFO "device label %s ", disk_super->label); 748 else 749 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 750 printk(KERN_CONT "devid %llu transid %llu %s\n", 751 (unsigned long long)devid, (unsigned long long)transid, path); 752 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 753 754 brelse(bh); 755 error_close: 756 mutex_unlock(&uuid_mutex); 757 blkdev_put(bdev, flags); 758 error: 759 return ret; 760 } 761 762 /* helper to account the used device space in the range */ 763 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 764 u64 end, u64 *length) 765 { 766 struct btrfs_key key; 767 struct btrfs_root *root = device->dev_root; 768 struct btrfs_dev_extent *dev_extent; 769 struct btrfs_path *path; 770 u64 extent_end; 771 int ret; 772 int slot; 773 struct extent_buffer *l; 774 775 *length = 0; 776 777 if (start >= device->total_bytes) 778 return 0; 779 780 path = btrfs_alloc_path(); 781 if (!path) 782 return -ENOMEM; 783 path->reada = 2; 784 785 key.objectid = device->devid; 786 key.offset = start; 787 key.type = BTRFS_DEV_EXTENT_KEY; 788 789 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 790 if (ret < 0) 791 goto out; 792 if (ret > 0) { 793 ret = btrfs_previous_item(root, path, key.objectid, key.type); 794 if (ret < 0) 795 goto out; 796 } 797 798 while (1) { 799 l = path->nodes[0]; 800 slot = path->slots[0]; 801 if (slot >= btrfs_header_nritems(l)) { 802 ret = btrfs_next_leaf(root, path); 803 if (ret == 0) 804 continue; 805 if (ret < 0) 806 goto out; 807 808 break; 809 } 810 btrfs_item_key_to_cpu(l, &key, slot); 811 812 if (key.objectid < device->devid) 813 goto next; 814 815 if (key.objectid > device->devid) 816 break; 817 818 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 819 goto next; 820 821 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 822 extent_end = key.offset + btrfs_dev_extent_length(l, 823 dev_extent); 824 if (key.offset <= start && extent_end > end) { 825 *length = end - start + 1; 826 break; 827 } else if (key.offset <= start && extent_end > start) 828 *length += extent_end - start; 829 else if (key.offset > start && extent_end <= end) 830 *length += extent_end - key.offset; 831 else if (key.offset > start && key.offset <= end) { 832 *length += end - key.offset + 1; 833 break; 834 } else if (key.offset > end) 835 break; 836 837 next: 838 path->slots[0]++; 839 } 840 ret = 0; 841 out: 842 btrfs_free_path(path); 843 return ret; 844 } 845 846 /* 847 * find_free_dev_extent - find free space in the specified device 848 * @device: the device which we search the free space in 849 * @num_bytes: the size of the free space that we need 850 * @start: store the start of the free space. 851 * @len: the size of the free space. that we find, or the size of the max 852 * free space if we don't find suitable free space 853 * 854 * this uses a pretty simple search, the expectation is that it is 855 * called very infrequently and that a given device has a small number 856 * of extents 857 * 858 * @start is used to store the start of the free space if we find. But if we 859 * don't find suitable free space, it will be used to store the start position 860 * of the max free space. 861 * 862 * @len is used to store the size of the free space that we find. 863 * But if we don't find suitable free space, it is used to store the size of 864 * the max free space. 865 */ 866 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 867 u64 *start, u64 *len) 868 { 869 struct btrfs_key key; 870 struct btrfs_root *root = device->dev_root; 871 struct btrfs_dev_extent *dev_extent; 872 struct btrfs_path *path; 873 u64 hole_size; 874 u64 max_hole_start; 875 u64 max_hole_size; 876 u64 extent_end; 877 u64 search_start; 878 u64 search_end = device->total_bytes; 879 int ret; 880 int slot; 881 struct extent_buffer *l; 882 883 /* FIXME use last free of some kind */ 884 885 /* we don't want to overwrite the superblock on the drive, 886 * so we make sure to start at an offset of at least 1MB 887 */ 888 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 889 890 max_hole_start = search_start; 891 max_hole_size = 0; 892 hole_size = 0; 893 894 if (search_start >= search_end) { 895 ret = -ENOSPC; 896 goto error; 897 } 898 899 path = btrfs_alloc_path(); 900 if (!path) { 901 ret = -ENOMEM; 902 goto error; 903 } 904 path->reada = 2; 905 906 key.objectid = device->devid; 907 key.offset = search_start; 908 key.type = BTRFS_DEV_EXTENT_KEY; 909 910 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 911 if (ret < 0) 912 goto out; 913 if (ret > 0) { 914 ret = btrfs_previous_item(root, path, key.objectid, key.type); 915 if (ret < 0) 916 goto out; 917 } 918 919 while (1) { 920 l = path->nodes[0]; 921 slot = path->slots[0]; 922 if (slot >= btrfs_header_nritems(l)) { 923 ret = btrfs_next_leaf(root, path); 924 if (ret == 0) 925 continue; 926 if (ret < 0) 927 goto out; 928 929 break; 930 } 931 btrfs_item_key_to_cpu(l, &key, slot); 932 933 if (key.objectid < device->devid) 934 goto next; 935 936 if (key.objectid > device->devid) 937 break; 938 939 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 940 goto next; 941 942 if (key.offset > search_start) { 943 hole_size = key.offset - search_start; 944 945 if (hole_size > max_hole_size) { 946 max_hole_start = search_start; 947 max_hole_size = hole_size; 948 } 949 950 /* 951 * If this free space is greater than which we need, 952 * it must be the max free space that we have found 953 * until now, so max_hole_start must point to the start 954 * of this free space and the length of this free space 955 * is stored in max_hole_size. Thus, we return 956 * max_hole_start and max_hole_size and go back to the 957 * caller. 958 */ 959 if (hole_size >= num_bytes) { 960 ret = 0; 961 goto out; 962 } 963 } 964 965 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 966 extent_end = key.offset + btrfs_dev_extent_length(l, 967 dev_extent); 968 if (extent_end > search_start) 969 search_start = extent_end; 970 next: 971 path->slots[0]++; 972 cond_resched(); 973 } 974 975 /* 976 * At this point, search_start should be the end of 977 * allocated dev extents, and when shrinking the device, 978 * search_end may be smaller than search_start. 979 */ 980 if (search_end > search_start) 981 hole_size = search_end - search_start; 982 983 if (hole_size > max_hole_size) { 984 max_hole_start = search_start; 985 max_hole_size = hole_size; 986 } 987 988 /* See above. */ 989 if (hole_size < num_bytes) 990 ret = -ENOSPC; 991 else 992 ret = 0; 993 994 out: 995 btrfs_free_path(path); 996 error: 997 *start = max_hole_start; 998 if (len) 999 *len = max_hole_size; 1000 return ret; 1001 } 1002 1003 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1004 struct btrfs_device *device, 1005 u64 start) 1006 { 1007 int ret; 1008 struct btrfs_path *path; 1009 struct btrfs_root *root = device->dev_root; 1010 struct btrfs_key key; 1011 struct btrfs_key found_key; 1012 struct extent_buffer *leaf = NULL; 1013 struct btrfs_dev_extent *extent = NULL; 1014 1015 path = btrfs_alloc_path(); 1016 if (!path) 1017 return -ENOMEM; 1018 1019 key.objectid = device->devid; 1020 key.offset = start; 1021 key.type = BTRFS_DEV_EXTENT_KEY; 1022 again: 1023 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1024 if (ret > 0) { 1025 ret = btrfs_previous_item(root, path, key.objectid, 1026 BTRFS_DEV_EXTENT_KEY); 1027 if (ret) 1028 goto out; 1029 leaf = path->nodes[0]; 1030 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1031 extent = btrfs_item_ptr(leaf, path->slots[0], 1032 struct btrfs_dev_extent); 1033 BUG_ON(found_key.offset > start || found_key.offset + 1034 btrfs_dev_extent_length(leaf, extent) < start); 1035 key = found_key; 1036 btrfs_release_path(path); 1037 goto again; 1038 } else if (ret == 0) { 1039 leaf = path->nodes[0]; 1040 extent = btrfs_item_ptr(leaf, path->slots[0], 1041 struct btrfs_dev_extent); 1042 } 1043 BUG_ON(ret); 1044 1045 if (device->bytes_used > 0) { 1046 u64 len = btrfs_dev_extent_length(leaf, extent); 1047 device->bytes_used -= len; 1048 spin_lock(&root->fs_info->free_chunk_lock); 1049 root->fs_info->free_chunk_space += len; 1050 spin_unlock(&root->fs_info->free_chunk_lock); 1051 } 1052 ret = btrfs_del_item(trans, root, path); 1053 1054 out: 1055 btrfs_free_path(path); 1056 return ret; 1057 } 1058 1059 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1060 struct btrfs_device *device, 1061 u64 chunk_tree, u64 chunk_objectid, 1062 u64 chunk_offset, u64 start, u64 num_bytes) 1063 { 1064 int ret; 1065 struct btrfs_path *path; 1066 struct btrfs_root *root = device->dev_root; 1067 struct btrfs_dev_extent *extent; 1068 struct extent_buffer *leaf; 1069 struct btrfs_key key; 1070 1071 WARN_ON(!device->in_fs_metadata); 1072 path = btrfs_alloc_path(); 1073 if (!path) 1074 return -ENOMEM; 1075 1076 key.objectid = device->devid; 1077 key.offset = start; 1078 key.type = BTRFS_DEV_EXTENT_KEY; 1079 ret = btrfs_insert_empty_item(trans, root, path, &key, 1080 sizeof(*extent)); 1081 BUG_ON(ret); 1082 1083 leaf = path->nodes[0]; 1084 extent = btrfs_item_ptr(leaf, path->slots[0], 1085 struct btrfs_dev_extent); 1086 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1087 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1088 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1089 1090 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1091 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1092 BTRFS_UUID_SIZE); 1093 1094 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1095 btrfs_mark_buffer_dirty(leaf); 1096 btrfs_free_path(path); 1097 return ret; 1098 } 1099 1100 static noinline int find_next_chunk(struct btrfs_root *root, 1101 u64 objectid, u64 *offset) 1102 { 1103 struct btrfs_path *path; 1104 int ret; 1105 struct btrfs_key key; 1106 struct btrfs_chunk *chunk; 1107 struct btrfs_key found_key; 1108 1109 path = btrfs_alloc_path(); 1110 if (!path) 1111 return -ENOMEM; 1112 1113 key.objectid = objectid; 1114 key.offset = (u64)-1; 1115 key.type = BTRFS_CHUNK_ITEM_KEY; 1116 1117 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1118 if (ret < 0) 1119 goto error; 1120 1121 BUG_ON(ret == 0); 1122 1123 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1124 if (ret) { 1125 *offset = 0; 1126 } else { 1127 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1128 path->slots[0]); 1129 if (found_key.objectid != objectid) 1130 *offset = 0; 1131 else { 1132 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1133 struct btrfs_chunk); 1134 *offset = found_key.offset + 1135 btrfs_chunk_length(path->nodes[0], chunk); 1136 } 1137 } 1138 ret = 0; 1139 error: 1140 btrfs_free_path(path); 1141 return ret; 1142 } 1143 1144 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1145 { 1146 int ret; 1147 struct btrfs_key key; 1148 struct btrfs_key found_key; 1149 struct btrfs_path *path; 1150 1151 root = root->fs_info->chunk_root; 1152 1153 path = btrfs_alloc_path(); 1154 if (!path) 1155 return -ENOMEM; 1156 1157 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1158 key.type = BTRFS_DEV_ITEM_KEY; 1159 key.offset = (u64)-1; 1160 1161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1162 if (ret < 0) 1163 goto error; 1164 1165 BUG_ON(ret == 0); 1166 1167 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1168 BTRFS_DEV_ITEM_KEY); 1169 if (ret) { 1170 *objectid = 1; 1171 } else { 1172 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1173 path->slots[0]); 1174 *objectid = found_key.offset + 1; 1175 } 1176 ret = 0; 1177 error: 1178 btrfs_free_path(path); 1179 return ret; 1180 } 1181 1182 /* 1183 * the device information is stored in the chunk root 1184 * the btrfs_device struct should be fully filled in 1185 */ 1186 int btrfs_add_device(struct btrfs_trans_handle *trans, 1187 struct btrfs_root *root, 1188 struct btrfs_device *device) 1189 { 1190 int ret; 1191 struct btrfs_path *path; 1192 struct btrfs_dev_item *dev_item; 1193 struct extent_buffer *leaf; 1194 struct btrfs_key key; 1195 unsigned long ptr; 1196 1197 root = root->fs_info->chunk_root; 1198 1199 path = btrfs_alloc_path(); 1200 if (!path) 1201 return -ENOMEM; 1202 1203 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1204 key.type = BTRFS_DEV_ITEM_KEY; 1205 key.offset = device->devid; 1206 1207 ret = btrfs_insert_empty_item(trans, root, path, &key, 1208 sizeof(*dev_item)); 1209 if (ret) 1210 goto out; 1211 1212 leaf = path->nodes[0]; 1213 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1214 1215 btrfs_set_device_id(leaf, dev_item, device->devid); 1216 btrfs_set_device_generation(leaf, dev_item, 0); 1217 btrfs_set_device_type(leaf, dev_item, device->type); 1218 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1219 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1220 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1221 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1222 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1223 btrfs_set_device_group(leaf, dev_item, 0); 1224 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1225 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1226 btrfs_set_device_start_offset(leaf, dev_item, 0); 1227 1228 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1229 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1230 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1231 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1232 btrfs_mark_buffer_dirty(leaf); 1233 1234 ret = 0; 1235 out: 1236 btrfs_free_path(path); 1237 return ret; 1238 } 1239 1240 static int btrfs_rm_dev_item(struct btrfs_root *root, 1241 struct btrfs_device *device) 1242 { 1243 int ret; 1244 struct btrfs_path *path; 1245 struct btrfs_key key; 1246 struct btrfs_trans_handle *trans; 1247 1248 root = root->fs_info->chunk_root; 1249 1250 path = btrfs_alloc_path(); 1251 if (!path) 1252 return -ENOMEM; 1253 1254 trans = btrfs_start_transaction(root, 0); 1255 if (IS_ERR(trans)) { 1256 btrfs_free_path(path); 1257 return PTR_ERR(trans); 1258 } 1259 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1260 key.type = BTRFS_DEV_ITEM_KEY; 1261 key.offset = device->devid; 1262 lock_chunks(root); 1263 1264 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1265 if (ret < 0) 1266 goto out; 1267 1268 if (ret > 0) { 1269 ret = -ENOENT; 1270 goto out; 1271 } 1272 1273 ret = btrfs_del_item(trans, root, path); 1274 if (ret) 1275 goto out; 1276 out: 1277 btrfs_free_path(path); 1278 unlock_chunks(root); 1279 btrfs_commit_transaction(trans, root); 1280 return ret; 1281 } 1282 1283 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1284 { 1285 struct btrfs_device *device; 1286 struct btrfs_device *next_device; 1287 struct block_device *bdev; 1288 struct buffer_head *bh = NULL; 1289 struct btrfs_super_block *disk_super; 1290 struct btrfs_fs_devices *cur_devices; 1291 u64 all_avail; 1292 u64 devid; 1293 u64 num_devices; 1294 u8 *dev_uuid; 1295 int ret = 0; 1296 bool clear_super = false; 1297 1298 mutex_lock(&uuid_mutex); 1299 1300 all_avail = root->fs_info->avail_data_alloc_bits | 1301 root->fs_info->avail_system_alloc_bits | 1302 root->fs_info->avail_metadata_alloc_bits; 1303 1304 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1305 root->fs_info->fs_devices->num_devices <= 4) { 1306 printk(KERN_ERR "btrfs: unable to go below four devices " 1307 "on raid10\n"); 1308 ret = -EINVAL; 1309 goto out; 1310 } 1311 1312 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1313 root->fs_info->fs_devices->num_devices <= 2) { 1314 printk(KERN_ERR "btrfs: unable to go below two " 1315 "devices on raid1\n"); 1316 ret = -EINVAL; 1317 goto out; 1318 } 1319 1320 if (strcmp(device_path, "missing") == 0) { 1321 struct list_head *devices; 1322 struct btrfs_device *tmp; 1323 1324 device = NULL; 1325 devices = &root->fs_info->fs_devices->devices; 1326 /* 1327 * It is safe to read the devices since the volume_mutex 1328 * is held. 1329 */ 1330 list_for_each_entry(tmp, devices, dev_list) { 1331 if (tmp->in_fs_metadata && !tmp->bdev) { 1332 device = tmp; 1333 break; 1334 } 1335 } 1336 bdev = NULL; 1337 bh = NULL; 1338 disk_super = NULL; 1339 if (!device) { 1340 printk(KERN_ERR "btrfs: no missing devices found to " 1341 "remove\n"); 1342 goto out; 1343 } 1344 } else { 1345 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1346 root->fs_info->bdev_holder); 1347 if (IS_ERR(bdev)) { 1348 ret = PTR_ERR(bdev); 1349 goto out; 1350 } 1351 1352 set_blocksize(bdev, 4096); 1353 bh = btrfs_read_dev_super(bdev); 1354 if (!bh) { 1355 ret = -EINVAL; 1356 goto error_close; 1357 } 1358 disk_super = (struct btrfs_super_block *)bh->b_data; 1359 devid = btrfs_stack_device_id(&disk_super->dev_item); 1360 dev_uuid = disk_super->dev_item.uuid; 1361 device = btrfs_find_device(root, devid, dev_uuid, 1362 disk_super->fsid); 1363 if (!device) { 1364 ret = -ENOENT; 1365 goto error_brelse; 1366 } 1367 } 1368 1369 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1370 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1371 "device\n"); 1372 ret = -EINVAL; 1373 goto error_brelse; 1374 } 1375 1376 if (device->writeable) { 1377 lock_chunks(root); 1378 list_del_init(&device->dev_alloc_list); 1379 unlock_chunks(root); 1380 root->fs_info->fs_devices->rw_devices--; 1381 clear_super = true; 1382 } 1383 1384 ret = btrfs_shrink_device(device, 0); 1385 if (ret) 1386 goto error_undo; 1387 1388 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1389 if (ret) 1390 goto error_undo; 1391 1392 spin_lock(&root->fs_info->free_chunk_lock); 1393 root->fs_info->free_chunk_space = device->total_bytes - 1394 device->bytes_used; 1395 spin_unlock(&root->fs_info->free_chunk_lock); 1396 1397 device->in_fs_metadata = 0; 1398 btrfs_scrub_cancel_dev(root, device); 1399 1400 /* 1401 * the device list mutex makes sure that we don't change 1402 * the device list while someone else is writing out all 1403 * the device supers. 1404 */ 1405 1406 cur_devices = device->fs_devices; 1407 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1408 list_del_rcu(&device->dev_list); 1409 1410 device->fs_devices->num_devices--; 1411 1412 if (device->missing) 1413 root->fs_info->fs_devices->missing_devices--; 1414 1415 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1416 struct btrfs_device, dev_list); 1417 if (device->bdev == root->fs_info->sb->s_bdev) 1418 root->fs_info->sb->s_bdev = next_device->bdev; 1419 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1420 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1421 1422 if (device->bdev) 1423 device->fs_devices->open_devices--; 1424 1425 call_rcu(&device->rcu, free_device); 1426 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1427 1428 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1429 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1430 1431 if (cur_devices->open_devices == 0) { 1432 struct btrfs_fs_devices *fs_devices; 1433 fs_devices = root->fs_info->fs_devices; 1434 while (fs_devices) { 1435 if (fs_devices->seed == cur_devices) 1436 break; 1437 fs_devices = fs_devices->seed; 1438 } 1439 fs_devices->seed = cur_devices->seed; 1440 cur_devices->seed = NULL; 1441 lock_chunks(root); 1442 __btrfs_close_devices(cur_devices); 1443 unlock_chunks(root); 1444 free_fs_devices(cur_devices); 1445 } 1446 1447 /* 1448 * at this point, the device is zero sized. We want to 1449 * remove it from the devices list and zero out the old super 1450 */ 1451 if (clear_super) { 1452 /* make sure this device isn't detected as part of 1453 * the FS anymore 1454 */ 1455 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1456 set_buffer_dirty(bh); 1457 sync_dirty_buffer(bh); 1458 } 1459 1460 ret = 0; 1461 1462 error_brelse: 1463 brelse(bh); 1464 error_close: 1465 if (bdev) 1466 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1467 out: 1468 mutex_unlock(&uuid_mutex); 1469 return ret; 1470 error_undo: 1471 if (device->writeable) { 1472 lock_chunks(root); 1473 list_add(&device->dev_alloc_list, 1474 &root->fs_info->fs_devices->alloc_list); 1475 unlock_chunks(root); 1476 root->fs_info->fs_devices->rw_devices++; 1477 } 1478 goto error_brelse; 1479 } 1480 1481 /* 1482 * does all the dirty work required for changing file system's UUID. 1483 */ 1484 static int btrfs_prepare_sprout(struct btrfs_root *root) 1485 { 1486 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1487 struct btrfs_fs_devices *old_devices; 1488 struct btrfs_fs_devices *seed_devices; 1489 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1490 struct btrfs_device *device; 1491 u64 super_flags; 1492 1493 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1494 if (!fs_devices->seeding) 1495 return -EINVAL; 1496 1497 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1498 if (!seed_devices) 1499 return -ENOMEM; 1500 1501 old_devices = clone_fs_devices(fs_devices); 1502 if (IS_ERR(old_devices)) { 1503 kfree(seed_devices); 1504 return PTR_ERR(old_devices); 1505 } 1506 1507 list_add(&old_devices->list, &fs_uuids); 1508 1509 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1510 seed_devices->opened = 1; 1511 INIT_LIST_HEAD(&seed_devices->devices); 1512 INIT_LIST_HEAD(&seed_devices->alloc_list); 1513 mutex_init(&seed_devices->device_list_mutex); 1514 1515 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1516 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1517 synchronize_rcu); 1518 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1519 1520 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1521 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1522 device->fs_devices = seed_devices; 1523 } 1524 1525 fs_devices->seeding = 0; 1526 fs_devices->num_devices = 0; 1527 fs_devices->open_devices = 0; 1528 fs_devices->seed = seed_devices; 1529 1530 generate_random_uuid(fs_devices->fsid); 1531 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1532 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1533 super_flags = btrfs_super_flags(disk_super) & 1534 ~BTRFS_SUPER_FLAG_SEEDING; 1535 btrfs_set_super_flags(disk_super, super_flags); 1536 1537 return 0; 1538 } 1539 1540 /* 1541 * strore the expected generation for seed devices in device items. 1542 */ 1543 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1544 struct btrfs_root *root) 1545 { 1546 struct btrfs_path *path; 1547 struct extent_buffer *leaf; 1548 struct btrfs_dev_item *dev_item; 1549 struct btrfs_device *device; 1550 struct btrfs_key key; 1551 u8 fs_uuid[BTRFS_UUID_SIZE]; 1552 u8 dev_uuid[BTRFS_UUID_SIZE]; 1553 u64 devid; 1554 int ret; 1555 1556 path = btrfs_alloc_path(); 1557 if (!path) 1558 return -ENOMEM; 1559 1560 root = root->fs_info->chunk_root; 1561 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1562 key.offset = 0; 1563 key.type = BTRFS_DEV_ITEM_KEY; 1564 1565 while (1) { 1566 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1567 if (ret < 0) 1568 goto error; 1569 1570 leaf = path->nodes[0]; 1571 next_slot: 1572 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1573 ret = btrfs_next_leaf(root, path); 1574 if (ret > 0) 1575 break; 1576 if (ret < 0) 1577 goto error; 1578 leaf = path->nodes[0]; 1579 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1580 btrfs_release_path(path); 1581 continue; 1582 } 1583 1584 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1585 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1586 key.type != BTRFS_DEV_ITEM_KEY) 1587 break; 1588 1589 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1590 struct btrfs_dev_item); 1591 devid = btrfs_device_id(leaf, dev_item); 1592 read_extent_buffer(leaf, dev_uuid, 1593 (unsigned long)btrfs_device_uuid(dev_item), 1594 BTRFS_UUID_SIZE); 1595 read_extent_buffer(leaf, fs_uuid, 1596 (unsigned long)btrfs_device_fsid(dev_item), 1597 BTRFS_UUID_SIZE); 1598 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1599 BUG_ON(!device); 1600 1601 if (device->fs_devices->seeding) { 1602 btrfs_set_device_generation(leaf, dev_item, 1603 device->generation); 1604 btrfs_mark_buffer_dirty(leaf); 1605 } 1606 1607 path->slots[0]++; 1608 goto next_slot; 1609 } 1610 ret = 0; 1611 error: 1612 btrfs_free_path(path); 1613 return ret; 1614 } 1615 1616 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1617 { 1618 struct request_queue *q; 1619 struct btrfs_trans_handle *trans; 1620 struct btrfs_device *device; 1621 struct block_device *bdev; 1622 struct list_head *devices; 1623 struct super_block *sb = root->fs_info->sb; 1624 u64 total_bytes; 1625 int seeding_dev = 0; 1626 int ret = 0; 1627 1628 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1629 return -EINVAL; 1630 1631 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1632 root->fs_info->bdev_holder); 1633 if (IS_ERR(bdev)) 1634 return PTR_ERR(bdev); 1635 1636 if (root->fs_info->fs_devices->seeding) { 1637 seeding_dev = 1; 1638 down_write(&sb->s_umount); 1639 mutex_lock(&uuid_mutex); 1640 } 1641 1642 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1643 1644 devices = &root->fs_info->fs_devices->devices; 1645 /* 1646 * we have the volume lock, so we don't need the extra 1647 * device list mutex while reading the list here. 1648 */ 1649 list_for_each_entry(device, devices, dev_list) { 1650 if (device->bdev == bdev) { 1651 ret = -EEXIST; 1652 goto error; 1653 } 1654 } 1655 1656 device = kzalloc(sizeof(*device), GFP_NOFS); 1657 if (!device) { 1658 /* we can safely leave the fs_devices entry around */ 1659 ret = -ENOMEM; 1660 goto error; 1661 } 1662 1663 device->name = kstrdup(device_path, GFP_NOFS); 1664 if (!device->name) { 1665 kfree(device); 1666 ret = -ENOMEM; 1667 goto error; 1668 } 1669 1670 ret = find_next_devid(root, &device->devid); 1671 if (ret) { 1672 kfree(device->name); 1673 kfree(device); 1674 goto error; 1675 } 1676 1677 trans = btrfs_start_transaction(root, 0); 1678 if (IS_ERR(trans)) { 1679 kfree(device->name); 1680 kfree(device); 1681 ret = PTR_ERR(trans); 1682 goto error; 1683 } 1684 1685 lock_chunks(root); 1686 1687 q = bdev_get_queue(bdev); 1688 if (blk_queue_discard(q)) 1689 device->can_discard = 1; 1690 device->writeable = 1; 1691 device->work.func = pending_bios_fn; 1692 generate_random_uuid(device->uuid); 1693 spin_lock_init(&device->io_lock); 1694 device->generation = trans->transid; 1695 device->io_width = root->sectorsize; 1696 device->io_align = root->sectorsize; 1697 device->sector_size = root->sectorsize; 1698 device->total_bytes = i_size_read(bdev->bd_inode); 1699 device->disk_total_bytes = device->total_bytes; 1700 device->dev_root = root->fs_info->dev_root; 1701 device->bdev = bdev; 1702 device->in_fs_metadata = 1; 1703 device->mode = FMODE_EXCL; 1704 set_blocksize(device->bdev, 4096); 1705 1706 if (seeding_dev) { 1707 sb->s_flags &= ~MS_RDONLY; 1708 ret = btrfs_prepare_sprout(root); 1709 BUG_ON(ret); 1710 } 1711 1712 device->fs_devices = root->fs_info->fs_devices; 1713 1714 /* 1715 * we don't want write_supers to jump in here with our device 1716 * half setup 1717 */ 1718 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1719 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1720 list_add(&device->dev_alloc_list, 1721 &root->fs_info->fs_devices->alloc_list); 1722 root->fs_info->fs_devices->num_devices++; 1723 root->fs_info->fs_devices->open_devices++; 1724 root->fs_info->fs_devices->rw_devices++; 1725 if (device->can_discard) 1726 root->fs_info->fs_devices->num_can_discard++; 1727 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1728 1729 spin_lock(&root->fs_info->free_chunk_lock); 1730 root->fs_info->free_chunk_space += device->total_bytes; 1731 spin_unlock(&root->fs_info->free_chunk_lock); 1732 1733 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1734 root->fs_info->fs_devices->rotating = 1; 1735 1736 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 1737 btrfs_set_super_total_bytes(root->fs_info->super_copy, 1738 total_bytes + device->total_bytes); 1739 1740 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 1741 btrfs_set_super_num_devices(root->fs_info->super_copy, 1742 total_bytes + 1); 1743 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1744 1745 if (seeding_dev) { 1746 ret = init_first_rw_device(trans, root, device); 1747 BUG_ON(ret); 1748 ret = btrfs_finish_sprout(trans, root); 1749 BUG_ON(ret); 1750 } else { 1751 ret = btrfs_add_device(trans, root, device); 1752 } 1753 1754 /* 1755 * we've got more storage, clear any full flags on the space 1756 * infos 1757 */ 1758 btrfs_clear_space_info_full(root->fs_info); 1759 1760 unlock_chunks(root); 1761 btrfs_commit_transaction(trans, root); 1762 1763 if (seeding_dev) { 1764 mutex_unlock(&uuid_mutex); 1765 up_write(&sb->s_umount); 1766 1767 ret = btrfs_relocate_sys_chunks(root); 1768 BUG_ON(ret); 1769 } 1770 1771 return ret; 1772 error: 1773 blkdev_put(bdev, FMODE_EXCL); 1774 if (seeding_dev) { 1775 mutex_unlock(&uuid_mutex); 1776 up_write(&sb->s_umount); 1777 } 1778 return ret; 1779 } 1780 1781 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1782 struct btrfs_device *device) 1783 { 1784 int ret; 1785 struct btrfs_path *path; 1786 struct btrfs_root *root; 1787 struct btrfs_dev_item *dev_item; 1788 struct extent_buffer *leaf; 1789 struct btrfs_key key; 1790 1791 root = device->dev_root->fs_info->chunk_root; 1792 1793 path = btrfs_alloc_path(); 1794 if (!path) 1795 return -ENOMEM; 1796 1797 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1798 key.type = BTRFS_DEV_ITEM_KEY; 1799 key.offset = device->devid; 1800 1801 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1802 if (ret < 0) 1803 goto out; 1804 1805 if (ret > 0) { 1806 ret = -ENOENT; 1807 goto out; 1808 } 1809 1810 leaf = path->nodes[0]; 1811 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1812 1813 btrfs_set_device_id(leaf, dev_item, device->devid); 1814 btrfs_set_device_type(leaf, dev_item, device->type); 1815 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1816 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1817 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1818 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1819 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1820 btrfs_mark_buffer_dirty(leaf); 1821 1822 out: 1823 btrfs_free_path(path); 1824 return ret; 1825 } 1826 1827 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1828 struct btrfs_device *device, u64 new_size) 1829 { 1830 struct btrfs_super_block *super_copy = 1831 device->dev_root->fs_info->super_copy; 1832 u64 old_total = btrfs_super_total_bytes(super_copy); 1833 u64 diff = new_size - device->total_bytes; 1834 1835 if (!device->writeable) 1836 return -EACCES; 1837 if (new_size <= device->total_bytes) 1838 return -EINVAL; 1839 1840 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1841 device->fs_devices->total_rw_bytes += diff; 1842 1843 device->total_bytes = new_size; 1844 device->disk_total_bytes = new_size; 1845 btrfs_clear_space_info_full(device->dev_root->fs_info); 1846 1847 return btrfs_update_device(trans, device); 1848 } 1849 1850 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1851 struct btrfs_device *device, u64 new_size) 1852 { 1853 int ret; 1854 lock_chunks(device->dev_root); 1855 ret = __btrfs_grow_device(trans, device, new_size); 1856 unlock_chunks(device->dev_root); 1857 return ret; 1858 } 1859 1860 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1861 struct btrfs_root *root, 1862 u64 chunk_tree, u64 chunk_objectid, 1863 u64 chunk_offset) 1864 { 1865 int ret; 1866 struct btrfs_path *path; 1867 struct btrfs_key key; 1868 1869 root = root->fs_info->chunk_root; 1870 path = btrfs_alloc_path(); 1871 if (!path) 1872 return -ENOMEM; 1873 1874 key.objectid = chunk_objectid; 1875 key.offset = chunk_offset; 1876 key.type = BTRFS_CHUNK_ITEM_KEY; 1877 1878 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1879 BUG_ON(ret); 1880 1881 ret = btrfs_del_item(trans, root, path); 1882 1883 btrfs_free_path(path); 1884 return ret; 1885 } 1886 1887 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1888 chunk_offset) 1889 { 1890 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 1891 struct btrfs_disk_key *disk_key; 1892 struct btrfs_chunk *chunk; 1893 u8 *ptr; 1894 int ret = 0; 1895 u32 num_stripes; 1896 u32 array_size; 1897 u32 len = 0; 1898 u32 cur; 1899 struct btrfs_key key; 1900 1901 array_size = btrfs_super_sys_array_size(super_copy); 1902 1903 ptr = super_copy->sys_chunk_array; 1904 cur = 0; 1905 1906 while (cur < array_size) { 1907 disk_key = (struct btrfs_disk_key *)ptr; 1908 btrfs_disk_key_to_cpu(&key, disk_key); 1909 1910 len = sizeof(*disk_key); 1911 1912 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1913 chunk = (struct btrfs_chunk *)(ptr + len); 1914 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1915 len += btrfs_chunk_item_size(num_stripes); 1916 } else { 1917 ret = -EIO; 1918 break; 1919 } 1920 if (key.objectid == chunk_objectid && 1921 key.offset == chunk_offset) { 1922 memmove(ptr, ptr + len, array_size - (cur + len)); 1923 array_size -= len; 1924 btrfs_set_super_sys_array_size(super_copy, array_size); 1925 } else { 1926 ptr += len; 1927 cur += len; 1928 } 1929 } 1930 return ret; 1931 } 1932 1933 static int btrfs_relocate_chunk(struct btrfs_root *root, 1934 u64 chunk_tree, u64 chunk_objectid, 1935 u64 chunk_offset) 1936 { 1937 struct extent_map_tree *em_tree; 1938 struct btrfs_root *extent_root; 1939 struct btrfs_trans_handle *trans; 1940 struct extent_map *em; 1941 struct map_lookup *map; 1942 int ret; 1943 int i; 1944 1945 root = root->fs_info->chunk_root; 1946 extent_root = root->fs_info->extent_root; 1947 em_tree = &root->fs_info->mapping_tree.map_tree; 1948 1949 ret = btrfs_can_relocate(extent_root, chunk_offset); 1950 if (ret) 1951 return -ENOSPC; 1952 1953 /* step one, relocate all the extents inside this chunk */ 1954 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1955 if (ret) 1956 return ret; 1957 1958 trans = btrfs_start_transaction(root, 0); 1959 BUG_ON(IS_ERR(trans)); 1960 1961 lock_chunks(root); 1962 1963 /* 1964 * step two, delete the device extents and the 1965 * chunk tree entries 1966 */ 1967 read_lock(&em_tree->lock); 1968 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1969 read_unlock(&em_tree->lock); 1970 1971 BUG_ON(!em || em->start > chunk_offset || 1972 em->start + em->len < chunk_offset); 1973 map = (struct map_lookup *)em->bdev; 1974 1975 for (i = 0; i < map->num_stripes; i++) { 1976 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1977 map->stripes[i].physical); 1978 BUG_ON(ret); 1979 1980 if (map->stripes[i].dev) { 1981 ret = btrfs_update_device(trans, map->stripes[i].dev); 1982 BUG_ON(ret); 1983 } 1984 } 1985 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1986 chunk_offset); 1987 1988 BUG_ON(ret); 1989 1990 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 1991 1992 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1993 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1994 BUG_ON(ret); 1995 } 1996 1997 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1998 BUG_ON(ret); 1999 2000 write_lock(&em_tree->lock); 2001 remove_extent_mapping(em_tree, em); 2002 write_unlock(&em_tree->lock); 2003 2004 kfree(map); 2005 em->bdev = NULL; 2006 2007 /* once for the tree */ 2008 free_extent_map(em); 2009 /* once for us */ 2010 free_extent_map(em); 2011 2012 unlock_chunks(root); 2013 btrfs_end_transaction(trans, root); 2014 return 0; 2015 } 2016 2017 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2018 { 2019 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2020 struct btrfs_path *path; 2021 struct extent_buffer *leaf; 2022 struct btrfs_chunk *chunk; 2023 struct btrfs_key key; 2024 struct btrfs_key found_key; 2025 u64 chunk_tree = chunk_root->root_key.objectid; 2026 u64 chunk_type; 2027 bool retried = false; 2028 int failed = 0; 2029 int ret; 2030 2031 path = btrfs_alloc_path(); 2032 if (!path) 2033 return -ENOMEM; 2034 2035 again: 2036 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2037 key.offset = (u64)-1; 2038 key.type = BTRFS_CHUNK_ITEM_KEY; 2039 2040 while (1) { 2041 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2042 if (ret < 0) 2043 goto error; 2044 BUG_ON(ret == 0); 2045 2046 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2047 key.type); 2048 if (ret < 0) 2049 goto error; 2050 if (ret > 0) 2051 break; 2052 2053 leaf = path->nodes[0]; 2054 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2055 2056 chunk = btrfs_item_ptr(leaf, path->slots[0], 2057 struct btrfs_chunk); 2058 chunk_type = btrfs_chunk_type(leaf, chunk); 2059 btrfs_release_path(path); 2060 2061 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2062 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2063 found_key.objectid, 2064 found_key.offset); 2065 if (ret == -ENOSPC) 2066 failed++; 2067 else if (ret) 2068 BUG(); 2069 } 2070 2071 if (found_key.offset == 0) 2072 break; 2073 key.offset = found_key.offset - 1; 2074 } 2075 ret = 0; 2076 if (failed && !retried) { 2077 failed = 0; 2078 retried = true; 2079 goto again; 2080 } else if (failed && retried) { 2081 WARN_ON(1); 2082 ret = -ENOSPC; 2083 } 2084 error: 2085 btrfs_free_path(path); 2086 return ret; 2087 } 2088 2089 static int insert_balance_item(struct btrfs_root *root, 2090 struct btrfs_balance_control *bctl) 2091 { 2092 struct btrfs_trans_handle *trans; 2093 struct btrfs_balance_item *item; 2094 struct btrfs_disk_balance_args disk_bargs; 2095 struct btrfs_path *path; 2096 struct extent_buffer *leaf; 2097 struct btrfs_key key; 2098 int ret, err; 2099 2100 path = btrfs_alloc_path(); 2101 if (!path) 2102 return -ENOMEM; 2103 2104 trans = btrfs_start_transaction(root, 0); 2105 if (IS_ERR(trans)) { 2106 btrfs_free_path(path); 2107 return PTR_ERR(trans); 2108 } 2109 2110 key.objectid = BTRFS_BALANCE_OBJECTID; 2111 key.type = BTRFS_BALANCE_ITEM_KEY; 2112 key.offset = 0; 2113 2114 ret = btrfs_insert_empty_item(trans, root, path, &key, 2115 sizeof(*item)); 2116 if (ret) 2117 goto out; 2118 2119 leaf = path->nodes[0]; 2120 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2121 2122 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2123 2124 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2125 btrfs_set_balance_data(leaf, item, &disk_bargs); 2126 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2127 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2128 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2129 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2130 2131 btrfs_set_balance_flags(leaf, item, bctl->flags); 2132 2133 btrfs_mark_buffer_dirty(leaf); 2134 out: 2135 btrfs_free_path(path); 2136 err = btrfs_commit_transaction(trans, root); 2137 if (err && !ret) 2138 ret = err; 2139 return ret; 2140 } 2141 2142 static int del_balance_item(struct btrfs_root *root) 2143 { 2144 struct btrfs_trans_handle *trans; 2145 struct btrfs_path *path; 2146 struct btrfs_key key; 2147 int ret, err; 2148 2149 path = btrfs_alloc_path(); 2150 if (!path) 2151 return -ENOMEM; 2152 2153 trans = btrfs_start_transaction(root, 0); 2154 if (IS_ERR(trans)) { 2155 btrfs_free_path(path); 2156 return PTR_ERR(trans); 2157 } 2158 2159 key.objectid = BTRFS_BALANCE_OBJECTID; 2160 key.type = BTRFS_BALANCE_ITEM_KEY; 2161 key.offset = 0; 2162 2163 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2164 if (ret < 0) 2165 goto out; 2166 if (ret > 0) { 2167 ret = -ENOENT; 2168 goto out; 2169 } 2170 2171 ret = btrfs_del_item(trans, root, path); 2172 out: 2173 btrfs_free_path(path); 2174 err = btrfs_commit_transaction(trans, root); 2175 if (err && !ret) 2176 ret = err; 2177 return ret; 2178 } 2179 2180 /* 2181 * This is a heuristic used to reduce the number of chunks balanced on 2182 * resume after balance was interrupted. 2183 */ 2184 static void update_balance_args(struct btrfs_balance_control *bctl) 2185 { 2186 /* 2187 * Turn on soft mode for chunk types that were being converted. 2188 */ 2189 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2190 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2191 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2192 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2193 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2194 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2195 2196 /* 2197 * Turn on usage filter if is not already used. The idea is 2198 * that chunks that we have already balanced should be 2199 * reasonably full. Don't do it for chunks that are being 2200 * converted - that will keep us from relocating unconverted 2201 * (albeit full) chunks. 2202 */ 2203 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2204 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2205 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2206 bctl->data.usage = 90; 2207 } 2208 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2209 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2210 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2211 bctl->sys.usage = 90; 2212 } 2213 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2214 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2215 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2216 bctl->meta.usage = 90; 2217 } 2218 } 2219 2220 /* 2221 * Should be called with both balance and volume mutexes held to 2222 * serialize other volume operations (add_dev/rm_dev/resize) with 2223 * restriper. Same goes for unset_balance_control. 2224 */ 2225 static void set_balance_control(struct btrfs_balance_control *bctl) 2226 { 2227 struct btrfs_fs_info *fs_info = bctl->fs_info; 2228 2229 BUG_ON(fs_info->balance_ctl); 2230 2231 spin_lock(&fs_info->balance_lock); 2232 fs_info->balance_ctl = bctl; 2233 spin_unlock(&fs_info->balance_lock); 2234 } 2235 2236 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2237 { 2238 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2239 2240 BUG_ON(!fs_info->balance_ctl); 2241 2242 spin_lock(&fs_info->balance_lock); 2243 fs_info->balance_ctl = NULL; 2244 spin_unlock(&fs_info->balance_lock); 2245 2246 kfree(bctl); 2247 } 2248 2249 /* 2250 * Balance filters. Return 1 if chunk should be filtered out 2251 * (should not be balanced). 2252 */ 2253 static int chunk_profiles_filter(u64 chunk_profile, 2254 struct btrfs_balance_args *bargs) 2255 { 2256 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; 2257 2258 if (chunk_profile == 0) 2259 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 2260 2261 if (bargs->profiles & chunk_profile) 2262 return 0; 2263 2264 return 1; 2265 } 2266 2267 static u64 div_factor_fine(u64 num, int factor) 2268 { 2269 if (factor <= 0) 2270 return 0; 2271 if (factor >= 100) 2272 return num; 2273 2274 num *= factor; 2275 do_div(num, 100); 2276 return num; 2277 } 2278 2279 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2280 struct btrfs_balance_args *bargs) 2281 { 2282 struct btrfs_block_group_cache *cache; 2283 u64 chunk_used, user_thresh; 2284 int ret = 1; 2285 2286 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2287 chunk_used = btrfs_block_group_used(&cache->item); 2288 2289 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2290 if (chunk_used < user_thresh) 2291 ret = 0; 2292 2293 btrfs_put_block_group(cache); 2294 return ret; 2295 } 2296 2297 static int chunk_devid_filter(struct extent_buffer *leaf, 2298 struct btrfs_chunk *chunk, 2299 struct btrfs_balance_args *bargs) 2300 { 2301 struct btrfs_stripe *stripe; 2302 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2303 int i; 2304 2305 for (i = 0; i < num_stripes; i++) { 2306 stripe = btrfs_stripe_nr(chunk, i); 2307 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2308 return 0; 2309 } 2310 2311 return 1; 2312 } 2313 2314 /* [pstart, pend) */ 2315 static int chunk_drange_filter(struct extent_buffer *leaf, 2316 struct btrfs_chunk *chunk, 2317 u64 chunk_offset, 2318 struct btrfs_balance_args *bargs) 2319 { 2320 struct btrfs_stripe *stripe; 2321 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2322 u64 stripe_offset; 2323 u64 stripe_length; 2324 int factor; 2325 int i; 2326 2327 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 2328 return 0; 2329 2330 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2331 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2332 factor = 2; 2333 else 2334 factor = 1; 2335 factor = num_stripes / factor; 2336 2337 for (i = 0; i < num_stripes; i++) { 2338 stripe = btrfs_stripe_nr(chunk, i); 2339 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 2340 continue; 2341 2342 stripe_offset = btrfs_stripe_offset(leaf, stripe); 2343 stripe_length = btrfs_chunk_length(leaf, chunk); 2344 do_div(stripe_length, factor); 2345 2346 if (stripe_offset < bargs->pend && 2347 stripe_offset + stripe_length > bargs->pstart) 2348 return 0; 2349 } 2350 2351 return 1; 2352 } 2353 2354 /* [vstart, vend) */ 2355 static int chunk_vrange_filter(struct extent_buffer *leaf, 2356 struct btrfs_chunk *chunk, 2357 u64 chunk_offset, 2358 struct btrfs_balance_args *bargs) 2359 { 2360 if (chunk_offset < bargs->vend && 2361 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 2362 /* at least part of the chunk is inside this vrange */ 2363 return 0; 2364 2365 return 1; 2366 } 2367 2368 static int chunk_soft_convert_filter(u64 chunk_profile, 2369 struct btrfs_balance_args *bargs) 2370 { 2371 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 2372 return 0; 2373 2374 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; 2375 2376 if (chunk_profile == 0) 2377 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 2378 2379 if (bargs->target & chunk_profile) 2380 return 1; 2381 2382 return 0; 2383 } 2384 2385 static int should_balance_chunk(struct btrfs_root *root, 2386 struct extent_buffer *leaf, 2387 struct btrfs_chunk *chunk, u64 chunk_offset) 2388 { 2389 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 2390 struct btrfs_balance_args *bargs = NULL; 2391 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 2392 2393 /* type filter */ 2394 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 2395 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 2396 return 0; 2397 } 2398 2399 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 2400 bargs = &bctl->data; 2401 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 2402 bargs = &bctl->sys; 2403 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 2404 bargs = &bctl->meta; 2405 2406 /* profiles filter */ 2407 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 2408 chunk_profiles_filter(chunk_type, bargs)) { 2409 return 0; 2410 } 2411 2412 /* usage filter */ 2413 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 2414 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 2415 return 0; 2416 } 2417 2418 /* devid filter */ 2419 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 2420 chunk_devid_filter(leaf, chunk, bargs)) { 2421 return 0; 2422 } 2423 2424 /* drange filter, makes sense only with devid filter */ 2425 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 2426 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 2427 return 0; 2428 } 2429 2430 /* vrange filter */ 2431 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 2432 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 2433 return 0; 2434 } 2435 2436 /* soft profile changing mode */ 2437 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 2438 chunk_soft_convert_filter(chunk_type, bargs)) { 2439 return 0; 2440 } 2441 2442 return 1; 2443 } 2444 2445 static u64 div_factor(u64 num, int factor) 2446 { 2447 if (factor == 10) 2448 return num; 2449 num *= factor; 2450 do_div(num, 10); 2451 return num; 2452 } 2453 2454 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2455 { 2456 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2457 struct btrfs_root *chunk_root = fs_info->chunk_root; 2458 struct btrfs_root *dev_root = fs_info->dev_root; 2459 struct list_head *devices; 2460 struct btrfs_device *device; 2461 u64 old_size; 2462 u64 size_to_free; 2463 struct btrfs_chunk *chunk; 2464 struct btrfs_path *path; 2465 struct btrfs_key key; 2466 struct btrfs_key found_key; 2467 struct btrfs_trans_handle *trans; 2468 struct extent_buffer *leaf; 2469 int slot; 2470 int ret; 2471 int enospc_errors = 0; 2472 bool counting = true; 2473 2474 /* step one make some room on all the devices */ 2475 devices = &fs_info->fs_devices->devices; 2476 list_for_each_entry(device, devices, dev_list) { 2477 old_size = device->total_bytes; 2478 size_to_free = div_factor(old_size, 1); 2479 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2480 if (!device->writeable || 2481 device->total_bytes - device->bytes_used > size_to_free) 2482 continue; 2483 2484 ret = btrfs_shrink_device(device, old_size - size_to_free); 2485 if (ret == -ENOSPC) 2486 break; 2487 BUG_ON(ret); 2488 2489 trans = btrfs_start_transaction(dev_root, 0); 2490 BUG_ON(IS_ERR(trans)); 2491 2492 ret = btrfs_grow_device(trans, device, old_size); 2493 BUG_ON(ret); 2494 2495 btrfs_end_transaction(trans, dev_root); 2496 } 2497 2498 /* step two, relocate all the chunks */ 2499 path = btrfs_alloc_path(); 2500 if (!path) { 2501 ret = -ENOMEM; 2502 goto error; 2503 } 2504 2505 /* zero out stat counters */ 2506 spin_lock(&fs_info->balance_lock); 2507 memset(&bctl->stat, 0, sizeof(bctl->stat)); 2508 spin_unlock(&fs_info->balance_lock); 2509 again: 2510 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2511 key.offset = (u64)-1; 2512 key.type = BTRFS_CHUNK_ITEM_KEY; 2513 2514 while (1) { 2515 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 2516 atomic_read(&fs_info->balance_cancel_req)) { 2517 ret = -ECANCELED; 2518 goto error; 2519 } 2520 2521 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2522 if (ret < 0) 2523 goto error; 2524 2525 /* 2526 * this shouldn't happen, it means the last relocate 2527 * failed 2528 */ 2529 if (ret == 0) 2530 BUG(); /* FIXME break ? */ 2531 2532 ret = btrfs_previous_item(chunk_root, path, 0, 2533 BTRFS_CHUNK_ITEM_KEY); 2534 if (ret) { 2535 ret = 0; 2536 break; 2537 } 2538 2539 leaf = path->nodes[0]; 2540 slot = path->slots[0]; 2541 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2542 2543 if (found_key.objectid != key.objectid) 2544 break; 2545 2546 /* chunk zero is special */ 2547 if (found_key.offset == 0) 2548 break; 2549 2550 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 2551 2552 if (!counting) { 2553 spin_lock(&fs_info->balance_lock); 2554 bctl->stat.considered++; 2555 spin_unlock(&fs_info->balance_lock); 2556 } 2557 2558 ret = should_balance_chunk(chunk_root, leaf, chunk, 2559 found_key.offset); 2560 btrfs_release_path(path); 2561 if (!ret) 2562 goto loop; 2563 2564 if (counting) { 2565 spin_lock(&fs_info->balance_lock); 2566 bctl->stat.expected++; 2567 spin_unlock(&fs_info->balance_lock); 2568 goto loop; 2569 } 2570 2571 ret = btrfs_relocate_chunk(chunk_root, 2572 chunk_root->root_key.objectid, 2573 found_key.objectid, 2574 found_key.offset); 2575 if (ret && ret != -ENOSPC) 2576 goto error; 2577 if (ret == -ENOSPC) { 2578 enospc_errors++; 2579 } else { 2580 spin_lock(&fs_info->balance_lock); 2581 bctl->stat.completed++; 2582 spin_unlock(&fs_info->balance_lock); 2583 } 2584 loop: 2585 key.offset = found_key.offset - 1; 2586 } 2587 2588 if (counting) { 2589 btrfs_release_path(path); 2590 counting = false; 2591 goto again; 2592 } 2593 error: 2594 btrfs_free_path(path); 2595 if (enospc_errors) { 2596 printk(KERN_INFO "btrfs: %d enospc errors during balance\n", 2597 enospc_errors); 2598 if (!ret) 2599 ret = -ENOSPC; 2600 } 2601 2602 return ret; 2603 } 2604 2605 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 2606 { 2607 /* cancel requested || normal exit path */ 2608 return atomic_read(&fs_info->balance_cancel_req) || 2609 (atomic_read(&fs_info->balance_pause_req) == 0 && 2610 atomic_read(&fs_info->balance_cancel_req) == 0); 2611 } 2612 2613 static void __cancel_balance(struct btrfs_fs_info *fs_info) 2614 { 2615 int ret; 2616 2617 unset_balance_control(fs_info); 2618 ret = del_balance_item(fs_info->tree_root); 2619 BUG_ON(ret); 2620 } 2621 2622 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 2623 struct btrfs_ioctl_balance_args *bargs); 2624 2625 /* 2626 * Should be called with both balance and volume mutexes held 2627 */ 2628 int btrfs_balance(struct btrfs_balance_control *bctl, 2629 struct btrfs_ioctl_balance_args *bargs) 2630 { 2631 struct btrfs_fs_info *fs_info = bctl->fs_info; 2632 u64 allowed; 2633 int ret; 2634 2635 if (btrfs_fs_closing(fs_info) || 2636 atomic_read(&fs_info->balance_pause_req) || 2637 atomic_read(&fs_info->balance_cancel_req)) { 2638 ret = -EINVAL; 2639 goto out; 2640 } 2641 2642 /* 2643 * In case of mixed groups both data and meta should be picked, 2644 * and identical options should be given for both of them. 2645 */ 2646 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 2647 if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2648 (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { 2649 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 2650 !(bctl->flags & BTRFS_BALANCE_METADATA) || 2651 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 2652 printk(KERN_ERR "btrfs: with mixed groups data and " 2653 "metadata balance options must be the same\n"); 2654 ret = -EINVAL; 2655 goto out; 2656 } 2657 } 2658 2659 /* 2660 * Profile changing sanity checks. Skip them if a simple 2661 * balance is requested. 2662 */ 2663 if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & 2664 BTRFS_BALANCE_ARGS_CONVERT)) 2665 goto do_balance; 2666 2667 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 2668 if (fs_info->fs_devices->num_devices == 1) 2669 allowed |= BTRFS_BLOCK_GROUP_DUP; 2670 else if (fs_info->fs_devices->num_devices < 4) 2671 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 2672 else 2673 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2674 BTRFS_BLOCK_GROUP_RAID10); 2675 2676 if (!profile_is_valid(bctl->data.target, 1) || 2677 bctl->data.target & ~allowed) { 2678 printk(KERN_ERR "btrfs: unable to start balance with target " 2679 "data profile %llu\n", 2680 (unsigned long long)bctl->data.target); 2681 ret = -EINVAL; 2682 goto out; 2683 } 2684 if (!profile_is_valid(bctl->meta.target, 1) || 2685 bctl->meta.target & ~allowed) { 2686 printk(KERN_ERR "btrfs: unable to start balance with target " 2687 "metadata profile %llu\n", 2688 (unsigned long long)bctl->meta.target); 2689 ret = -EINVAL; 2690 goto out; 2691 } 2692 if (!profile_is_valid(bctl->sys.target, 1) || 2693 bctl->sys.target & ~allowed) { 2694 printk(KERN_ERR "btrfs: unable to start balance with target " 2695 "system profile %llu\n", 2696 (unsigned long long)bctl->sys.target); 2697 ret = -EINVAL; 2698 goto out; 2699 } 2700 2701 if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { 2702 printk(KERN_ERR "btrfs: dup for data is not allowed\n"); 2703 ret = -EINVAL; 2704 goto out; 2705 } 2706 2707 /* allow to reduce meta or sys integrity only if force set */ 2708 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 2709 BTRFS_BLOCK_GROUP_RAID10; 2710 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2711 (fs_info->avail_system_alloc_bits & allowed) && 2712 !(bctl->sys.target & allowed)) || 2713 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2714 (fs_info->avail_metadata_alloc_bits & allowed) && 2715 !(bctl->meta.target & allowed))) { 2716 if (bctl->flags & BTRFS_BALANCE_FORCE) { 2717 printk(KERN_INFO "btrfs: force reducing metadata " 2718 "integrity\n"); 2719 } else { 2720 printk(KERN_ERR "btrfs: balance will reduce metadata " 2721 "integrity, use force if you want this\n"); 2722 ret = -EINVAL; 2723 goto out; 2724 } 2725 } 2726 2727 do_balance: 2728 ret = insert_balance_item(fs_info->tree_root, bctl); 2729 if (ret && ret != -EEXIST) 2730 goto out; 2731 2732 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 2733 BUG_ON(ret == -EEXIST); 2734 set_balance_control(bctl); 2735 } else { 2736 BUG_ON(ret != -EEXIST); 2737 spin_lock(&fs_info->balance_lock); 2738 update_balance_args(bctl); 2739 spin_unlock(&fs_info->balance_lock); 2740 } 2741 2742 atomic_inc(&fs_info->balance_running); 2743 mutex_unlock(&fs_info->balance_mutex); 2744 2745 ret = __btrfs_balance(fs_info); 2746 2747 mutex_lock(&fs_info->balance_mutex); 2748 atomic_dec(&fs_info->balance_running); 2749 2750 if (bargs) { 2751 memset(bargs, 0, sizeof(*bargs)); 2752 update_ioctl_balance_args(fs_info, 0, bargs); 2753 } 2754 2755 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 2756 balance_need_close(fs_info)) { 2757 __cancel_balance(fs_info); 2758 } 2759 2760 wake_up(&fs_info->balance_wait_q); 2761 2762 return ret; 2763 out: 2764 if (bctl->flags & BTRFS_BALANCE_RESUME) 2765 __cancel_balance(fs_info); 2766 else 2767 kfree(bctl); 2768 return ret; 2769 } 2770 2771 static int balance_kthread(void *data) 2772 { 2773 struct btrfs_balance_control *bctl = 2774 (struct btrfs_balance_control *)data; 2775 struct btrfs_fs_info *fs_info = bctl->fs_info; 2776 int ret = 0; 2777 2778 mutex_lock(&fs_info->volume_mutex); 2779 mutex_lock(&fs_info->balance_mutex); 2780 2781 set_balance_control(bctl); 2782 2783 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 2784 printk(KERN_INFO "btrfs: force skipping balance\n"); 2785 } else { 2786 printk(KERN_INFO "btrfs: continuing balance\n"); 2787 ret = btrfs_balance(bctl, NULL); 2788 } 2789 2790 mutex_unlock(&fs_info->balance_mutex); 2791 mutex_unlock(&fs_info->volume_mutex); 2792 return ret; 2793 } 2794 2795 int btrfs_recover_balance(struct btrfs_root *tree_root) 2796 { 2797 struct task_struct *tsk; 2798 struct btrfs_balance_control *bctl; 2799 struct btrfs_balance_item *item; 2800 struct btrfs_disk_balance_args disk_bargs; 2801 struct btrfs_path *path; 2802 struct extent_buffer *leaf; 2803 struct btrfs_key key; 2804 int ret; 2805 2806 path = btrfs_alloc_path(); 2807 if (!path) 2808 return -ENOMEM; 2809 2810 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 2811 if (!bctl) { 2812 ret = -ENOMEM; 2813 goto out; 2814 } 2815 2816 key.objectid = BTRFS_BALANCE_OBJECTID; 2817 key.type = BTRFS_BALANCE_ITEM_KEY; 2818 key.offset = 0; 2819 2820 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 2821 if (ret < 0) 2822 goto out_bctl; 2823 if (ret > 0) { /* ret = -ENOENT; */ 2824 ret = 0; 2825 goto out_bctl; 2826 } 2827 2828 leaf = path->nodes[0]; 2829 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2830 2831 bctl->fs_info = tree_root->fs_info; 2832 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; 2833 2834 btrfs_balance_data(leaf, item, &disk_bargs); 2835 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 2836 btrfs_balance_meta(leaf, item, &disk_bargs); 2837 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 2838 btrfs_balance_sys(leaf, item, &disk_bargs); 2839 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 2840 2841 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); 2842 if (IS_ERR(tsk)) 2843 ret = PTR_ERR(tsk); 2844 else 2845 goto out; 2846 2847 out_bctl: 2848 kfree(bctl); 2849 out: 2850 btrfs_free_path(path); 2851 return ret; 2852 } 2853 2854 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 2855 { 2856 int ret = 0; 2857 2858 mutex_lock(&fs_info->balance_mutex); 2859 if (!fs_info->balance_ctl) { 2860 mutex_unlock(&fs_info->balance_mutex); 2861 return -ENOTCONN; 2862 } 2863 2864 if (atomic_read(&fs_info->balance_running)) { 2865 atomic_inc(&fs_info->balance_pause_req); 2866 mutex_unlock(&fs_info->balance_mutex); 2867 2868 wait_event(fs_info->balance_wait_q, 2869 atomic_read(&fs_info->balance_running) == 0); 2870 2871 mutex_lock(&fs_info->balance_mutex); 2872 /* we are good with balance_ctl ripped off from under us */ 2873 BUG_ON(atomic_read(&fs_info->balance_running)); 2874 atomic_dec(&fs_info->balance_pause_req); 2875 } else { 2876 ret = -ENOTCONN; 2877 } 2878 2879 mutex_unlock(&fs_info->balance_mutex); 2880 return ret; 2881 } 2882 2883 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 2884 { 2885 mutex_lock(&fs_info->balance_mutex); 2886 if (!fs_info->balance_ctl) { 2887 mutex_unlock(&fs_info->balance_mutex); 2888 return -ENOTCONN; 2889 } 2890 2891 atomic_inc(&fs_info->balance_cancel_req); 2892 /* 2893 * if we are running just wait and return, balance item is 2894 * deleted in btrfs_balance in this case 2895 */ 2896 if (atomic_read(&fs_info->balance_running)) { 2897 mutex_unlock(&fs_info->balance_mutex); 2898 wait_event(fs_info->balance_wait_q, 2899 atomic_read(&fs_info->balance_running) == 0); 2900 mutex_lock(&fs_info->balance_mutex); 2901 } else { 2902 /* __cancel_balance needs volume_mutex */ 2903 mutex_unlock(&fs_info->balance_mutex); 2904 mutex_lock(&fs_info->volume_mutex); 2905 mutex_lock(&fs_info->balance_mutex); 2906 2907 if (fs_info->balance_ctl) 2908 __cancel_balance(fs_info); 2909 2910 mutex_unlock(&fs_info->volume_mutex); 2911 } 2912 2913 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 2914 atomic_dec(&fs_info->balance_cancel_req); 2915 mutex_unlock(&fs_info->balance_mutex); 2916 return 0; 2917 } 2918 2919 /* 2920 * shrinking a device means finding all of the device extents past 2921 * the new size, and then following the back refs to the chunks. 2922 * The chunk relocation code actually frees the device extent 2923 */ 2924 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 2925 { 2926 struct btrfs_trans_handle *trans; 2927 struct btrfs_root *root = device->dev_root; 2928 struct btrfs_dev_extent *dev_extent = NULL; 2929 struct btrfs_path *path; 2930 u64 length; 2931 u64 chunk_tree; 2932 u64 chunk_objectid; 2933 u64 chunk_offset; 2934 int ret; 2935 int slot; 2936 int failed = 0; 2937 bool retried = false; 2938 struct extent_buffer *l; 2939 struct btrfs_key key; 2940 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2941 u64 old_total = btrfs_super_total_bytes(super_copy); 2942 u64 old_size = device->total_bytes; 2943 u64 diff = device->total_bytes - new_size; 2944 2945 if (new_size >= device->total_bytes) 2946 return -EINVAL; 2947 2948 path = btrfs_alloc_path(); 2949 if (!path) 2950 return -ENOMEM; 2951 2952 path->reada = 2; 2953 2954 lock_chunks(root); 2955 2956 device->total_bytes = new_size; 2957 if (device->writeable) { 2958 device->fs_devices->total_rw_bytes -= diff; 2959 spin_lock(&root->fs_info->free_chunk_lock); 2960 root->fs_info->free_chunk_space -= diff; 2961 spin_unlock(&root->fs_info->free_chunk_lock); 2962 } 2963 unlock_chunks(root); 2964 2965 again: 2966 key.objectid = device->devid; 2967 key.offset = (u64)-1; 2968 key.type = BTRFS_DEV_EXTENT_KEY; 2969 2970 while (1) { 2971 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2972 if (ret < 0) 2973 goto done; 2974 2975 ret = btrfs_previous_item(root, path, 0, key.type); 2976 if (ret < 0) 2977 goto done; 2978 if (ret) { 2979 ret = 0; 2980 btrfs_release_path(path); 2981 break; 2982 } 2983 2984 l = path->nodes[0]; 2985 slot = path->slots[0]; 2986 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2987 2988 if (key.objectid != device->devid) { 2989 btrfs_release_path(path); 2990 break; 2991 } 2992 2993 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2994 length = btrfs_dev_extent_length(l, dev_extent); 2995 2996 if (key.offset + length <= new_size) { 2997 btrfs_release_path(path); 2998 break; 2999 } 3000 3001 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3002 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3003 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3004 btrfs_release_path(path); 3005 3006 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 3007 chunk_offset); 3008 if (ret && ret != -ENOSPC) 3009 goto done; 3010 if (ret == -ENOSPC) 3011 failed++; 3012 key.offset -= 1; 3013 } 3014 3015 if (failed && !retried) { 3016 failed = 0; 3017 retried = true; 3018 goto again; 3019 } else if (failed && retried) { 3020 ret = -ENOSPC; 3021 lock_chunks(root); 3022 3023 device->total_bytes = old_size; 3024 if (device->writeable) 3025 device->fs_devices->total_rw_bytes += diff; 3026 spin_lock(&root->fs_info->free_chunk_lock); 3027 root->fs_info->free_chunk_space += diff; 3028 spin_unlock(&root->fs_info->free_chunk_lock); 3029 unlock_chunks(root); 3030 goto done; 3031 } 3032 3033 /* Shrinking succeeded, else we would be at "done". */ 3034 trans = btrfs_start_transaction(root, 0); 3035 if (IS_ERR(trans)) { 3036 ret = PTR_ERR(trans); 3037 goto done; 3038 } 3039 3040 lock_chunks(root); 3041 3042 device->disk_total_bytes = new_size; 3043 /* Now btrfs_update_device() will change the on-disk size. */ 3044 ret = btrfs_update_device(trans, device); 3045 if (ret) { 3046 unlock_chunks(root); 3047 btrfs_end_transaction(trans, root); 3048 goto done; 3049 } 3050 WARN_ON(diff > old_total); 3051 btrfs_set_super_total_bytes(super_copy, old_total - diff); 3052 unlock_chunks(root); 3053 btrfs_end_transaction(trans, root); 3054 done: 3055 btrfs_free_path(path); 3056 return ret; 3057 } 3058 3059 static int btrfs_add_system_chunk(struct btrfs_root *root, 3060 struct btrfs_key *key, 3061 struct btrfs_chunk *chunk, int item_size) 3062 { 3063 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3064 struct btrfs_disk_key disk_key; 3065 u32 array_size; 3066 u8 *ptr; 3067 3068 array_size = btrfs_super_sys_array_size(super_copy); 3069 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3070 return -EFBIG; 3071 3072 ptr = super_copy->sys_chunk_array + array_size; 3073 btrfs_cpu_key_to_disk(&disk_key, key); 3074 memcpy(ptr, &disk_key, sizeof(disk_key)); 3075 ptr += sizeof(disk_key); 3076 memcpy(ptr, chunk, item_size); 3077 item_size += sizeof(disk_key); 3078 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 3079 return 0; 3080 } 3081 3082 /* 3083 * sort the devices in descending order by max_avail, total_avail 3084 */ 3085 static int btrfs_cmp_device_info(const void *a, const void *b) 3086 { 3087 const struct btrfs_device_info *di_a = a; 3088 const struct btrfs_device_info *di_b = b; 3089 3090 if (di_a->max_avail > di_b->max_avail) 3091 return -1; 3092 if (di_a->max_avail < di_b->max_avail) 3093 return 1; 3094 if (di_a->total_avail > di_b->total_avail) 3095 return -1; 3096 if (di_a->total_avail < di_b->total_avail) 3097 return 1; 3098 return 0; 3099 } 3100 3101 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3102 struct btrfs_root *extent_root, 3103 struct map_lookup **map_ret, 3104 u64 *num_bytes_out, u64 *stripe_size_out, 3105 u64 start, u64 type) 3106 { 3107 struct btrfs_fs_info *info = extent_root->fs_info; 3108 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3109 struct list_head *cur; 3110 struct map_lookup *map = NULL; 3111 struct extent_map_tree *em_tree; 3112 struct extent_map *em; 3113 struct btrfs_device_info *devices_info = NULL; 3114 u64 total_avail; 3115 int num_stripes; /* total number of stripes to allocate */ 3116 int sub_stripes; /* sub_stripes info for map */ 3117 int dev_stripes; /* stripes per dev */ 3118 int devs_max; /* max devs to use */ 3119 int devs_min; /* min devs needed */ 3120 int devs_increment; /* ndevs has to be a multiple of this */ 3121 int ncopies; /* how many copies to data has */ 3122 int ret; 3123 u64 max_stripe_size; 3124 u64 max_chunk_size; 3125 u64 stripe_size; 3126 u64 num_bytes; 3127 int ndevs; 3128 int i; 3129 int j; 3130 3131 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 3132 (type & BTRFS_BLOCK_GROUP_DUP)) { 3133 WARN_ON(1); 3134 type &= ~BTRFS_BLOCK_GROUP_DUP; 3135 } 3136 3137 if (list_empty(&fs_devices->alloc_list)) 3138 return -ENOSPC; 3139 3140 sub_stripes = 1; 3141 dev_stripes = 1; 3142 devs_increment = 1; 3143 ncopies = 1; 3144 devs_max = 0; /* 0 == as many as possible */ 3145 devs_min = 1; 3146 3147 /* 3148 * define the properties of each RAID type. 3149 * FIXME: move this to a global table and use it in all RAID 3150 * calculation code 3151 */ 3152 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3153 dev_stripes = 2; 3154 ncopies = 2; 3155 devs_max = 1; 3156 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 3157 devs_min = 2; 3158 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 3159 devs_increment = 2; 3160 ncopies = 2; 3161 devs_max = 2; 3162 devs_min = 2; 3163 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 3164 sub_stripes = 2; 3165 devs_increment = 2; 3166 ncopies = 2; 3167 devs_min = 4; 3168 } else { 3169 devs_max = 1; 3170 } 3171 3172 if (type & BTRFS_BLOCK_GROUP_DATA) { 3173 max_stripe_size = 1024 * 1024 * 1024; 3174 max_chunk_size = 10 * max_stripe_size; 3175 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3176 /* for larger filesystems, use larger metadata chunks */ 3177 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 3178 max_stripe_size = 1024 * 1024 * 1024; 3179 else 3180 max_stripe_size = 256 * 1024 * 1024; 3181 max_chunk_size = max_stripe_size; 3182 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3183 max_stripe_size = 32 * 1024 * 1024; 3184 max_chunk_size = 2 * max_stripe_size; 3185 } else { 3186 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3187 type); 3188 BUG_ON(1); 3189 } 3190 3191 /* we don't want a chunk larger than 10% of writeable space */ 3192 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 3193 max_chunk_size); 3194 3195 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 3196 GFP_NOFS); 3197 if (!devices_info) 3198 return -ENOMEM; 3199 3200 cur = fs_devices->alloc_list.next; 3201 3202 /* 3203 * in the first pass through the devices list, we gather information 3204 * about the available holes on each device. 3205 */ 3206 ndevs = 0; 3207 while (cur != &fs_devices->alloc_list) { 3208 struct btrfs_device *device; 3209 u64 max_avail; 3210 u64 dev_offset; 3211 3212 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 3213 3214 cur = cur->next; 3215 3216 if (!device->writeable) { 3217 printk(KERN_ERR 3218 "btrfs: read-only device in alloc_list\n"); 3219 WARN_ON(1); 3220 continue; 3221 } 3222 3223 if (!device->in_fs_metadata) 3224 continue; 3225 3226 if (device->total_bytes > device->bytes_used) 3227 total_avail = device->total_bytes - device->bytes_used; 3228 else 3229 total_avail = 0; 3230 3231 /* If there is no space on this device, skip it. */ 3232 if (total_avail == 0) 3233 continue; 3234 3235 ret = find_free_dev_extent(device, 3236 max_stripe_size * dev_stripes, 3237 &dev_offset, &max_avail); 3238 if (ret && ret != -ENOSPC) 3239 goto error; 3240 3241 if (ret == 0) 3242 max_avail = max_stripe_size * dev_stripes; 3243 3244 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3245 continue; 3246 3247 devices_info[ndevs].dev_offset = dev_offset; 3248 devices_info[ndevs].max_avail = max_avail; 3249 devices_info[ndevs].total_avail = total_avail; 3250 devices_info[ndevs].dev = device; 3251 ++ndevs; 3252 } 3253 3254 /* 3255 * now sort the devices by hole size / available space 3256 */ 3257 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 3258 btrfs_cmp_device_info, NULL); 3259 3260 /* round down to number of usable stripes */ 3261 ndevs -= ndevs % devs_increment; 3262 3263 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 3264 ret = -ENOSPC; 3265 goto error; 3266 } 3267 3268 if (devs_max && ndevs > devs_max) 3269 ndevs = devs_max; 3270 /* 3271 * the primary goal is to maximize the number of stripes, so use as many 3272 * devices as possible, even if the stripes are not maximum sized. 3273 */ 3274 stripe_size = devices_info[ndevs-1].max_avail; 3275 num_stripes = ndevs * dev_stripes; 3276 3277 if (stripe_size * num_stripes > max_chunk_size * ncopies) { 3278 stripe_size = max_chunk_size * ncopies; 3279 do_div(stripe_size, num_stripes); 3280 } 3281 3282 do_div(stripe_size, dev_stripes); 3283 do_div(stripe_size, BTRFS_STRIPE_LEN); 3284 stripe_size *= BTRFS_STRIPE_LEN; 3285 3286 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3287 if (!map) { 3288 ret = -ENOMEM; 3289 goto error; 3290 } 3291 map->num_stripes = num_stripes; 3292 3293 for (i = 0; i < ndevs; ++i) { 3294 for (j = 0; j < dev_stripes; ++j) { 3295 int s = i * dev_stripes + j; 3296 map->stripes[s].dev = devices_info[i].dev; 3297 map->stripes[s].physical = devices_info[i].dev_offset + 3298 j * stripe_size; 3299 } 3300 } 3301 map->sector_size = extent_root->sectorsize; 3302 map->stripe_len = BTRFS_STRIPE_LEN; 3303 map->io_align = BTRFS_STRIPE_LEN; 3304 map->io_width = BTRFS_STRIPE_LEN; 3305 map->type = type; 3306 map->sub_stripes = sub_stripes; 3307 3308 *map_ret = map; 3309 num_bytes = stripe_size * (num_stripes / ncopies); 3310 3311 *stripe_size_out = stripe_size; 3312 *num_bytes_out = num_bytes; 3313 3314 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3315 3316 em = alloc_extent_map(); 3317 if (!em) { 3318 ret = -ENOMEM; 3319 goto error; 3320 } 3321 em->bdev = (struct block_device *)map; 3322 em->start = start; 3323 em->len = num_bytes; 3324 em->block_start = 0; 3325 em->block_len = em->len; 3326 3327 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3328 write_lock(&em_tree->lock); 3329 ret = add_extent_mapping(em_tree, em); 3330 write_unlock(&em_tree->lock); 3331 BUG_ON(ret); 3332 free_extent_map(em); 3333 3334 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3335 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3336 start, num_bytes); 3337 BUG_ON(ret); 3338 3339 for (i = 0; i < map->num_stripes; ++i) { 3340 struct btrfs_device *device; 3341 u64 dev_offset; 3342 3343 device = map->stripes[i].dev; 3344 dev_offset = map->stripes[i].physical; 3345 3346 ret = btrfs_alloc_dev_extent(trans, device, 3347 info->chunk_root->root_key.objectid, 3348 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3349 start, dev_offset, stripe_size); 3350 BUG_ON(ret); 3351 } 3352 3353 kfree(devices_info); 3354 return 0; 3355 3356 error: 3357 kfree(map); 3358 kfree(devices_info); 3359 return ret; 3360 } 3361 3362 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3363 struct btrfs_root *extent_root, 3364 struct map_lookup *map, u64 chunk_offset, 3365 u64 chunk_size, u64 stripe_size) 3366 { 3367 u64 dev_offset; 3368 struct btrfs_key key; 3369 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3370 struct btrfs_device *device; 3371 struct btrfs_chunk *chunk; 3372 struct btrfs_stripe *stripe; 3373 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3374 int index = 0; 3375 int ret; 3376 3377 chunk = kzalloc(item_size, GFP_NOFS); 3378 if (!chunk) 3379 return -ENOMEM; 3380 3381 index = 0; 3382 while (index < map->num_stripes) { 3383 device = map->stripes[index].dev; 3384 device->bytes_used += stripe_size; 3385 ret = btrfs_update_device(trans, device); 3386 BUG_ON(ret); 3387 index++; 3388 } 3389 3390 spin_lock(&extent_root->fs_info->free_chunk_lock); 3391 extent_root->fs_info->free_chunk_space -= (stripe_size * 3392 map->num_stripes); 3393 spin_unlock(&extent_root->fs_info->free_chunk_lock); 3394 3395 index = 0; 3396 stripe = &chunk->stripe; 3397 while (index < map->num_stripes) { 3398 device = map->stripes[index].dev; 3399 dev_offset = map->stripes[index].physical; 3400 3401 btrfs_set_stack_stripe_devid(stripe, device->devid); 3402 btrfs_set_stack_stripe_offset(stripe, dev_offset); 3403 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 3404 stripe++; 3405 index++; 3406 } 3407 3408 btrfs_set_stack_chunk_length(chunk, chunk_size); 3409 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 3410 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 3411 btrfs_set_stack_chunk_type(chunk, map->type); 3412 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 3413 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 3414 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 3415 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 3416 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 3417 3418 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3419 key.type = BTRFS_CHUNK_ITEM_KEY; 3420 key.offset = chunk_offset; 3421 3422 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 3423 BUG_ON(ret); 3424 3425 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3426 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 3427 item_size); 3428 BUG_ON(ret); 3429 } 3430 3431 kfree(chunk); 3432 return 0; 3433 } 3434 3435 /* 3436 * Chunk allocation falls into two parts. The first part does works 3437 * that make the new allocated chunk useable, but not do any operation 3438 * that modifies the chunk tree. The second part does the works that 3439 * require modifying the chunk tree. This division is important for the 3440 * bootstrap process of adding storage to a seed btrfs. 3441 */ 3442 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3443 struct btrfs_root *extent_root, u64 type) 3444 { 3445 u64 chunk_offset; 3446 u64 chunk_size; 3447 u64 stripe_size; 3448 struct map_lookup *map; 3449 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3450 int ret; 3451 3452 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3453 &chunk_offset); 3454 if (ret) 3455 return ret; 3456 3457 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3458 &stripe_size, chunk_offset, type); 3459 if (ret) 3460 return ret; 3461 3462 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3463 chunk_size, stripe_size); 3464 BUG_ON(ret); 3465 return 0; 3466 } 3467 3468 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 3469 struct btrfs_root *root, 3470 struct btrfs_device *device) 3471 { 3472 u64 chunk_offset; 3473 u64 sys_chunk_offset; 3474 u64 chunk_size; 3475 u64 sys_chunk_size; 3476 u64 stripe_size; 3477 u64 sys_stripe_size; 3478 u64 alloc_profile; 3479 struct map_lookup *map; 3480 struct map_lookup *sys_map; 3481 struct btrfs_fs_info *fs_info = root->fs_info; 3482 struct btrfs_root *extent_root = fs_info->extent_root; 3483 int ret; 3484 3485 ret = find_next_chunk(fs_info->chunk_root, 3486 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 3487 if (ret) 3488 return ret; 3489 3490 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3491 fs_info->avail_metadata_alloc_bits; 3492 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3493 3494 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3495 &stripe_size, chunk_offset, alloc_profile); 3496 BUG_ON(ret); 3497 3498 sys_chunk_offset = chunk_offset + chunk_size; 3499 3500 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3501 fs_info->avail_system_alloc_bits; 3502 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3503 3504 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3505 &sys_chunk_size, &sys_stripe_size, 3506 sys_chunk_offset, alloc_profile); 3507 BUG_ON(ret); 3508 3509 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3510 BUG_ON(ret); 3511 3512 /* 3513 * Modifying chunk tree needs allocating new blocks from both 3514 * system block group and metadata block group. So we only can 3515 * do operations require modifying the chunk tree after both 3516 * block groups were created. 3517 */ 3518 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3519 chunk_size, stripe_size); 3520 BUG_ON(ret); 3521 3522 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3523 sys_chunk_offset, sys_chunk_size, 3524 sys_stripe_size); 3525 BUG_ON(ret); 3526 return 0; 3527 } 3528 3529 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 3530 { 3531 struct extent_map *em; 3532 struct map_lookup *map; 3533 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3534 int readonly = 0; 3535 int i; 3536 3537 read_lock(&map_tree->map_tree.lock); 3538 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3539 read_unlock(&map_tree->map_tree.lock); 3540 if (!em) 3541 return 1; 3542 3543 if (btrfs_test_opt(root, DEGRADED)) { 3544 free_extent_map(em); 3545 return 0; 3546 } 3547 3548 map = (struct map_lookup *)em->bdev; 3549 for (i = 0; i < map->num_stripes; i++) { 3550 if (!map->stripes[i].dev->writeable) { 3551 readonly = 1; 3552 break; 3553 } 3554 } 3555 free_extent_map(em); 3556 return readonly; 3557 } 3558 3559 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 3560 { 3561 extent_map_tree_init(&tree->map_tree); 3562 } 3563 3564 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 3565 { 3566 struct extent_map *em; 3567 3568 while (1) { 3569 write_lock(&tree->map_tree.lock); 3570 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 3571 if (em) 3572 remove_extent_mapping(&tree->map_tree, em); 3573 write_unlock(&tree->map_tree.lock); 3574 if (!em) 3575 break; 3576 kfree(em->bdev); 3577 /* once for us */ 3578 free_extent_map(em); 3579 /* once for the tree */ 3580 free_extent_map(em); 3581 } 3582 } 3583 3584 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3585 { 3586 struct extent_map *em; 3587 struct map_lookup *map; 3588 struct extent_map_tree *em_tree = &map_tree->map_tree; 3589 int ret; 3590 3591 read_lock(&em_tree->lock); 3592 em = lookup_extent_mapping(em_tree, logical, len); 3593 read_unlock(&em_tree->lock); 3594 BUG_ON(!em); 3595 3596 BUG_ON(em->start > logical || em->start + em->len < logical); 3597 map = (struct map_lookup *)em->bdev; 3598 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 3599 ret = map->num_stripes; 3600 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3601 ret = map->sub_stripes; 3602 else 3603 ret = 1; 3604 free_extent_map(em); 3605 return ret; 3606 } 3607 3608 static int find_live_mirror(struct map_lookup *map, int first, int num, 3609 int optimal) 3610 { 3611 int i; 3612 if (map->stripes[optimal].dev->bdev) 3613 return optimal; 3614 for (i = first; i < first + num; i++) { 3615 if (map->stripes[i].dev->bdev) 3616 return i; 3617 } 3618 /* we couldn't find one that doesn't fail. Just return something 3619 * and the io error handling code will clean up eventually 3620 */ 3621 return optimal; 3622 } 3623 3624 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3625 u64 logical, u64 *length, 3626 struct btrfs_bio **bbio_ret, 3627 int mirror_num) 3628 { 3629 struct extent_map *em; 3630 struct map_lookup *map; 3631 struct extent_map_tree *em_tree = &map_tree->map_tree; 3632 u64 offset; 3633 u64 stripe_offset; 3634 u64 stripe_end_offset; 3635 u64 stripe_nr; 3636 u64 stripe_nr_orig; 3637 u64 stripe_nr_end; 3638 int stripe_index; 3639 int i; 3640 int ret = 0; 3641 int num_stripes; 3642 int max_errors = 0; 3643 struct btrfs_bio *bbio = NULL; 3644 3645 read_lock(&em_tree->lock); 3646 em = lookup_extent_mapping(em_tree, logical, *length); 3647 read_unlock(&em_tree->lock); 3648 3649 if (!em) { 3650 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 3651 (unsigned long long)logical, 3652 (unsigned long long)*length); 3653 BUG(); 3654 } 3655 3656 BUG_ON(em->start > logical || em->start + em->len < logical); 3657 map = (struct map_lookup *)em->bdev; 3658 offset = logical - em->start; 3659 3660 if (mirror_num > map->num_stripes) 3661 mirror_num = 0; 3662 3663 stripe_nr = offset; 3664 /* 3665 * stripe_nr counts the total number of stripes we have to stride 3666 * to get to this block 3667 */ 3668 do_div(stripe_nr, map->stripe_len); 3669 3670 stripe_offset = stripe_nr * map->stripe_len; 3671 BUG_ON(offset < stripe_offset); 3672 3673 /* stripe_offset is the offset of this block in its stripe*/ 3674 stripe_offset = offset - stripe_offset; 3675 3676 if (rw & REQ_DISCARD) 3677 *length = min_t(u64, em->len - offset, *length); 3678 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 3679 /* we limit the length of each bio to what fits in a stripe */ 3680 *length = min_t(u64, em->len - offset, 3681 map->stripe_len - stripe_offset); 3682 } else { 3683 *length = em->len - offset; 3684 } 3685 3686 if (!bbio_ret) 3687 goto out; 3688 3689 num_stripes = 1; 3690 stripe_index = 0; 3691 stripe_nr_orig = stripe_nr; 3692 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 3693 (~(map->stripe_len - 1)); 3694 do_div(stripe_nr_end, map->stripe_len); 3695 stripe_end_offset = stripe_nr_end * map->stripe_len - 3696 (offset + *length); 3697 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3698 if (rw & REQ_DISCARD) 3699 num_stripes = min_t(u64, map->num_stripes, 3700 stripe_nr_end - stripe_nr_orig); 3701 stripe_index = do_div(stripe_nr, map->num_stripes); 3702 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3703 if (rw & (REQ_WRITE | REQ_DISCARD)) 3704 num_stripes = map->num_stripes; 3705 else if (mirror_num) 3706 stripe_index = mirror_num - 1; 3707 else { 3708 stripe_index = find_live_mirror(map, 0, 3709 map->num_stripes, 3710 current->pid % map->num_stripes); 3711 mirror_num = stripe_index + 1; 3712 } 3713 3714 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3715 if (rw & (REQ_WRITE | REQ_DISCARD)) { 3716 num_stripes = map->num_stripes; 3717 } else if (mirror_num) { 3718 stripe_index = mirror_num - 1; 3719 } else { 3720 mirror_num = 1; 3721 } 3722 3723 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3724 int factor = map->num_stripes / map->sub_stripes; 3725 3726 stripe_index = do_div(stripe_nr, factor); 3727 stripe_index *= map->sub_stripes; 3728 3729 if (rw & REQ_WRITE) 3730 num_stripes = map->sub_stripes; 3731 else if (rw & REQ_DISCARD) 3732 num_stripes = min_t(u64, map->sub_stripes * 3733 (stripe_nr_end - stripe_nr_orig), 3734 map->num_stripes); 3735 else if (mirror_num) 3736 stripe_index += mirror_num - 1; 3737 else { 3738 stripe_index = find_live_mirror(map, stripe_index, 3739 map->sub_stripes, stripe_index + 3740 current->pid % map->sub_stripes); 3741 mirror_num = stripe_index + 1; 3742 } 3743 } else { 3744 /* 3745 * after this do_div call, stripe_nr is the number of stripes 3746 * on this device we have to walk to find the data, and 3747 * stripe_index is the number of our device in the stripe array 3748 */ 3749 stripe_index = do_div(stripe_nr, map->num_stripes); 3750 mirror_num = stripe_index + 1; 3751 } 3752 BUG_ON(stripe_index >= map->num_stripes); 3753 3754 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 3755 if (!bbio) { 3756 ret = -ENOMEM; 3757 goto out; 3758 } 3759 atomic_set(&bbio->error, 0); 3760 3761 if (rw & REQ_DISCARD) { 3762 int factor = 0; 3763 int sub_stripes = 0; 3764 u64 stripes_per_dev = 0; 3765 u32 remaining_stripes = 0; 3766 3767 if (map->type & 3768 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 3769 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3770 sub_stripes = 1; 3771 else 3772 sub_stripes = map->sub_stripes; 3773 3774 factor = map->num_stripes / sub_stripes; 3775 stripes_per_dev = div_u64_rem(stripe_nr_end - 3776 stripe_nr_orig, 3777 factor, 3778 &remaining_stripes); 3779 } 3780 3781 for (i = 0; i < num_stripes; i++) { 3782 bbio->stripes[i].physical = 3783 map->stripes[stripe_index].physical + 3784 stripe_offset + stripe_nr * map->stripe_len; 3785 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3786 3787 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3788 BTRFS_BLOCK_GROUP_RAID10)) { 3789 bbio->stripes[i].length = stripes_per_dev * 3790 map->stripe_len; 3791 if (i / sub_stripes < remaining_stripes) 3792 bbio->stripes[i].length += 3793 map->stripe_len; 3794 if (i < sub_stripes) 3795 bbio->stripes[i].length -= 3796 stripe_offset; 3797 if ((i / sub_stripes + 1) % 3798 sub_stripes == remaining_stripes) 3799 bbio->stripes[i].length -= 3800 stripe_end_offset; 3801 if (i == sub_stripes - 1) 3802 stripe_offset = 0; 3803 } else 3804 bbio->stripes[i].length = *length; 3805 3806 stripe_index++; 3807 if (stripe_index == map->num_stripes) { 3808 /* This could only happen for RAID0/10 */ 3809 stripe_index = 0; 3810 stripe_nr++; 3811 } 3812 } 3813 } else { 3814 for (i = 0; i < num_stripes; i++) { 3815 bbio->stripes[i].physical = 3816 map->stripes[stripe_index].physical + 3817 stripe_offset + 3818 stripe_nr * map->stripe_len; 3819 bbio->stripes[i].dev = 3820 map->stripes[stripe_index].dev; 3821 stripe_index++; 3822 } 3823 } 3824 3825 if (rw & REQ_WRITE) { 3826 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 3827 BTRFS_BLOCK_GROUP_RAID10 | 3828 BTRFS_BLOCK_GROUP_DUP)) { 3829 max_errors = 1; 3830 } 3831 } 3832 3833 *bbio_ret = bbio; 3834 bbio->num_stripes = num_stripes; 3835 bbio->max_errors = max_errors; 3836 bbio->mirror_num = mirror_num; 3837 out: 3838 free_extent_map(em); 3839 return ret; 3840 } 3841 3842 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3843 u64 logical, u64 *length, 3844 struct btrfs_bio **bbio_ret, int mirror_num) 3845 { 3846 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 3847 mirror_num); 3848 } 3849 3850 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3851 u64 chunk_start, u64 physical, u64 devid, 3852 u64 **logical, int *naddrs, int *stripe_len) 3853 { 3854 struct extent_map_tree *em_tree = &map_tree->map_tree; 3855 struct extent_map *em; 3856 struct map_lookup *map; 3857 u64 *buf; 3858 u64 bytenr; 3859 u64 length; 3860 u64 stripe_nr; 3861 int i, j, nr = 0; 3862 3863 read_lock(&em_tree->lock); 3864 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3865 read_unlock(&em_tree->lock); 3866 3867 BUG_ON(!em || em->start != chunk_start); 3868 map = (struct map_lookup *)em->bdev; 3869 3870 length = em->len; 3871 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3872 do_div(length, map->num_stripes / map->sub_stripes); 3873 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3874 do_div(length, map->num_stripes); 3875 3876 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 3877 BUG_ON(!buf); 3878 3879 for (i = 0; i < map->num_stripes; i++) { 3880 if (devid && map->stripes[i].dev->devid != devid) 3881 continue; 3882 if (map->stripes[i].physical > physical || 3883 map->stripes[i].physical + length <= physical) 3884 continue; 3885 3886 stripe_nr = physical - map->stripes[i].physical; 3887 do_div(stripe_nr, map->stripe_len); 3888 3889 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3890 stripe_nr = stripe_nr * map->num_stripes + i; 3891 do_div(stripe_nr, map->sub_stripes); 3892 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3893 stripe_nr = stripe_nr * map->num_stripes + i; 3894 } 3895 bytenr = chunk_start + stripe_nr * map->stripe_len; 3896 WARN_ON(nr >= map->num_stripes); 3897 for (j = 0; j < nr; j++) { 3898 if (buf[j] == bytenr) 3899 break; 3900 } 3901 if (j == nr) { 3902 WARN_ON(nr >= map->num_stripes); 3903 buf[nr++] = bytenr; 3904 } 3905 } 3906 3907 *logical = buf; 3908 *naddrs = nr; 3909 *stripe_len = map->stripe_len; 3910 3911 free_extent_map(em); 3912 return 0; 3913 } 3914 3915 static void btrfs_end_bio(struct bio *bio, int err) 3916 { 3917 struct btrfs_bio *bbio = bio->bi_private; 3918 int is_orig_bio = 0; 3919 3920 if (err) 3921 atomic_inc(&bbio->error); 3922 3923 if (bio == bbio->orig_bio) 3924 is_orig_bio = 1; 3925 3926 if (atomic_dec_and_test(&bbio->stripes_pending)) { 3927 if (!is_orig_bio) { 3928 bio_put(bio); 3929 bio = bbio->orig_bio; 3930 } 3931 bio->bi_private = bbio->private; 3932 bio->bi_end_io = bbio->end_io; 3933 bio->bi_bdev = (struct block_device *) 3934 (unsigned long)bbio->mirror_num; 3935 /* only send an error to the higher layers if it is 3936 * beyond the tolerance of the multi-bio 3937 */ 3938 if (atomic_read(&bbio->error) > bbio->max_errors) { 3939 err = -EIO; 3940 } else { 3941 /* 3942 * this bio is actually up to date, we didn't 3943 * go over the max number of errors 3944 */ 3945 set_bit(BIO_UPTODATE, &bio->bi_flags); 3946 err = 0; 3947 } 3948 kfree(bbio); 3949 3950 bio_endio(bio, err); 3951 } else if (!is_orig_bio) { 3952 bio_put(bio); 3953 } 3954 } 3955 3956 struct async_sched { 3957 struct bio *bio; 3958 int rw; 3959 struct btrfs_fs_info *info; 3960 struct btrfs_work work; 3961 }; 3962 3963 /* 3964 * see run_scheduled_bios for a description of why bios are collected for 3965 * async submit. 3966 * 3967 * This will add one bio to the pending list for a device and make sure 3968 * the work struct is scheduled. 3969 */ 3970 static noinline int schedule_bio(struct btrfs_root *root, 3971 struct btrfs_device *device, 3972 int rw, struct bio *bio) 3973 { 3974 int should_queue = 1; 3975 struct btrfs_pending_bios *pending_bios; 3976 3977 /* don't bother with additional async steps for reads, right now */ 3978 if (!(rw & REQ_WRITE)) { 3979 bio_get(bio); 3980 btrfsic_submit_bio(rw, bio); 3981 bio_put(bio); 3982 return 0; 3983 } 3984 3985 /* 3986 * nr_async_bios allows us to reliably return congestion to the 3987 * higher layers. Otherwise, the async bio makes it appear we have 3988 * made progress against dirty pages when we've really just put it 3989 * on a queue for later 3990 */ 3991 atomic_inc(&root->fs_info->nr_async_bios); 3992 WARN_ON(bio->bi_next); 3993 bio->bi_next = NULL; 3994 bio->bi_rw |= rw; 3995 3996 spin_lock(&device->io_lock); 3997 if (bio->bi_rw & REQ_SYNC) 3998 pending_bios = &device->pending_sync_bios; 3999 else 4000 pending_bios = &device->pending_bios; 4001 4002 if (pending_bios->tail) 4003 pending_bios->tail->bi_next = bio; 4004 4005 pending_bios->tail = bio; 4006 if (!pending_bios->head) 4007 pending_bios->head = bio; 4008 if (device->running_pending) 4009 should_queue = 0; 4010 4011 spin_unlock(&device->io_lock); 4012 4013 if (should_queue) 4014 btrfs_queue_worker(&root->fs_info->submit_workers, 4015 &device->work); 4016 return 0; 4017 } 4018 4019 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4020 int mirror_num, int async_submit) 4021 { 4022 struct btrfs_mapping_tree *map_tree; 4023 struct btrfs_device *dev; 4024 struct bio *first_bio = bio; 4025 u64 logical = (u64)bio->bi_sector << 9; 4026 u64 length = 0; 4027 u64 map_length; 4028 int ret; 4029 int dev_nr = 0; 4030 int total_devs = 1; 4031 struct btrfs_bio *bbio = NULL; 4032 4033 length = bio->bi_size; 4034 map_tree = &root->fs_info->mapping_tree; 4035 map_length = length; 4036 4037 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4038 mirror_num); 4039 BUG_ON(ret); 4040 4041 total_devs = bbio->num_stripes; 4042 if (map_length < length) { 4043 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4044 "len %llu\n", (unsigned long long)logical, 4045 (unsigned long long)length, 4046 (unsigned long long)map_length); 4047 BUG(); 4048 } 4049 4050 bbio->orig_bio = first_bio; 4051 bbio->private = first_bio->bi_private; 4052 bbio->end_io = first_bio->bi_end_io; 4053 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4054 4055 while (dev_nr < total_devs) { 4056 if (dev_nr < total_devs - 1) { 4057 bio = bio_clone(first_bio, GFP_NOFS); 4058 BUG_ON(!bio); 4059 } else { 4060 bio = first_bio; 4061 } 4062 bio->bi_private = bbio; 4063 bio->bi_end_io = btrfs_end_bio; 4064 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4065 dev = bbio->stripes[dev_nr].dev; 4066 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 4067 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " 4068 "(%s id %llu), size=%u\n", rw, 4069 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4070 dev->name, dev->devid, bio->bi_size); 4071 bio->bi_bdev = dev->bdev; 4072 if (async_submit) 4073 schedule_bio(root, dev, rw, bio); 4074 else 4075 btrfsic_submit_bio(rw, bio); 4076 } else { 4077 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4078 bio->bi_sector = logical >> 9; 4079 bio_endio(bio, -EIO); 4080 } 4081 dev_nr++; 4082 } 4083 return 0; 4084 } 4085 4086 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4087 u8 *uuid, u8 *fsid) 4088 { 4089 struct btrfs_device *device; 4090 struct btrfs_fs_devices *cur_devices; 4091 4092 cur_devices = root->fs_info->fs_devices; 4093 while (cur_devices) { 4094 if (!fsid || 4095 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4096 device = __find_device(&cur_devices->devices, 4097 devid, uuid); 4098 if (device) 4099 return device; 4100 } 4101 cur_devices = cur_devices->seed; 4102 } 4103 return NULL; 4104 } 4105 4106 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 4107 u64 devid, u8 *dev_uuid) 4108 { 4109 struct btrfs_device *device; 4110 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4111 4112 device = kzalloc(sizeof(*device), GFP_NOFS); 4113 if (!device) 4114 return NULL; 4115 list_add(&device->dev_list, 4116 &fs_devices->devices); 4117 device->dev_root = root->fs_info->dev_root; 4118 device->devid = devid; 4119 device->work.func = pending_bios_fn; 4120 device->fs_devices = fs_devices; 4121 device->missing = 1; 4122 fs_devices->num_devices++; 4123 fs_devices->missing_devices++; 4124 spin_lock_init(&device->io_lock); 4125 INIT_LIST_HEAD(&device->dev_alloc_list); 4126 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 4127 return device; 4128 } 4129 4130 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 4131 struct extent_buffer *leaf, 4132 struct btrfs_chunk *chunk) 4133 { 4134 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4135 struct map_lookup *map; 4136 struct extent_map *em; 4137 u64 logical; 4138 u64 length; 4139 u64 devid; 4140 u8 uuid[BTRFS_UUID_SIZE]; 4141 int num_stripes; 4142 int ret; 4143 int i; 4144 4145 logical = key->offset; 4146 length = btrfs_chunk_length(leaf, chunk); 4147 4148 read_lock(&map_tree->map_tree.lock); 4149 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 4150 read_unlock(&map_tree->map_tree.lock); 4151 4152 /* already mapped? */ 4153 if (em && em->start <= logical && em->start + em->len > logical) { 4154 free_extent_map(em); 4155 return 0; 4156 } else if (em) { 4157 free_extent_map(em); 4158 } 4159 4160 em = alloc_extent_map(); 4161 if (!em) 4162 return -ENOMEM; 4163 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 4164 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4165 if (!map) { 4166 free_extent_map(em); 4167 return -ENOMEM; 4168 } 4169 4170 em->bdev = (struct block_device *)map; 4171 em->start = logical; 4172 em->len = length; 4173 em->block_start = 0; 4174 em->block_len = em->len; 4175 4176 map->num_stripes = num_stripes; 4177 map->io_width = btrfs_chunk_io_width(leaf, chunk); 4178 map->io_align = btrfs_chunk_io_align(leaf, chunk); 4179 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 4180 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 4181 map->type = btrfs_chunk_type(leaf, chunk); 4182 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 4183 for (i = 0; i < num_stripes; i++) { 4184 map->stripes[i].physical = 4185 btrfs_stripe_offset_nr(leaf, chunk, i); 4186 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 4187 read_extent_buffer(leaf, uuid, (unsigned long) 4188 btrfs_stripe_dev_uuid_nr(chunk, i), 4189 BTRFS_UUID_SIZE); 4190 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4191 NULL); 4192 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4193 kfree(map); 4194 free_extent_map(em); 4195 return -EIO; 4196 } 4197 if (!map->stripes[i].dev) { 4198 map->stripes[i].dev = 4199 add_missing_dev(root, devid, uuid); 4200 if (!map->stripes[i].dev) { 4201 kfree(map); 4202 free_extent_map(em); 4203 return -EIO; 4204 } 4205 } 4206 map->stripes[i].dev->in_fs_metadata = 1; 4207 } 4208 4209 write_lock(&map_tree->map_tree.lock); 4210 ret = add_extent_mapping(&map_tree->map_tree, em); 4211 write_unlock(&map_tree->map_tree.lock); 4212 BUG_ON(ret); 4213 free_extent_map(em); 4214 4215 return 0; 4216 } 4217 4218 static int fill_device_from_item(struct extent_buffer *leaf, 4219 struct btrfs_dev_item *dev_item, 4220 struct btrfs_device *device) 4221 { 4222 unsigned long ptr; 4223 4224 device->devid = btrfs_device_id(leaf, dev_item); 4225 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 4226 device->total_bytes = device->disk_total_bytes; 4227 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 4228 device->type = btrfs_device_type(leaf, dev_item); 4229 device->io_align = btrfs_device_io_align(leaf, dev_item); 4230 device->io_width = btrfs_device_io_width(leaf, dev_item); 4231 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 4232 4233 ptr = (unsigned long)btrfs_device_uuid(dev_item); 4234 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 4235 4236 return 0; 4237 } 4238 4239 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 4240 { 4241 struct btrfs_fs_devices *fs_devices; 4242 int ret; 4243 4244 BUG_ON(!mutex_is_locked(&uuid_mutex)); 4245 4246 fs_devices = root->fs_info->fs_devices->seed; 4247 while (fs_devices) { 4248 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4249 ret = 0; 4250 goto out; 4251 } 4252 fs_devices = fs_devices->seed; 4253 } 4254 4255 fs_devices = find_fsid(fsid); 4256 if (!fs_devices) { 4257 ret = -ENOENT; 4258 goto out; 4259 } 4260 4261 fs_devices = clone_fs_devices(fs_devices); 4262 if (IS_ERR(fs_devices)) { 4263 ret = PTR_ERR(fs_devices); 4264 goto out; 4265 } 4266 4267 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 4268 root->fs_info->bdev_holder); 4269 if (ret) 4270 goto out; 4271 4272 if (!fs_devices->seeding) { 4273 __btrfs_close_devices(fs_devices); 4274 free_fs_devices(fs_devices); 4275 ret = -EINVAL; 4276 goto out; 4277 } 4278 4279 fs_devices->seed = root->fs_info->fs_devices->seed; 4280 root->fs_info->fs_devices->seed = fs_devices; 4281 out: 4282 return ret; 4283 } 4284 4285 static int read_one_dev(struct btrfs_root *root, 4286 struct extent_buffer *leaf, 4287 struct btrfs_dev_item *dev_item) 4288 { 4289 struct btrfs_device *device; 4290 u64 devid; 4291 int ret; 4292 u8 fs_uuid[BTRFS_UUID_SIZE]; 4293 u8 dev_uuid[BTRFS_UUID_SIZE]; 4294 4295 devid = btrfs_device_id(leaf, dev_item); 4296 read_extent_buffer(leaf, dev_uuid, 4297 (unsigned long)btrfs_device_uuid(dev_item), 4298 BTRFS_UUID_SIZE); 4299 read_extent_buffer(leaf, fs_uuid, 4300 (unsigned long)btrfs_device_fsid(dev_item), 4301 BTRFS_UUID_SIZE); 4302 4303 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 4304 ret = open_seed_devices(root, fs_uuid); 4305 if (ret && !btrfs_test_opt(root, DEGRADED)) 4306 return ret; 4307 } 4308 4309 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 4310 if (!device || !device->bdev) { 4311 if (!btrfs_test_opt(root, DEGRADED)) 4312 return -EIO; 4313 4314 if (!device) { 4315 printk(KERN_WARNING "warning devid %llu missing\n", 4316 (unsigned long long)devid); 4317 device = add_missing_dev(root, devid, dev_uuid); 4318 if (!device) 4319 return -ENOMEM; 4320 } else if (!device->missing) { 4321 /* 4322 * this happens when a device that was properly setup 4323 * in the device info lists suddenly goes bad. 4324 * device->bdev is NULL, and so we have to set 4325 * device->missing to one here 4326 */ 4327 root->fs_info->fs_devices->missing_devices++; 4328 device->missing = 1; 4329 } 4330 } 4331 4332 if (device->fs_devices != root->fs_info->fs_devices) { 4333 BUG_ON(device->writeable); 4334 if (device->generation != 4335 btrfs_device_generation(leaf, dev_item)) 4336 return -EINVAL; 4337 } 4338 4339 fill_device_from_item(leaf, dev_item, device); 4340 device->dev_root = root->fs_info->dev_root; 4341 device->in_fs_metadata = 1; 4342 if (device->writeable) { 4343 device->fs_devices->total_rw_bytes += device->total_bytes; 4344 spin_lock(&root->fs_info->free_chunk_lock); 4345 root->fs_info->free_chunk_space += device->total_bytes - 4346 device->bytes_used; 4347 spin_unlock(&root->fs_info->free_chunk_lock); 4348 } 4349 ret = 0; 4350 return ret; 4351 } 4352 4353 int btrfs_read_sys_array(struct btrfs_root *root) 4354 { 4355 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 4356 struct extent_buffer *sb; 4357 struct btrfs_disk_key *disk_key; 4358 struct btrfs_chunk *chunk; 4359 u8 *ptr; 4360 unsigned long sb_ptr; 4361 int ret = 0; 4362 u32 num_stripes; 4363 u32 array_size; 4364 u32 len = 0; 4365 u32 cur; 4366 struct btrfs_key key; 4367 4368 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 4369 BTRFS_SUPER_INFO_SIZE); 4370 if (!sb) 4371 return -ENOMEM; 4372 btrfs_set_buffer_uptodate(sb); 4373 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 4374 /* 4375 * The sb extent buffer is artifical and just used to read the system array. 4376 * btrfs_set_buffer_uptodate() call does not properly mark all it's 4377 * pages up-to-date when the page is larger: extent does not cover the 4378 * whole page and consequently check_page_uptodate does not find all 4379 * the page's extents up-to-date (the hole beyond sb), 4380 * write_extent_buffer then triggers a WARN_ON. 4381 * 4382 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 4383 * but sb spans only this function. Add an explicit SetPageUptodate call 4384 * to silence the warning eg. on PowerPC 64. 4385 */ 4386 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 4387 SetPageUptodate(sb->first_page); 4388 4389 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 4390 array_size = btrfs_super_sys_array_size(super_copy); 4391 4392 ptr = super_copy->sys_chunk_array; 4393 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 4394 cur = 0; 4395 4396 while (cur < array_size) { 4397 disk_key = (struct btrfs_disk_key *)ptr; 4398 btrfs_disk_key_to_cpu(&key, disk_key); 4399 4400 len = sizeof(*disk_key); ptr += len; 4401 sb_ptr += len; 4402 cur += len; 4403 4404 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 4405 chunk = (struct btrfs_chunk *)sb_ptr; 4406 ret = read_one_chunk(root, &key, sb, chunk); 4407 if (ret) 4408 break; 4409 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 4410 len = btrfs_chunk_item_size(num_stripes); 4411 } else { 4412 ret = -EIO; 4413 break; 4414 } 4415 ptr += len; 4416 sb_ptr += len; 4417 cur += len; 4418 } 4419 free_extent_buffer(sb); 4420 return ret; 4421 } 4422 4423 int btrfs_read_chunk_tree(struct btrfs_root *root) 4424 { 4425 struct btrfs_path *path; 4426 struct extent_buffer *leaf; 4427 struct btrfs_key key; 4428 struct btrfs_key found_key; 4429 int ret; 4430 int slot; 4431 4432 root = root->fs_info->chunk_root; 4433 4434 path = btrfs_alloc_path(); 4435 if (!path) 4436 return -ENOMEM; 4437 4438 mutex_lock(&uuid_mutex); 4439 lock_chunks(root); 4440 4441 /* first we search for all of the device items, and then we 4442 * read in all of the chunk items. This way we can create chunk 4443 * mappings that reference all of the devices that are afound 4444 */ 4445 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 4446 key.offset = 0; 4447 key.type = 0; 4448 again: 4449 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4450 if (ret < 0) 4451 goto error; 4452 while (1) { 4453 leaf = path->nodes[0]; 4454 slot = path->slots[0]; 4455 if (slot >= btrfs_header_nritems(leaf)) { 4456 ret = btrfs_next_leaf(root, path); 4457 if (ret == 0) 4458 continue; 4459 if (ret < 0) 4460 goto error; 4461 break; 4462 } 4463 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4464 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 4465 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 4466 break; 4467 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 4468 struct btrfs_dev_item *dev_item; 4469 dev_item = btrfs_item_ptr(leaf, slot, 4470 struct btrfs_dev_item); 4471 ret = read_one_dev(root, leaf, dev_item); 4472 if (ret) 4473 goto error; 4474 } 4475 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 4476 struct btrfs_chunk *chunk; 4477 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 4478 ret = read_one_chunk(root, &found_key, leaf, chunk); 4479 if (ret) 4480 goto error; 4481 } 4482 path->slots[0]++; 4483 } 4484 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 4485 key.objectid = 0; 4486 btrfs_release_path(path); 4487 goto again; 4488 } 4489 ret = 0; 4490 error: 4491 unlock_chunks(root); 4492 mutex_unlock(&uuid_mutex); 4493 4494 btrfs_free_path(path); 4495 return ret; 4496 } 4497