1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/buffer_head.h> 21 #include <linux/blkdev.h> 22 #include <linux/random.h> 23 #include <linux/iocontext.h> 24 #include <asm/div64.h> 25 #include "compat.h" 26 #include "ctree.h" 27 #include "extent_map.h" 28 #include "disk-io.h" 29 #include "transaction.h" 30 #include "print-tree.h" 31 #include "volumes.h" 32 #include "async-thread.h" 33 34 struct map_lookup { 35 u64 type; 36 int io_align; 37 int io_width; 38 int stripe_len; 39 int sector_size; 40 int num_stripes; 41 int sub_stripes; 42 struct btrfs_bio_stripe stripes[]; 43 }; 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 50 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 51 (sizeof(struct btrfs_bio_stripe) * (n))) 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 void btrfs_lock_volumes(void) 57 { 58 mutex_lock(&uuid_mutex); 59 } 60 61 void btrfs_unlock_volumes(void) 62 { 63 mutex_unlock(&uuid_mutex); 64 } 65 66 static void lock_chunks(struct btrfs_root *root) 67 { 68 mutex_lock(&root->fs_info->chunk_mutex); 69 } 70 71 static void unlock_chunks(struct btrfs_root *root) 72 { 73 mutex_unlock(&root->fs_info->chunk_mutex); 74 } 75 76 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 77 { 78 struct btrfs_device *device; 79 WARN_ON(fs_devices->opened); 80 while (!list_empty(&fs_devices->devices)) { 81 device = list_entry(fs_devices->devices.next, 82 struct btrfs_device, dev_list); 83 list_del(&device->dev_list); 84 kfree(device->name); 85 kfree(device); 86 } 87 kfree(fs_devices); 88 } 89 90 int btrfs_cleanup_fs_uuids(void) 91 { 92 struct btrfs_fs_devices *fs_devices; 93 94 while (!list_empty(&fs_uuids)) { 95 fs_devices = list_entry(fs_uuids.next, 96 struct btrfs_fs_devices, list); 97 list_del(&fs_devices->list); 98 free_fs_devices(fs_devices); 99 } 100 return 0; 101 } 102 103 static noinline struct btrfs_device *__find_device(struct list_head *head, 104 u64 devid, u8 *uuid) 105 { 106 struct btrfs_device *dev; 107 108 list_for_each_entry(dev, head, dev_list) { 109 if (dev->devid == devid && 110 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 111 return dev; 112 } 113 } 114 return NULL; 115 } 116 117 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 118 { 119 struct btrfs_fs_devices *fs_devices; 120 121 list_for_each_entry(fs_devices, &fs_uuids, list) { 122 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 123 return fs_devices; 124 } 125 return NULL; 126 } 127 128 static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 struct bio *head, struct bio *tail) 130 { 131 132 struct bio *old_head; 133 134 old_head = pending_bios->head; 135 pending_bios->head = head; 136 if (pending_bios->tail) 137 tail->bi_next = old_head; 138 else 139 pending_bios->tail = tail; 140 } 141 142 /* 143 * we try to collect pending bios for a device so we don't get a large 144 * number of procs sending bios down to the same device. This greatly 145 * improves the schedulers ability to collect and merge the bios. 146 * 147 * But, it also turns into a long list of bios to process and that is sure 148 * to eventually make the worker thread block. The solution here is to 149 * make some progress and then put this work struct back at the end of 150 * the list if the block device is congested. This way, multiple devices 151 * can make progress from a single worker thread. 152 */ 153 static noinline int run_scheduled_bios(struct btrfs_device *device) 154 { 155 struct bio *pending; 156 struct backing_dev_info *bdi; 157 struct btrfs_fs_info *fs_info; 158 struct btrfs_pending_bios *pending_bios; 159 struct bio *tail; 160 struct bio *cur; 161 int again = 0; 162 unsigned long num_run; 163 unsigned long num_sync_run; 164 unsigned long batch_run = 0; 165 unsigned long limit; 166 unsigned long last_waited = 0; 167 int force_reg = 0; 168 169 bdi = blk_get_backing_dev_info(device->bdev); 170 fs_info = device->dev_root->fs_info; 171 limit = btrfs_async_submit_limit(fs_info); 172 limit = limit * 2 / 3; 173 174 /* we want to make sure that every time we switch from the sync 175 * list to the normal list, we unplug 176 */ 177 num_sync_run = 0; 178 179 loop: 180 spin_lock(&device->io_lock); 181 182 loop_lock: 183 num_run = 0; 184 185 /* take all the bios off the list at once and process them 186 * later on (without the lock held). But, remember the 187 * tail and other pointers so the bios can be properly reinserted 188 * into the list if we hit congestion 189 */ 190 if (!force_reg && device->pending_sync_bios.head) { 191 pending_bios = &device->pending_sync_bios; 192 force_reg = 1; 193 } else { 194 pending_bios = &device->pending_bios; 195 force_reg = 0; 196 } 197 198 pending = pending_bios->head; 199 tail = pending_bios->tail; 200 WARN_ON(pending && !tail); 201 202 /* 203 * if pending was null this time around, no bios need processing 204 * at all and we can stop. Otherwise it'll loop back up again 205 * and do an additional check so no bios are missed. 206 * 207 * device->running_pending is used to synchronize with the 208 * schedule_bio code. 209 */ 210 if (device->pending_sync_bios.head == NULL && 211 device->pending_bios.head == NULL) { 212 again = 0; 213 device->running_pending = 0; 214 } else { 215 again = 1; 216 device->running_pending = 1; 217 } 218 219 pending_bios->head = NULL; 220 pending_bios->tail = NULL; 221 222 spin_unlock(&device->io_lock); 223 224 /* 225 * if we're doing the regular priority list, make sure we unplug 226 * for any high prio bios we've sent down 227 */ 228 if (pending_bios == &device->pending_bios && num_sync_run > 0) { 229 num_sync_run = 0; 230 blk_run_backing_dev(bdi, NULL); 231 } 232 233 while (pending) { 234 235 rmb(); 236 /* we want to work on both lists, but do more bios on the 237 * sync list than the regular list 238 */ 239 if ((num_run > 32 && 240 pending_bios != &device->pending_sync_bios && 241 device->pending_sync_bios.head) || 242 (num_run > 64 && pending_bios == &device->pending_sync_bios && 243 device->pending_bios.head)) { 244 spin_lock(&device->io_lock); 245 requeue_list(pending_bios, pending, tail); 246 goto loop_lock; 247 } 248 249 cur = pending; 250 pending = pending->bi_next; 251 cur->bi_next = NULL; 252 atomic_dec(&fs_info->nr_async_bios); 253 254 if (atomic_read(&fs_info->nr_async_bios) < limit && 255 waitqueue_active(&fs_info->async_submit_wait)) 256 wake_up(&fs_info->async_submit_wait); 257 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 submit_bio(cur->bi_rw, cur); 260 num_run++; 261 batch_run++; 262 263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 264 num_sync_run++; 265 266 if (need_resched()) { 267 if (num_sync_run) { 268 blk_run_backing_dev(bdi, NULL); 269 num_sync_run = 0; 270 } 271 cond_resched(); 272 } 273 274 /* 275 * we made progress, there is more work to do and the bdi 276 * is now congested. Back off and let other work structs 277 * run instead 278 */ 279 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 280 fs_info->fs_devices->open_devices > 1) { 281 struct io_context *ioc; 282 283 ioc = current->io_context; 284 285 /* 286 * the main goal here is that we don't want to 287 * block if we're going to be able to submit 288 * more requests without blocking. 289 * 290 * This code does two great things, it pokes into 291 * the elevator code from a filesystem _and_ 292 * it makes assumptions about how batching works. 293 */ 294 if (ioc && ioc->nr_batch_requests > 0 && 295 time_before(jiffies, ioc->last_waited + HZ/50UL) && 296 (last_waited == 0 || 297 ioc->last_waited == last_waited)) { 298 /* 299 * we want to go through our batch of 300 * requests and stop. So, we copy out 301 * the ioc->last_waited time and test 302 * against it before looping 303 */ 304 last_waited = ioc->last_waited; 305 if (need_resched()) { 306 if (num_sync_run) { 307 blk_run_backing_dev(bdi, NULL); 308 num_sync_run = 0; 309 } 310 cond_resched(); 311 } 312 continue; 313 } 314 spin_lock(&device->io_lock); 315 requeue_list(pending_bios, pending, tail); 316 device->running_pending = 1; 317 318 spin_unlock(&device->io_lock); 319 btrfs_requeue_work(&device->work); 320 goto done; 321 } 322 } 323 324 if (num_sync_run) { 325 num_sync_run = 0; 326 blk_run_backing_dev(bdi, NULL); 327 } 328 329 cond_resched(); 330 if (again) 331 goto loop; 332 333 spin_lock(&device->io_lock); 334 if (device->pending_bios.head || device->pending_sync_bios.head) 335 goto loop_lock; 336 spin_unlock(&device->io_lock); 337 338 /* 339 * IO has already been through a long path to get here. Checksumming, 340 * async helper threads, perhaps compression. We've done a pretty 341 * good job of collecting a batch of IO and should just unplug 342 * the device right away. 343 * 344 * This will help anyone who is waiting on the IO, they might have 345 * already unplugged, but managed to do so before the bio they 346 * cared about found its way down here. 347 */ 348 blk_run_backing_dev(bdi, NULL); 349 done: 350 return 0; 351 } 352 353 static void pending_bios_fn(struct btrfs_work *work) 354 { 355 struct btrfs_device *device; 356 357 device = container_of(work, struct btrfs_device, work); 358 run_scheduled_bios(device); 359 } 360 361 static noinline int device_list_add(const char *path, 362 struct btrfs_super_block *disk_super, 363 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 364 { 365 struct btrfs_device *device; 366 struct btrfs_fs_devices *fs_devices; 367 u64 found_transid = btrfs_super_generation(disk_super); 368 369 fs_devices = find_fsid(disk_super->fsid); 370 if (!fs_devices) { 371 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 372 if (!fs_devices) 373 return -ENOMEM; 374 INIT_LIST_HEAD(&fs_devices->devices); 375 INIT_LIST_HEAD(&fs_devices->alloc_list); 376 list_add(&fs_devices->list, &fs_uuids); 377 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 378 fs_devices->latest_devid = devid; 379 fs_devices->latest_trans = found_transid; 380 mutex_init(&fs_devices->device_list_mutex); 381 device = NULL; 382 } else { 383 device = __find_device(&fs_devices->devices, devid, 384 disk_super->dev_item.uuid); 385 } 386 if (!device) { 387 if (fs_devices->opened) 388 return -EBUSY; 389 390 device = kzalloc(sizeof(*device), GFP_NOFS); 391 if (!device) { 392 /* we can safely leave the fs_devices entry around */ 393 return -ENOMEM; 394 } 395 device->devid = devid; 396 device->work.func = pending_bios_fn; 397 memcpy(device->uuid, disk_super->dev_item.uuid, 398 BTRFS_UUID_SIZE); 399 device->barriers = 1; 400 spin_lock_init(&device->io_lock); 401 device->name = kstrdup(path, GFP_NOFS); 402 if (!device->name) { 403 kfree(device); 404 return -ENOMEM; 405 } 406 INIT_LIST_HEAD(&device->dev_alloc_list); 407 408 mutex_lock(&fs_devices->device_list_mutex); 409 list_add(&device->dev_list, &fs_devices->devices); 410 mutex_unlock(&fs_devices->device_list_mutex); 411 412 device->fs_devices = fs_devices; 413 fs_devices->num_devices++; 414 } 415 416 if (found_transid > fs_devices->latest_trans) { 417 fs_devices->latest_devid = devid; 418 fs_devices->latest_trans = found_transid; 419 } 420 *fs_devices_ret = fs_devices; 421 return 0; 422 } 423 424 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 425 { 426 struct btrfs_fs_devices *fs_devices; 427 struct btrfs_device *device; 428 struct btrfs_device *orig_dev; 429 430 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 431 if (!fs_devices) 432 return ERR_PTR(-ENOMEM); 433 434 INIT_LIST_HEAD(&fs_devices->devices); 435 INIT_LIST_HEAD(&fs_devices->alloc_list); 436 INIT_LIST_HEAD(&fs_devices->list); 437 mutex_init(&fs_devices->device_list_mutex); 438 fs_devices->latest_devid = orig->latest_devid; 439 fs_devices->latest_trans = orig->latest_trans; 440 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 441 442 mutex_lock(&orig->device_list_mutex); 443 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 444 device = kzalloc(sizeof(*device), GFP_NOFS); 445 if (!device) 446 goto error; 447 448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 449 if (!device->name) { 450 kfree(device); 451 goto error; 452 } 453 454 device->devid = orig_dev->devid; 455 device->work.func = pending_bios_fn; 456 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 457 device->barriers = 1; 458 spin_lock_init(&device->io_lock); 459 INIT_LIST_HEAD(&device->dev_list); 460 INIT_LIST_HEAD(&device->dev_alloc_list); 461 462 list_add(&device->dev_list, &fs_devices->devices); 463 device->fs_devices = fs_devices; 464 fs_devices->num_devices++; 465 } 466 mutex_unlock(&orig->device_list_mutex); 467 return fs_devices; 468 error: 469 mutex_unlock(&orig->device_list_mutex); 470 free_fs_devices(fs_devices); 471 return ERR_PTR(-ENOMEM); 472 } 473 474 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 475 { 476 struct btrfs_device *device, *next; 477 478 mutex_lock(&uuid_mutex); 479 again: 480 mutex_lock(&fs_devices->device_list_mutex); 481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 482 if (device->in_fs_metadata) 483 continue; 484 485 if (device->bdev) { 486 close_bdev_exclusive(device->bdev, device->mode); 487 device->bdev = NULL; 488 fs_devices->open_devices--; 489 } 490 if (device->writeable) { 491 list_del_init(&device->dev_alloc_list); 492 device->writeable = 0; 493 fs_devices->rw_devices--; 494 } 495 list_del_init(&device->dev_list); 496 fs_devices->num_devices--; 497 kfree(device->name); 498 kfree(device); 499 } 500 mutex_unlock(&fs_devices->device_list_mutex); 501 502 if (fs_devices->seed) { 503 fs_devices = fs_devices->seed; 504 goto again; 505 } 506 507 mutex_unlock(&uuid_mutex); 508 return 0; 509 } 510 511 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 512 { 513 struct btrfs_device *device; 514 515 if (--fs_devices->opened > 0) 516 return 0; 517 518 list_for_each_entry(device, &fs_devices->devices, dev_list) { 519 if (device->bdev) { 520 close_bdev_exclusive(device->bdev, device->mode); 521 fs_devices->open_devices--; 522 } 523 if (device->writeable) { 524 list_del_init(&device->dev_alloc_list); 525 fs_devices->rw_devices--; 526 } 527 528 device->bdev = NULL; 529 device->writeable = 0; 530 device->in_fs_metadata = 0; 531 } 532 WARN_ON(fs_devices->open_devices); 533 WARN_ON(fs_devices->rw_devices); 534 fs_devices->opened = 0; 535 fs_devices->seeding = 0; 536 537 return 0; 538 } 539 540 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 541 { 542 struct btrfs_fs_devices *seed_devices = NULL; 543 int ret; 544 545 mutex_lock(&uuid_mutex); 546 ret = __btrfs_close_devices(fs_devices); 547 if (!fs_devices->opened) { 548 seed_devices = fs_devices->seed; 549 fs_devices->seed = NULL; 550 } 551 mutex_unlock(&uuid_mutex); 552 553 while (seed_devices) { 554 fs_devices = seed_devices; 555 seed_devices = fs_devices->seed; 556 __btrfs_close_devices(fs_devices); 557 free_fs_devices(fs_devices); 558 } 559 return ret; 560 } 561 562 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 563 fmode_t flags, void *holder) 564 { 565 struct block_device *bdev; 566 struct list_head *head = &fs_devices->devices; 567 struct btrfs_device *device; 568 struct block_device *latest_bdev = NULL; 569 struct buffer_head *bh; 570 struct btrfs_super_block *disk_super; 571 u64 latest_devid = 0; 572 u64 latest_transid = 0; 573 u64 devid; 574 int seeding = 1; 575 int ret = 0; 576 577 list_for_each_entry(device, head, dev_list) { 578 if (device->bdev) 579 continue; 580 if (!device->name) 581 continue; 582 583 bdev = open_bdev_exclusive(device->name, flags, holder); 584 if (IS_ERR(bdev)) { 585 printk(KERN_INFO "open %s failed\n", device->name); 586 goto error; 587 } 588 set_blocksize(bdev, 4096); 589 590 bh = btrfs_read_dev_super(bdev); 591 if (!bh) 592 goto error_close; 593 594 disk_super = (struct btrfs_super_block *)bh->b_data; 595 devid = le64_to_cpu(disk_super->dev_item.devid); 596 if (devid != device->devid) 597 goto error_brelse; 598 599 if (memcmp(device->uuid, disk_super->dev_item.uuid, 600 BTRFS_UUID_SIZE)) 601 goto error_brelse; 602 603 device->generation = btrfs_super_generation(disk_super); 604 if (!latest_transid || device->generation > latest_transid) { 605 latest_devid = devid; 606 latest_transid = device->generation; 607 latest_bdev = bdev; 608 } 609 610 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 611 device->writeable = 0; 612 } else { 613 device->writeable = !bdev_read_only(bdev); 614 seeding = 0; 615 } 616 617 device->bdev = bdev; 618 device->in_fs_metadata = 0; 619 device->mode = flags; 620 621 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 622 fs_devices->rotating = 1; 623 624 fs_devices->open_devices++; 625 if (device->writeable) { 626 fs_devices->rw_devices++; 627 list_add(&device->dev_alloc_list, 628 &fs_devices->alloc_list); 629 } 630 continue; 631 632 error_brelse: 633 brelse(bh); 634 error_close: 635 close_bdev_exclusive(bdev, FMODE_READ); 636 error: 637 continue; 638 } 639 if (fs_devices->open_devices == 0) { 640 ret = -EIO; 641 goto out; 642 } 643 fs_devices->seeding = seeding; 644 fs_devices->opened = 1; 645 fs_devices->latest_bdev = latest_bdev; 646 fs_devices->latest_devid = latest_devid; 647 fs_devices->latest_trans = latest_transid; 648 fs_devices->total_rw_bytes = 0; 649 out: 650 return ret; 651 } 652 653 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 654 fmode_t flags, void *holder) 655 { 656 int ret; 657 658 mutex_lock(&uuid_mutex); 659 if (fs_devices->opened) { 660 fs_devices->opened++; 661 ret = 0; 662 } else { 663 ret = __btrfs_open_devices(fs_devices, flags, holder); 664 } 665 mutex_unlock(&uuid_mutex); 666 return ret; 667 } 668 669 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 670 struct btrfs_fs_devices **fs_devices_ret) 671 { 672 struct btrfs_super_block *disk_super; 673 struct block_device *bdev; 674 struct buffer_head *bh; 675 int ret; 676 u64 devid; 677 u64 transid; 678 679 mutex_lock(&uuid_mutex); 680 681 bdev = open_bdev_exclusive(path, flags, holder); 682 683 if (IS_ERR(bdev)) { 684 ret = PTR_ERR(bdev); 685 goto error; 686 } 687 688 ret = set_blocksize(bdev, 4096); 689 if (ret) 690 goto error_close; 691 bh = btrfs_read_dev_super(bdev); 692 if (!bh) { 693 ret = -EIO; 694 goto error_close; 695 } 696 disk_super = (struct btrfs_super_block *)bh->b_data; 697 devid = le64_to_cpu(disk_super->dev_item.devid); 698 transid = btrfs_super_generation(disk_super); 699 if (disk_super->label[0]) 700 printk(KERN_INFO "device label %s ", disk_super->label); 701 else { 702 /* FIXME, make a readl uuid parser */ 703 printk(KERN_INFO "device fsid %llx-%llx ", 704 *(unsigned long long *)disk_super->fsid, 705 *(unsigned long long *)(disk_super->fsid + 8)); 706 } 707 printk(KERN_CONT "devid %llu transid %llu %s\n", 708 (unsigned long long)devid, (unsigned long long)transid, path); 709 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 710 711 brelse(bh); 712 error_close: 713 close_bdev_exclusive(bdev, flags); 714 error: 715 mutex_unlock(&uuid_mutex); 716 return ret; 717 } 718 719 /* 720 * this uses a pretty simple search, the expectation is that it is 721 * called very infrequently and that a given device has a small number 722 * of extents 723 */ 724 int find_free_dev_extent(struct btrfs_trans_handle *trans, 725 struct btrfs_device *device, u64 num_bytes, 726 u64 *start, u64 *max_avail) 727 { 728 struct btrfs_key key; 729 struct btrfs_root *root = device->dev_root; 730 struct btrfs_dev_extent *dev_extent = NULL; 731 struct btrfs_path *path; 732 u64 hole_size = 0; 733 u64 last_byte = 0; 734 u64 search_start = 0; 735 u64 search_end = device->total_bytes; 736 int ret; 737 int slot = 0; 738 int start_found; 739 struct extent_buffer *l; 740 741 path = btrfs_alloc_path(); 742 if (!path) 743 return -ENOMEM; 744 path->reada = 2; 745 start_found = 0; 746 747 /* FIXME use last free of some kind */ 748 749 /* we don't want to overwrite the superblock on the drive, 750 * so we make sure to start at an offset of at least 1MB 751 */ 752 search_start = max((u64)1024 * 1024, search_start); 753 754 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 755 search_start = max(root->fs_info->alloc_start, search_start); 756 757 key.objectid = device->devid; 758 key.offset = search_start; 759 key.type = BTRFS_DEV_EXTENT_KEY; 760 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 761 if (ret < 0) 762 goto error; 763 if (ret > 0) { 764 ret = btrfs_previous_item(root, path, key.objectid, key.type); 765 if (ret < 0) 766 goto error; 767 if (ret > 0) 768 start_found = 1; 769 } 770 l = path->nodes[0]; 771 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 772 while (1) { 773 l = path->nodes[0]; 774 slot = path->slots[0]; 775 if (slot >= btrfs_header_nritems(l)) { 776 ret = btrfs_next_leaf(root, path); 777 if (ret == 0) 778 continue; 779 if (ret < 0) 780 goto error; 781 no_more_items: 782 if (!start_found) { 783 if (search_start >= search_end) { 784 ret = -ENOSPC; 785 goto error; 786 } 787 *start = search_start; 788 start_found = 1; 789 goto check_pending; 790 } 791 *start = last_byte > search_start ? 792 last_byte : search_start; 793 if (search_end <= *start) { 794 ret = -ENOSPC; 795 goto error; 796 } 797 goto check_pending; 798 } 799 btrfs_item_key_to_cpu(l, &key, slot); 800 801 if (key.objectid < device->devid) 802 goto next; 803 804 if (key.objectid > device->devid) 805 goto no_more_items; 806 807 if (key.offset >= search_start && key.offset > last_byte && 808 start_found) { 809 if (last_byte < search_start) 810 last_byte = search_start; 811 hole_size = key.offset - last_byte; 812 813 if (hole_size > *max_avail) 814 *max_avail = hole_size; 815 816 if (key.offset > last_byte && 817 hole_size >= num_bytes) { 818 *start = last_byte; 819 goto check_pending; 820 } 821 } 822 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 823 goto next; 824 825 start_found = 1; 826 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 827 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 828 next: 829 path->slots[0]++; 830 cond_resched(); 831 } 832 check_pending: 833 /* we have to make sure we didn't find an extent that has already 834 * been allocated by the map tree or the original allocation 835 */ 836 BUG_ON(*start < search_start); 837 838 if (*start + num_bytes > search_end) { 839 ret = -ENOSPC; 840 goto error; 841 } 842 /* check for pending inserts here */ 843 ret = 0; 844 845 error: 846 btrfs_free_path(path); 847 return ret; 848 } 849 850 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 851 struct btrfs_device *device, 852 u64 start) 853 { 854 int ret; 855 struct btrfs_path *path; 856 struct btrfs_root *root = device->dev_root; 857 struct btrfs_key key; 858 struct btrfs_key found_key; 859 struct extent_buffer *leaf = NULL; 860 struct btrfs_dev_extent *extent = NULL; 861 862 path = btrfs_alloc_path(); 863 if (!path) 864 return -ENOMEM; 865 866 key.objectid = device->devid; 867 key.offset = start; 868 key.type = BTRFS_DEV_EXTENT_KEY; 869 870 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 871 if (ret > 0) { 872 ret = btrfs_previous_item(root, path, key.objectid, 873 BTRFS_DEV_EXTENT_KEY); 874 BUG_ON(ret); 875 leaf = path->nodes[0]; 876 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 877 extent = btrfs_item_ptr(leaf, path->slots[0], 878 struct btrfs_dev_extent); 879 BUG_ON(found_key.offset > start || found_key.offset + 880 btrfs_dev_extent_length(leaf, extent) < start); 881 ret = 0; 882 } else if (ret == 0) { 883 leaf = path->nodes[0]; 884 extent = btrfs_item_ptr(leaf, path->slots[0], 885 struct btrfs_dev_extent); 886 } 887 BUG_ON(ret); 888 889 if (device->bytes_used > 0) 890 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 891 ret = btrfs_del_item(trans, root, path); 892 BUG_ON(ret); 893 894 btrfs_free_path(path); 895 return ret; 896 } 897 898 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 899 struct btrfs_device *device, 900 u64 chunk_tree, u64 chunk_objectid, 901 u64 chunk_offset, u64 start, u64 num_bytes) 902 { 903 int ret; 904 struct btrfs_path *path; 905 struct btrfs_root *root = device->dev_root; 906 struct btrfs_dev_extent *extent; 907 struct extent_buffer *leaf; 908 struct btrfs_key key; 909 910 WARN_ON(!device->in_fs_metadata); 911 path = btrfs_alloc_path(); 912 if (!path) 913 return -ENOMEM; 914 915 key.objectid = device->devid; 916 key.offset = start; 917 key.type = BTRFS_DEV_EXTENT_KEY; 918 ret = btrfs_insert_empty_item(trans, root, path, &key, 919 sizeof(*extent)); 920 BUG_ON(ret); 921 922 leaf = path->nodes[0]; 923 extent = btrfs_item_ptr(leaf, path->slots[0], 924 struct btrfs_dev_extent); 925 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 926 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 927 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 928 929 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 930 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 931 BTRFS_UUID_SIZE); 932 933 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 934 btrfs_mark_buffer_dirty(leaf); 935 btrfs_free_path(path); 936 return ret; 937 } 938 939 static noinline int find_next_chunk(struct btrfs_root *root, 940 u64 objectid, u64 *offset) 941 { 942 struct btrfs_path *path; 943 int ret; 944 struct btrfs_key key; 945 struct btrfs_chunk *chunk; 946 struct btrfs_key found_key; 947 948 path = btrfs_alloc_path(); 949 BUG_ON(!path); 950 951 key.objectid = objectid; 952 key.offset = (u64)-1; 953 key.type = BTRFS_CHUNK_ITEM_KEY; 954 955 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 956 if (ret < 0) 957 goto error; 958 959 BUG_ON(ret == 0); 960 961 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 962 if (ret) { 963 *offset = 0; 964 } else { 965 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 966 path->slots[0]); 967 if (found_key.objectid != objectid) 968 *offset = 0; 969 else { 970 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 971 struct btrfs_chunk); 972 *offset = found_key.offset + 973 btrfs_chunk_length(path->nodes[0], chunk); 974 } 975 } 976 ret = 0; 977 error: 978 btrfs_free_path(path); 979 return ret; 980 } 981 982 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 983 { 984 int ret; 985 struct btrfs_key key; 986 struct btrfs_key found_key; 987 struct btrfs_path *path; 988 989 root = root->fs_info->chunk_root; 990 991 path = btrfs_alloc_path(); 992 if (!path) 993 return -ENOMEM; 994 995 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 996 key.type = BTRFS_DEV_ITEM_KEY; 997 key.offset = (u64)-1; 998 999 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1000 if (ret < 0) 1001 goto error; 1002 1003 BUG_ON(ret == 0); 1004 1005 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1006 BTRFS_DEV_ITEM_KEY); 1007 if (ret) { 1008 *objectid = 1; 1009 } else { 1010 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1011 path->slots[0]); 1012 *objectid = found_key.offset + 1; 1013 } 1014 ret = 0; 1015 error: 1016 btrfs_free_path(path); 1017 return ret; 1018 } 1019 1020 /* 1021 * the device information is stored in the chunk root 1022 * the btrfs_device struct should be fully filled in 1023 */ 1024 int btrfs_add_device(struct btrfs_trans_handle *trans, 1025 struct btrfs_root *root, 1026 struct btrfs_device *device) 1027 { 1028 int ret; 1029 struct btrfs_path *path; 1030 struct btrfs_dev_item *dev_item; 1031 struct extent_buffer *leaf; 1032 struct btrfs_key key; 1033 unsigned long ptr; 1034 1035 root = root->fs_info->chunk_root; 1036 1037 path = btrfs_alloc_path(); 1038 if (!path) 1039 return -ENOMEM; 1040 1041 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1042 key.type = BTRFS_DEV_ITEM_KEY; 1043 key.offset = device->devid; 1044 1045 ret = btrfs_insert_empty_item(trans, root, path, &key, 1046 sizeof(*dev_item)); 1047 if (ret) 1048 goto out; 1049 1050 leaf = path->nodes[0]; 1051 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1052 1053 btrfs_set_device_id(leaf, dev_item, device->devid); 1054 btrfs_set_device_generation(leaf, dev_item, 0); 1055 btrfs_set_device_type(leaf, dev_item, device->type); 1056 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1057 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1058 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1059 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1060 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1061 btrfs_set_device_group(leaf, dev_item, 0); 1062 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1063 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1064 btrfs_set_device_start_offset(leaf, dev_item, 0); 1065 1066 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1067 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1068 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1069 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1070 btrfs_mark_buffer_dirty(leaf); 1071 1072 ret = 0; 1073 out: 1074 btrfs_free_path(path); 1075 return ret; 1076 } 1077 1078 static int btrfs_rm_dev_item(struct btrfs_root *root, 1079 struct btrfs_device *device) 1080 { 1081 int ret; 1082 struct btrfs_path *path; 1083 struct btrfs_key key; 1084 struct btrfs_trans_handle *trans; 1085 1086 root = root->fs_info->chunk_root; 1087 1088 path = btrfs_alloc_path(); 1089 if (!path) 1090 return -ENOMEM; 1091 1092 trans = btrfs_start_transaction(root, 1); 1093 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1094 key.type = BTRFS_DEV_ITEM_KEY; 1095 key.offset = device->devid; 1096 lock_chunks(root); 1097 1098 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1099 if (ret < 0) 1100 goto out; 1101 1102 if (ret > 0) { 1103 ret = -ENOENT; 1104 goto out; 1105 } 1106 1107 ret = btrfs_del_item(trans, root, path); 1108 if (ret) 1109 goto out; 1110 out: 1111 btrfs_free_path(path); 1112 unlock_chunks(root); 1113 btrfs_commit_transaction(trans, root); 1114 return ret; 1115 } 1116 1117 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1118 { 1119 struct btrfs_device *device; 1120 struct btrfs_device *next_device; 1121 struct block_device *bdev; 1122 struct buffer_head *bh = NULL; 1123 struct btrfs_super_block *disk_super; 1124 u64 all_avail; 1125 u64 devid; 1126 u64 num_devices; 1127 u8 *dev_uuid; 1128 int ret = 0; 1129 1130 mutex_lock(&uuid_mutex); 1131 mutex_lock(&root->fs_info->volume_mutex); 1132 1133 all_avail = root->fs_info->avail_data_alloc_bits | 1134 root->fs_info->avail_system_alloc_bits | 1135 root->fs_info->avail_metadata_alloc_bits; 1136 1137 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1138 root->fs_info->fs_devices->rw_devices <= 4) { 1139 printk(KERN_ERR "btrfs: unable to go below four devices " 1140 "on raid10\n"); 1141 ret = -EINVAL; 1142 goto out; 1143 } 1144 1145 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1146 root->fs_info->fs_devices->rw_devices <= 2) { 1147 printk(KERN_ERR "btrfs: unable to go below two " 1148 "devices on raid1\n"); 1149 ret = -EINVAL; 1150 goto out; 1151 } 1152 1153 if (strcmp(device_path, "missing") == 0) { 1154 struct list_head *devices; 1155 struct btrfs_device *tmp; 1156 1157 device = NULL; 1158 devices = &root->fs_info->fs_devices->devices; 1159 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1160 list_for_each_entry(tmp, devices, dev_list) { 1161 if (tmp->in_fs_metadata && !tmp->bdev) { 1162 device = tmp; 1163 break; 1164 } 1165 } 1166 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1167 bdev = NULL; 1168 bh = NULL; 1169 disk_super = NULL; 1170 if (!device) { 1171 printk(KERN_ERR "btrfs: no missing devices found to " 1172 "remove\n"); 1173 goto out; 1174 } 1175 } else { 1176 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1177 root->fs_info->bdev_holder); 1178 if (IS_ERR(bdev)) { 1179 ret = PTR_ERR(bdev); 1180 goto out; 1181 } 1182 1183 set_blocksize(bdev, 4096); 1184 bh = btrfs_read_dev_super(bdev); 1185 if (!bh) { 1186 ret = -EIO; 1187 goto error_close; 1188 } 1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1191 dev_uuid = disk_super->dev_item.uuid; 1192 device = btrfs_find_device(root, devid, dev_uuid, 1193 disk_super->fsid); 1194 if (!device) { 1195 ret = -ENOENT; 1196 goto error_brelse; 1197 } 1198 } 1199 1200 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1201 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1202 "device\n"); 1203 ret = -EINVAL; 1204 goto error_brelse; 1205 } 1206 1207 if (device->writeable) { 1208 list_del_init(&device->dev_alloc_list); 1209 root->fs_info->fs_devices->rw_devices--; 1210 } 1211 1212 ret = btrfs_shrink_device(device, 0); 1213 if (ret) 1214 goto error_brelse; 1215 1216 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1217 if (ret) 1218 goto error_brelse; 1219 1220 device->in_fs_metadata = 0; 1221 1222 /* 1223 * the device list mutex makes sure that we don't change 1224 * the device list while someone else is writing out all 1225 * the device supers. 1226 */ 1227 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1228 list_del_init(&device->dev_list); 1229 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1230 1231 device->fs_devices->num_devices--; 1232 1233 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1234 struct btrfs_device, dev_list); 1235 if (device->bdev == root->fs_info->sb->s_bdev) 1236 root->fs_info->sb->s_bdev = next_device->bdev; 1237 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1238 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1239 1240 if (device->bdev) { 1241 close_bdev_exclusive(device->bdev, device->mode); 1242 device->bdev = NULL; 1243 device->fs_devices->open_devices--; 1244 } 1245 1246 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1247 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1248 1249 if (device->fs_devices->open_devices == 0) { 1250 struct btrfs_fs_devices *fs_devices; 1251 fs_devices = root->fs_info->fs_devices; 1252 while (fs_devices) { 1253 if (fs_devices->seed == device->fs_devices) 1254 break; 1255 fs_devices = fs_devices->seed; 1256 } 1257 fs_devices->seed = device->fs_devices->seed; 1258 device->fs_devices->seed = NULL; 1259 __btrfs_close_devices(device->fs_devices); 1260 free_fs_devices(device->fs_devices); 1261 } 1262 1263 /* 1264 * at this point, the device is zero sized. We want to 1265 * remove it from the devices list and zero out the old super 1266 */ 1267 if (device->writeable) { 1268 /* make sure this device isn't detected as part of 1269 * the FS anymore 1270 */ 1271 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1272 set_buffer_dirty(bh); 1273 sync_dirty_buffer(bh); 1274 } 1275 1276 kfree(device->name); 1277 kfree(device); 1278 ret = 0; 1279 1280 error_brelse: 1281 brelse(bh); 1282 error_close: 1283 if (bdev) 1284 close_bdev_exclusive(bdev, FMODE_READ); 1285 out: 1286 mutex_unlock(&root->fs_info->volume_mutex); 1287 mutex_unlock(&uuid_mutex); 1288 return ret; 1289 } 1290 1291 /* 1292 * does all the dirty work required for changing file system's UUID. 1293 */ 1294 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1295 struct btrfs_root *root) 1296 { 1297 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1298 struct btrfs_fs_devices *old_devices; 1299 struct btrfs_fs_devices *seed_devices; 1300 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1301 struct btrfs_device *device; 1302 u64 super_flags; 1303 1304 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1305 if (!fs_devices->seeding) 1306 return -EINVAL; 1307 1308 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1309 if (!seed_devices) 1310 return -ENOMEM; 1311 1312 old_devices = clone_fs_devices(fs_devices); 1313 if (IS_ERR(old_devices)) { 1314 kfree(seed_devices); 1315 return PTR_ERR(old_devices); 1316 } 1317 1318 list_add(&old_devices->list, &fs_uuids); 1319 1320 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1321 seed_devices->opened = 1; 1322 INIT_LIST_HEAD(&seed_devices->devices); 1323 INIT_LIST_HEAD(&seed_devices->alloc_list); 1324 mutex_init(&seed_devices->device_list_mutex); 1325 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1326 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1327 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1328 device->fs_devices = seed_devices; 1329 } 1330 1331 fs_devices->seeding = 0; 1332 fs_devices->num_devices = 0; 1333 fs_devices->open_devices = 0; 1334 fs_devices->seed = seed_devices; 1335 1336 generate_random_uuid(fs_devices->fsid); 1337 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1338 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1339 super_flags = btrfs_super_flags(disk_super) & 1340 ~BTRFS_SUPER_FLAG_SEEDING; 1341 btrfs_set_super_flags(disk_super, super_flags); 1342 1343 return 0; 1344 } 1345 1346 /* 1347 * strore the expected generation for seed devices in device items. 1348 */ 1349 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1350 struct btrfs_root *root) 1351 { 1352 struct btrfs_path *path; 1353 struct extent_buffer *leaf; 1354 struct btrfs_dev_item *dev_item; 1355 struct btrfs_device *device; 1356 struct btrfs_key key; 1357 u8 fs_uuid[BTRFS_UUID_SIZE]; 1358 u8 dev_uuid[BTRFS_UUID_SIZE]; 1359 u64 devid; 1360 int ret; 1361 1362 path = btrfs_alloc_path(); 1363 if (!path) 1364 return -ENOMEM; 1365 1366 root = root->fs_info->chunk_root; 1367 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1368 key.offset = 0; 1369 key.type = BTRFS_DEV_ITEM_KEY; 1370 1371 while (1) { 1372 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1373 if (ret < 0) 1374 goto error; 1375 1376 leaf = path->nodes[0]; 1377 next_slot: 1378 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1379 ret = btrfs_next_leaf(root, path); 1380 if (ret > 0) 1381 break; 1382 if (ret < 0) 1383 goto error; 1384 leaf = path->nodes[0]; 1385 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1386 btrfs_release_path(root, path); 1387 continue; 1388 } 1389 1390 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1391 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1392 key.type != BTRFS_DEV_ITEM_KEY) 1393 break; 1394 1395 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1396 struct btrfs_dev_item); 1397 devid = btrfs_device_id(leaf, dev_item); 1398 read_extent_buffer(leaf, dev_uuid, 1399 (unsigned long)btrfs_device_uuid(dev_item), 1400 BTRFS_UUID_SIZE); 1401 read_extent_buffer(leaf, fs_uuid, 1402 (unsigned long)btrfs_device_fsid(dev_item), 1403 BTRFS_UUID_SIZE); 1404 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1405 BUG_ON(!device); 1406 1407 if (device->fs_devices->seeding) { 1408 btrfs_set_device_generation(leaf, dev_item, 1409 device->generation); 1410 btrfs_mark_buffer_dirty(leaf); 1411 } 1412 1413 path->slots[0]++; 1414 goto next_slot; 1415 } 1416 ret = 0; 1417 error: 1418 btrfs_free_path(path); 1419 return ret; 1420 } 1421 1422 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1423 { 1424 struct btrfs_trans_handle *trans; 1425 struct btrfs_device *device; 1426 struct block_device *bdev; 1427 struct list_head *devices; 1428 struct super_block *sb = root->fs_info->sb; 1429 u64 total_bytes; 1430 int seeding_dev = 0; 1431 int ret = 0; 1432 1433 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1434 return -EINVAL; 1435 1436 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1437 if (!bdev) 1438 return -EIO; 1439 1440 if (root->fs_info->fs_devices->seeding) { 1441 seeding_dev = 1; 1442 down_write(&sb->s_umount); 1443 mutex_lock(&uuid_mutex); 1444 } 1445 1446 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1447 mutex_lock(&root->fs_info->volume_mutex); 1448 1449 devices = &root->fs_info->fs_devices->devices; 1450 /* 1451 * we have the volume lock, so we don't need the extra 1452 * device list mutex while reading the list here. 1453 */ 1454 list_for_each_entry(device, devices, dev_list) { 1455 if (device->bdev == bdev) { 1456 ret = -EEXIST; 1457 goto error; 1458 } 1459 } 1460 1461 device = kzalloc(sizeof(*device), GFP_NOFS); 1462 if (!device) { 1463 /* we can safely leave the fs_devices entry around */ 1464 ret = -ENOMEM; 1465 goto error; 1466 } 1467 1468 device->name = kstrdup(device_path, GFP_NOFS); 1469 if (!device->name) { 1470 kfree(device); 1471 ret = -ENOMEM; 1472 goto error; 1473 } 1474 1475 ret = find_next_devid(root, &device->devid); 1476 if (ret) { 1477 kfree(device); 1478 goto error; 1479 } 1480 1481 trans = btrfs_start_transaction(root, 1); 1482 lock_chunks(root); 1483 1484 device->barriers = 1; 1485 device->writeable = 1; 1486 device->work.func = pending_bios_fn; 1487 generate_random_uuid(device->uuid); 1488 spin_lock_init(&device->io_lock); 1489 device->generation = trans->transid; 1490 device->io_width = root->sectorsize; 1491 device->io_align = root->sectorsize; 1492 device->sector_size = root->sectorsize; 1493 device->total_bytes = i_size_read(bdev->bd_inode); 1494 device->disk_total_bytes = device->total_bytes; 1495 device->dev_root = root->fs_info->dev_root; 1496 device->bdev = bdev; 1497 device->in_fs_metadata = 1; 1498 device->mode = 0; 1499 set_blocksize(device->bdev, 4096); 1500 1501 if (seeding_dev) { 1502 sb->s_flags &= ~MS_RDONLY; 1503 ret = btrfs_prepare_sprout(trans, root); 1504 BUG_ON(ret); 1505 } 1506 1507 device->fs_devices = root->fs_info->fs_devices; 1508 1509 /* 1510 * we don't want write_supers to jump in here with our device 1511 * half setup 1512 */ 1513 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1514 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1515 list_add(&device->dev_alloc_list, 1516 &root->fs_info->fs_devices->alloc_list); 1517 root->fs_info->fs_devices->num_devices++; 1518 root->fs_info->fs_devices->open_devices++; 1519 root->fs_info->fs_devices->rw_devices++; 1520 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1521 1522 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1523 root->fs_info->fs_devices->rotating = 1; 1524 1525 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1526 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1527 total_bytes + device->total_bytes); 1528 1529 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1530 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1531 total_bytes + 1); 1532 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1533 1534 if (seeding_dev) { 1535 ret = init_first_rw_device(trans, root, device); 1536 BUG_ON(ret); 1537 ret = btrfs_finish_sprout(trans, root); 1538 BUG_ON(ret); 1539 } else { 1540 ret = btrfs_add_device(trans, root, device); 1541 } 1542 1543 /* 1544 * we've got more storage, clear any full flags on the space 1545 * infos 1546 */ 1547 btrfs_clear_space_info_full(root->fs_info); 1548 1549 unlock_chunks(root); 1550 btrfs_commit_transaction(trans, root); 1551 1552 if (seeding_dev) { 1553 mutex_unlock(&uuid_mutex); 1554 up_write(&sb->s_umount); 1555 1556 ret = btrfs_relocate_sys_chunks(root); 1557 BUG_ON(ret); 1558 } 1559 out: 1560 mutex_unlock(&root->fs_info->volume_mutex); 1561 return ret; 1562 error: 1563 close_bdev_exclusive(bdev, 0); 1564 if (seeding_dev) { 1565 mutex_unlock(&uuid_mutex); 1566 up_write(&sb->s_umount); 1567 } 1568 goto out; 1569 } 1570 1571 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1572 struct btrfs_device *device) 1573 { 1574 int ret; 1575 struct btrfs_path *path; 1576 struct btrfs_root *root; 1577 struct btrfs_dev_item *dev_item; 1578 struct extent_buffer *leaf; 1579 struct btrfs_key key; 1580 1581 root = device->dev_root->fs_info->chunk_root; 1582 1583 path = btrfs_alloc_path(); 1584 if (!path) 1585 return -ENOMEM; 1586 1587 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1588 key.type = BTRFS_DEV_ITEM_KEY; 1589 key.offset = device->devid; 1590 1591 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1592 if (ret < 0) 1593 goto out; 1594 1595 if (ret > 0) { 1596 ret = -ENOENT; 1597 goto out; 1598 } 1599 1600 leaf = path->nodes[0]; 1601 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1602 1603 btrfs_set_device_id(leaf, dev_item, device->devid); 1604 btrfs_set_device_type(leaf, dev_item, device->type); 1605 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1606 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1607 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1608 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1609 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1610 btrfs_mark_buffer_dirty(leaf); 1611 1612 out: 1613 btrfs_free_path(path); 1614 return ret; 1615 } 1616 1617 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1618 struct btrfs_device *device, u64 new_size) 1619 { 1620 struct btrfs_super_block *super_copy = 1621 &device->dev_root->fs_info->super_copy; 1622 u64 old_total = btrfs_super_total_bytes(super_copy); 1623 u64 diff = new_size - device->total_bytes; 1624 1625 if (!device->writeable) 1626 return -EACCES; 1627 if (new_size <= device->total_bytes) 1628 return -EINVAL; 1629 1630 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1631 device->fs_devices->total_rw_bytes += diff; 1632 1633 device->total_bytes = new_size; 1634 device->disk_total_bytes = new_size; 1635 btrfs_clear_space_info_full(device->dev_root->fs_info); 1636 1637 return btrfs_update_device(trans, device); 1638 } 1639 1640 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1641 struct btrfs_device *device, u64 new_size) 1642 { 1643 int ret; 1644 lock_chunks(device->dev_root); 1645 ret = __btrfs_grow_device(trans, device, new_size); 1646 unlock_chunks(device->dev_root); 1647 return ret; 1648 } 1649 1650 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1651 struct btrfs_root *root, 1652 u64 chunk_tree, u64 chunk_objectid, 1653 u64 chunk_offset) 1654 { 1655 int ret; 1656 struct btrfs_path *path; 1657 struct btrfs_key key; 1658 1659 root = root->fs_info->chunk_root; 1660 path = btrfs_alloc_path(); 1661 if (!path) 1662 return -ENOMEM; 1663 1664 key.objectid = chunk_objectid; 1665 key.offset = chunk_offset; 1666 key.type = BTRFS_CHUNK_ITEM_KEY; 1667 1668 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1669 BUG_ON(ret); 1670 1671 ret = btrfs_del_item(trans, root, path); 1672 BUG_ON(ret); 1673 1674 btrfs_free_path(path); 1675 return 0; 1676 } 1677 1678 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1679 chunk_offset) 1680 { 1681 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1682 struct btrfs_disk_key *disk_key; 1683 struct btrfs_chunk *chunk; 1684 u8 *ptr; 1685 int ret = 0; 1686 u32 num_stripes; 1687 u32 array_size; 1688 u32 len = 0; 1689 u32 cur; 1690 struct btrfs_key key; 1691 1692 array_size = btrfs_super_sys_array_size(super_copy); 1693 1694 ptr = super_copy->sys_chunk_array; 1695 cur = 0; 1696 1697 while (cur < array_size) { 1698 disk_key = (struct btrfs_disk_key *)ptr; 1699 btrfs_disk_key_to_cpu(&key, disk_key); 1700 1701 len = sizeof(*disk_key); 1702 1703 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1704 chunk = (struct btrfs_chunk *)(ptr + len); 1705 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1706 len += btrfs_chunk_item_size(num_stripes); 1707 } else { 1708 ret = -EIO; 1709 break; 1710 } 1711 if (key.objectid == chunk_objectid && 1712 key.offset == chunk_offset) { 1713 memmove(ptr, ptr + len, array_size - (cur + len)); 1714 array_size -= len; 1715 btrfs_set_super_sys_array_size(super_copy, array_size); 1716 } else { 1717 ptr += len; 1718 cur += len; 1719 } 1720 } 1721 return ret; 1722 } 1723 1724 static int btrfs_relocate_chunk(struct btrfs_root *root, 1725 u64 chunk_tree, u64 chunk_objectid, 1726 u64 chunk_offset) 1727 { 1728 struct extent_map_tree *em_tree; 1729 struct btrfs_root *extent_root; 1730 struct btrfs_trans_handle *trans; 1731 struct extent_map *em; 1732 struct map_lookup *map; 1733 int ret; 1734 int i; 1735 1736 root = root->fs_info->chunk_root; 1737 extent_root = root->fs_info->extent_root; 1738 em_tree = &root->fs_info->mapping_tree.map_tree; 1739 1740 ret = btrfs_can_relocate(extent_root, chunk_offset); 1741 if (ret) 1742 return -ENOSPC; 1743 1744 /* step one, relocate all the extents inside this chunk */ 1745 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1746 BUG_ON(ret); 1747 1748 trans = btrfs_start_transaction(root, 1); 1749 BUG_ON(!trans); 1750 1751 lock_chunks(root); 1752 1753 /* 1754 * step two, delete the device extents and the 1755 * chunk tree entries 1756 */ 1757 read_lock(&em_tree->lock); 1758 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1759 read_unlock(&em_tree->lock); 1760 1761 BUG_ON(em->start > chunk_offset || 1762 em->start + em->len < chunk_offset); 1763 map = (struct map_lookup *)em->bdev; 1764 1765 for (i = 0; i < map->num_stripes; i++) { 1766 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1767 map->stripes[i].physical); 1768 BUG_ON(ret); 1769 1770 if (map->stripes[i].dev) { 1771 ret = btrfs_update_device(trans, map->stripes[i].dev); 1772 BUG_ON(ret); 1773 } 1774 } 1775 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1776 chunk_offset); 1777 1778 BUG_ON(ret); 1779 1780 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1781 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1782 BUG_ON(ret); 1783 } 1784 1785 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1786 BUG_ON(ret); 1787 1788 write_lock(&em_tree->lock); 1789 remove_extent_mapping(em_tree, em); 1790 write_unlock(&em_tree->lock); 1791 1792 kfree(map); 1793 em->bdev = NULL; 1794 1795 /* once for the tree */ 1796 free_extent_map(em); 1797 /* once for us */ 1798 free_extent_map(em); 1799 1800 unlock_chunks(root); 1801 btrfs_end_transaction(trans, root); 1802 return 0; 1803 } 1804 1805 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1806 { 1807 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1808 struct btrfs_path *path; 1809 struct extent_buffer *leaf; 1810 struct btrfs_chunk *chunk; 1811 struct btrfs_key key; 1812 struct btrfs_key found_key; 1813 u64 chunk_tree = chunk_root->root_key.objectid; 1814 u64 chunk_type; 1815 bool retried = false; 1816 int failed = 0; 1817 int ret; 1818 1819 path = btrfs_alloc_path(); 1820 if (!path) 1821 return -ENOMEM; 1822 1823 again: 1824 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1825 key.offset = (u64)-1; 1826 key.type = BTRFS_CHUNK_ITEM_KEY; 1827 1828 while (1) { 1829 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1830 if (ret < 0) 1831 goto error; 1832 BUG_ON(ret == 0); 1833 1834 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1835 key.type); 1836 if (ret < 0) 1837 goto error; 1838 if (ret > 0) 1839 break; 1840 1841 leaf = path->nodes[0]; 1842 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1843 1844 chunk = btrfs_item_ptr(leaf, path->slots[0], 1845 struct btrfs_chunk); 1846 chunk_type = btrfs_chunk_type(leaf, chunk); 1847 btrfs_release_path(chunk_root, path); 1848 1849 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1850 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1851 found_key.objectid, 1852 found_key.offset); 1853 if (ret == -ENOSPC) 1854 failed++; 1855 else if (ret) 1856 BUG(); 1857 } 1858 1859 if (found_key.offset == 0) 1860 break; 1861 key.offset = found_key.offset - 1; 1862 } 1863 ret = 0; 1864 if (failed && !retried) { 1865 failed = 0; 1866 retried = true; 1867 goto again; 1868 } else if (failed && retried) { 1869 WARN_ON(1); 1870 ret = -ENOSPC; 1871 } 1872 error: 1873 btrfs_free_path(path); 1874 return ret; 1875 } 1876 1877 static u64 div_factor(u64 num, int factor) 1878 { 1879 if (factor == 10) 1880 return num; 1881 num *= factor; 1882 do_div(num, 10); 1883 return num; 1884 } 1885 1886 int btrfs_balance(struct btrfs_root *dev_root) 1887 { 1888 int ret; 1889 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1890 struct btrfs_device *device; 1891 u64 old_size; 1892 u64 size_to_free; 1893 struct btrfs_path *path; 1894 struct btrfs_key key; 1895 struct btrfs_chunk *chunk; 1896 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1897 struct btrfs_trans_handle *trans; 1898 struct btrfs_key found_key; 1899 1900 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1901 return -EROFS; 1902 1903 mutex_lock(&dev_root->fs_info->volume_mutex); 1904 dev_root = dev_root->fs_info->dev_root; 1905 1906 /* step one make some room on all the devices */ 1907 list_for_each_entry(device, devices, dev_list) { 1908 old_size = device->total_bytes; 1909 size_to_free = div_factor(old_size, 1); 1910 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1911 if (!device->writeable || 1912 device->total_bytes - device->bytes_used > size_to_free) 1913 continue; 1914 1915 ret = btrfs_shrink_device(device, old_size - size_to_free); 1916 if (ret == -ENOSPC) 1917 break; 1918 BUG_ON(ret); 1919 1920 trans = btrfs_start_transaction(dev_root, 1); 1921 BUG_ON(!trans); 1922 1923 ret = btrfs_grow_device(trans, device, old_size); 1924 BUG_ON(ret); 1925 1926 btrfs_end_transaction(trans, dev_root); 1927 } 1928 1929 /* step two, relocate all the chunks */ 1930 path = btrfs_alloc_path(); 1931 BUG_ON(!path); 1932 1933 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1934 key.offset = (u64)-1; 1935 key.type = BTRFS_CHUNK_ITEM_KEY; 1936 1937 while (1) { 1938 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1939 if (ret < 0) 1940 goto error; 1941 1942 /* 1943 * this shouldn't happen, it means the last relocate 1944 * failed 1945 */ 1946 if (ret == 0) 1947 break; 1948 1949 ret = btrfs_previous_item(chunk_root, path, 0, 1950 BTRFS_CHUNK_ITEM_KEY); 1951 if (ret) 1952 break; 1953 1954 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1955 path->slots[0]); 1956 if (found_key.objectid != key.objectid) 1957 break; 1958 1959 chunk = btrfs_item_ptr(path->nodes[0], 1960 path->slots[0], 1961 struct btrfs_chunk); 1962 /* chunk zero is special */ 1963 if (found_key.offset == 0) 1964 break; 1965 1966 btrfs_release_path(chunk_root, path); 1967 ret = btrfs_relocate_chunk(chunk_root, 1968 chunk_root->root_key.objectid, 1969 found_key.objectid, 1970 found_key.offset); 1971 BUG_ON(ret && ret != -ENOSPC); 1972 key.offset = found_key.offset - 1; 1973 } 1974 ret = 0; 1975 error: 1976 btrfs_free_path(path); 1977 mutex_unlock(&dev_root->fs_info->volume_mutex); 1978 return ret; 1979 } 1980 1981 /* 1982 * shrinking a device means finding all of the device extents past 1983 * the new size, and then following the back refs to the chunks. 1984 * The chunk relocation code actually frees the device extent 1985 */ 1986 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 1987 { 1988 struct btrfs_trans_handle *trans; 1989 struct btrfs_root *root = device->dev_root; 1990 struct btrfs_dev_extent *dev_extent = NULL; 1991 struct btrfs_path *path; 1992 u64 length; 1993 u64 chunk_tree; 1994 u64 chunk_objectid; 1995 u64 chunk_offset; 1996 int ret; 1997 int slot; 1998 int failed = 0; 1999 bool retried = false; 2000 struct extent_buffer *l; 2001 struct btrfs_key key; 2002 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2003 u64 old_total = btrfs_super_total_bytes(super_copy); 2004 u64 old_size = device->total_bytes; 2005 u64 diff = device->total_bytes - new_size; 2006 2007 if (new_size >= device->total_bytes) 2008 return -EINVAL; 2009 2010 path = btrfs_alloc_path(); 2011 if (!path) 2012 return -ENOMEM; 2013 2014 path->reada = 2; 2015 2016 lock_chunks(root); 2017 2018 device->total_bytes = new_size; 2019 if (device->writeable) 2020 device->fs_devices->total_rw_bytes -= diff; 2021 unlock_chunks(root); 2022 2023 again: 2024 key.objectid = device->devid; 2025 key.offset = (u64)-1; 2026 key.type = BTRFS_DEV_EXTENT_KEY; 2027 2028 while (1) { 2029 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2030 if (ret < 0) 2031 goto done; 2032 2033 ret = btrfs_previous_item(root, path, 0, key.type); 2034 if (ret < 0) 2035 goto done; 2036 if (ret) { 2037 ret = 0; 2038 btrfs_release_path(root, path); 2039 break; 2040 } 2041 2042 l = path->nodes[0]; 2043 slot = path->slots[0]; 2044 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2045 2046 if (key.objectid != device->devid) { 2047 btrfs_release_path(root, path); 2048 break; 2049 } 2050 2051 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2052 length = btrfs_dev_extent_length(l, dev_extent); 2053 2054 if (key.offset + length <= new_size) { 2055 btrfs_release_path(root, path); 2056 break; 2057 } 2058 2059 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2060 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2061 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2062 btrfs_release_path(root, path); 2063 2064 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2065 chunk_offset); 2066 if (ret && ret != -ENOSPC) 2067 goto done; 2068 if (ret == -ENOSPC) 2069 failed++; 2070 key.offset -= 1; 2071 } 2072 2073 if (failed && !retried) { 2074 failed = 0; 2075 retried = true; 2076 goto again; 2077 } else if (failed && retried) { 2078 ret = -ENOSPC; 2079 lock_chunks(root); 2080 2081 device->total_bytes = old_size; 2082 if (device->writeable) 2083 device->fs_devices->total_rw_bytes += diff; 2084 unlock_chunks(root); 2085 goto done; 2086 } 2087 2088 /* Shrinking succeeded, else we would be at "done". */ 2089 trans = btrfs_start_transaction(root, 1); 2090 if (!trans) { 2091 ret = -ENOMEM; 2092 goto done; 2093 } 2094 lock_chunks(root); 2095 2096 device->disk_total_bytes = new_size; 2097 /* Now btrfs_update_device() will change the on-disk size. */ 2098 ret = btrfs_update_device(trans, device); 2099 if (ret) { 2100 unlock_chunks(root); 2101 btrfs_end_transaction(trans, root); 2102 goto done; 2103 } 2104 WARN_ON(diff > old_total); 2105 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2106 unlock_chunks(root); 2107 btrfs_end_transaction(trans, root); 2108 done: 2109 btrfs_free_path(path); 2110 return ret; 2111 } 2112 2113 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2114 struct btrfs_root *root, 2115 struct btrfs_key *key, 2116 struct btrfs_chunk *chunk, int item_size) 2117 { 2118 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2119 struct btrfs_disk_key disk_key; 2120 u32 array_size; 2121 u8 *ptr; 2122 2123 array_size = btrfs_super_sys_array_size(super_copy); 2124 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2125 return -EFBIG; 2126 2127 ptr = super_copy->sys_chunk_array + array_size; 2128 btrfs_cpu_key_to_disk(&disk_key, key); 2129 memcpy(ptr, &disk_key, sizeof(disk_key)); 2130 ptr += sizeof(disk_key); 2131 memcpy(ptr, chunk, item_size); 2132 item_size += sizeof(disk_key); 2133 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2134 return 0; 2135 } 2136 2137 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2138 int num_stripes, int sub_stripes) 2139 { 2140 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2141 return calc_size; 2142 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2143 return calc_size * (num_stripes / sub_stripes); 2144 else 2145 return calc_size * num_stripes; 2146 } 2147 2148 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2149 struct btrfs_root *extent_root, 2150 struct map_lookup **map_ret, 2151 u64 *num_bytes, u64 *stripe_size, 2152 u64 start, u64 type) 2153 { 2154 struct btrfs_fs_info *info = extent_root->fs_info; 2155 struct btrfs_device *device = NULL; 2156 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2157 struct list_head *cur; 2158 struct map_lookup *map = NULL; 2159 struct extent_map_tree *em_tree; 2160 struct extent_map *em; 2161 struct list_head private_devs; 2162 int min_stripe_size = 1 * 1024 * 1024; 2163 u64 calc_size = 1024 * 1024 * 1024; 2164 u64 max_chunk_size = calc_size; 2165 u64 min_free; 2166 u64 avail; 2167 u64 max_avail = 0; 2168 u64 dev_offset; 2169 int num_stripes = 1; 2170 int min_stripes = 1; 2171 int sub_stripes = 0; 2172 int looped = 0; 2173 int ret; 2174 int index; 2175 int stripe_len = 64 * 1024; 2176 2177 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2178 (type & BTRFS_BLOCK_GROUP_DUP)) { 2179 WARN_ON(1); 2180 type &= ~BTRFS_BLOCK_GROUP_DUP; 2181 } 2182 if (list_empty(&fs_devices->alloc_list)) 2183 return -ENOSPC; 2184 2185 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2186 num_stripes = fs_devices->rw_devices; 2187 min_stripes = 2; 2188 } 2189 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2190 num_stripes = 2; 2191 min_stripes = 2; 2192 } 2193 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2194 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2195 if (num_stripes < 2) 2196 return -ENOSPC; 2197 min_stripes = 2; 2198 } 2199 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2200 num_stripes = fs_devices->rw_devices; 2201 if (num_stripes < 4) 2202 return -ENOSPC; 2203 num_stripes &= ~(u32)1; 2204 sub_stripes = 2; 2205 min_stripes = 4; 2206 } 2207 2208 if (type & BTRFS_BLOCK_GROUP_DATA) { 2209 max_chunk_size = 10 * calc_size; 2210 min_stripe_size = 64 * 1024 * 1024; 2211 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2212 max_chunk_size = 256 * 1024 * 1024; 2213 min_stripe_size = 32 * 1024 * 1024; 2214 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2215 calc_size = 8 * 1024 * 1024; 2216 max_chunk_size = calc_size * 2; 2217 min_stripe_size = 1 * 1024 * 1024; 2218 } 2219 2220 /* we don't want a chunk larger than 10% of writeable space */ 2221 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2222 max_chunk_size); 2223 2224 again: 2225 max_avail = 0; 2226 if (!map || map->num_stripes != num_stripes) { 2227 kfree(map); 2228 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2229 if (!map) 2230 return -ENOMEM; 2231 map->num_stripes = num_stripes; 2232 } 2233 2234 if (calc_size * num_stripes > max_chunk_size) { 2235 calc_size = max_chunk_size; 2236 do_div(calc_size, num_stripes); 2237 do_div(calc_size, stripe_len); 2238 calc_size *= stripe_len; 2239 } 2240 /* we don't want tiny stripes */ 2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2242 2243 do_div(calc_size, stripe_len); 2244 calc_size *= stripe_len; 2245 2246 cur = fs_devices->alloc_list.next; 2247 index = 0; 2248 2249 if (type & BTRFS_BLOCK_GROUP_DUP) 2250 min_free = calc_size * 2; 2251 else 2252 min_free = calc_size; 2253 2254 /* 2255 * we add 1MB because we never use the first 1MB of the device, unless 2256 * we've looped, then we are likely allocating the maximum amount of 2257 * space left already 2258 */ 2259 if (!looped) 2260 min_free += 1024 * 1024; 2261 2262 INIT_LIST_HEAD(&private_devs); 2263 while (index < num_stripes) { 2264 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2265 BUG_ON(!device->writeable); 2266 if (device->total_bytes > device->bytes_used) 2267 avail = device->total_bytes - device->bytes_used; 2268 else 2269 avail = 0; 2270 cur = cur->next; 2271 2272 if (device->in_fs_metadata && avail >= min_free) { 2273 ret = find_free_dev_extent(trans, device, 2274 min_free, &dev_offset, 2275 &max_avail); 2276 if (ret == 0) { 2277 list_move_tail(&device->dev_alloc_list, 2278 &private_devs); 2279 map->stripes[index].dev = device; 2280 map->stripes[index].physical = dev_offset; 2281 index++; 2282 if (type & BTRFS_BLOCK_GROUP_DUP) { 2283 map->stripes[index].dev = device; 2284 map->stripes[index].physical = 2285 dev_offset + calc_size; 2286 index++; 2287 } 2288 } 2289 } else if (device->in_fs_metadata && avail > max_avail) 2290 max_avail = avail; 2291 if (cur == &fs_devices->alloc_list) 2292 break; 2293 } 2294 list_splice(&private_devs, &fs_devices->alloc_list); 2295 if (index < num_stripes) { 2296 if (index >= min_stripes) { 2297 num_stripes = index; 2298 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2299 num_stripes /= sub_stripes; 2300 num_stripes *= sub_stripes; 2301 } 2302 looped = 1; 2303 goto again; 2304 } 2305 if (!looped && max_avail > 0) { 2306 looped = 1; 2307 calc_size = max_avail; 2308 goto again; 2309 } 2310 kfree(map); 2311 return -ENOSPC; 2312 } 2313 map->sector_size = extent_root->sectorsize; 2314 map->stripe_len = stripe_len; 2315 map->io_align = stripe_len; 2316 map->io_width = stripe_len; 2317 map->type = type; 2318 map->num_stripes = num_stripes; 2319 map->sub_stripes = sub_stripes; 2320 2321 *map_ret = map; 2322 *stripe_size = calc_size; 2323 *num_bytes = chunk_bytes_by_type(type, calc_size, 2324 num_stripes, sub_stripes); 2325 2326 em = alloc_extent_map(GFP_NOFS); 2327 if (!em) { 2328 kfree(map); 2329 return -ENOMEM; 2330 } 2331 em->bdev = (struct block_device *)map; 2332 em->start = start; 2333 em->len = *num_bytes; 2334 em->block_start = 0; 2335 em->block_len = em->len; 2336 2337 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2338 write_lock(&em_tree->lock); 2339 ret = add_extent_mapping(em_tree, em); 2340 write_unlock(&em_tree->lock); 2341 BUG_ON(ret); 2342 free_extent_map(em); 2343 2344 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2345 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2346 start, *num_bytes); 2347 BUG_ON(ret); 2348 2349 index = 0; 2350 while (index < map->num_stripes) { 2351 device = map->stripes[index].dev; 2352 dev_offset = map->stripes[index].physical; 2353 2354 ret = btrfs_alloc_dev_extent(trans, device, 2355 info->chunk_root->root_key.objectid, 2356 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2357 start, dev_offset, calc_size); 2358 BUG_ON(ret); 2359 index++; 2360 } 2361 2362 return 0; 2363 } 2364 2365 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2366 struct btrfs_root *extent_root, 2367 struct map_lookup *map, u64 chunk_offset, 2368 u64 chunk_size, u64 stripe_size) 2369 { 2370 u64 dev_offset; 2371 struct btrfs_key key; 2372 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2373 struct btrfs_device *device; 2374 struct btrfs_chunk *chunk; 2375 struct btrfs_stripe *stripe; 2376 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2377 int index = 0; 2378 int ret; 2379 2380 chunk = kzalloc(item_size, GFP_NOFS); 2381 if (!chunk) 2382 return -ENOMEM; 2383 2384 index = 0; 2385 while (index < map->num_stripes) { 2386 device = map->stripes[index].dev; 2387 device->bytes_used += stripe_size; 2388 ret = btrfs_update_device(trans, device); 2389 BUG_ON(ret); 2390 index++; 2391 } 2392 2393 index = 0; 2394 stripe = &chunk->stripe; 2395 while (index < map->num_stripes) { 2396 device = map->stripes[index].dev; 2397 dev_offset = map->stripes[index].physical; 2398 2399 btrfs_set_stack_stripe_devid(stripe, device->devid); 2400 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2401 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2402 stripe++; 2403 index++; 2404 } 2405 2406 btrfs_set_stack_chunk_length(chunk, chunk_size); 2407 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2408 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2409 btrfs_set_stack_chunk_type(chunk, map->type); 2410 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2411 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2412 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2413 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2414 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2415 2416 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2417 key.type = BTRFS_CHUNK_ITEM_KEY; 2418 key.offset = chunk_offset; 2419 2420 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2421 BUG_ON(ret); 2422 2423 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2424 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2425 item_size); 2426 BUG_ON(ret); 2427 } 2428 kfree(chunk); 2429 return 0; 2430 } 2431 2432 /* 2433 * Chunk allocation falls into two parts. The first part does works 2434 * that make the new allocated chunk useable, but not do any operation 2435 * that modifies the chunk tree. The second part does the works that 2436 * require modifying the chunk tree. This division is important for the 2437 * bootstrap process of adding storage to a seed btrfs. 2438 */ 2439 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2440 struct btrfs_root *extent_root, u64 type) 2441 { 2442 u64 chunk_offset; 2443 u64 chunk_size; 2444 u64 stripe_size; 2445 struct map_lookup *map; 2446 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2447 int ret; 2448 2449 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2450 &chunk_offset); 2451 if (ret) 2452 return ret; 2453 2454 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2455 &stripe_size, chunk_offset, type); 2456 if (ret) 2457 return ret; 2458 2459 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2460 chunk_size, stripe_size); 2461 BUG_ON(ret); 2462 return 0; 2463 } 2464 2465 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2466 struct btrfs_root *root, 2467 struct btrfs_device *device) 2468 { 2469 u64 chunk_offset; 2470 u64 sys_chunk_offset; 2471 u64 chunk_size; 2472 u64 sys_chunk_size; 2473 u64 stripe_size; 2474 u64 sys_stripe_size; 2475 u64 alloc_profile; 2476 struct map_lookup *map; 2477 struct map_lookup *sys_map; 2478 struct btrfs_fs_info *fs_info = root->fs_info; 2479 struct btrfs_root *extent_root = fs_info->extent_root; 2480 int ret; 2481 2482 ret = find_next_chunk(fs_info->chunk_root, 2483 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2484 BUG_ON(ret); 2485 2486 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2487 (fs_info->metadata_alloc_profile & 2488 fs_info->avail_metadata_alloc_bits); 2489 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2490 2491 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2492 &stripe_size, chunk_offset, alloc_profile); 2493 BUG_ON(ret); 2494 2495 sys_chunk_offset = chunk_offset + chunk_size; 2496 2497 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2498 (fs_info->system_alloc_profile & 2499 fs_info->avail_system_alloc_bits); 2500 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2501 2502 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2503 &sys_chunk_size, &sys_stripe_size, 2504 sys_chunk_offset, alloc_profile); 2505 BUG_ON(ret); 2506 2507 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2508 BUG_ON(ret); 2509 2510 /* 2511 * Modifying chunk tree needs allocating new blocks from both 2512 * system block group and metadata block group. So we only can 2513 * do operations require modifying the chunk tree after both 2514 * block groups were created. 2515 */ 2516 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2517 chunk_size, stripe_size); 2518 BUG_ON(ret); 2519 2520 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2521 sys_chunk_offset, sys_chunk_size, 2522 sys_stripe_size); 2523 BUG_ON(ret); 2524 return 0; 2525 } 2526 2527 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2528 { 2529 struct extent_map *em; 2530 struct map_lookup *map; 2531 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2532 int readonly = 0; 2533 int i; 2534 2535 read_lock(&map_tree->map_tree.lock); 2536 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2537 read_unlock(&map_tree->map_tree.lock); 2538 if (!em) 2539 return 1; 2540 2541 map = (struct map_lookup *)em->bdev; 2542 for (i = 0; i < map->num_stripes; i++) { 2543 if (!map->stripes[i].dev->writeable) { 2544 readonly = 1; 2545 break; 2546 } 2547 } 2548 free_extent_map(em); 2549 return readonly; 2550 } 2551 2552 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2553 { 2554 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2555 } 2556 2557 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2558 { 2559 struct extent_map *em; 2560 2561 while (1) { 2562 write_lock(&tree->map_tree.lock); 2563 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2564 if (em) 2565 remove_extent_mapping(&tree->map_tree, em); 2566 write_unlock(&tree->map_tree.lock); 2567 if (!em) 2568 break; 2569 kfree(em->bdev); 2570 /* once for us */ 2571 free_extent_map(em); 2572 /* once for the tree */ 2573 free_extent_map(em); 2574 } 2575 } 2576 2577 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2578 { 2579 struct extent_map *em; 2580 struct map_lookup *map; 2581 struct extent_map_tree *em_tree = &map_tree->map_tree; 2582 int ret; 2583 2584 read_lock(&em_tree->lock); 2585 em = lookup_extent_mapping(em_tree, logical, len); 2586 read_unlock(&em_tree->lock); 2587 BUG_ON(!em); 2588 2589 BUG_ON(em->start > logical || em->start + em->len < logical); 2590 map = (struct map_lookup *)em->bdev; 2591 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2592 ret = map->num_stripes; 2593 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2594 ret = map->sub_stripes; 2595 else 2596 ret = 1; 2597 free_extent_map(em); 2598 return ret; 2599 } 2600 2601 static int find_live_mirror(struct map_lookup *map, int first, int num, 2602 int optimal) 2603 { 2604 int i; 2605 if (map->stripes[optimal].dev->bdev) 2606 return optimal; 2607 for (i = first; i < first + num; i++) { 2608 if (map->stripes[i].dev->bdev) 2609 return i; 2610 } 2611 /* we couldn't find one that doesn't fail. Just return something 2612 * and the io error handling code will clean up eventually 2613 */ 2614 return optimal; 2615 } 2616 2617 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2618 u64 logical, u64 *length, 2619 struct btrfs_multi_bio **multi_ret, 2620 int mirror_num, struct page *unplug_page) 2621 { 2622 struct extent_map *em; 2623 struct map_lookup *map; 2624 struct extent_map_tree *em_tree = &map_tree->map_tree; 2625 u64 offset; 2626 u64 stripe_offset; 2627 u64 stripe_nr; 2628 int stripes_allocated = 8; 2629 int stripes_required = 1; 2630 int stripe_index; 2631 int i; 2632 int num_stripes; 2633 int max_errors = 0; 2634 struct btrfs_multi_bio *multi = NULL; 2635 2636 if (multi_ret && !(rw & (1 << BIO_RW))) 2637 stripes_allocated = 1; 2638 again: 2639 if (multi_ret) { 2640 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2641 GFP_NOFS); 2642 if (!multi) 2643 return -ENOMEM; 2644 2645 atomic_set(&multi->error, 0); 2646 } 2647 2648 read_lock(&em_tree->lock); 2649 em = lookup_extent_mapping(em_tree, logical, *length); 2650 read_unlock(&em_tree->lock); 2651 2652 if (!em && unplug_page) { 2653 kfree(multi); 2654 return 0; 2655 } 2656 2657 if (!em) { 2658 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2659 (unsigned long long)logical, 2660 (unsigned long long)*length); 2661 BUG(); 2662 } 2663 2664 BUG_ON(em->start > logical || em->start + em->len < logical); 2665 map = (struct map_lookup *)em->bdev; 2666 offset = logical - em->start; 2667 2668 if (mirror_num > map->num_stripes) 2669 mirror_num = 0; 2670 2671 /* if our multi bio struct is too small, back off and try again */ 2672 if (rw & (1 << BIO_RW)) { 2673 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2674 BTRFS_BLOCK_GROUP_DUP)) { 2675 stripes_required = map->num_stripes; 2676 max_errors = 1; 2677 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2678 stripes_required = map->sub_stripes; 2679 max_errors = 1; 2680 } 2681 } 2682 if (multi_ret && (rw & (1 << BIO_RW)) && 2683 stripes_allocated < stripes_required) { 2684 stripes_allocated = map->num_stripes; 2685 free_extent_map(em); 2686 kfree(multi); 2687 goto again; 2688 } 2689 stripe_nr = offset; 2690 /* 2691 * stripe_nr counts the total number of stripes we have to stride 2692 * to get to this block 2693 */ 2694 do_div(stripe_nr, map->stripe_len); 2695 2696 stripe_offset = stripe_nr * map->stripe_len; 2697 BUG_ON(offset < stripe_offset); 2698 2699 /* stripe_offset is the offset of this block in its stripe*/ 2700 stripe_offset = offset - stripe_offset; 2701 2702 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2703 BTRFS_BLOCK_GROUP_RAID10 | 2704 BTRFS_BLOCK_GROUP_DUP)) { 2705 /* we limit the length of each bio to what fits in a stripe */ 2706 *length = min_t(u64, em->len - offset, 2707 map->stripe_len - stripe_offset); 2708 } else { 2709 *length = em->len - offset; 2710 } 2711 2712 if (!multi_ret && !unplug_page) 2713 goto out; 2714 2715 num_stripes = 1; 2716 stripe_index = 0; 2717 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2718 if (unplug_page || (rw & (1 << BIO_RW))) 2719 num_stripes = map->num_stripes; 2720 else if (mirror_num) 2721 stripe_index = mirror_num - 1; 2722 else { 2723 stripe_index = find_live_mirror(map, 0, 2724 map->num_stripes, 2725 current->pid % map->num_stripes); 2726 } 2727 2728 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2729 if (rw & (1 << BIO_RW)) 2730 num_stripes = map->num_stripes; 2731 else if (mirror_num) 2732 stripe_index = mirror_num - 1; 2733 2734 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2735 int factor = map->num_stripes / map->sub_stripes; 2736 2737 stripe_index = do_div(stripe_nr, factor); 2738 stripe_index *= map->sub_stripes; 2739 2740 if (unplug_page || (rw & (1 << BIO_RW))) 2741 num_stripes = map->sub_stripes; 2742 else if (mirror_num) 2743 stripe_index += mirror_num - 1; 2744 else { 2745 stripe_index = find_live_mirror(map, stripe_index, 2746 map->sub_stripes, stripe_index + 2747 current->pid % map->sub_stripes); 2748 } 2749 } else { 2750 /* 2751 * after this do_div call, stripe_nr is the number of stripes 2752 * on this device we have to walk to find the data, and 2753 * stripe_index is the number of our device in the stripe array 2754 */ 2755 stripe_index = do_div(stripe_nr, map->num_stripes); 2756 } 2757 BUG_ON(stripe_index >= map->num_stripes); 2758 2759 for (i = 0; i < num_stripes; i++) { 2760 if (unplug_page) { 2761 struct btrfs_device *device; 2762 struct backing_dev_info *bdi; 2763 2764 device = map->stripes[stripe_index].dev; 2765 if (device->bdev) { 2766 bdi = blk_get_backing_dev_info(device->bdev); 2767 if (bdi->unplug_io_fn) 2768 bdi->unplug_io_fn(bdi, unplug_page); 2769 } 2770 } else { 2771 multi->stripes[i].physical = 2772 map->stripes[stripe_index].physical + 2773 stripe_offset + stripe_nr * map->stripe_len; 2774 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2775 } 2776 stripe_index++; 2777 } 2778 if (multi_ret) { 2779 *multi_ret = multi; 2780 multi->num_stripes = num_stripes; 2781 multi->max_errors = max_errors; 2782 } 2783 out: 2784 free_extent_map(em); 2785 return 0; 2786 } 2787 2788 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2789 u64 logical, u64 *length, 2790 struct btrfs_multi_bio **multi_ret, int mirror_num) 2791 { 2792 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 2793 mirror_num, NULL); 2794 } 2795 2796 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 2797 u64 chunk_start, u64 physical, u64 devid, 2798 u64 **logical, int *naddrs, int *stripe_len) 2799 { 2800 struct extent_map_tree *em_tree = &map_tree->map_tree; 2801 struct extent_map *em; 2802 struct map_lookup *map; 2803 u64 *buf; 2804 u64 bytenr; 2805 u64 length; 2806 u64 stripe_nr; 2807 int i, j, nr = 0; 2808 2809 read_lock(&em_tree->lock); 2810 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2811 read_unlock(&em_tree->lock); 2812 2813 BUG_ON(!em || em->start != chunk_start); 2814 map = (struct map_lookup *)em->bdev; 2815 2816 length = em->len; 2817 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2818 do_div(length, map->num_stripes / map->sub_stripes); 2819 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 2820 do_div(length, map->num_stripes); 2821 2822 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 2823 BUG_ON(!buf); 2824 2825 for (i = 0; i < map->num_stripes; i++) { 2826 if (devid && map->stripes[i].dev->devid != devid) 2827 continue; 2828 if (map->stripes[i].physical > physical || 2829 map->stripes[i].physical + length <= physical) 2830 continue; 2831 2832 stripe_nr = physical - map->stripes[i].physical; 2833 do_div(stripe_nr, map->stripe_len); 2834 2835 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2836 stripe_nr = stripe_nr * map->num_stripes + i; 2837 do_div(stripe_nr, map->sub_stripes); 2838 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2839 stripe_nr = stripe_nr * map->num_stripes + i; 2840 } 2841 bytenr = chunk_start + stripe_nr * map->stripe_len; 2842 WARN_ON(nr >= map->num_stripes); 2843 for (j = 0; j < nr; j++) { 2844 if (buf[j] == bytenr) 2845 break; 2846 } 2847 if (j == nr) { 2848 WARN_ON(nr >= map->num_stripes); 2849 buf[nr++] = bytenr; 2850 } 2851 } 2852 2853 *logical = buf; 2854 *naddrs = nr; 2855 *stripe_len = map->stripe_len; 2856 2857 free_extent_map(em); 2858 return 0; 2859 } 2860 2861 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, 2862 u64 logical, struct page *page) 2863 { 2864 u64 length = PAGE_CACHE_SIZE; 2865 return __btrfs_map_block(map_tree, READ, logical, &length, 2866 NULL, 0, page); 2867 } 2868 2869 static void end_bio_multi_stripe(struct bio *bio, int err) 2870 { 2871 struct btrfs_multi_bio *multi = bio->bi_private; 2872 int is_orig_bio = 0; 2873 2874 if (err) 2875 atomic_inc(&multi->error); 2876 2877 if (bio == multi->orig_bio) 2878 is_orig_bio = 1; 2879 2880 if (atomic_dec_and_test(&multi->stripes_pending)) { 2881 if (!is_orig_bio) { 2882 bio_put(bio); 2883 bio = multi->orig_bio; 2884 } 2885 bio->bi_private = multi->private; 2886 bio->bi_end_io = multi->end_io; 2887 /* only send an error to the higher layers if it is 2888 * beyond the tolerance of the multi-bio 2889 */ 2890 if (atomic_read(&multi->error) > multi->max_errors) { 2891 err = -EIO; 2892 } else if (err) { 2893 /* 2894 * this bio is actually up to date, we didn't 2895 * go over the max number of errors 2896 */ 2897 set_bit(BIO_UPTODATE, &bio->bi_flags); 2898 err = 0; 2899 } 2900 kfree(multi); 2901 2902 bio_endio(bio, err); 2903 } else if (!is_orig_bio) { 2904 bio_put(bio); 2905 } 2906 } 2907 2908 struct async_sched { 2909 struct bio *bio; 2910 int rw; 2911 struct btrfs_fs_info *info; 2912 struct btrfs_work work; 2913 }; 2914 2915 /* 2916 * see run_scheduled_bios for a description of why bios are collected for 2917 * async submit. 2918 * 2919 * This will add one bio to the pending list for a device and make sure 2920 * the work struct is scheduled. 2921 */ 2922 static noinline int schedule_bio(struct btrfs_root *root, 2923 struct btrfs_device *device, 2924 int rw, struct bio *bio) 2925 { 2926 int should_queue = 1; 2927 struct btrfs_pending_bios *pending_bios; 2928 2929 /* don't bother with additional async steps for reads, right now */ 2930 if (!(rw & (1 << BIO_RW))) { 2931 bio_get(bio); 2932 submit_bio(rw, bio); 2933 bio_put(bio); 2934 return 0; 2935 } 2936 2937 /* 2938 * nr_async_bios allows us to reliably return congestion to the 2939 * higher layers. Otherwise, the async bio makes it appear we have 2940 * made progress against dirty pages when we've really just put it 2941 * on a queue for later 2942 */ 2943 atomic_inc(&root->fs_info->nr_async_bios); 2944 WARN_ON(bio->bi_next); 2945 bio->bi_next = NULL; 2946 bio->bi_rw |= rw; 2947 2948 spin_lock(&device->io_lock); 2949 if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) 2950 pending_bios = &device->pending_sync_bios; 2951 else 2952 pending_bios = &device->pending_bios; 2953 2954 if (pending_bios->tail) 2955 pending_bios->tail->bi_next = bio; 2956 2957 pending_bios->tail = bio; 2958 if (!pending_bios->head) 2959 pending_bios->head = bio; 2960 if (device->running_pending) 2961 should_queue = 0; 2962 2963 spin_unlock(&device->io_lock); 2964 2965 if (should_queue) 2966 btrfs_queue_worker(&root->fs_info->submit_workers, 2967 &device->work); 2968 return 0; 2969 } 2970 2971 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2972 int mirror_num, int async_submit) 2973 { 2974 struct btrfs_mapping_tree *map_tree; 2975 struct btrfs_device *dev; 2976 struct bio *first_bio = bio; 2977 u64 logical = (u64)bio->bi_sector << 9; 2978 u64 length = 0; 2979 u64 map_length; 2980 struct btrfs_multi_bio *multi = NULL; 2981 int ret; 2982 int dev_nr = 0; 2983 int total_devs = 1; 2984 2985 length = bio->bi_size; 2986 map_tree = &root->fs_info->mapping_tree; 2987 map_length = length; 2988 2989 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 2990 mirror_num); 2991 BUG_ON(ret); 2992 2993 total_devs = multi->num_stripes; 2994 if (map_length < length) { 2995 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 2996 "len %llu\n", (unsigned long long)logical, 2997 (unsigned long long)length, 2998 (unsigned long long)map_length); 2999 BUG(); 3000 } 3001 multi->end_io = first_bio->bi_end_io; 3002 multi->private = first_bio->bi_private; 3003 multi->orig_bio = first_bio; 3004 atomic_set(&multi->stripes_pending, multi->num_stripes); 3005 3006 while (dev_nr < total_devs) { 3007 if (total_devs > 1) { 3008 if (dev_nr < total_devs - 1) { 3009 bio = bio_clone(first_bio, GFP_NOFS); 3010 BUG_ON(!bio); 3011 } else { 3012 bio = first_bio; 3013 } 3014 bio->bi_private = multi; 3015 bio->bi_end_io = end_bio_multi_stripe; 3016 } 3017 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3018 dev = multi->stripes[dev_nr].dev; 3019 BUG_ON(rw == WRITE && !dev->writeable); 3020 if (dev && dev->bdev) { 3021 bio->bi_bdev = dev->bdev; 3022 if (async_submit) 3023 schedule_bio(root, dev, rw, bio); 3024 else 3025 submit_bio(rw, bio); 3026 } else { 3027 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 3028 bio->bi_sector = logical >> 9; 3029 bio_endio(bio, -EIO); 3030 } 3031 dev_nr++; 3032 } 3033 if (total_devs == 1) 3034 kfree(multi); 3035 return 0; 3036 } 3037 3038 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 3039 u8 *uuid, u8 *fsid) 3040 { 3041 struct btrfs_device *device; 3042 struct btrfs_fs_devices *cur_devices; 3043 3044 cur_devices = root->fs_info->fs_devices; 3045 while (cur_devices) { 3046 if (!fsid || 3047 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3048 device = __find_device(&cur_devices->devices, 3049 devid, uuid); 3050 if (device) 3051 return device; 3052 } 3053 cur_devices = cur_devices->seed; 3054 } 3055 return NULL; 3056 } 3057 3058 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3059 u64 devid, u8 *dev_uuid) 3060 { 3061 struct btrfs_device *device; 3062 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3063 3064 device = kzalloc(sizeof(*device), GFP_NOFS); 3065 if (!device) 3066 return NULL; 3067 list_add(&device->dev_list, 3068 &fs_devices->devices); 3069 device->barriers = 1; 3070 device->dev_root = root->fs_info->dev_root; 3071 device->devid = devid; 3072 device->work.func = pending_bios_fn; 3073 device->fs_devices = fs_devices; 3074 fs_devices->num_devices++; 3075 spin_lock_init(&device->io_lock); 3076 INIT_LIST_HEAD(&device->dev_alloc_list); 3077 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3078 return device; 3079 } 3080 3081 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3082 struct extent_buffer *leaf, 3083 struct btrfs_chunk *chunk) 3084 { 3085 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3086 struct map_lookup *map; 3087 struct extent_map *em; 3088 u64 logical; 3089 u64 length; 3090 u64 devid; 3091 u8 uuid[BTRFS_UUID_SIZE]; 3092 int num_stripes; 3093 int ret; 3094 int i; 3095 3096 logical = key->offset; 3097 length = btrfs_chunk_length(leaf, chunk); 3098 3099 read_lock(&map_tree->map_tree.lock); 3100 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3101 read_unlock(&map_tree->map_tree.lock); 3102 3103 /* already mapped? */ 3104 if (em && em->start <= logical && em->start + em->len > logical) { 3105 free_extent_map(em); 3106 return 0; 3107 } else if (em) { 3108 free_extent_map(em); 3109 } 3110 3111 em = alloc_extent_map(GFP_NOFS); 3112 if (!em) 3113 return -ENOMEM; 3114 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3115 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3116 if (!map) { 3117 free_extent_map(em); 3118 return -ENOMEM; 3119 } 3120 3121 em->bdev = (struct block_device *)map; 3122 em->start = logical; 3123 em->len = length; 3124 em->block_start = 0; 3125 em->block_len = em->len; 3126 3127 map->num_stripes = num_stripes; 3128 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3129 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3130 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3131 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3132 map->type = btrfs_chunk_type(leaf, chunk); 3133 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3134 for (i = 0; i < num_stripes; i++) { 3135 map->stripes[i].physical = 3136 btrfs_stripe_offset_nr(leaf, chunk, i); 3137 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3138 read_extent_buffer(leaf, uuid, (unsigned long) 3139 btrfs_stripe_dev_uuid_nr(chunk, i), 3140 BTRFS_UUID_SIZE); 3141 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3142 NULL); 3143 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3144 kfree(map); 3145 free_extent_map(em); 3146 return -EIO; 3147 } 3148 if (!map->stripes[i].dev) { 3149 map->stripes[i].dev = 3150 add_missing_dev(root, devid, uuid); 3151 if (!map->stripes[i].dev) { 3152 kfree(map); 3153 free_extent_map(em); 3154 return -EIO; 3155 } 3156 } 3157 map->stripes[i].dev->in_fs_metadata = 1; 3158 } 3159 3160 write_lock(&map_tree->map_tree.lock); 3161 ret = add_extent_mapping(&map_tree->map_tree, em); 3162 write_unlock(&map_tree->map_tree.lock); 3163 BUG_ON(ret); 3164 free_extent_map(em); 3165 3166 return 0; 3167 } 3168 3169 static int fill_device_from_item(struct extent_buffer *leaf, 3170 struct btrfs_dev_item *dev_item, 3171 struct btrfs_device *device) 3172 { 3173 unsigned long ptr; 3174 3175 device->devid = btrfs_device_id(leaf, dev_item); 3176 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3177 device->total_bytes = device->disk_total_bytes; 3178 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3179 device->type = btrfs_device_type(leaf, dev_item); 3180 device->io_align = btrfs_device_io_align(leaf, dev_item); 3181 device->io_width = btrfs_device_io_width(leaf, dev_item); 3182 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3183 3184 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3185 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3186 3187 return 0; 3188 } 3189 3190 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3191 { 3192 struct btrfs_fs_devices *fs_devices; 3193 int ret; 3194 3195 mutex_lock(&uuid_mutex); 3196 3197 fs_devices = root->fs_info->fs_devices->seed; 3198 while (fs_devices) { 3199 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3200 ret = 0; 3201 goto out; 3202 } 3203 fs_devices = fs_devices->seed; 3204 } 3205 3206 fs_devices = find_fsid(fsid); 3207 if (!fs_devices) { 3208 ret = -ENOENT; 3209 goto out; 3210 } 3211 3212 fs_devices = clone_fs_devices(fs_devices); 3213 if (IS_ERR(fs_devices)) { 3214 ret = PTR_ERR(fs_devices); 3215 goto out; 3216 } 3217 3218 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3219 root->fs_info->bdev_holder); 3220 if (ret) 3221 goto out; 3222 3223 if (!fs_devices->seeding) { 3224 __btrfs_close_devices(fs_devices); 3225 free_fs_devices(fs_devices); 3226 ret = -EINVAL; 3227 goto out; 3228 } 3229 3230 fs_devices->seed = root->fs_info->fs_devices->seed; 3231 root->fs_info->fs_devices->seed = fs_devices; 3232 out: 3233 mutex_unlock(&uuid_mutex); 3234 return ret; 3235 } 3236 3237 static int read_one_dev(struct btrfs_root *root, 3238 struct extent_buffer *leaf, 3239 struct btrfs_dev_item *dev_item) 3240 { 3241 struct btrfs_device *device; 3242 u64 devid; 3243 int ret; 3244 u8 fs_uuid[BTRFS_UUID_SIZE]; 3245 u8 dev_uuid[BTRFS_UUID_SIZE]; 3246 3247 devid = btrfs_device_id(leaf, dev_item); 3248 read_extent_buffer(leaf, dev_uuid, 3249 (unsigned long)btrfs_device_uuid(dev_item), 3250 BTRFS_UUID_SIZE); 3251 read_extent_buffer(leaf, fs_uuid, 3252 (unsigned long)btrfs_device_fsid(dev_item), 3253 BTRFS_UUID_SIZE); 3254 3255 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3256 ret = open_seed_devices(root, fs_uuid); 3257 if (ret && !btrfs_test_opt(root, DEGRADED)) 3258 return ret; 3259 } 3260 3261 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3262 if (!device || !device->bdev) { 3263 if (!btrfs_test_opt(root, DEGRADED)) 3264 return -EIO; 3265 3266 if (!device) { 3267 printk(KERN_WARNING "warning devid %llu missing\n", 3268 (unsigned long long)devid); 3269 device = add_missing_dev(root, devid, dev_uuid); 3270 if (!device) 3271 return -ENOMEM; 3272 } 3273 } 3274 3275 if (device->fs_devices != root->fs_info->fs_devices) { 3276 BUG_ON(device->writeable); 3277 if (device->generation != 3278 btrfs_device_generation(leaf, dev_item)) 3279 return -EINVAL; 3280 } 3281 3282 fill_device_from_item(leaf, dev_item, device); 3283 device->dev_root = root->fs_info->dev_root; 3284 device->in_fs_metadata = 1; 3285 if (device->writeable) 3286 device->fs_devices->total_rw_bytes += device->total_bytes; 3287 ret = 0; 3288 return ret; 3289 } 3290 3291 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3292 { 3293 struct btrfs_dev_item *dev_item; 3294 3295 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3296 dev_item); 3297 return read_one_dev(root, buf, dev_item); 3298 } 3299 3300 int btrfs_read_sys_array(struct btrfs_root *root) 3301 { 3302 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3303 struct extent_buffer *sb; 3304 struct btrfs_disk_key *disk_key; 3305 struct btrfs_chunk *chunk; 3306 u8 *ptr; 3307 unsigned long sb_ptr; 3308 int ret = 0; 3309 u32 num_stripes; 3310 u32 array_size; 3311 u32 len = 0; 3312 u32 cur; 3313 struct btrfs_key key; 3314 3315 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3316 BTRFS_SUPER_INFO_SIZE); 3317 if (!sb) 3318 return -ENOMEM; 3319 btrfs_set_buffer_uptodate(sb); 3320 btrfs_set_buffer_lockdep_class(sb, 0); 3321 3322 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3323 array_size = btrfs_super_sys_array_size(super_copy); 3324 3325 ptr = super_copy->sys_chunk_array; 3326 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3327 cur = 0; 3328 3329 while (cur < array_size) { 3330 disk_key = (struct btrfs_disk_key *)ptr; 3331 btrfs_disk_key_to_cpu(&key, disk_key); 3332 3333 len = sizeof(*disk_key); ptr += len; 3334 sb_ptr += len; 3335 cur += len; 3336 3337 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3338 chunk = (struct btrfs_chunk *)sb_ptr; 3339 ret = read_one_chunk(root, &key, sb, chunk); 3340 if (ret) 3341 break; 3342 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3343 len = btrfs_chunk_item_size(num_stripes); 3344 } else { 3345 ret = -EIO; 3346 break; 3347 } 3348 ptr += len; 3349 sb_ptr += len; 3350 cur += len; 3351 } 3352 free_extent_buffer(sb); 3353 return ret; 3354 } 3355 3356 int btrfs_read_chunk_tree(struct btrfs_root *root) 3357 { 3358 struct btrfs_path *path; 3359 struct extent_buffer *leaf; 3360 struct btrfs_key key; 3361 struct btrfs_key found_key; 3362 int ret; 3363 int slot; 3364 3365 root = root->fs_info->chunk_root; 3366 3367 path = btrfs_alloc_path(); 3368 if (!path) 3369 return -ENOMEM; 3370 3371 /* first we search for all of the device items, and then we 3372 * read in all of the chunk items. This way we can create chunk 3373 * mappings that reference all of the devices that are afound 3374 */ 3375 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3376 key.offset = 0; 3377 key.type = 0; 3378 again: 3379 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3380 while (1) { 3381 leaf = path->nodes[0]; 3382 slot = path->slots[0]; 3383 if (slot >= btrfs_header_nritems(leaf)) { 3384 ret = btrfs_next_leaf(root, path); 3385 if (ret == 0) 3386 continue; 3387 if (ret < 0) 3388 goto error; 3389 break; 3390 } 3391 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3392 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3393 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3394 break; 3395 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3396 struct btrfs_dev_item *dev_item; 3397 dev_item = btrfs_item_ptr(leaf, slot, 3398 struct btrfs_dev_item); 3399 ret = read_one_dev(root, leaf, dev_item); 3400 if (ret) 3401 goto error; 3402 } 3403 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3404 struct btrfs_chunk *chunk; 3405 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3406 ret = read_one_chunk(root, &found_key, leaf, chunk); 3407 if (ret) 3408 goto error; 3409 } 3410 path->slots[0]++; 3411 } 3412 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3413 key.objectid = 0; 3414 btrfs_release_path(root, path); 3415 goto again; 3416 } 3417 ret = 0; 3418 error: 3419 btrfs_free_path(path); 3420 return ret; 3421 } 3422