1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/buffer_head.h> 21 #include <linux/blkdev.h> 22 #include <linux/random.h> 23 #include <linux/iocontext.h> 24 #include <asm/div64.h> 25 #include "compat.h" 26 #include "ctree.h" 27 #include "extent_map.h" 28 #include "disk-io.h" 29 #include "transaction.h" 30 #include "print-tree.h" 31 #include "volumes.h" 32 #include "async-thread.h" 33 34 struct map_lookup { 35 u64 type; 36 int io_align; 37 int io_width; 38 int stripe_len; 39 int sector_size; 40 int num_stripes; 41 int sub_stripes; 42 struct btrfs_bio_stripe stripes[]; 43 }; 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 50 #define map_lookup_size(n) (sizeof(struct map_lookup) + \ 51 (sizeof(struct btrfs_bio_stripe) * (n))) 52 53 static DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 void btrfs_lock_volumes(void) 57 { 58 mutex_lock(&uuid_mutex); 59 } 60 61 void btrfs_unlock_volumes(void) 62 { 63 mutex_unlock(&uuid_mutex); 64 } 65 66 static void lock_chunks(struct btrfs_root *root) 67 { 68 mutex_lock(&root->fs_info->chunk_mutex); 69 } 70 71 static void unlock_chunks(struct btrfs_root *root) 72 { 73 mutex_unlock(&root->fs_info->chunk_mutex); 74 } 75 76 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 77 { 78 struct btrfs_device *device; 79 WARN_ON(fs_devices->opened); 80 while (!list_empty(&fs_devices->devices)) { 81 device = list_entry(fs_devices->devices.next, 82 struct btrfs_device, dev_list); 83 list_del(&device->dev_list); 84 kfree(device->name); 85 kfree(device); 86 } 87 kfree(fs_devices); 88 } 89 90 int btrfs_cleanup_fs_uuids(void) 91 { 92 struct btrfs_fs_devices *fs_devices; 93 94 while (!list_empty(&fs_uuids)) { 95 fs_devices = list_entry(fs_uuids.next, 96 struct btrfs_fs_devices, list); 97 list_del(&fs_devices->list); 98 free_fs_devices(fs_devices); 99 } 100 return 0; 101 } 102 103 static noinline struct btrfs_device *__find_device(struct list_head *head, 104 u64 devid, u8 *uuid) 105 { 106 struct btrfs_device *dev; 107 108 list_for_each_entry(dev, head, dev_list) { 109 if (dev->devid == devid && 110 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 111 return dev; 112 } 113 } 114 return NULL; 115 } 116 117 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 118 { 119 struct btrfs_fs_devices *fs_devices; 120 121 list_for_each_entry(fs_devices, &fs_uuids, list) { 122 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 123 return fs_devices; 124 } 125 return NULL; 126 } 127 128 static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 struct bio *head, struct bio *tail) 130 { 131 132 struct bio *old_head; 133 134 old_head = pending_bios->head; 135 pending_bios->head = head; 136 if (pending_bios->tail) 137 tail->bi_next = old_head; 138 else 139 pending_bios->tail = tail; 140 } 141 142 /* 143 * we try to collect pending bios for a device so we don't get a large 144 * number of procs sending bios down to the same device. This greatly 145 * improves the schedulers ability to collect and merge the bios. 146 * 147 * But, it also turns into a long list of bios to process and that is sure 148 * to eventually make the worker thread block. The solution here is to 149 * make some progress and then put this work struct back at the end of 150 * the list if the block device is congested. This way, multiple devices 151 * can make progress from a single worker thread. 152 */ 153 static noinline int run_scheduled_bios(struct btrfs_device *device) 154 { 155 struct bio *pending; 156 struct backing_dev_info *bdi; 157 struct btrfs_fs_info *fs_info; 158 struct btrfs_pending_bios *pending_bios; 159 struct bio *tail; 160 struct bio *cur; 161 int again = 0; 162 unsigned long num_run; 163 unsigned long num_sync_run; 164 unsigned long batch_run = 0; 165 unsigned long limit; 166 unsigned long last_waited = 0; 167 int force_reg = 0; 168 169 bdi = blk_get_backing_dev_info(device->bdev); 170 fs_info = device->dev_root->fs_info; 171 limit = btrfs_async_submit_limit(fs_info); 172 limit = limit * 2 / 3; 173 174 /* we want to make sure that every time we switch from the sync 175 * list to the normal list, we unplug 176 */ 177 num_sync_run = 0; 178 179 loop: 180 spin_lock(&device->io_lock); 181 182 loop_lock: 183 num_run = 0; 184 185 /* take all the bios off the list at once and process them 186 * later on (without the lock held). But, remember the 187 * tail and other pointers so the bios can be properly reinserted 188 * into the list if we hit congestion 189 */ 190 if (!force_reg && device->pending_sync_bios.head) { 191 pending_bios = &device->pending_sync_bios; 192 force_reg = 1; 193 } else { 194 pending_bios = &device->pending_bios; 195 force_reg = 0; 196 } 197 198 pending = pending_bios->head; 199 tail = pending_bios->tail; 200 WARN_ON(pending && !tail); 201 202 /* 203 * if pending was null this time around, no bios need processing 204 * at all and we can stop. Otherwise it'll loop back up again 205 * and do an additional check so no bios are missed. 206 * 207 * device->running_pending is used to synchronize with the 208 * schedule_bio code. 209 */ 210 if (device->pending_sync_bios.head == NULL && 211 device->pending_bios.head == NULL) { 212 again = 0; 213 device->running_pending = 0; 214 } else { 215 again = 1; 216 device->running_pending = 1; 217 } 218 219 pending_bios->head = NULL; 220 pending_bios->tail = NULL; 221 222 spin_unlock(&device->io_lock); 223 224 /* 225 * if we're doing the regular priority list, make sure we unplug 226 * for any high prio bios we've sent down 227 */ 228 if (pending_bios == &device->pending_bios && num_sync_run > 0) { 229 num_sync_run = 0; 230 blk_run_backing_dev(bdi, NULL); 231 } 232 233 while (pending) { 234 235 rmb(); 236 /* we want to work on both lists, but do more bios on the 237 * sync list than the regular list 238 */ 239 if ((num_run > 32 && 240 pending_bios != &device->pending_sync_bios && 241 device->pending_sync_bios.head) || 242 (num_run > 64 && pending_bios == &device->pending_sync_bios && 243 device->pending_bios.head)) { 244 spin_lock(&device->io_lock); 245 requeue_list(pending_bios, pending, tail); 246 goto loop_lock; 247 } 248 249 cur = pending; 250 pending = pending->bi_next; 251 cur->bi_next = NULL; 252 atomic_dec(&fs_info->nr_async_bios); 253 254 if (atomic_read(&fs_info->nr_async_bios) < limit && 255 waitqueue_active(&fs_info->async_submit_wait)) 256 wake_up(&fs_info->async_submit_wait); 257 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 submit_bio(cur->bi_rw, cur); 260 num_run++; 261 batch_run++; 262 263 if (bio_sync(cur)) 264 num_sync_run++; 265 266 if (need_resched()) { 267 if (num_sync_run) { 268 blk_run_backing_dev(bdi, NULL); 269 num_sync_run = 0; 270 } 271 cond_resched(); 272 } 273 274 /* 275 * we made progress, there is more work to do and the bdi 276 * is now congested. Back off and let other work structs 277 * run instead 278 */ 279 if (pending && bdi_write_congested(bdi) && batch_run > 32 && 280 fs_info->fs_devices->open_devices > 1) { 281 struct io_context *ioc; 282 283 ioc = current->io_context; 284 285 /* 286 * the main goal here is that we don't want to 287 * block if we're going to be able to submit 288 * more requests without blocking. 289 * 290 * This code does two great things, it pokes into 291 * the elevator code from a filesystem _and_ 292 * it makes assumptions about how batching works. 293 */ 294 if (ioc && ioc->nr_batch_requests > 0 && 295 time_before(jiffies, ioc->last_waited + HZ/50UL) && 296 (last_waited == 0 || 297 ioc->last_waited == last_waited)) { 298 /* 299 * we want to go through our batch of 300 * requests and stop. So, we copy out 301 * the ioc->last_waited time and test 302 * against it before looping 303 */ 304 last_waited = ioc->last_waited; 305 if (need_resched()) { 306 if (num_sync_run) { 307 blk_run_backing_dev(bdi, NULL); 308 num_sync_run = 0; 309 } 310 cond_resched(); 311 } 312 continue; 313 } 314 spin_lock(&device->io_lock); 315 requeue_list(pending_bios, pending, tail); 316 device->running_pending = 1; 317 318 spin_unlock(&device->io_lock); 319 btrfs_requeue_work(&device->work); 320 goto done; 321 } 322 } 323 324 if (num_sync_run) { 325 num_sync_run = 0; 326 blk_run_backing_dev(bdi, NULL); 327 } 328 329 cond_resched(); 330 if (again) 331 goto loop; 332 333 spin_lock(&device->io_lock); 334 if (device->pending_bios.head || device->pending_sync_bios.head) 335 goto loop_lock; 336 spin_unlock(&device->io_lock); 337 338 /* 339 * IO has already been through a long path to get here. Checksumming, 340 * async helper threads, perhaps compression. We've done a pretty 341 * good job of collecting a batch of IO and should just unplug 342 * the device right away. 343 * 344 * This will help anyone who is waiting on the IO, they might have 345 * already unplugged, but managed to do so before the bio they 346 * cared about found its way down here. 347 */ 348 blk_run_backing_dev(bdi, NULL); 349 done: 350 return 0; 351 } 352 353 static void pending_bios_fn(struct btrfs_work *work) 354 { 355 struct btrfs_device *device; 356 357 device = container_of(work, struct btrfs_device, work); 358 run_scheduled_bios(device); 359 } 360 361 static noinline int device_list_add(const char *path, 362 struct btrfs_super_block *disk_super, 363 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 364 { 365 struct btrfs_device *device; 366 struct btrfs_fs_devices *fs_devices; 367 u64 found_transid = btrfs_super_generation(disk_super); 368 369 fs_devices = find_fsid(disk_super->fsid); 370 if (!fs_devices) { 371 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 372 if (!fs_devices) 373 return -ENOMEM; 374 INIT_LIST_HEAD(&fs_devices->devices); 375 INIT_LIST_HEAD(&fs_devices->alloc_list); 376 list_add(&fs_devices->list, &fs_uuids); 377 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 378 fs_devices->latest_devid = devid; 379 fs_devices->latest_trans = found_transid; 380 mutex_init(&fs_devices->device_list_mutex); 381 device = NULL; 382 } else { 383 device = __find_device(&fs_devices->devices, devid, 384 disk_super->dev_item.uuid); 385 } 386 if (!device) { 387 if (fs_devices->opened) 388 return -EBUSY; 389 390 device = kzalloc(sizeof(*device), GFP_NOFS); 391 if (!device) { 392 /* we can safely leave the fs_devices entry around */ 393 return -ENOMEM; 394 } 395 device->devid = devid; 396 device->work.func = pending_bios_fn; 397 memcpy(device->uuid, disk_super->dev_item.uuid, 398 BTRFS_UUID_SIZE); 399 device->barriers = 1; 400 spin_lock_init(&device->io_lock); 401 device->name = kstrdup(path, GFP_NOFS); 402 if (!device->name) { 403 kfree(device); 404 return -ENOMEM; 405 } 406 INIT_LIST_HEAD(&device->dev_alloc_list); 407 408 mutex_lock(&fs_devices->device_list_mutex); 409 list_add(&device->dev_list, &fs_devices->devices); 410 mutex_unlock(&fs_devices->device_list_mutex); 411 412 device->fs_devices = fs_devices; 413 fs_devices->num_devices++; 414 } 415 416 if (found_transid > fs_devices->latest_trans) { 417 fs_devices->latest_devid = devid; 418 fs_devices->latest_trans = found_transid; 419 } 420 *fs_devices_ret = fs_devices; 421 return 0; 422 } 423 424 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 425 { 426 struct btrfs_fs_devices *fs_devices; 427 struct btrfs_device *device; 428 struct btrfs_device *orig_dev; 429 430 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 431 if (!fs_devices) 432 return ERR_PTR(-ENOMEM); 433 434 INIT_LIST_HEAD(&fs_devices->devices); 435 INIT_LIST_HEAD(&fs_devices->alloc_list); 436 INIT_LIST_HEAD(&fs_devices->list); 437 mutex_init(&fs_devices->device_list_mutex); 438 fs_devices->latest_devid = orig->latest_devid; 439 fs_devices->latest_trans = orig->latest_trans; 440 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 441 442 mutex_lock(&orig->device_list_mutex); 443 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 444 device = kzalloc(sizeof(*device), GFP_NOFS); 445 if (!device) 446 goto error; 447 448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 449 if (!device->name) 450 goto error; 451 452 device->devid = orig_dev->devid; 453 device->work.func = pending_bios_fn; 454 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 455 device->barriers = 1; 456 spin_lock_init(&device->io_lock); 457 INIT_LIST_HEAD(&device->dev_list); 458 INIT_LIST_HEAD(&device->dev_alloc_list); 459 460 list_add(&device->dev_list, &fs_devices->devices); 461 device->fs_devices = fs_devices; 462 fs_devices->num_devices++; 463 } 464 mutex_unlock(&orig->device_list_mutex); 465 return fs_devices; 466 error: 467 mutex_unlock(&orig->device_list_mutex); 468 free_fs_devices(fs_devices); 469 return ERR_PTR(-ENOMEM); 470 } 471 472 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 473 { 474 struct btrfs_device *device, *next; 475 476 mutex_lock(&uuid_mutex); 477 again: 478 mutex_lock(&fs_devices->device_list_mutex); 479 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 480 if (device->in_fs_metadata) 481 continue; 482 483 if (device->bdev) { 484 close_bdev_exclusive(device->bdev, device->mode); 485 device->bdev = NULL; 486 fs_devices->open_devices--; 487 } 488 if (device->writeable) { 489 list_del_init(&device->dev_alloc_list); 490 device->writeable = 0; 491 fs_devices->rw_devices--; 492 } 493 list_del_init(&device->dev_list); 494 fs_devices->num_devices--; 495 kfree(device->name); 496 kfree(device); 497 } 498 mutex_unlock(&fs_devices->device_list_mutex); 499 500 if (fs_devices->seed) { 501 fs_devices = fs_devices->seed; 502 goto again; 503 } 504 505 mutex_unlock(&uuid_mutex); 506 return 0; 507 } 508 509 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 510 { 511 struct btrfs_device *device; 512 513 if (--fs_devices->opened > 0) 514 return 0; 515 516 list_for_each_entry(device, &fs_devices->devices, dev_list) { 517 if (device->bdev) { 518 close_bdev_exclusive(device->bdev, device->mode); 519 fs_devices->open_devices--; 520 } 521 if (device->writeable) { 522 list_del_init(&device->dev_alloc_list); 523 fs_devices->rw_devices--; 524 } 525 526 device->bdev = NULL; 527 device->writeable = 0; 528 device->in_fs_metadata = 0; 529 } 530 WARN_ON(fs_devices->open_devices); 531 WARN_ON(fs_devices->rw_devices); 532 fs_devices->opened = 0; 533 fs_devices->seeding = 0; 534 535 return 0; 536 } 537 538 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 539 { 540 struct btrfs_fs_devices *seed_devices = NULL; 541 int ret; 542 543 mutex_lock(&uuid_mutex); 544 ret = __btrfs_close_devices(fs_devices); 545 if (!fs_devices->opened) { 546 seed_devices = fs_devices->seed; 547 fs_devices->seed = NULL; 548 } 549 mutex_unlock(&uuid_mutex); 550 551 while (seed_devices) { 552 fs_devices = seed_devices; 553 seed_devices = fs_devices->seed; 554 __btrfs_close_devices(fs_devices); 555 free_fs_devices(fs_devices); 556 } 557 return ret; 558 } 559 560 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 561 fmode_t flags, void *holder) 562 { 563 struct block_device *bdev; 564 struct list_head *head = &fs_devices->devices; 565 struct btrfs_device *device; 566 struct block_device *latest_bdev = NULL; 567 struct buffer_head *bh; 568 struct btrfs_super_block *disk_super; 569 u64 latest_devid = 0; 570 u64 latest_transid = 0; 571 u64 devid; 572 int seeding = 1; 573 int ret = 0; 574 575 list_for_each_entry(device, head, dev_list) { 576 if (device->bdev) 577 continue; 578 if (!device->name) 579 continue; 580 581 bdev = open_bdev_exclusive(device->name, flags, holder); 582 if (IS_ERR(bdev)) { 583 printk(KERN_INFO "open %s failed\n", device->name); 584 goto error; 585 } 586 set_blocksize(bdev, 4096); 587 588 bh = btrfs_read_dev_super(bdev); 589 if (!bh) 590 goto error_close; 591 592 disk_super = (struct btrfs_super_block *)bh->b_data; 593 devid = le64_to_cpu(disk_super->dev_item.devid); 594 if (devid != device->devid) 595 goto error_brelse; 596 597 if (memcmp(device->uuid, disk_super->dev_item.uuid, 598 BTRFS_UUID_SIZE)) 599 goto error_brelse; 600 601 device->generation = btrfs_super_generation(disk_super); 602 if (!latest_transid || device->generation > latest_transid) { 603 latest_devid = devid; 604 latest_transid = device->generation; 605 latest_bdev = bdev; 606 } 607 608 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 609 device->writeable = 0; 610 } else { 611 device->writeable = !bdev_read_only(bdev); 612 seeding = 0; 613 } 614 615 device->bdev = bdev; 616 device->in_fs_metadata = 0; 617 device->mode = flags; 618 619 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 620 fs_devices->rotating = 1; 621 622 fs_devices->open_devices++; 623 if (device->writeable) { 624 fs_devices->rw_devices++; 625 list_add(&device->dev_alloc_list, 626 &fs_devices->alloc_list); 627 } 628 continue; 629 630 error_brelse: 631 brelse(bh); 632 error_close: 633 close_bdev_exclusive(bdev, FMODE_READ); 634 error: 635 continue; 636 } 637 if (fs_devices->open_devices == 0) { 638 ret = -EIO; 639 goto out; 640 } 641 fs_devices->seeding = seeding; 642 fs_devices->opened = 1; 643 fs_devices->latest_bdev = latest_bdev; 644 fs_devices->latest_devid = latest_devid; 645 fs_devices->latest_trans = latest_transid; 646 fs_devices->total_rw_bytes = 0; 647 out: 648 return ret; 649 } 650 651 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 652 fmode_t flags, void *holder) 653 { 654 int ret; 655 656 mutex_lock(&uuid_mutex); 657 if (fs_devices->opened) { 658 fs_devices->opened++; 659 ret = 0; 660 } else { 661 ret = __btrfs_open_devices(fs_devices, flags, holder); 662 } 663 mutex_unlock(&uuid_mutex); 664 return ret; 665 } 666 667 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 668 struct btrfs_fs_devices **fs_devices_ret) 669 { 670 struct btrfs_super_block *disk_super; 671 struct block_device *bdev; 672 struct buffer_head *bh; 673 int ret; 674 u64 devid; 675 u64 transid; 676 677 mutex_lock(&uuid_mutex); 678 679 bdev = open_bdev_exclusive(path, flags, holder); 680 681 if (IS_ERR(bdev)) { 682 ret = PTR_ERR(bdev); 683 goto error; 684 } 685 686 ret = set_blocksize(bdev, 4096); 687 if (ret) 688 goto error_close; 689 bh = btrfs_read_dev_super(bdev); 690 if (!bh) { 691 ret = -EIO; 692 goto error_close; 693 } 694 disk_super = (struct btrfs_super_block *)bh->b_data; 695 devid = le64_to_cpu(disk_super->dev_item.devid); 696 transid = btrfs_super_generation(disk_super); 697 if (disk_super->label[0]) 698 printk(KERN_INFO "device label %s ", disk_super->label); 699 else { 700 /* FIXME, make a readl uuid parser */ 701 printk(KERN_INFO "device fsid %llx-%llx ", 702 *(unsigned long long *)disk_super->fsid, 703 *(unsigned long long *)(disk_super->fsid + 8)); 704 } 705 printk(KERN_CONT "devid %llu transid %llu %s\n", 706 (unsigned long long)devid, (unsigned long long)transid, path); 707 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 708 709 brelse(bh); 710 error_close: 711 close_bdev_exclusive(bdev, flags); 712 error: 713 mutex_unlock(&uuid_mutex); 714 return ret; 715 } 716 717 /* 718 * this uses a pretty simple search, the expectation is that it is 719 * called very infrequently and that a given device has a small number 720 * of extents 721 */ 722 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 723 struct btrfs_device *device, 724 u64 num_bytes, u64 *start) 725 { 726 struct btrfs_key key; 727 struct btrfs_root *root = device->dev_root; 728 struct btrfs_dev_extent *dev_extent = NULL; 729 struct btrfs_path *path; 730 u64 hole_size = 0; 731 u64 last_byte = 0; 732 u64 search_start = 0; 733 u64 search_end = device->total_bytes; 734 int ret; 735 int slot = 0; 736 int start_found; 737 struct extent_buffer *l; 738 739 path = btrfs_alloc_path(); 740 if (!path) 741 return -ENOMEM; 742 path->reada = 2; 743 start_found = 0; 744 745 /* FIXME use last free of some kind */ 746 747 /* we don't want to overwrite the superblock on the drive, 748 * so we make sure to start at an offset of at least 1MB 749 */ 750 search_start = max((u64)1024 * 1024, search_start); 751 752 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 753 search_start = max(root->fs_info->alloc_start, search_start); 754 755 key.objectid = device->devid; 756 key.offset = search_start; 757 key.type = BTRFS_DEV_EXTENT_KEY; 758 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 759 if (ret < 0) 760 goto error; 761 ret = btrfs_previous_item(root, path, 0, key.type); 762 if (ret < 0) 763 goto error; 764 l = path->nodes[0]; 765 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 766 while (1) { 767 l = path->nodes[0]; 768 slot = path->slots[0]; 769 if (slot >= btrfs_header_nritems(l)) { 770 ret = btrfs_next_leaf(root, path); 771 if (ret == 0) 772 continue; 773 if (ret < 0) 774 goto error; 775 no_more_items: 776 if (!start_found) { 777 if (search_start >= search_end) { 778 ret = -ENOSPC; 779 goto error; 780 } 781 *start = search_start; 782 start_found = 1; 783 goto check_pending; 784 } 785 *start = last_byte > search_start ? 786 last_byte : search_start; 787 if (search_end <= *start) { 788 ret = -ENOSPC; 789 goto error; 790 } 791 goto check_pending; 792 } 793 btrfs_item_key_to_cpu(l, &key, slot); 794 795 if (key.objectid < device->devid) 796 goto next; 797 798 if (key.objectid > device->devid) 799 goto no_more_items; 800 801 if (key.offset >= search_start && key.offset > last_byte && 802 start_found) { 803 if (last_byte < search_start) 804 last_byte = search_start; 805 hole_size = key.offset - last_byte; 806 if (key.offset > last_byte && 807 hole_size >= num_bytes) { 808 *start = last_byte; 809 goto check_pending; 810 } 811 } 812 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 813 goto next; 814 815 start_found = 1; 816 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 817 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 818 next: 819 path->slots[0]++; 820 cond_resched(); 821 } 822 check_pending: 823 /* we have to make sure we didn't find an extent that has already 824 * been allocated by the map tree or the original allocation 825 */ 826 BUG_ON(*start < search_start); 827 828 if (*start + num_bytes > search_end) { 829 ret = -ENOSPC; 830 goto error; 831 } 832 /* check for pending inserts here */ 833 ret = 0; 834 835 error: 836 btrfs_free_path(path); 837 return ret; 838 } 839 840 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 841 struct btrfs_device *device, 842 u64 start) 843 { 844 int ret; 845 struct btrfs_path *path; 846 struct btrfs_root *root = device->dev_root; 847 struct btrfs_key key; 848 struct btrfs_key found_key; 849 struct extent_buffer *leaf = NULL; 850 struct btrfs_dev_extent *extent = NULL; 851 852 path = btrfs_alloc_path(); 853 if (!path) 854 return -ENOMEM; 855 856 key.objectid = device->devid; 857 key.offset = start; 858 key.type = BTRFS_DEV_EXTENT_KEY; 859 860 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 861 if (ret > 0) { 862 ret = btrfs_previous_item(root, path, key.objectid, 863 BTRFS_DEV_EXTENT_KEY); 864 BUG_ON(ret); 865 leaf = path->nodes[0]; 866 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 867 extent = btrfs_item_ptr(leaf, path->slots[0], 868 struct btrfs_dev_extent); 869 BUG_ON(found_key.offset > start || found_key.offset + 870 btrfs_dev_extent_length(leaf, extent) < start); 871 ret = 0; 872 } else if (ret == 0) { 873 leaf = path->nodes[0]; 874 extent = btrfs_item_ptr(leaf, path->slots[0], 875 struct btrfs_dev_extent); 876 } 877 BUG_ON(ret); 878 879 if (device->bytes_used > 0) 880 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 881 ret = btrfs_del_item(trans, root, path); 882 BUG_ON(ret); 883 884 btrfs_free_path(path); 885 return ret; 886 } 887 888 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 889 struct btrfs_device *device, 890 u64 chunk_tree, u64 chunk_objectid, 891 u64 chunk_offset, u64 start, u64 num_bytes) 892 { 893 int ret; 894 struct btrfs_path *path; 895 struct btrfs_root *root = device->dev_root; 896 struct btrfs_dev_extent *extent; 897 struct extent_buffer *leaf; 898 struct btrfs_key key; 899 900 WARN_ON(!device->in_fs_metadata); 901 path = btrfs_alloc_path(); 902 if (!path) 903 return -ENOMEM; 904 905 key.objectid = device->devid; 906 key.offset = start; 907 key.type = BTRFS_DEV_EXTENT_KEY; 908 ret = btrfs_insert_empty_item(trans, root, path, &key, 909 sizeof(*extent)); 910 BUG_ON(ret); 911 912 leaf = path->nodes[0]; 913 extent = btrfs_item_ptr(leaf, path->slots[0], 914 struct btrfs_dev_extent); 915 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 916 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 917 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 918 919 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 920 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 921 BTRFS_UUID_SIZE); 922 923 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 924 btrfs_mark_buffer_dirty(leaf); 925 btrfs_free_path(path); 926 return ret; 927 } 928 929 static noinline int find_next_chunk(struct btrfs_root *root, 930 u64 objectid, u64 *offset) 931 { 932 struct btrfs_path *path; 933 int ret; 934 struct btrfs_key key; 935 struct btrfs_chunk *chunk; 936 struct btrfs_key found_key; 937 938 path = btrfs_alloc_path(); 939 BUG_ON(!path); 940 941 key.objectid = objectid; 942 key.offset = (u64)-1; 943 key.type = BTRFS_CHUNK_ITEM_KEY; 944 945 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 946 if (ret < 0) 947 goto error; 948 949 BUG_ON(ret == 0); 950 951 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 952 if (ret) { 953 *offset = 0; 954 } else { 955 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 956 path->slots[0]); 957 if (found_key.objectid != objectid) 958 *offset = 0; 959 else { 960 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 961 struct btrfs_chunk); 962 *offset = found_key.offset + 963 btrfs_chunk_length(path->nodes[0], chunk); 964 } 965 } 966 ret = 0; 967 error: 968 btrfs_free_path(path); 969 return ret; 970 } 971 972 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 973 { 974 int ret; 975 struct btrfs_key key; 976 struct btrfs_key found_key; 977 struct btrfs_path *path; 978 979 root = root->fs_info->chunk_root; 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return -ENOMEM; 984 985 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 986 key.type = BTRFS_DEV_ITEM_KEY; 987 key.offset = (u64)-1; 988 989 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 990 if (ret < 0) 991 goto error; 992 993 BUG_ON(ret == 0); 994 995 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 996 BTRFS_DEV_ITEM_KEY); 997 if (ret) { 998 *objectid = 1; 999 } else { 1000 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1001 path->slots[0]); 1002 *objectid = found_key.offset + 1; 1003 } 1004 ret = 0; 1005 error: 1006 btrfs_free_path(path); 1007 return ret; 1008 } 1009 1010 /* 1011 * the device information is stored in the chunk root 1012 * the btrfs_device struct should be fully filled in 1013 */ 1014 int btrfs_add_device(struct btrfs_trans_handle *trans, 1015 struct btrfs_root *root, 1016 struct btrfs_device *device) 1017 { 1018 int ret; 1019 struct btrfs_path *path; 1020 struct btrfs_dev_item *dev_item; 1021 struct extent_buffer *leaf; 1022 struct btrfs_key key; 1023 unsigned long ptr; 1024 1025 root = root->fs_info->chunk_root; 1026 1027 path = btrfs_alloc_path(); 1028 if (!path) 1029 return -ENOMEM; 1030 1031 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1032 key.type = BTRFS_DEV_ITEM_KEY; 1033 key.offset = device->devid; 1034 1035 ret = btrfs_insert_empty_item(trans, root, path, &key, 1036 sizeof(*dev_item)); 1037 if (ret) 1038 goto out; 1039 1040 leaf = path->nodes[0]; 1041 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1042 1043 btrfs_set_device_id(leaf, dev_item, device->devid); 1044 btrfs_set_device_generation(leaf, dev_item, 0); 1045 btrfs_set_device_type(leaf, dev_item, device->type); 1046 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1047 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1048 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1049 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1050 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1051 btrfs_set_device_group(leaf, dev_item, 0); 1052 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1053 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1054 btrfs_set_device_start_offset(leaf, dev_item, 0); 1055 1056 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1057 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1058 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1059 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1060 btrfs_mark_buffer_dirty(leaf); 1061 1062 ret = 0; 1063 out: 1064 btrfs_free_path(path); 1065 return ret; 1066 } 1067 1068 static int btrfs_rm_dev_item(struct btrfs_root *root, 1069 struct btrfs_device *device) 1070 { 1071 int ret; 1072 struct btrfs_path *path; 1073 struct btrfs_key key; 1074 struct btrfs_trans_handle *trans; 1075 1076 root = root->fs_info->chunk_root; 1077 1078 path = btrfs_alloc_path(); 1079 if (!path) 1080 return -ENOMEM; 1081 1082 trans = btrfs_start_transaction(root, 1); 1083 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1084 key.type = BTRFS_DEV_ITEM_KEY; 1085 key.offset = device->devid; 1086 lock_chunks(root); 1087 1088 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1089 if (ret < 0) 1090 goto out; 1091 1092 if (ret > 0) { 1093 ret = -ENOENT; 1094 goto out; 1095 } 1096 1097 ret = btrfs_del_item(trans, root, path); 1098 if (ret) 1099 goto out; 1100 out: 1101 btrfs_free_path(path); 1102 unlock_chunks(root); 1103 btrfs_commit_transaction(trans, root); 1104 return ret; 1105 } 1106 1107 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1108 { 1109 struct btrfs_device *device; 1110 struct btrfs_device *next_device; 1111 struct block_device *bdev; 1112 struct buffer_head *bh = NULL; 1113 struct btrfs_super_block *disk_super; 1114 u64 all_avail; 1115 u64 devid; 1116 u64 num_devices; 1117 u8 *dev_uuid; 1118 int ret = 0; 1119 1120 mutex_lock(&uuid_mutex); 1121 mutex_lock(&root->fs_info->volume_mutex); 1122 1123 all_avail = root->fs_info->avail_data_alloc_bits | 1124 root->fs_info->avail_system_alloc_bits | 1125 root->fs_info->avail_metadata_alloc_bits; 1126 1127 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1128 root->fs_info->fs_devices->rw_devices <= 4) { 1129 printk(KERN_ERR "btrfs: unable to go below four devices " 1130 "on raid10\n"); 1131 ret = -EINVAL; 1132 goto out; 1133 } 1134 1135 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1136 root->fs_info->fs_devices->rw_devices <= 2) { 1137 printk(KERN_ERR "btrfs: unable to go below two " 1138 "devices on raid1\n"); 1139 ret = -EINVAL; 1140 goto out; 1141 } 1142 1143 if (strcmp(device_path, "missing") == 0) { 1144 struct list_head *devices; 1145 struct btrfs_device *tmp; 1146 1147 device = NULL; 1148 devices = &root->fs_info->fs_devices->devices; 1149 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1150 list_for_each_entry(tmp, devices, dev_list) { 1151 if (tmp->in_fs_metadata && !tmp->bdev) { 1152 device = tmp; 1153 break; 1154 } 1155 } 1156 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1157 bdev = NULL; 1158 bh = NULL; 1159 disk_super = NULL; 1160 if (!device) { 1161 printk(KERN_ERR "btrfs: no missing devices found to " 1162 "remove\n"); 1163 goto out; 1164 } 1165 } else { 1166 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1167 root->fs_info->bdev_holder); 1168 if (IS_ERR(bdev)) { 1169 ret = PTR_ERR(bdev); 1170 goto out; 1171 } 1172 1173 set_blocksize(bdev, 4096); 1174 bh = btrfs_read_dev_super(bdev); 1175 if (!bh) { 1176 ret = -EIO; 1177 goto error_close; 1178 } 1179 disk_super = (struct btrfs_super_block *)bh->b_data; 1180 devid = le64_to_cpu(disk_super->dev_item.devid); 1181 dev_uuid = disk_super->dev_item.uuid; 1182 device = btrfs_find_device(root, devid, dev_uuid, 1183 disk_super->fsid); 1184 if (!device) { 1185 ret = -ENOENT; 1186 goto error_brelse; 1187 } 1188 } 1189 1190 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1191 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1192 "device\n"); 1193 ret = -EINVAL; 1194 goto error_brelse; 1195 } 1196 1197 if (device->writeable) { 1198 list_del_init(&device->dev_alloc_list); 1199 root->fs_info->fs_devices->rw_devices--; 1200 } 1201 1202 ret = btrfs_shrink_device(device, 0); 1203 if (ret) 1204 goto error_brelse; 1205 1206 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1207 if (ret) 1208 goto error_brelse; 1209 1210 device->in_fs_metadata = 0; 1211 1212 /* 1213 * the device list mutex makes sure that we don't change 1214 * the device list while someone else is writing out all 1215 * the device supers. 1216 */ 1217 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1218 list_del_init(&device->dev_list); 1219 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1220 1221 device->fs_devices->num_devices--; 1222 1223 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1224 struct btrfs_device, dev_list); 1225 if (device->bdev == root->fs_info->sb->s_bdev) 1226 root->fs_info->sb->s_bdev = next_device->bdev; 1227 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1228 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1229 1230 if (device->bdev) { 1231 close_bdev_exclusive(device->bdev, device->mode); 1232 device->bdev = NULL; 1233 device->fs_devices->open_devices--; 1234 } 1235 1236 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1237 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1238 1239 if (device->fs_devices->open_devices == 0) { 1240 struct btrfs_fs_devices *fs_devices; 1241 fs_devices = root->fs_info->fs_devices; 1242 while (fs_devices) { 1243 if (fs_devices->seed == device->fs_devices) 1244 break; 1245 fs_devices = fs_devices->seed; 1246 } 1247 fs_devices->seed = device->fs_devices->seed; 1248 device->fs_devices->seed = NULL; 1249 __btrfs_close_devices(device->fs_devices); 1250 free_fs_devices(device->fs_devices); 1251 } 1252 1253 /* 1254 * at this point, the device is zero sized. We want to 1255 * remove it from the devices list and zero out the old super 1256 */ 1257 if (device->writeable) { 1258 /* make sure this device isn't detected as part of 1259 * the FS anymore 1260 */ 1261 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1262 set_buffer_dirty(bh); 1263 sync_dirty_buffer(bh); 1264 } 1265 1266 kfree(device->name); 1267 kfree(device); 1268 ret = 0; 1269 1270 error_brelse: 1271 brelse(bh); 1272 error_close: 1273 if (bdev) 1274 close_bdev_exclusive(bdev, FMODE_READ); 1275 out: 1276 mutex_unlock(&root->fs_info->volume_mutex); 1277 mutex_unlock(&uuid_mutex); 1278 return ret; 1279 } 1280 1281 /* 1282 * does all the dirty work required for changing file system's UUID. 1283 */ 1284 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1285 struct btrfs_root *root) 1286 { 1287 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1288 struct btrfs_fs_devices *old_devices; 1289 struct btrfs_fs_devices *seed_devices; 1290 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1291 struct btrfs_device *device; 1292 u64 super_flags; 1293 1294 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1295 if (!fs_devices->seeding) 1296 return -EINVAL; 1297 1298 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1299 if (!seed_devices) 1300 return -ENOMEM; 1301 1302 old_devices = clone_fs_devices(fs_devices); 1303 if (IS_ERR(old_devices)) { 1304 kfree(seed_devices); 1305 return PTR_ERR(old_devices); 1306 } 1307 1308 list_add(&old_devices->list, &fs_uuids); 1309 1310 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1311 seed_devices->opened = 1; 1312 INIT_LIST_HEAD(&seed_devices->devices); 1313 INIT_LIST_HEAD(&seed_devices->alloc_list); 1314 mutex_init(&seed_devices->device_list_mutex); 1315 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1316 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1317 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1318 device->fs_devices = seed_devices; 1319 } 1320 1321 fs_devices->seeding = 0; 1322 fs_devices->num_devices = 0; 1323 fs_devices->open_devices = 0; 1324 fs_devices->seed = seed_devices; 1325 1326 generate_random_uuid(fs_devices->fsid); 1327 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1328 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1329 super_flags = btrfs_super_flags(disk_super) & 1330 ~BTRFS_SUPER_FLAG_SEEDING; 1331 btrfs_set_super_flags(disk_super, super_flags); 1332 1333 return 0; 1334 } 1335 1336 /* 1337 * strore the expected generation for seed devices in device items. 1338 */ 1339 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1340 struct btrfs_root *root) 1341 { 1342 struct btrfs_path *path; 1343 struct extent_buffer *leaf; 1344 struct btrfs_dev_item *dev_item; 1345 struct btrfs_device *device; 1346 struct btrfs_key key; 1347 u8 fs_uuid[BTRFS_UUID_SIZE]; 1348 u8 dev_uuid[BTRFS_UUID_SIZE]; 1349 u64 devid; 1350 int ret; 1351 1352 path = btrfs_alloc_path(); 1353 if (!path) 1354 return -ENOMEM; 1355 1356 root = root->fs_info->chunk_root; 1357 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1358 key.offset = 0; 1359 key.type = BTRFS_DEV_ITEM_KEY; 1360 1361 while (1) { 1362 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1363 if (ret < 0) 1364 goto error; 1365 1366 leaf = path->nodes[0]; 1367 next_slot: 1368 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1369 ret = btrfs_next_leaf(root, path); 1370 if (ret > 0) 1371 break; 1372 if (ret < 0) 1373 goto error; 1374 leaf = path->nodes[0]; 1375 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1376 btrfs_release_path(root, path); 1377 continue; 1378 } 1379 1380 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1381 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1382 key.type != BTRFS_DEV_ITEM_KEY) 1383 break; 1384 1385 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1386 struct btrfs_dev_item); 1387 devid = btrfs_device_id(leaf, dev_item); 1388 read_extent_buffer(leaf, dev_uuid, 1389 (unsigned long)btrfs_device_uuid(dev_item), 1390 BTRFS_UUID_SIZE); 1391 read_extent_buffer(leaf, fs_uuid, 1392 (unsigned long)btrfs_device_fsid(dev_item), 1393 BTRFS_UUID_SIZE); 1394 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1395 BUG_ON(!device); 1396 1397 if (device->fs_devices->seeding) { 1398 btrfs_set_device_generation(leaf, dev_item, 1399 device->generation); 1400 btrfs_mark_buffer_dirty(leaf); 1401 } 1402 1403 path->slots[0]++; 1404 goto next_slot; 1405 } 1406 ret = 0; 1407 error: 1408 btrfs_free_path(path); 1409 return ret; 1410 } 1411 1412 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1413 { 1414 struct btrfs_trans_handle *trans; 1415 struct btrfs_device *device; 1416 struct block_device *bdev; 1417 struct list_head *devices; 1418 struct super_block *sb = root->fs_info->sb; 1419 u64 total_bytes; 1420 int seeding_dev = 0; 1421 int ret = 0; 1422 1423 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1424 return -EINVAL; 1425 1426 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1427 if (!bdev) 1428 return -EIO; 1429 1430 if (root->fs_info->fs_devices->seeding) { 1431 seeding_dev = 1; 1432 down_write(&sb->s_umount); 1433 mutex_lock(&uuid_mutex); 1434 } 1435 1436 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1437 mutex_lock(&root->fs_info->volume_mutex); 1438 1439 devices = &root->fs_info->fs_devices->devices; 1440 /* 1441 * we have the volume lock, so we don't need the extra 1442 * device list mutex while reading the list here. 1443 */ 1444 list_for_each_entry(device, devices, dev_list) { 1445 if (device->bdev == bdev) { 1446 ret = -EEXIST; 1447 goto error; 1448 } 1449 } 1450 1451 device = kzalloc(sizeof(*device), GFP_NOFS); 1452 if (!device) { 1453 /* we can safely leave the fs_devices entry around */ 1454 ret = -ENOMEM; 1455 goto error; 1456 } 1457 1458 device->name = kstrdup(device_path, GFP_NOFS); 1459 if (!device->name) { 1460 kfree(device); 1461 ret = -ENOMEM; 1462 goto error; 1463 } 1464 1465 ret = find_next_devid(root, &device->devid); 1466 if (ret) { 1467 kfree(device); 1468 goto error; 1469 } 1470 1471 trans = btrfs_start_transaction(root, 1); 1472 lock_chunks(root); 1473 1474 device->barriers = 1; 1475 device->writeable = 1; 1476 device->work.func = pending_bios_fn; 1477 generate_random_uuid(device->uuid); 1478 spin_lock_init(&device->io_lock); 1479 device->generation = trans->transid; 1480 device->io_width = root->sectorsize; 1481 device->io_align = root->sectorsize; 1482 device->sector_size = root->sectorsize; 1483 device->total_bytes = i_size_read(bdev->bd_inode); 1484 device->disk_total_bytes = device->total_bytes; 1485 device->dev_root = root->fs_info->dev_root; 1486 device->bdev = bdev; 1487 device->in_fs_metadata = 1; 1488 device->mode = 0; 1489 set_blocksize(device->bdev, 4096); 1490 1491 if (seeding_dev) { 1492 sb->s_flags &= ~MS_RDONLY; 1493 ret = btrfs_prepare_sprout(trans, root); 1494 BUG_ON(ret); 1495 } 1496 1497 device->fs_devices = root->fs_info->fs_devices; 1498 1499 /* 1500 * we don't want write_supers to jump in here with our device 1501 * half setup 1502 */ 1503 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1504 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1505 list_add(&device->dev_alloc_list, 1506 &root->fs_info->fs_devices->alloc_list); 1507 root->fs_info->fs_devices->num_devices++; 1508 root->fs_info->fs_devices->open_devices++; 1509 root->fs_info->fs_devices->rw_devices++; 1510 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1511 1512 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1513 root->fs_info->fs_devices->rotating = 1; 1514 1515 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1516 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1517 total_bytes + device->total_bytes); 1518 1519 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1520 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1521 total_bytes + 1); 1522 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1523 1524 if (seeding_dev) { 1525 ret = init_first_rw_device(trans, root, device); 1526 BUG_ON(ret); 1527 ret = btrfs_finish_sprout(trans, root); 1528 BUG_ON(ret); 1529 } else { 1530 ret = btrfs_add_device(trans, root, device); 1531 } 1532 1533 /* 1534 * we've got more storage, clear any full flags on the space 1535 * infos 1536 */ 1537 btrfs_clear_space_info_full(root->fs_info); 1538 1539 unlock_chunks(root); 1540 btrfs_commit_transaction(trans, root); 1541 1542 if (seeding_dev) { 1543 mutex_unlock(&uuid_mutex); 1544 up_write(&sb->s_umount); 1545 1546 ret = btrfs_relocate_sys_chunks(root); 1547 BUG_ON(ret); 1548 } 1549 out: 1550 mutex_unlock(&root->fs_info->volume_mutex); 1551 return ret; 1552 error: 1553 close_bdev_exclusive(bdev, 0); 1554 if (seeding_dev) { 1555 mutex_unlock(&uuid_mutex); 1556 up_write(&sb->s_umount); 1557 } 1558 goto out; 1559 } 1560 1561 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1562 struct btrfs_device *device) 1563 { 1564 int ret; 1565 struct btrfs_path *path; 1566 struct btrfs_root *root; 1567 struct btrfs_dev_item *dev_item; 1568 struct extent_buffer *leaf; 1569 struct btrfs_key key; 1570 1571 root = device->dev_root->fs_info->chunk_root; 1572 1573 path = btrfs_alloc_path(); 1574 if (!path) 1575 return -ENOMEM; 1576 1577 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1578 key.type = BTRFS_DEV_ITEM_KEY; 1579 key.offset = device->devid; 1580 1581 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1582 if (ret < 0) 1583 goto out; 1584 1585 if (ret > 0) { 1586 ret = -ENOENT; 1587 goto out; 1588 } 1589 1590 leaf = path->nodes[0]; 1591 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1592 1593 btrfs_set_device_id(leaf, dev_item, device->devid); 1594 btrfs_set_device_type(leaf, dev_item, device->type); 1595 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1596 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1597 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1598 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1599 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1600 btrfs_mark_buffer_dirty(leaf); 1601 1602 out: 1603 btrfs_free_path(path); 1604 return ret; 1605 } 1606 1607 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1608 struct btrfs_device *device, u64 new_size) 1609 { 1610 struct btrfs_super_block *super_copy = 1611 &device->dev_root->fs_info->super_copy; 1612 u64 old_total = btrfs_super_total_bytes(super_copy); 1613 u64 diff = new_size - device->total_bytes; 1614 1615 if (!device->writeable) 1616 return -EACCES; 1617 if (new_size <= device->total_bytes) 1618 return -EINVAL; 1619 1620 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1621 device->fs_devices->total_rw_bytes += diff; 1622 1623 device->total_bytes = new_size; 1624 btrfs_clear_space_info_full(device->dev_root->fs_info); 1625 1626 return btrfs_update_device(trans, device); 1627 } 1628 1629 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1630 struct btrfs_device *device, u64 new_size) 1631 { 1632 int ret; 1633 lock_chunks(device->dev_root); 1634 ret = __btrfs_grow_device(trans, device, new_size); 1635 unlock_chunks(device->dev_root); 1636 return ret; 1637 } 1638 1639 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1640 struct btrfs_root *root, 1641 u64 chunk_tree, u64 chunk_objectid, 1642 u64 chunk_offset) 1643 { 1644 int ret; 1645 struct btrfs_path *path; 1646 struct btrfs_key key; 1647 1648 root = root->fs_info->chunk_root; 1649 path = btrfs_alloc_path(); 1650 if (!path) 1651 return -ENOMEM; 1652 1653 key.objectid = chunk_objectid; 1654 key.offset = chunk_offset; 1655 key.type = BTRFS_CHUNK_ITEM_KEY; 1656 1657 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1658 BUG_ON(ret); 1659 1660 ret = btrfs_del_item(trans, root, path); 1661 BUG_ON(ret); 1662 1663 btrfs_free_path(path); 1664 return 0; 1665 } 1666 1667 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1668 chunk_offset) 1669 { 1670 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1671 struct btrfs_disk_key *disk_key; 1672 struct btrfs_chunk *chunk; 1673 u8 *ptr; 1674 int ret = 0; 1675 u32 num_stripes; 1676 u32 array_size; 1677 u32 len = 0; 1678 u32 cur; 1679 struct btrfs_key key; 1680 1681 array_size = btrfs_super_sys_array_size(super_copy); 1682 1683 ptr = super_copy->sys_chunk_array; 1684 cur = 0; 1685 1686 while (cur < array_size) { 1687 disk_key = (struct btrfs_disk_key *)ptr; 1688 btrfs_disk_key_to_cpu(&key, disk_key); 1689 1690 len = sizeof(*disk_key); 1691 1692 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1693 chunk = (struct btrfs_chunk *)(ptr + len); 1694 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1695 len += btrfs_chunk_item_size(num_stripes); 1696 } else { 1697 ret = -EIO; 1698 break; 1699 } 1700 if (key.objectid == chunk_objectid && 1701 key.offset == chunk_offset) { 1702 memmove(ptr, ptr + len, array_size - (cur + len)); 1703 array_size -= len; 1704 btrfs_set_super_sys_array_size(super_copy, array_size); 1705 } else { 1706 ptr += len; 1707 cur += len; 1708 } 1709 } 1710 return ret; 1711 } 1712 1713 static int btrfs_relocate_chunk(struct btrfs_root *root, 1714 u64 chunk_tree, u64 chunk_objectid, 1715 u64 chunk_offset) 1716 { 1717 struct extent_map_tree *em_tree; 1718 struct btrfs_root *extent_root; 1719 struct btrfs_trans_handle *trans; 1720 struct extent_map *em; 1721 struct map_lookup *map; 1722 int ret; 1723 int i; 1724 1725 root = root->fs_info->chunk_root; 1726 extent_root = root->fs_info->extent_root; 1727 em_tree = &root->fs_info->mapping_tree.map_tree; 1728 1729 /* step one, relocate all the extents inside this chunk */ 1730 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1731 BUG_ON(ret); 1732 1733 trans = btrfs_start_transaction(root, 1); 1734 BUG_ON(!trans); 1735 1736 lock_chunks(root); 1737 1738 /* 1739 * step two, delete the device extents and the 1740 * chunk tree entries 1741 */ 1742 spin_lock(&em_tree->lock); 1743 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1744 spin_unlock(&em_tree->lock); 1745 1746 BUG_ON(em->start > chunk_offset || 1747 em->start + em->len < chunk_offset); 1748 map = (struct map_lookup *)em->bdev; 1749 1750 for (i = 0; i < map->num_stripes; i++) { 1751 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 1752 map->stripes[i].physical); 1753 BUG_ON(ret); 1754 1755 if (map->stripes[i].dev) { 1756 ret = btrfs_update_device(trans, map->stripes[i].dev); 1757 BUG_ON(ret); 1758 } 1759 } 1760 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 1761 chunk_offset); 1762 1763 BUG_ON(ret); 1764 1765 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1766 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1767 BUG_ON(ret); 1768 } 1769 1770 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 1771 BUG_ON(ret); 1772 1773 spin_lock(&em_tree->lock); 1774 remove_extent_mapping(em_tree, em); 1775 spin_unlock(&em_tree->lock); 1776 1777 kfree(map); 1778 em->bdev = NULL; 1779 1780 /* once for the tree */ 1781 free_extent_map(em); 1782 /* once for us */ 1783 free_extent_map(em); 1784 1785 unlock_chunks(root); 1786 btrfs_end_transaction(trans, root); 1787 return 0; 1788 } 1789 1790 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 1791 { 1792 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 1793 struct btrfs_path *path; 1794 struct extent_buffer *leaf; 1795 struct btrfs_chunk *chunk; 1796 struct btrfs_key key; 1797 struct btrfs_key found_key; 1798 u64 chunk_tree = chunk_root->root_key.objectid; 1799 u64 chunk_type; 1800 int ret; 1801 1802 path = btrfs_alloc_path(); 1803 if (!path) 1804 return -ENOMEM; 1805 1806 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1807 key.offset = (u64)-1; 1808 key.type = BTRFS_CHUNK_ITEM_KEY; 1809 1810 while (1) { 1811 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1812 if (ret < 0) 1813 goto error; 1814 BUG_ON(ret == 0); 1815 1816 ret = btrfs_previous_item(chunk_root, path, key.objectid, 1817 key.type); 1818 if (ret < 0) 1819 goto error; 1820 if (ret > 0) 1821 break; 1822 1823 leaf = path->nodes[0]; 1824 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1825 1826 chunk = btrfs_item_ptr(leaf, path->slots[0], 1827 struct btrfs_chunk); 1828 chunk_type = btrfs_chunk_type(leaf, chunk); 1829 btrfs_release_path(chunk_root, path); 1830 1831 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1832 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1833 found_key.objectid, 1834 found_key.offset); 1835 BUG_ON(ret); 1836 } 1837 1838 if (found_key.offset == 0) 1839 break; 1840 key.offset = found_key.offset - 1; 1841 } 1842 ret = 0; 1843 error: 1844 btrfs_free_path(path); 1845 return ret; 1846 } 1847 1848 static u64 div_factor(u64 num, int factor) 1849 { 1850 if (factor == 10) 1851 return num; 1852 num *= factor; 1853 do_div(num, 10); 1854 return num; 1855 } 1856 1857 int btrfs_balance(struct btrfs_root *dev_root) 1858 { 1859 int ret; 1860 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1861 struct btrfs_device *device; 1862 u64 old_size; 1863 u64 size_to_free; 1864 struct btrfs_path *path; 1865 struct btrfs_key key; 1866 struct btrfs_chunk *chunk; 1867 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 1868 struct btrfs_trans_handle *trans; 1869 struct btrfs_key found_key; 1870 1871 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 1872 return -EROFS; 1873 1874 mutex_lock(&dev_root->fs_info->volume_mutex); 1875 dev_root = dev_root->fs_info->dev_root; 1876 1877 /* step one make some room on all the devices */ 1878 list_for_each_entry(device, devices, dev_list) { 1879 old_size = device->total_bytes; 1880 size_to_free = div_factor(old_size, 1); 1881 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1882 if (!device->writeable || 1883 device->total_bytes - device->bytes_used > size_to_free) 1884 continue; 1885 1886 ret = btrfs_shrink_device(device, old_size - size_to_free); 1887 BUG_ON(ret); 1888 1889 trans = btrfs_start_transaction(dev_root, 1); 1890 BUG_ON(!trans); 1891 1892 ret = btrfs_grow_device(trans, device, old_size); 1893 BUG_ON(ret); 1894 1895 btrfs_end_transaction(trans, dev_root); 1896 } 1897 1898 /* step two, relocate all the chunks */ 1899 path = btrfs_alloc_path(); 1900 BUG_ON(!path); 1901 1902 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 1903 key.offset = (u64)-1; 1904 key.type = BTRFS_CHUNK_ITEM_KEY; 1905 1906 while (1) { 1907 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 1908 if (ret < 0) 1909 goto error; 1910 1911 /* 1912 * this shouldn't happen, it means the last relocate 1913 * failed 1914 */ 1915 if (ret == 0) 1916 break; 1917 1918 ret = btrfs_previous_item(chunk_root, path, 0, 1919 BTRFS_CHUNK_ITEM_KEY); 1920 if (ret) 1921 break; 1922 1923 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1924 path->slots[0]); 1925 if (found_key.objectid != key.objectid) 1926 break; 1927 1928 chunk = btrfs_item_ptr(path->nodes[0], 1929 path->slots[0], 1930 struct btrfs_chunk); 1931 key.offset = found_key.offset; 1932 /* chunk zero is special */ 1933 if (key.offset == 0) 1934 break; 1935 1936 btrfs_release_path(chunk_root, path); 1937 ret = btrfs_relocate_chunk(chunk_root, 1938 chunk_root->root_key.objectid, 1939 found_key.objectid, 1940 found_key.offset); 1941 BUG_ON(ret); 1942 } 1943 ret = 0; 1944 error: 1945 btrfs_free_path(path); 1946 mutex_unlock(&dev_root->fs_info->volume_mutex); 1947 return ret; 1948 } 1949 1950 /* 1951 * shrinking a device means finding all of the device extents past 1952 * the new size, and then following the back refs to the chunks. 1953 * The chunk relocation code actually frees the device extent 1954 */ 1955 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 1956 { 1957 struct btrfs_trans_handle *trans; 1958 struct btrfs_root *root = device->dev_root; 1959 struct btrfs_dev_extent *dev_extent = NULL; 1960 struct btrfs_path *path; 1961 u64 length; 1962 u64 chunk_tree; 1963 u64 chunk_objectid; 1964 u64 chunk_offset; 1965 int ret; 1966 int slot; 1967 struct extent_buffer *l; 1968 struct btrfs_key key; 1969 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1970 u64 old_total = btrfs_super_total_bytes(super_copy); 1971 u64 diff = device->total_bytes - new_size; 1972 1973 if (new_size >= device->total_bytes) 1974 return -EINVAL; 1975 1976 path = btrfs_alloc_path(); 1977 if (!path) 1978 return -ENOMEM; 1979 1980 trans = btrfs_start_transaction(root, 1); 1981 if (!trans) { 1982 ret = -ENOMEM; 1983 goto done; 1984 } 1985 1986 path->reada = 2; 1987 1988 lock_chunks(root); 1989 1990 device->total_bytes = new_size; 1991 if (device->writeable) 1992 device->fs_devices->total_rw_bytes -= diff; 1993 unlock_chunks(root); 1994 btrfs_end_transaction(trans, root); 1995 1996 key.objectid = device->devid; 1997 key.offset = (u64)-1; 1998 key.type = BTRFS_DEV_EXTENT_KEY; 1999 2000 while (1) { 2001 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2002 if (ret < 0) 2003 goto done; 2004 2005 ret = btrfs_previous_item(root, path, 0, key.type); 2006 if (ret < 0) 2007 goto done; 2008 if (ret) { 2009 ret = 0; 2010 goto done; 2011 } 2012 2013 l = path->nodes[0]; 2014 slot = path->slots[0]; 2015 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2016 2017 if (key.objectid != device->devid) 2018 goto done; 2019 2020 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2021 length = btrfs_dev_extent_length(l, dev_extent); 2022 2023 if (key.offset + length <= new_size) 2024 break; 2025 2026 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2027 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2028 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2029 btrfs_release_path(root, path); 2030 2031 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2032 chunk_offset); 2033 if (ret) 2034 goto done; 2035 } 2036 2037 /* Shrinking succeeded, else we would be at "done". */ 2038 trans = btrfs_start_transaction(root, 1); 2039 if (!trans) { 2040 ret = -ENOMEM; 2041 goto done; 2042 } 2043 lock_chunks(root); 2044 2045 device->disk_total_bytes = new_size; 2046 /* Now btrfs_update_device() will change the on-disk size. */ 2047 ret = btrfs_update_device(trans, device); 2048 if (ret) { 2049 unlock_chunks(root); 2050 btrfs_end_transaction(trans, root); 2051 goto done; 2052 } 2053 WARN_ON(diff > old_total); 2054 btrfs_set_super_total_bytes(super_copy, old_total - diff); 2055 unlock_chunks(root); 2056 btrfs_end_transaction(trans, root); 2057 done: 2058 btrfs_free_path(path); 2059 return ret; 2060 } 2061 2062 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 2063 struct btrfs_root *root, 2064 struct btrfs_key *key, 2065 struct btrfs_chunk *chunk, int item_size) 2066 { 2067 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2068 struct btrfs_disk_key disk_key; 2069 u32 array_size; 2070 u8 *ptr; 2071 2072 array_size = btrfs_super_sys_array_size(super_copy); 2073 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 2074 return -EFBIG; 2075 2076 ptr = super_copy->sys_chunk_array + array_size; 2077 btrfs_cpu_key_to_disk(&disk_key, key); 2078 memcpy(ptr, &disk_key, sizeof(disk_key)); 2079 ptr += sizeof(disk_key); 2080 memcpy(ptr, chunk, item_size); 2081 item_size += sizeof(disk_key); 2082 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 2083 return 0; 2084 } 2085 2086 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2087 int num_stripes, int sub_stripes) 2088 { 2089 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2090 return calc_size; 2091 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2092 return calc_size * (num_stripes / sub_stripes); 2093 else 2094 return calc_size * num_stripes; 2095 } 2096 2097 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2098 struct btrfs_root *extent_root, 2099 struct map_lookup **map_ret, 2100 u64 *num_bytes, u64 *stripe_size, 2101 u64 start, u64 type) 2102 { 2103 struct btrfs_fs_info *info = extent_root->fs_info; 2104 struct btrfs_device *device = NULL; 2105 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2106 struct list_head *cur; 2107 struct map_lookup *map = NULL; 2108 struct extent_map_tree *em_tree; 2109 struct extent_map *em; 2110 struct list_head private_devs; 2111 int min_stripe_size = 1 * 1024 * 1024; 2112 u64 calc_size = 1024 * 1024 * 1024; 2113 u64 max_chunk_size = calc_size; 2114 u64 min_free; 2115 u64 avail; 2116 u64 max_avail = 0; 2117 u64 dev_offset; 2118 int num_stripes = 1; 2119 int min_stripes = 1; 2120 int sub_stripes = 0; 2121 int looped = 0; 2122 int ret; 2123 int index; 2124 int stripe_len = 64 * 1024; 2125 2126 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2127 (type & BTRFS_BLOCK_GROUP_DUP)) { 2128 WARN_ON(1); 2129 type &= ~BTRFS_BLOCK_GROUP_DUP; 2130 } 2131 if (list_empty(&fs_devices->alloc_list)) 2132 return -ENOSPC; 2133 2134 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2135 num_stripes = fs_devices->rw_devices; 2136 min_stripes = 2; 2137 } 2138 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2139 num_stripes = 2; 2140 min_stripes = 2; 2141 } 2142 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2143 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2144 if (num_stripes < 2) 2145 return -ENOSPC; 2146 min_stripes = 2; 2147 } 2148 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2149 num_stripes = fs_devices->rw_devices; 2150 if (num_stripes < 4) 2151 return -ENOSPC; 2152 num_stripes &= ~(u32)1; 2153 sub_stripes = 2; 2154 min_stripes = 4; 2155 } 2156 2157 if (type & BTRFS_BLOCK_GROUP_DATA) { 2158 max_chunk_size = 10 * calc_size; 2159 min_stripe_size = 64 * 1024 * 1024; 2160 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2161 max_chunk_size = 4 * calc_size; 2162 min_stripe_size = 32 * 1024 * 1024; 2163 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2164 calc_size = 8 * 1024 * 1024; 2165 max_chunk_size = calc_size * 2; 2166 min_stripe_size = 1 * 1024 * 1024; 2167 } 2168 2169 /* we don't want a chunk larger than 10% of writeable space */ 2170 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2171 max_chunk_size); 2172 2173 again: 2174 if (!map || map->num_stripes != num_stripes) { 2175 kfree(map); 2176 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2177 if (!map) 2178 return -ENOMEM; 2179 map->num_stripes = num_stripes; 2180 } 2181 2182 if (calc_size * num_stripes > max_chunk_size) { 2183 calc_size = max_chunk_size; 2184 do_div(calc_size, num_stripes); 2185 do_div(calc_size, stripe_len); 2186 calc_size *= stripe_len; 2187 } 2188 /* we don't want tiny stripes */ 2189 calc_size = max_t(u64, min_stripe_size, calc_size); 2190 2191 do_div(calc_size, stripe_len); 2192 calc_size *= stripe_len; 2193 2194 cur = fs_devices->alloc_list.next; 2195 index = 0; 2196 2197 if (type & BTRFS_BLOCK_GROUP_DUP) 2198 min_free = calc_size * 2; 2199 else 2200 min_free = calc_size; 2201 2202 /* 2203 * we add 1MB because we never use the first 1MB of the device, unless 2204 * we've looped, then we are likely allocating the maximum amount of 2205 * space left already 2206 */ 2207 if (!looped) 2208 min_free += 1024 * 1024; 2209 2210 INIT_LIST_HEAD(&private_devs); 2211 while (index < num_stripes) { 2212 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 2213 BUG_ON(!device->writeable); 2214 if (device->total_bytes > device->bytes_used) 2215 avail = device->total_bytes - device->bytes_used; 2216 else 2217 avail = 0; 2218 cur = cur->next; 2219 2220 if (device->in_fs_metadata && avail >= min_free) { 2221 ret = find_free_dev_extent(trans, device, 2222 min_free, &dev_offset); 2223 if (ret == 0) { 2224 list_move_tail(&device->dev_alloc_list, 2225 &private_devs); 2226 map->stripes[index].dev = device; 2227 map->stripes[index].physical = dev_offset; 2228 index++; 2229 if (type & BTRFS_BLOCK_GROUP_DUP) { 2230 map->stripes[index].dev = device; 2231 map->stripes[index].physical = 2232 dev_offset + calc_size; 2233 index++; 2234 } 2235 } 2236 } else if (device->in_fs_metadata && avail > max_avail) 2237 max_avail = avail; 2238 if (cur == &fs_devices->alloc_list) 2239 break; 2240 } 2241 list_splice(&private_devs, &fs_devices->alloc_list); 2242 if (index < num_stripes) { 2243 if (index >= min_stripes) { 2244 num_stripes = index; 2245 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2246 num_stripes /= sub_stripes; 2247 num_stripes *= sub_stripes; 2248 } 2249 looped = 1; 2250 goto again; 2251 } 2252 if (!looped && max_avail > 0) { 2253 looped = 1; 2254 calc_size = max_avail; 2255 goto again; 2256 } 2257 kfree(map); 2258 return -ENOSPC; 2259 } 2260 map->sector_size = extent_root->sectorsize; 2261 map->stripe_len = stripe_len; 2262 map->io_align = stripe_len; 2263 map->io_width = stripe_len; 2264 map->type = type; 2265 map->num_stripes = num_stripes; 2266 map->sub_stripes = sub_stripes; 2267 2268 *map_ret = map; 2269 *stripe_size = calc_size; 2270 *num_bytes = chunk_bytes_by_type(type, calc_size, 2271 num_stripes, sub_stripes); 2272 2273 em = alloc_extent_map(GFP_NOFS); 2274 if (!em) { 2275 kfree(map); 2276 return -ENOMEM; 2277 } 2278 em->bdev = (struct block_device *)map; 2279 em->start = start; 2280 em->len = *num_bytes; 2281 em->block_start = 0; 2282 em->block_len = em->len; 2283 2284 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 2285 spin_lock(&em_tree->lock); 2286 ret = add_extent_mapping(em_tree, em); 2287 spin_unlock(&em_tree->lock); 2288 BUG_ON(ret); 2289 free_extent_map(em); 2290 2291 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2292 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2293 start, *num_bytes); 2294 BUG_ON(ret); 2295 2296 index = 0; 2297 while (index < map->num_stripes) { 2298 device = map->stripes[index].dev; 2299 dev_offset = map->stripes[index].physical; 2300 2301 ret = btrfs_alloc_dev_extent(trans, device, 2302 info->chunk_root->root_key.objectid, 2303 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2304 start, dev_offset, calc_size); 2305 BUG_ON(ret); 2306 index++; 2307 } 2308 2309 return 0; 2310 } 2311 2312 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2313 struct btrfs_root *extent_root, 2314 struct map_lookup *map, u64 chunk_offset, 2315 u64 chunk_size, u64 stripe_size) 2316 { 2317 u64 dev_offset; 2318 struct btrfs_key key; 2319 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2320 struct btrfs_device *device; 2321 struct btrfs_chunk *chunk; 2322 struct btrfs_stripe *stripe; 2323 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 2324 int index = 0; 2325 int ret; 2326 2327 chunk = kzalloc(item_size, GFP_NOFS); 2328 if (!chunk) 2329 return -ENOMEM; 2330 2331 index = 0; 2332 while (index < map->num_stripes) { 2333 device = map->stripes[index].dev; 2334 device->bytes_used += stripe_size; 2335 ret = btrfs_update_device(trans, device); 2336 BUG_ON(ret); 2337 index++; 2338 } 2339 2340 index = 0; 2341 stripe = &chunk->stripe; 2342 while (index < map->num_stripes) { 2343 device = map->stripes[index].dev; 2344 dev_offset = map->stripes[index].physical; 2345 2346 btrfs_set_stack_stripe_devid(stripe, device->devid); 2347 btrfs_set_stack_stripe_offset(stripe, dev_offset); 2348 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 2349 stripe++; 2350 index++; 2351 } 2352 2353 btrfs_set_stack_chunk_length(chunk, chunk_size); 2354 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 2355 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 2356 btrfs_set_stack_chunk_type(chunk, map->type); 2357 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 2358 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 2359 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 2360 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 2361 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 2362 2363 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2364 key.type = BTRFS_CHUNK_ITEM_KEY; 2365 key.offset = chunk_offset; 2366 2367 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 2368 BUG_ON(ret); 2369 2370 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2371 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 2372 item_size); 2373 BUG_ON(ret); 2374 } 2375 kfree(chunk); 2376 return 0; 2377 } 2378 2379 /* 2380 * Chunk allocation falls into two parts. The first part does works 2381 * that make the new allocated chunk useable, but not do any operation 2382 * that modifies the chunk tree. The second part does the works that 2383 * require modifying the chunk tree. This division is important for the 2384 * bootstrap process of adding storage to a seed btrfs. 2385 */ 2386 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2387 struct btrfs_root *extent_root, u64 type) 2388 { 2389 u64 chunk_offset; 2390 u64 chunk_size; 2391 u64 stripe_size; 2392 struct map_lookup *map; 2393 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 2394 int ret; 2395 2396 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2397 &chunk_offset); 2398 if (ret) 2399 return ret; 2400 2401 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2402 &stripe_size, chunk_offset, type); 2403 if (ret) 2404 return ret; 2405 2406 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2407 chunk_size, stripe_size); 2408 BUG_ON(ret); 2409 return 0; 2410 } 2411 2412 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 2413 struct btrfs_root *root, 2414 struct btrfs_device *device) 2415 { 2416 u64 chunk_offset; 2417 u64 sys_chunk_offset; 2418 u64 chunk_size; 2419 u64 sys_chunk_size; 2420 u64 stripe_size; 2421 u64 sys_stripe_size; 2422 u64 alloc_profile; 2423 struct map_lookup *map; 2424 struct map_lookup *sys_map; 2425 struct btrfs_fs_info *fs_info = root->fs_info; 2426 struct btrfs_root *extent_root = fs_info->extent_root; 2427 int ret; 2428 2429 ret = find_next_chunk(fs_info->chunk_root, 2430 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2431 BUG_ON(ret); 2432 2433 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2434 (fs_info->metadata_alloc_profile & 2435 fs_info->avail_metadata_alloc_bits); 2436 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2437 2438 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 2439 &stripe_size, chunk_offset, alloc_profile); 2440 BUG_ON(ret); 2441 2442 sys_chunk_offset = chunk_offset + chunk_size; 2443 2444 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 2445 (fs_info->system_alloc_profile & 2446 fs_info->avail_system_alloc_bits); 2447 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 2448 2449 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 2450 &sys_chunk_size, &sys_stripe_size, 2451 sys_chunk_offset, alloc_profile); 2452 BUG_ON(ret); 2453 2454 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 2455 BUG_ON(ret); 2456 2457 /* 2458 * Modifying chunk tree needs allocating new blocks from both 2459 * system block group and metadata block group. So we only can 2460 * do operations require modifying the chunk tree after both 2461 * block groups were created. 2462 */ 2463 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 2464 chunk_size, stripe_size); 2465 BUG_ON(ret); 2466 2467 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 2468 sys_chunk_offset, sys_chunk_size, 2469 sys_stripe_size); 2470 BUG_ON(ret); 2471 return 0; 2472 } 2473 2474 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 2475 { 2476 struct extent_map *em; 2477 struct map_lookup *map; 2478 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 2479 int readonly = 0; 2480 int i; 2481 2482 spin_lock(&map_tree->map_tree.lock); 2483 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2484 spin_unlock(&map_tree->map_tree.lock); 2485 if (!em) 2486 return 1; 2487 2488 map = (struct map_lookup *)em->bdev; 2489 for (i = 0; i < map->num_stripes; i++) { 2490 if (!map->stripes[i].dev->writeable) { 2491 readonly = 1; 2492 break; 2493 } 2494 } 2495 free_extent_map(em); 2496 return readonly; 2497 } 2498 2499 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2500 { 2501 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2502 } 2503 2504 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2505 { 2506 struct extent_map *em; 2507 2508 while (1) { 2509 spin_lock(&tree->map_tree.lock); 2510 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 2511 if (em) 2512 remove_extent_mapping(&tree->map_tree, em); 2513 spin_unlock(&tree->map_tree.lock); 2514 if (!em) 2515 break; 2516 kfree(em->bdev); 2517 /* once for us */ 2518 free_extent_map(em); 2519 /* once for the tree */ 2520 free_extent_map(em); 2521 } 2522 } 2523 2524 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 2525 { 2526 struct extent_map *em; 2527 struct map_lookup *map; 2528 struct extent_map_tree *em_tree = &map_tree->map_tree; 2529 int ret; 2530 2531 spin_lock(&em_tree->lock); 2532 em = lookup_extent_mapping(em_tree, logical, len); 2533 spin_unlock(&em_tree->lock); 2534 BUG_ON(!em); 2535 2536 BUG_ON(em->start > logical || em->start + em->len < logical); 2537 map = (struct map_lookup *)em->bdev; 2538 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 2539 ret = map->num_stripes; 2540 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2541 ret = map->sub_stripes; 2542 else 2543 ret = 1; 2544 free_extent_map(em); 2545 return ret; 2546 } 2547 2548 static int find_live_mirror(struct map_lookup *map, int first, int num, 2549 int optimal) 2550 { 2551 int i; 2552 if (map->stripes[optimal].dev->bdev) 2553 return optimal; 2554 for (i = first; i < first + num; i++) { 2555 if (map->stripes[i].dev->bdev) 2556 return i; 2557 } 2558 /* we couldn't find one that doesn't fail. Just return something 2559 * and the io error handling code will clean up eventually 2560 */ 2561 return optimal; 2562 } 2563 2564 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2565 u64 logical, u64 *length, 2566 struct btrfs_multi_bio **multi_ret, 2567 int mirror_num, struct page *unplug_page) 2568 { 2569 struct extent_map *em; 2570 struct map_lookup *map; 2571 struct extent_map_tree *em_tree = &map_tree->map_tree; 2572 u64 offset; 2573 u64 stripe_offset; 2574 u64 stripe_nr; 2575 int stripes_allocated = 8; 2576 int stripes_required = 1; 2577 int stripe_index; 2578 int i; 2579 int num_stripes; 2580 int max_errors = 0; 2581 struct btrfs_multi_bio *multi = NULL; 2582 2583 if (multi_ret && !(rw & (1 << BIO_RW))) 2584 stripes_allocated = 1; 2585 again: 2586 if (multi_ret) { 2587 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2588 GFP_NOFS); 2589 if (!multi) 2590 return -ENOMEM; 2591 2592 atomic_set(&multi->error, 0); 2593 } 2594 2595 spin_lock(&em_tree->lock); 2596 em = lookup_extent_mapping(em_tree, logical, *length); 2597 spin_unlock(&em_tree->lock); 2598 2599 if (!em && unplug_page) 2600 return 0; 2601 2602 if (!em) { 2603 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2604 (unsigned long long)logical, 2605 (unsigned long long)*length); 2606 BUG(); 2607 } 2608 2609 BUG_ON(em->start > logical || em->start + em->len < logical); 2610 map = (struct map_lookup *)em->bdev; 2611 offset = logical - em->start; 2612 2613 if (mirror_num > map->num_stripes) 2614 mirror_num = 0; 2615 2616 /* if our multi bio struct is too small, back off and try again */ 2617 if (rw & (1 << BIO_RW)) { 2618 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2619 BTRFS_BLOCK_GROUP_DUP)) { 2620 stripes_required = map->num_stripes; 2621 max_errors = 1; 2622 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2623 stripes_required = map->sub_stripes; 2624 max_errors = 1; 2625 } 2626 } 2627 if (multi_ret && (rw & (1 << BIO_RW)) && 2628 stripes_allocated < stripes_required) { 2629 stripes_allocated = map->num_stripes; 2630 free_extent_map(em); 2631 kfree(multi); 2632 goto again; 2633 } 2634 stripe_nr = offset; 2635 /* 2636 * stripe_nr counts the total number of stripes we have to stride 2637 * to get to this block 2638 */ 2639 do_div(stripe_nr, map->stripe_len); 2640 2641 stripe_offset = stripe_nr * map->stripe_len; 2642 BUG_ON(offset < stripe_offset); 2643 2644 /* stripe_offset is the offset of this block in its stripe*/ 2645 stripe_offset = offset - stripe_offset; 2646 2647 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2648 BTRFS_BLOCK_GROUP_RAID10 | 2649 BTRFS_BLOCK_GROUP_DUP)) { 2650 /* we limit the length of each bio to what fits in a stripe */ 2651 *length = min_t(u64, em->len - offset, 2652 map->stripe_len - stripe_offset); 2653 } else { 2654 *length = em->len - offset; 2655 } 2656 2657 if (!multi_ret && !unplug_page) 2658 goto out; 2659 2660 num_stripes = 1; 2661 stripe_index = 0; 2662 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2663 if (unplug_page || (rw & (1 << BIO_RW))) 2664 num_stripes = map->num_stripes; 2665 else if (mirror_num) 2666 stripe_index = mirror_num - 1; 2667 else { 2668 stripe_index = find_live_mirror(map, 0, 2669 map->num_stripes, 2670 current->pid % map->num_stripes); 2671 } 2672 2673 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2674 if (rw & (1 << BIO_RW)) 2675 num_stripes = map->num_stripes; 2676 else if (mirror_num) 2677 stripe_index = mirror_num - 1; 2678 2679 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2680 int factor = map->num_stripes / map->sub_stripes; 2681 2682 stripe_index = do_div(stripe_nr, factor); 2683 stripe_index *= map->sub_stripes; 2684 2685 if (unplug_page || (rw & (1 << BIO_RW))) 2686 num_stripes = map->sub_stripes; 2687 else if (mirror_num) 2688 stripe_index += mirror_num - 1; 2689 else { 2690 stripe_index = find_live_mirror(map, stripe_index, 2691 map->sub_stripes, stripe_index + 2692 current->pid % map->sub_stripes); 2693 } 2694 } else { 2695 /* 2696 * after this do_div call, stripe_nr is the number of stripes 2697 * on this device we have to walk to find the data, and 2698 * stripe_index is the number of our device in the stripe array 2699 */ 2700 stripe_index = do_div(stripe_nr, map->num_stripes); 2701 } 2702 BUG_ON(stripe_index >= map->num_stripes); 2703 2704 for (i = 0; i < num_stripes; i++) { 2705 if (unplug_page) { 2706 struct btrfs_device *device; 2707 struct backing_dev_info *bdi; 2708 2709 device = map->stripes[stripe_index].dev; 2710 if (device->bdev) { 2711 bdi = blk_get_backing_dev_info(device->bdev); 2712 if (bdi->unplug_io_fn) 2713 bdi->unplug_io_fn(bdi, unplug_page); 2714 } 2715 } else { 2716 multi->stripes[i].physical = 2717 map->stripes[stripe_index].physical + 2718 stripe_offset + stripe_nr * map->stripe_len; 2719 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2720 } 2721 stripe_index++; 2722 } 2723 if (multi_ret) { 2724 *multi_ret = multi; 2725 multi->num_stripes = num_stripes; 2726 multi->max_errors = max_errors; 2727 } 2728 out: 2729 free_extent_map(em); 2730 return 0; 2731 } 2732 2733 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2734 u64 logical, u64 *length, 2735 struct btrfs_multi_bio **multi_ret, int mirror_num) 2736 { 2737 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 2738 mirror_num, NULL); 2739 } 2740 2741 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 2742 u64 chunk_start, u64 physical, u64 devid, 2743 u64 **logical, int *naddrs, int *stripe_len) 2744 { 2745 struct extent_map_tree *em_tree = &map_tree->map_tree; 2746 struct extent_map *em; 2747 struct map_lookup *map; 2748 u64 *buf; 2749 u64 bytenr; 2750 u64 length; 2751 u64 stripe_nr; 2752 int i, j, nr = 0; 2753 2754 spin_lock(&em_tree->lock); 2755 em = lookup_extent_mapping(em_tree, chunk_start, 1); 2756 spin_unlock(&em_tree->lock); 2757 2758 BUG_ON(!em || em->start != chunk_start); 2759 map = (struct map_lookup *)em->bdev; 2760 2761 length = em->len; 2762 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 2763 do_div(length, map->num_stripes / map->sub_stripes); 2764 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 2765 do_div(length, map->num_stripes); 2766 2767 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 2768 BUG_ON(!buf); 2769 2770 for (i = 0; i < map->num_stripes; i++) { 2771 if (devid && map->stripes[i].dev->devid != devid) 2772 continue; 2773 if (map->stripes[i].physical > physical || 2774 map->stripes[i].physical + length <= physical) 2775 continue; 2776 2777 stripe_nr = physical - map->stripes[i].physical; 2778 do_div(stripe_nr, map->stripe_len); 2779 2780 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 2781 stripe_nr = stripe_nr * map->num_stripes + i; 2782 do_div(stripe_nr, map->sub_stripes); 2783 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2784 stripe_nr = stripe_nr * map->num_stripes + i; 2785 } 2786 bytenr = chunk_start + stripe_nr * map->stripe_len; 2787 WARN_ON(nr >= map->num_stripes); 2788 for (j = 0; j < nr; j++) { 2789 if (buf[j] == bytenr) 2790 break; 2791 } 2792 if (j == nr) { 2793 WARN_ON(nr >= map->num_stripes); 2794 buf[nr++] = bytenr; 2795 } 2796 } 2797 2798 for (i = 0; i > nr; i++) { 2799 struct btrfs_multi_bio *multi; 2800 struct btrfs_bio_stripe *stripe; 2801 int ret; 2802 2803 length = 1; 2804 ret = btrfs_map_block(map_tree, WRITE, buf[i], 2805 &length, &multi, 0); 2806 BUG_ON(ret); 2807 2808 stripe = multi->stripes; 2809 for (j = 0; j < multi->num_stripes; j++) { 2810 if (stripe->physical >= physical && 2811 physical < stripe->physical + length) 2812 break; 2813 } 2814 BUG_ON(j >= multi->num_stripes); 2815 kfree(multi); 2816 } 2817 2818 *logical = buf; 2819 *naddrs = nr; 2820 *stripe_len = map->stripe_len; 2821 2822 free_extent_map(em); 2823 return 0; 2824 } 2825 2826 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, 2827 u64 logical, struct page *page) 2828 { 2829 u64 length = PAGE_CACHE_SIZE; 2830 return __btrfs_map_block(map_tree, READ, logical, &length, 2831 NULL, 0, page); 2832 } 2833 2834 static void end_bio_multi_stripe(struct bio *bio, int err) 2835 { 2836 struct btrfs_multi_bio *multi = bio->bi_private; 2837 int is_orig_bio = 0; 2838 2839 if (err) 2840 atomic_inc(&multi->error); 2841 2842 if (bio == multi->orig_bio) 2843 is_orig_bio = 1; 2844 2845 if (atomic_dec_and_test(&multi->stripes_pending)) { 2846 if (!is_orig_bio) { 2847 bio_put(bio); 2848 bio = multi->orig_bio; 2849 } 2850 bio->bi_private = multi->private; 2851 bio->bi_end_io = multi->end_io; 2852 /* only send an error to the higher layers if it is 2853 * beyond the tolerance of the multi-bio 2854 */ 2855 if (atomic_read(&multi->error) > multi->max_errors) { 2856 err = -EIO; 2857 } else if (err) { 2858 /* 2859 * this bio is actually up to date, we didn't 2860 * go over the max number of errors 2861 */ 2862 set_bit(BIO_UPTODATE, &bio->bi_flags); 2863 err = 0; 2864 } 2865 kfree(multi); 2866 2867 bio_endio(bio, err); 2868 } else if (!is_orig_bio) { 2869 bio_put(bio); 2870 } 2871 } 2872 2873 struct async_sched { 2874 struct bio *bio; 2875 int rw; 2876 struct btrfs_fs_info *info; 2877 struct btrfs_work work; 2878 }; 2879 2880 /* 2881 * see run_scheduled_bios for a description of why bios are collected for 2882 * async submit. 2883 * 2884 * This will add one bio to the pending list for a device and make sure 2885 * the work struct is scheduled. 2886 */ 2887 static noinline int schedule_bio(struct btrfs_root *root, 2888 struct btrfs_device *device, 2889 int rw, struct bio *bio) 2890 { 2891 int should_queue = 1; 2892 struct btrfs_pending_bios *pending_bios; 2893 2894 /* don't bother with additional async steps for reads, right now */ 2895 if (!(rw & (1 << BIO_RW))) { 2896 bio_get(bio); 2897 submit_bio(rw, bio); 2898 bio_put(bio); 2899 return 0; 2900 } 2901 2902 /* 2903 * nr_async_bios allows us to reliably return congestion to the 2904 * higher layers. Otherwise, the async bio makes it appear we have 2905 * made progress against dirty pages when we've really just put it 2906 * on a queue for later 2907 */ 2908 atomic_inc(&root->fs_info->nr_async_bios); 2909 WARN_ON(bio->bi_next); 2910 bio->bi_next = NULL; 2911 bio->bi_rw |= rw; 2912 2913 spin_lock(&device->io_lock); 2914 if (bio_sync(bio)) 2915 pending_bios = &device->pending_sync_bios; 2916 else 2917 pending_bios = &device->pending_bios; 2918 2919 if (pending_bios->tail) 2920 pending_bios->tail->bi_next = bio; 2921 2922 pending_bios->tail = bio; 2923 if (!pending_bios->head) 2924 pending_bios->head = bio; 2925 if (device->running_pending) 2926 should_queue = 0; 2927 2928 spin_unlock(&device->io_lock); 2929 2930 if (should_queue) 2931 btrfs_queue_worker(&root->fs_info->submit_workers, 2932 &device->work); 2933 return 0; 2934 } 2935 2936 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 2937 int mirror_num, int async_submit) 2938 { 2939 struct btrfs_mapping_tree *map_tree; 2940 struct btrfs_device *dev; 2941 struct bio *first_bio = bio; 2942 u64 logical = (u64)bio->bi_sector << 9; 2943 u64 length = 0; 2944 u64 map_length; 2945 struct btrfs_multi_bio *multi = NULL; 2946 int ret; 2947 int dev_nr = 0; 2948 int total_devs = 1; 2949 2950 length = bio->bi_size; 2951 map_tree = &root->fs_info->mapping_tree; 2952 map_length = length; 2953 2954 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 2955 mirror_num); 2956 BUG_ON(ret); 2957 2958 total_devs = multi->num_stripes; 2959 if (map_length < length) { 2960 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 2961 "len %llu\n", (unsigned long long)logical, 2962 (unsigned long long)length, 2963 (unsigned long long)map_length); 2964 BUG(); 2965 } 2966 multi->end_io = first_bio->bi_end_io; 2967 multi->private = first_bio->bi_private; 2968 multi->orig_bio = first_bio; 2969 atomic_set(&multi->stripes_pending, multi->num_stripes); 2970 2971 while (dev_nr < total_devs) { 2972 if (total_devs > 1) { 2973 if (dev_nr < total_devs - 1) { 2974 bio = bio_clone(first_bio, GFP_NOFS); 2975 BUG_ON(!bio); 2976 } else { 2977 bio = first_bio; 2978 } 2979 bio->bi_private = multi; 2980 bio->bi_end_io = end_bio_multi_stripe; 2981 } 2982 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 2983 dev = multi->stripes[dev_nr].dev; 2984 BUG_ON(rw == WRITE && !dev->writeable); 2985 if (dev && dev->bdev) { 2986 bio->bi_bdev = dev->bdev; 2987 if (async_submit) 2988 schedule_bio(root, dev, rw, bio); 2989 else 2990 submit_bio(rw, bio); 2991 } else { 2992 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 2993 bio->bi_sector = logical >> 9; 2994 bio_endio(bio, -EIO); 2995 } 2996 dev_nr++; 2997 } 2998 if (total_devs == 1) 2999 kfree(multi); 3000 return 0; 3001 } 3002 3003 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 3004 u8 *uuid, u8 *fsid) 3005 { 3006 struct btrfs_device *device; 3007 struct btrfs_fs_devices *cur_devices; 3008 3009 cur_devices = root->fs_info->fs_devices; 3010 while (cur_devices) { 3011 if (!fsid || 3012 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3013 device = __find_device(&cur_devices->devices, 3014 devid, uuid); 3015 if (device) 3016 return device; 3017 } 3018 cur_devices = cur_devices->seed; 3019 } 3020 return NULL; 3021 } 3022 3023 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 3024 u64 devid, u8 *dev_uuid) 3025 { 3026 struct btrfs_device *device; 3027 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 3028 3029 device = kzalloc(sizeof(*device), GFP_NOFS); 3030 if (!device) 3031 return NULL; 3032 list_add(&device->dev_list, 3033 &fs_devices->devices); 3034 device->barriers = 1; 3035 device->dev_root = root->fs_info->dev_root; 3036 device->devid = devid; 3037 device->work.func = pending_bios_fn; 3038 device->fs_devices = fs_devices; 3039 fs_devices->num_devices++; 3040 spin_lock_init(&device->io_lock); 3041 INIT_LIST_HEAD(&device->dev_alloc_list); 3042 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3043 return device; 3044 } 3045 3046 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 3047 struct extent_buffer *leaf, 3048 struct btrfs_chunk *chunk) 3049 { 3050 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3051 struct map_lookup *map; 3052 struct extent_map *em; 3053 u64 logical; 3054 u64 length; 3055 u64 devid; 3056 u8 uuid[BTRFS_UUID_SIZE]; 3057 int num_stripes; 3058 int ret; 3059 int i; 3060 3061 logical = key->offset; 3062 length = btrfs_chunk_length(leaf, chunk); 3063 3064 spin_lock(&map_tree->map_tree.lock); 3065 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 3066 spin_unlock(&map_tree->map_tree.lock); 3067 3068 /* already mapped? */ 3069 if (em && em->start <= logical && em->start + em->len > logical) { 3070 free_extent_map(em); 3071 return 0; 3072 } else if (em) { 3073 free_extent_map(em); 3074 } 3075 3076 em = alloc_extent_map(GFP_NOFS); 3077 if (!em) 3078 return -ENOMEM; 3079 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3080 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3081 if (!map) { 3082 free_extent_map(em); 3083 return -ENOMEM; 3084 } 3085 3086 em->bdev = (struct block_device *)map; 3087 em->start = logical; 3088 em->len = length; 3089 em->block_start = 0; 3090 em->block_len = em->len; 3091 3092 map->num_stripes = num_stripes; 3093 map->io_width = btrfs_chunk_io_width(leaf, chunk); 3094 map->io_align = btrfs_chunk_io_align(leaf, chunk); 3095 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 3096 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 3097 map->type = btrfs_chunk_type(leaf, chunk); 3098 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 3099 for (i = 0; i < num_stripes; i++) { 3100 map->stripes[i].physical = 3101 btrfs_stripe_offset_nr(leaf, chunk, i); 3102 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 3103 read_extent_buffer(leaf, uuid, (unsigned long) 3104 btrfs_stripe_dev_uuid_nr(chunk, i), 3105 BTRFS_UUID_SIZE); 3106 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 3107 NULL); 3108 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 3109 kfree(map); 3110 free_extent_map(em); 3111 return -EIO; 3112 } 3113 if (!map->stripes[i].dev) { 3114 map->stripes[i].dev = 3115 add_missing_dev(root, devid, uuid); 3116 if (!map->stripes[i].dev) { 3117 kfree(map); 3118 free_extent_map(em); 3119 return -EIO; 3120 } 3121 } 3122 map->stripes[i].dev->in_fs_metadata = 1; 3123 } 3124 3125 spin_lock(&map_tree->map_tree.lock); 3126 ret = add_extent_mapping(&map_tree->map_tree, em); 3127 spin_unlock(&map_tree->map_tree.lock); 3128 BUG_ON(ret); 3129 free_extent_map(em); 3130 3131 return 0; 3132 } 3133 3134 static int fill_device_from_item(struct extent_buffer *leaf, 3135 struct btrfs_dev_item *dev_item, 3136 struct btrfs_device *device) 3137 { 3138 unsigned long ptr; 3139 3140 device->devid = btrfs_device_id(leaf, dev_item); 3141 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 3142 device->total_bytes = device->disk_total_bytes; 3143 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 3144 device->type = btrfs_device_type(leaf, dev_item); 3145 device->io_align = btrfs_device_io_align(leaf, dev_item); 3146 device->io_width = btrfs_device_io_width(leaf, dev_item); 3147 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 3148 3149 ptr = (unsigned long)btrfs_device_uuid(dev_item); 3150 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 3151 3152 return 0; 3153 } 3154 3155 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 3156 { 3157 struct btrfs_fs_devices *fs_devices; 3158 int ret; 3159 3160 mutex_lock(&uuid_mutex); 3161 3162 fs_devices = root->fs_info->fs_devices->seed; 3163 while (fs_devices) { 3164 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 3165 ret = 0; 3166 goto out; 3167 } 3168 fs_devices = fs_devices->seed; 3169 } 3170 3171 fs_devices = find_fsid(fsid); 3172 if (!fs_devices) { 3173 ret = -ENOENT; 3174 goto out; 3175 } 3176 3177 fs_devices = clone_fs_devices(fs_devices); 3178 if (IS_ERR(fs_devices)) { 3179 ret = PTR_ERR(fs_devices); 3180 goto out; 3181 } 3182 3183 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 3184 root->fs_info->bdev_holder); 3185 if (ret) 3186 goto out; 3187 3188 if (!fs_devices->seeding) { 3189 __btrfs_close_devices(fs_devices); 3190 free_fs_devices(fs_devices); 3191 ret = -EINVAL; 3192 goto out; 3193 } 3194 3195 fs_devices->seed = root->fs_info->fs_devices->seed; 3196 root->fs_info->fs_devices->seed = fs_devices; 3197 out: 3198 mutex_unlock(&uuid_mutex); 3199 return ret; 3200 } 3201 3202 static int read_one_dev(struct btrfs_root *root, 3203 struct extent_buffer *leaf, 3204 struct btrfs_dev_item *dev_item) 3205 { 3206 struct btrfs_device *device; 3207 u64 devid; 3208 int ret; 3209 u8 fs_uuid[BTRFS_UUID_SIZE]; 3210 u8 dev_uuid[BTRFS_UUID_SIZE]; 3211 3212 devid = btrfs_device_id(leaf, dev_item); 3213 read_extent_buffer(leaf, dev_uuid, 3214 (unsigned long)btrfs_device_uuid(dev_item), 3215 BTRFS_UUID_SIZE); 3216 read_extent_buffer(leaf, fs_uuid, 3217 (unsigned long)btrfs_device_fsid(dev_item), 3218 BTRFS_UUID_SIZE); 3219 3220 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 3221 ret = open_seed_devices(root, fs_uuid); 3222 if (ret && !btrfs_test_opt(root, DEGRADED)) 3223 return ret; 3224 } 3225 3226 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 3227 if (!device || !device->bdev) { 3228 if (!btrfs_test_opt(root, DEGRADED)) 3229 return -EIO; 3230 3231 if (!device) { 3232 printk(KERN_WARNING "warning devid %llu missing\n", 3233 (unsigned long long)devid); 3234 device = add_missing_dev(root, devid, dev_uuid); 3235 if (!device) 3236 return -ENOMEM; 3237 } 3238 } 3239 3240 if (device->fs_devices != root->fs_info->fs_devices) { 3241 BUG_ON(device->writeable); 3242 if (device->generation != 3243 btrfs_device_generation(leaf, dev_item)) 3244 return -EINVAL; 3245 } 3246 3247 fill_device_from_item(leaf, dev_item, device); 3248 device->dev_root = root->fs_info->dev_root; 3249 device->in_fs_metadata = 1; 3250 if (device->writeable) 3251 device->fs_devices->total_rw_bytes += device->total_bytes; 3252 ret = 0; 3253 return ret; 3254 } 3255 3256 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) 3257 { 3258 struct btrfs_dev_item *dev_item; 3259 3260 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, 3261 dev_item); 3262 return read_one_dev(root, buf, dev_item); 3263 } 3264 3265 int btrfs_read_sys_array(struct btrfs_root *root) 3266 { 3267 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3268 struct extent_buffer *sb; 3269 struct btrfs_disk_key *disk_key; 3270 struct btrfs_chunk *chunk; 3271 u8 *ptr; 3272 unsigned long sb_ptr; 3273 int ret = 0; 3274 u32 num_stripes; 3275 u32 array_size; 3276 u32 len = 0; 3277 u32 cur; 3278 struct btrfs_key key; 3279 3280 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 3281 BTRFS_SUPER_INFO_SIZE); 3282 if (!sb) 3283 return -ENOMEM; 3284 btrfs_set_buffer_uptodate(sb); 3285 btrfs_set_buffer_lockdep_class(sb, 0); 3286 3287 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3288 array_size = btrfs_super_sys_array_size(super_copy); 3289 3290 ptr = super_copy->sys_chunk_array; 3291 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 3292 cur = 0; 3293 3294 while (cur < array_size) { 3295 disk_key = (struct btrfs_disk_key *)ptr; 3296 btrfs_disk_key_to_cpu(&key, disk_key); 3297 3298 len = sizeof(*disk_key); ptr += len; 3299 sb_ptr += len; 3300 cur += len; 3301 3302 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3303 chunk = (struct btrfs_chunk *)sb_ptr; 3304 ret = read_one_chunk(root, &key, sb, chunk); 3305 if (ret) 3306 break; 3307 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 3308 len = btrfs_chunk_item_size(num_stripes); 3309 } else { 3310 ret = -EIO; 3311 break; 3312 } 3313 ptr += len; 3314 sb_ptr += len; 3315 cur += len; 3316 } 3317 free_extent_buffer(sb); 3318 return ret; 3319 } 3320 3321 int btrfs_read_chunk_tree(struct btrfs_root *root) 3322 { 3323 struct btrfs_path *path; 3324 struct extent_buffer *leaf; 3325 struct btrfs_key key; 3326 struct btrfs_key found_key; 3327 int ret; 3328 int slot; 3329 3330 root = root->fs_info->chunk_root; 3331 3332 path = btrfs_alloc_path(); 3333 if (!path) 3334 return -ENOMEM; 3335 3336 /* first we search for all of the device items, and then we 3337 * read in all of the chunk items. This way we can create chunk 3338 * mappings that reference all of the devices that are afound 3339 */ 3340 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 3341 key.offset = 0; 3342 key.type = 0; 3343 again: 3344 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3345 while (1) { 3346 leaf = path->nodes[0]; 3347 slot = path->slots[0]; 3348 if (slot >= btrfs_header_nritems(leaf)) { 3349 ret = btrfs_next_leaf(root, path); 3350 if (ret == 0) 3351 continue; 3352 if (ret < 0) 3353 goto error; 3354 break; 3355 } 3356 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3357 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3358 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 3359 break; 3360 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 3361 struct btrfs_dev_item *dev_item; 3362 dev_item = btrfs_item_ptr(leaf, slot, 3363 struct btrfs_dev_item); 3364 ret = read_one_dev(root, leaf, dev_item); 3365 if (ret) 3366 goto error; 3367 } 3368 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 3369 struct btrfs_chunk *chunk; 3370 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3371 ret = read_one_chunk(root, &found_key, leaf, chunk); 3372 if (ret) 3373 goto error; 3374 } 3375 path->slots[0]++; 3376 } 3377 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3378 key.objectid = 0; 3379 btrfs_release_path(root, path); 3380 goto again; 3381 } 3382 ret = 0; 3383 error: 3384 btrfs_free_path(path); 3385 return ret; 3386 } 3387