1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/ratelimit.h> 27 #include <linux/kthread.h> 28 #include <linux/raid/pq.h> 29 #include <linux/semaphore.h> 30 #include <asm/div64.h> 31 #include "ctree.h" 32 #include "extent_map.h" 33 #include "disk-io.h" 34 #include "transaction.h" 35 #include "print-tree.h" 36 #include "volumes.h" 37 #include "raid56.h" 38 #include "async-thread.h" 39 #include "check-integrity.h" 40 #include "rcu-string.h" 41 #include "math.h" 42 #include "dev-replace.h" 43 44 static int init_first_rw_device(struct btrfs_trans_handle *trans, 45 struct btrfs_root *root, 46 struct btrfs_device *device); 47 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 48 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 49 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 50 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 51 52 static DEFINE_MUTEX(uuid_mutex); 53 static LIST_HEAD(fs_uuids); 54 55 static void lock_chunks(struct btrfs_root *root) 56 { 57 mutex_lock(&root->fs_info->chunk_mutex); 58 } 59 60 static void unlock_chunks(struct btrfs_root *root) 61 { 62 mutex_unlock(&root->fs_info->chunk_mutex); 63 } 64 65 static struct btrfs_fs_devices *__alloc_fs_devices(void) 66 { 67 struct btrfs_fs_devices *fs_devs; 68 69 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 70 if (!fs_devs) 71 return ERR_PTR(-ENOMEM); 72 73 mutex_init(&fs_devs->device_list_mutex); 74 75 INIT_LIST_HEAD(&fs_devs->devices); 76 INIT_LIST_HEAD(&fs_devs->alloc_list); 77 INIT_LIST_HEAD(&fs_devs->list); 78 79 return fs_devs; 80 } 81 82 /** 83 * alloc_fs_devices - allocate struct btrfs_fs_devices 84 * @fsid: a pointer to UUID for this FS. If NULL a new UUID is 85 * generated. 86 * 87 * Return: a pointer to a new &struct btrfs_fs_devices on success; 88 * ERR_PTR() on error. Returned struct is not linked onto any lists and 89 * can be destroyed with kfree() right away. 90 */ 91 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 92 { 93 struct btrfs_fs_devices *fs_devs; 94 95 fs_devs = __alloc_fs_devices(); 96 if (IS_ERR(fs_devs)) 97 return fs_devs; 98 99 if (fsid) 100 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 101 else 102 generate_random_uuid(fs_devs->fsid); 103 104 return fs_devs; 105 } 106 107 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 108 { 109 struct btrfs_device *device; 110 WARN_ON(fs_devices->opened); 111 while (!list_empty(&fs_devices->devices)) { 112 device = list_entry(fs_devices->devices.next, 113 struct btrfs_device, dev_list); 114 list_del(&device->dev_list); 115 rcu_string_free(device->name); 116 kfree(device); 117 } 118 kfree(fs_devices); 119 } 120 121 static void btrfs_kobject_uevent(struct block_device *bdev, 122 enum kobject_action action) 123 { 124 int ret; 125 126 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 127 if (ret) 128 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 129 action, 130 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 131 &disk_to_dev(bdev->bd_disk)->kobj); 132 } 133 134 void btrfs_cleanup_fs_uuids(void) 135 { 136 struct btrfs_fs_devices *fs_devices; 137 138 while (!list_empty(&fs_uuids)) { 139 fs_devices = list_entry(fs_uuids.next, 140 struct btrfs_fs_devices, list); 141 list_del(&fs_devices->list); 142 free_fs_devices(fs_devices); 143 } 144 } 145 146 static struct btrfs_device *__alloc_device(void) 147 { 148 struct btrfs_device *dev; 149 150 dev = kzalloc(sizeof(*dev), GFP_NOFS); 151 if (!dev) 152 return ERR_PTR(-ENOMEM); 153 154 INIT_LIST_HEAD(&dev->dev_list); 155 INIT_LIST_HEAD(&dev->dev_alloc_list); 156 157 spin_lock_init(&dev->io_lock); 158 159 spin_lock_init(&dev->reada_lock); 160 atomic_set(&dev->reada_in_flight, 0); 161 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 162 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 163 164 return dev; 165 } 166 167 static noinline struct btrfs_device *__find_device(struct list_head *head, 168 u64 devid, u8 *uuid) 169 { 170 struct btrfs_device *dev; 171 172 list_for_each_entry(dev, head, dev_list) { 173 if (dev->devid == devid && 174 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 175 return dev; 176 } 177 } 178 return NULL; 179 } 180 181 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 182 { 183 struct btrfs_fs_devices *fs_devices; 184 185 list_for_each_entry(fs_devices, &fs_uuids, list) { 186 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 187 return fs_devices; 188 } 189 return NULL; 190 } 191 192 static int 193 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 194 int flush, struct block_device **bdev, 195 struct buffer_head **bh) 196 { 197 int ret; 198 199 *bdev = blkdev_get_by_path(device_path, flags, holder); 200 201 if (IS_ERR(*bdev)) { 202 ret = PTR_ERR(*bdev); 203 printk(KERN_INFO "BTRFS: open %s failed\n", device_path); 204 goto error; 205 } 206 207 if (flush) 208 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 209 ret = set_blocksize(*bdev, 4096); 210 if (ret) { 211 blkdev_put(*bdev, flags); 212 goto error; 213 } 214 invalidate_bdev(*bdev); 215 *bh = btrfs_read_dev_super(*bdev); 216 if (!*bh) { 217 ret = -EINVAL; 218 blkdev_put(*bdev, flags); 219 goto error; 220 } 221 222 return 0; 223 224 error: 225 *bdev = NULL; 226 *bh = NULL; 227 return ret; 228 } 229 230 static void requeue_list(struct btrfs_pending_bios *pending_bios, 231 struct bio *head, struct bio *tail) 232 { 233 234 struct bio *old_head; 235 236 old_head = pending_bios->head; 237 pending_bios->head = head; 238 if (pending_bios->tail) 239 tail->bi_next = old_head; 240 else 241 pending_bios->tail = tail; 242 } 243 244 /* 245 * we try to collect pending bios for a device so we don't get a large 246 * number of procs sending bios down to the same device. This greatly 247 * improves the schedulers ability to collect and merge the bios. 248 * 249 * But, it also turns into a long list of bios to process and that is sure 250 * to eventually make the worker thread block. The solution here is to 251 * make some progress and then put this work struct back at the end of 252 * the list if the block device is congested. This way, multiple devices 253 * can make progress from a single worker thread. 254 */ 255 static noinline void run_scheduled_bios(struct btrfs_device *device) 256 { 257 struct bio *pending; 258 struct backing_dev_info *bdi; 259 struct btrfs_fs_info *fs_info; 260 struct btrfs_pending_bios *pending_bios; 261 struct bio *tail; 262 struct bio *cur; 263 int again = 0; 264 unsigned long num_run; 265 unsigned long batch_run = 0; 266 unsigned long limit; 267 unsigned long last_waited = 0; 268 int force_reg = 0; 269 int sync_pending = 0; 270 struct blk_plug plug; 271 272 /* 273 * this function runs all the bios we've collected for 274 * a particular device. We don't want to wander off to 275 * another device without first sending all of these down. 276 * So, setup a plug here and finish it off before we return 277 */ 278 blk_start_plug(&plug); 279 280 bdi = blk_get_backing_dev_info(device->bdev); 281 fs_info = device->dev_root->fs_info; 282 limit = btrfs_async_submit_limit(fs_info); 283 limit = limit * 2 / 3; 284 285 loop: 286 spin_lock(&device->io_lock); 287 288 loop_lock: 289 num_run = 0; 290 291 /* take all the bios off the list at once and process them 292 * later on (without the lock held). But, remember the 293 * tail and other pointers so the bios can be properly reinserted 294 * into the list if we hit congestion 295 */ 296 if (!force_reg && device->pending_sync_bios.head) { 297 pending_bios = &device->pending_sync_bios; 298 force_reg = 1; 299 } else { 300 pending_bios = &device->pending_bios; 301 force_reg = 0; 302 } 303 304 pending = pending_bios->head; 305 tail = pending_bios->tail; 306 WARN_ON(pending && !tail); 307 308 /* 309 * if pending was null this time around, no bios need processing 310 * at all and we can stop. Otherwise it'll loop back up again 311 * and do an additional check so no bios are missed. 312 * 313 * device->running_pending is used to synchronize with the 314 * schedule_bio code. 315 */ 316 if (device->pending_sync_bios.head == NULL && 317 device->pending_bios.head == NULL) { 318 again = 0; 319 device->running_pending = 0; 320 } else { 321 again = 1; 322 device->running_pending = 1; 323 } 324 325 pending_bios->head = NULL; 326 pending_bios->tail = NULL; 327 328 spin_unlock(&device->io_lock); 329 330 while (pending) { 331 332 rmb(); 333 /* we want to work on both lists, but do more bios on the 334 * sync list than the regular list 335 */ 336 if ((num_run > 32 && 337 pending_bios != &device->pending_sync_bios && 338 device->pending_sync_bios.head) || 339 (num_run > 64 && pending_bios == &device->pending_sync_bios && 340 device->pending_bios.head)) { 341 spin_lock(&device->io_lock); 342 requeue_list(pending_bios, pending, tail); 343 goto loop_lock; 344 } 345 346 cur = pending; 347 pending = pending->bi_next; 348 cur->bi_next = NULL; 349 350 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 351 waitqueue_active(&fs_info->async_submit_wait)) 352 wake_up(&fs_info->async_submit_wait); 353 354 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 355 356 /* 357 * if we're doing the sync list, record that our 358 * plug has some sync requests on it 359 * 360 * If we're doing the regular list and there are 361 * sync requests sitting around, unplug before 362 * we add more 363 */ 364 if (pending_bios == &device->pending_sync_bios) { 365 sync_pending = 1; 366 } else if (sync_pending) { 367 blk_finish_plug(&plug); 368 blk_start_plug(&plug); 369 sync_pending = 0; 370 } 371 372 btrfsic_submit_bio(cur->bi_rw, cur); 373 num_run++; 374 batch_run++; 375 if (need_resched()) 376 cond_resched(); 377 378 /* 379 * we made progress, there is more work to do and the bdi 380 * is now congested. Back off and let other work structs 381 * run instead 382 */ 383 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 384 fs_info->fs_devices->open_devices > 1) { 385 struct io_context *ioc; 386 387 ioc = current->io_context; 388 389 /* 390 * the main goal here is that we don't want to 391 * block if we're going to be able to submit 392 * more requests without blocking. 393 * 394 * This code does two great things, it pokes into 395 * the elevator code from a filesystem _and_ 396 * it makes assumptions about how batching works. 397 */ 398 if (ioc && ioc->nr_batch_requests > 0 && 399 time_before(jiffies, ioc->last_waited + HZ/50UL) && 400 (last_waited == 0 || 401 ioc->last_waited == last_waited)) { 402 /* 403 * we want to go through our batch of 404 * requests and stop. So, we copy out 405 * the ioc->last_waited time and test 406 * against it before looping 407 */ 408 last_waited = ioc->last_waited; 409 if (need_resched()) 410 cond_resched(); 411 continue; 412 } 413 spin_lock(&device->io_lock); 414 requeue_list(pending_bios, pending, tail); 415 device->running_pending = 1; 416 417 spin_unlock(&device->io_lock); 418 btrfs_queue_work(fs_info->submit_workers, 419 &device->work); 420 goto done; 421 } 422 /* unplug every 64 requests just for good measure */ 423 if (batch_run % 64 == 0) { 424 blk_finish_plug(&plug); 425 blk_start_plug(&plug); 426 sync_pending = 0; 427 } 428 } 429 430 cond_resched(); 431 if (again) 432 goto loop; 433 434 spin_lock(&device->io_lock); 435 if (device->pending_bios.head || device->pending_sync_bios.head) 436 goto loop_lock; 437 spin_unlock(&device->io_lock); 438 439 done: 440 blk_finish_plug(&plug); 441 } 442 443 static void pending_bios_fn(struct btrfs_work *work) 444 { 445 struct btrfs_device *device; 446 447 device = container_of(work, struct btrfs_device, work); 448 run_scheduled_bios(device); 449 } 450 451 /* 452 * Add new device to list of registered devices 453 * 454 * Returns: 455 * 1 - first time device is seen 456 * 0 - device already known 457 * < 0 - error 458 */ 459 static noinline int device_list_add(const char *path, 460 struct btrfs_super_block *disk_super, 461 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 462 { 463 struct btrfs_device *device; 464 struct btrfs_fs_devices *fs_devices; 465 struct rcu_string *name; 466 int ret = 0; 467 u64 found_transid = btrfs_super_generation(disk_super); 468 469 fs_devices = find_fsid(disk_super->fsid); 470 if (!fs_devices) { 471 fs_devices = alloc_fs_devices(disk_super->fsid); 472 if (IS_ERR(fs_devices)) 473 return PTR_ERR(fs_devices); 474 475 list_add(&fs_devices->list, &fs_uuids); 476 fs_devices->latest_devid = devid; 477 fs_devices->latest_trans = found_transid; 478 479 device = NULL; 480 } else { 481 device = __find_device(&fs_devices->devices, devid, 482 disk_super->dev_item.uuid); 483 } 484 if (!device) { 485 if (fs_devices->opened) 486 return -EBUSY; 487 488 device = btrfs_alloc_device(NULL, &devid, 489 disk_super->dev_item.uuid); 490 if (IS_ERR(device)) { 491 /* we can safely leave the fs_devices entry around */ 492 return PTR_ERR(device); 493 } 494 495 name = rcu_string_strdup(path, GFP_NOFS); 496 if (!name) { 497 kfree(device); 498 return -ENOMEM; 499 } 500 rcu_assign_pointer(device->name, name); 501 502 mutex_lock(&fs_devices->device_list_mutex); 503 list_add_rcu(&device->dev_list, &fs_devices->devices); 504 fs_devices->num_devices++; 505 mutex_unlock(&fs_devices->device_list_mutex); 506 507 ret = 1; 508 device->fs_devices = fs_devices; 509 } else if (!device->name || strcmp(device->name->str, path)) { 510 name = rcu_string_strdup(path, GFP_NOFS); 511 if (!name) 512 return -ENOMEM; 513 rcu_string_free(device->name); 514 rcu_assign_pointer(device->name, name); 515 if (device->missing) { 516 fs_devices->missing_devices--; 517 device->missing = 0; 518 } 519 } 520 521 if (found_transid > fs_devices->latest_trans) { 522 fs_devices->latest_devid = devid; 523 fs_devices->latest_trans = found_transid; 524 } 525 *fs_devices_ret = fs_devices; 526 527 return ret; 528 } 529 530 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 531 { 532 struct btrfs_fs_devices *fs_devices; 533 struct btrfs_device *device; 534 struct btrfs_device *orig_dev; 535 536 fs_devices = alloc_fs_devices(orig->fsid); 537 if (IS_ERR(fs_devices)) 538 return fs_devices; 539 540 fs_devices->latest_devid = orig->latest_devid; 541 fs_devices->latest_trans = orig->latest_trans; 542 fs_devices->total_devices = orig->total_devices; 543 544 /* We have held the volume lock, it is safe to get the devices. */ 545 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 546 struct rcu_string *name; 547 548 device = btrfs_alloc_device(NULL, &orig_dev->devid, 549 orig_dev->uuid); 550 if (IS_ERR(device)) 551 goto error; 552 553 /* 554 * This is ok to do without rcu read locked because we hold the 555 * uuid mutex so nothing we touch in here is going to disappear. 556 */ 557 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 558 if (!name) { 559 kfree(device); 560 goto error; 561 } 562 rcu_assign_pointer(device->name, name); 563 564 list_add(&device->dev_list, &fs_devices->devices); 565 device->fs_devices = fs_devices; 566 fs_devices->num_devices++; 567 } 568 return fs_devices; 569 error: 570 free_fs_devices(fs_devices); 571 return ERR_PTR(-ENOMEM); 572 } 573 574 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 575 struct btrfs_fs_devices *fs_devices, int step) 576 { 577 struct btrfs_device *device, *next; 578 579 struct block_device *latest_bdev = NULL; 580 u64 latest_devid = 0; 581 u64 latest_transid = 0; 582 583 mutex_lock(&uuid_mutex); 584 again: 585 /* This is the initialized path, it is safe to release the devices. */ 586 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 587 if (device->in_fs_metadata) { 588 if (!device->is_tgtdev_for_dev_replace && 589 (!latest_transid || 590 device->generation > latest_transid)) { 591 latest_devid = device->devid; 592 latest_transid = device->generation; 593 latest_bdev = device->bdev; 594 } 595 continue; 596 } 597 598 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 599 /* 600 * In the first step, keep the device which has 601 * the correct fsid and the devid that is used 602 * for the dev_replace procedure. 603 * In the second step, the dev_replace state is 604 * read from the device tree and it is known 605 * whether the procedure is really active or 606 * not, which means whether this device is 607 * used or whether it should be removed. 608 */ 609 if (step == 0 || device->is_tgtdev_for_dev_replace) { 610 continue; 611 } 612 } 613 if (device->bdev) { 614 blkdev_put(device->bdev, device->mode); 615 device->bdev = NULL; 616 fs_devices->open_devices--; 617 } 618 if (device->writeable) { 619 list_del_init(&device->dev_alloc_list); 620 device->writeable = 0; 621 if (!device->is_tgtdev_for_dev_replace) 622 fs_devices->rw_devices--; 623 } 624 list_del_init(&device->dev_list); 625 fs_devices->num_devices--; 626 rcu_string_free(device->name); 627 kfree(device); 628 } 629 630 if (fs_devices->seed) { 631 fs_devices = fs_devices->seed; 632 goto again; 633 } 634 635 fs_devices->latest_bdev = latest_bdev; 636 fs_devices->latest_devid = latest_devid; 637 fs_devices->latest_trans = latest_transid; 638 639 mutex_unlock(&uuid_mutex); 640 } 641 642 static void __free_device(struct work_struct *work) 643 { 644 struct btrfs_device *device; 645 646 device = container_of(work, struct btrfs_device, rcu_work); 647 648 if (device->bdev) 649 blkdev_put(device->bdev, device->mode); 650 651 rcu_string_free(device->name); 652 kfree(device); 653 } 654 655 static void free_device(struct rcu_head *head) 656 { 657 struct btrfs_device *device; 658 659 device = container_of(head, struct btrfs_device, rcu); 660 661 INIT_WORK(&device->rcu_work, __free_device); 662 schedule_work(&device->rcu_work); 663 } 664 665 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 666 { 667 struct btrfs_device *device; 668 669 if (--fs_devices->opened > 0) 670 return 0; 671 672 mutex_lock(&fs_devices->device_list_mutex); 673 list_for_each_entry(device, &fs_devices->devices, dev_list) { 674 struct btrfs_device *new_device; 675 struct rcu_string *name; 676 677 if (device->bdev) 678 fs_devices->open_devices--; 679 680 if (device->writeable && 681 device->devid != BTRFS_DEV_REPLACE_DEVID) { 682 list_del_init(&device->dev_alloc_list); 683 fs_devices->rw_devices--; 684 } 685 686 if (device->can_discard) 687 fs_devices->num_can_discard--; 688 if (device->missing) 689 fs_devices->missing_devices--; 690 691 new_device = btrfs_alloc_device(NULL, &device->devid, 692 device->uuid); 693 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 694 695 /* Safe because we are under uuid_mutex */ 696 if (device->name) { 697 name = rcu_string_strdup(device->name->str, GFP_NOFS); 698 BUG_ON(!name); /* -ENOMEM */ 699 rcu_assign_pointer(new_device->name, name); 700 } 701 702 list_replace_rcu(&device->dev_list, &new_device->dev_list); 703 new_device->fs_devices = device->fs_devices; 704 705 call_rcu(&device->rcu, free_device); 706 } 707 mutex_unlock(&fs_devices->device_list_mutex); 708 709 WARN_ON(fs_devices->open_devices); 710 WARN_ON(fs_devices->rw_devices); 711 fs_devices->opened = 0; 712 fs_devices->seeding = 0; 713 714 return 0; 715 } 716 717 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 718 { 719 struct btrfs_fs_devices *seed_devices = NULL; 720 int ret; 721 722 mutex_lock(&uuid_mutex); 723 ret = __btrfs_close_devices(fs_devices); 724 if (!fs_devices->opened) { 725 seed_devices = fs_devices->seed; 726 fs_devices->seed = NULL; 727 } 728 mutex_unlock(&uuid_mutex); 729 730 while (seed_devices) { 731 fs_devices = seed_devices; 732 seed_devices = fs_devices->seed; 733 __btrfs_close_devices(fs_devices); 734 free_fs_devices(fs_devices); 735 } 736 /* 737 * Wait for rcu kworkers under __btrfs_close_devices 738 * to finish all blkdev_puts so device is really 739 * free when umount is done. 740 */ 741 rcu_barrier(); 742 return ret; 743 } 744 745 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 746 fmode_t flags, void *holder) 747 { 748 struct request_queue *q; 749 struct block_device *bdev; 750 struct list_head *head = &fs_devices->devices; 751 struct btrfs_device *device; 752 struct block_device *latest_bdev = NULL; 753 struct buffer_head *bh; 754 struct btrfs_super_block *disk_super; 755 u64 latest_devid = 0; 756 u64 latest_transid = 0; 757 u64 devid; 758 int seeding = 1; 759 int ret = 0; 760 761 flags |= FMODE_EXCL; 762 763 list_for_each_entry(device, head, dev_list) { 764 if (device->bdev) 765 continue; 766 if (!device->name) 767 continue; 768 769 /* Just open everything we can; ignore failures here */ 770 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 771 &bdev, &bh)) 772 continue; 773 774 disk_super = (struct btrfs_super_block *)bh->b_data; 775 devid = btrfs_stack_device_id(&disk_super->dev_item); 776 if (devid != device->devid) 777 goto error_brelse; 778 779 if (memcmp(device->uuid, disk_super->dev_item.uuid, 780 BTRFS_UUID_SIZE)) 781 goto error_brelse; 782 783 device->generation = btrfs_super_generation(disk_super); 784 if (!latest_transid || device->generation > latest_transid) { 785 latest_devid = devid; 786 latest_transid = device->generation; 787 latest_bdev = bdev; 788 } 789 790 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 791 device->writeable = 0; 792 } else { 793 device->writeable = !bdev_read_only(bdev); 794 seeding = 0; 795 } 796 797 q = bdev_get_queue(bdev); 798 if (blk_queue_discard(q)) { 799 device->can_discard = 1; 800 fs_devices->num_can_discard++; 801 } 802 803 device->bdev = bdev; 804 device->in_fs_metadata = 0; 805 device->mode = flags; 806 807 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 808 fs_devices->rotating = 1; 809 810 fs_devices->open_devices++; 811 if (device->writeable && 812 device->devid != BTRFS_DEV_REPLACE_DEVID) { 813 fs_devices->rw_devices++; 814 list_add(&device->dev_alloc_list, 815 &fs_devices->alloc_list); 816 } 817 brelse(bh); 818 continue; 819 820 error_brelse: 821 brelse(bh); 822 blkdev_put(bdev, flags); 823 continue; 824 } 825 if (fs_devices->open_devices == 0) { 826 ret = -EINVAL; 827 goto out; 828 } 829 fs_devices->seeding = seeding; 830 fs_devices->opened = 1; 831 fs_devices->latest_bdev = latest_bdev; 832 fs_devices->latest_devid = latest_devid; 833 fs_devices->latest_trans = latest_transid; 834 fs_devices->total_rw_bytes = 0; 835 out: 836 return ret; 837 } 838 839 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 840 fmode_t flags, void *holder) 841 { 842 int ret; 843 844 mutex_lock(&uuid_mutex); 845 if (fs_devices->opened) { 846 fs_devices->opened++; 847 ret = 0; 848 } else { 849 ret = __btrfs_open_devices(fs_devices, flags, holder); 850 } 851 mutex_unlock(&uuid_mutex); 852 return ret; 853 } 854 855 /* 856 * Look for a btrfs signature on a device. This may be called out of the mount path 857 * and we are not allowed to call set_blocksize during the scan. The superblock 858 * is read via pagecache 859 */ 860 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 861 struct btrfs_fs_devices **fs_devices_ret) 862 { 863 struct btrfs_super_block *disk_super; 864 struct block_device *bdev; 865 struct page *page; 866 void *p; 867 int ret = -EINVAL; 868 u64 devid; 869 u64 transid; 870 u64 total_devices; 871 u64 bytenr; 872 pgoff_t index; 873 874 /* 875 * we would like to check all the supers, but that would make 876 * a btrfs mount succeed after a mkfs from a different FS. 877 * So, we need to add a special mount option to scan for 878 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 879 */ 880 bytenr = btrfs_sb_offset(0); 881 flags |= FMODE_EXCL; 882 mutex_lock(&uuid_mutex); 883 884 bdev = blkdev_get_by_path(path, flags, holder); 885 886 if (IS_ERR(bdev)) { 887 ret = PTR_ERR(bdev); 888 goto error; 889 } 890 891 /* make sure our super fits in the device */ 892 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) 893 goto error_bdev_put; 894 895 /* make sure our super fits in the page */ 896 if (sizeof(*disk_super) > PAGE_CACHE_SIZE) 897 goto error_bdev_put; 898 899 /* make sure our super doesn't straddle pages on disk */ 900 index = bytenr >> PAGE_CACHE_SHIFT; 901 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) 902 goto error_bdev_put; 903 904 /* pull in the page with our super */ 905 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 906 index, GFP_NOFS); 907 908 if (IS_ERR_OR_NULL(page)) 909 goto error_bdev_put; 910 911 p = kmap(page); 912 913 /* align our pointer to the offset of the super block */ 914 disk_super = p + (bytenr & ~PAGE_CACHE_MASK); 915 916 if (btrfs_super_bytenr(disk_super) != bytenr || 917 btrfs_super_magic(disk_super) != BTRFS_MAGIC) 918 goto error_unmap; 919 920 devid = btrfs_stack_device_id(&disk_super->dev_item); 921 transid = btrfs_super_generation(disk_super); 922 total_devices = btrfs_super_num_devices(disk_super); 923 924 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 925 if (ret > 0) { 926 if (disk_super->label[0]) { 927 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 928 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 929 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); 930 } else { 931 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); 932 } 933 934 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); 935 ret = 0; 936 } 937 if (!ret && fs_devices_ret) 938 (*fs_devices_ret)->total_devices = total_devices; 939 940 error_unmap: 941 kunmap(page); 942 page_cache_release(page); 943 944 error_bdev_put: 945 blkdev_put(bdev, flags); 946 error: 947 mutex_unlock(&uuid_mutex); 948 return ret; 949 } 950 951 /* helper to account the used device space in the range */ 952 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 953 u64 end, u64 *length) 954 { 955 struct btrfs_key key; 956 struct btrfs_root *root = device->dev_root; 957 struct btrfs_dev_extent *dev_extent; 958 struct btrfs_path *path; 959 u64 extent_end; 960 int ret; 961 int slot; 962 struct extent_buffer *l; 963 964 *length = 0; 965 966 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 967 return 0; 968 969 path = btrfs_alloc_path(); 970 if (!path) 971 return -ENOMEM; 972 path->reada = 2; 973 974 key.objectid = device->devid; 975 key.offset = start; 976 key.type = BTRFS_DEV_EXTENT_KEY; 977 978 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 979 if (ret < 0) 980 goto out; 981 if (ret > 0) { 982 ret = btrfs_previous_item(root, path, key.objectid, key.type); 983 if (ret < 0) 984 goto out; 985 } 986 987 while (1) { 988 l = path->nodes[0]; 989 slot = path->slots[0]; 990 if (slot >= btrfs_header_nritems(l)) { 991 ret = btrfs_next_leaf(root, path); 992 if (ret == 0) 993 continue; 994 if (ret < 0) 995 goto out; 996 997 break; 998 } 999 btrfs_item_key_to_cpu(l, &key, slot); 1000 1001 if (key.objectid < device->devid) 1002 goto next; 1003 1004 if (key.objectid > device->devid) 1005 break; 1006 1007 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1008 goto next; 1009 1010 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1011 extent_end = key.offset + btrfs_dev_extent_length(l, 1012 dev_extent); 1013 if (key.offset <= start && extent_end > end) { 1014 *length = end - start + 1; 1015 break; 1016 } else if (key.offset <= start && extent_end > start) 1017 *length += extent_end - start; 1018 else if (key.offset > start && extent_end <= end) 1019 *length += extent_end - key.offset; 1020 else if (key.offset > start && key.offset <= end) { 1021 *length += end - key.offset + 1; 1022 break; 1023 } else if (key.offset > end) 1024 break; 1025 1026 next: 1027 path->slots[0]++; 1028 } 1029 ret = 0; 1030 out: 1031 btrfs_free_path(path); 1032 return ret; 1033 } 1034 1035 static int contains_pending_extent(struct btrfs_trans_handle *trans, 1036 struct btrfs_device *device, 1037 u64 *start, u64 len) 1038 { 1039 struct extent_map *em; 1040 int ret = 0; 1041 1042 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1043 struct map_lookup *map; 1044 int i; 1045 1046 map = (struct map_lookup *)em->bdev; 1047 for (i = 0; i < map->num_stripes; i++) { 1048 if (map->stripes[i].dev != device) 1049 continue; 1050 if (map->stripes[i].physical >= *start + len || 1051 map->stripes[i].physical + em->orig_block_len <= 1052 *start) 1053 continue; 1054 *start = map->stripes[i].physical + 1055 em->orig_block_len; 1056 ret = 1; 1057 } 1058 } 1059 1060 return ret; 1061 } 1062 1063 1064 /* 1065 * find_free_dev_extent - find free space in the specified device 1066 * @device: the device which we search the free space in 1067 * @num_bytes: the size of the free space that we need 1068 * @start: store the start of the free space. 1069 * @len: the size of the free space. that we find, or the size of the max 1070 * free space if we don't find suitable free space 1071 * 1072 * this uses a pretty simple search, the expectation is that it is 1073 * called very infrequently and that a given device has a small number 1074 * of extents 1075 * 1076 * @start is used to store the start of the free space if we find. But if we 1077 * don't find suitable free space, it will be used to store the start position 1078 * of the max free space. 1079 * 1080 * @len is used to store the size of the free space that we find. 1081 * But if we don't find suitable free space, it is used to store the size of 1082 * the max free space. 1083 */ 1084 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1085 struct btrfs_device *device, u64 num_bytes, 1086 u64 *start, u64 *len) 1087 { 1088 struct btrfs_key key; 1089 struct btrfs_root *root = device->dev_root; 1090 struct btrfs_dev_extent *dev_extent; 1091 struct btrfs_path *path; 1092 u64 hole_size; 1093 u64 max_hole_start; 1094 u64 max_hole_size; 1095 u64 extent_end; 1096 u64 search_start; 1097 u64 search_end = device->total_bytes; 1098 int ret; 1099 int slot; 1100 struct extent_buffer *l; 1101 1102 /* FIXME use last free of some kind */ 1103 1104 /* we don't want to overwrite the superblock on the drive, 1105 * so we make sure to start at an offset of at least 1MB 1106 */ 1107 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1108 1109 path = btrfs_alloc_path(); 1110 if (!path) 1111 return -ENOMEM; 1112 again: 1113 max_hole_start = search_start; 1114 max_hole_size = 0; 1115 hole_size = 0; 1116 1117 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1118 ret = -ENOSPC; 1119 goto out; 1120 } 1121 1122 path->reada = 2; 1123 path->search_commit_root = 1; 1124 path->skip_locking = 1; 1125 1126 key.objectid = device->devid; 1127 key.offset = search_start; 1128 key.type = BTRFS_DEV_EXTENT_KEY; 1129 1130 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1131 if (ret < 0) 1132 goto out; 1133 if (ret > 0) { 1134 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1135 if (ret < 0) 1136 goto out; 1137 } 1138 1139 while (1) { 1140 l = path->nodes[0]; 1141 slot = path->slots[0]; 1142 if (slot >= btrfs_header_nritems(l)) { 1143 ret = btrfs_next_leaf(root, path); 1144 if (ret == 0) 1145 continue; 1146 if (ret < 0) 1147 goto out; 1148 1149 break; 1150 } 1151 btrfs_item_key_to_cpu(l, &key, slot); 1152 1153 if (key.objectid < device->devid) 1154 goto next; 1155 1156 if (key.objectid > device->devid) 1157 break; 1158 1159 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1160 goto next; 1161 1162 if (key.offset > search_start) { 1163 hole_size = key.offset - search_start; 1164 1165 /* 1166 * Have to check before we set max_hole_start, otherwise 1167 * we could end up sending back this offset anyway. 1168 */ 1169 if (contains_pending_extent(trans, device, 1170 &search_start, 1171 hole_size)) 1172 hole_size = 0; 1173 1174 if (hole_size > max_hole_size) { 1175 max_hole_start = search_start; 1176 max_hole_size = hole_size; 1177 } 1178 1179 /* 1180 * If this free space is greater than which we need, 1181 * it must be the max free space that we have found 1182 * until now, so max_hole_start must point to the start 1183 * of this free space and the length of this free space 1184 * is stored in max_hole_size. Thus, we return 1185 * max_hole_start and max_hole_size and go back to the 1186 * caller. 1187 */ 1188 if (hole_size >= num_bytes) { 1189 ret = 0; 1190 goto out; 1191 } 1192 } 1193 1194 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1195 extent_end = key.offset + btrfs_dev_extent_length(l, 1196 dev_extent); 1197 if (extent_end > search_start) 1198 search_start = extent_end; 1199 next: 1200 path->slots[0]++; 1201 cond_resched(); 1202 } 1203 1204 /* 1205 * At this point, search_start should be the end of 1206 * allocated dev extents, and when shrinking the device, 1207 * search_end may be smaller than search_start. 1208 */ 1209 if (search_end > search_start) 1210 hole_size = search_end - search_start; 1211 1212 if (hole_size > max_hole_size) { 1213 max_hole_start = search_start; 1214 max_hole_size = hole_size; 1215 } 1216 1217 if (contains_pending_extent(trans, device, &search_start, hole_size)) { 1218 btrfs_release_path(path); 1219 goto again; 1220 } 1221 1222 /* See above. */ 1223 if (hole_size < num_bytes) 1224 ret = -ENOSPC; 1225 else 1226 ret = 0; 1227 1228 out: 1229 btrfs_free_path(path); 1230 *start = max_hole_start; 1231 if (len) 1232 *len = max_hole_size; 1233 return ret; 1234 } 1235 1236 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1237 struct btrfs_device *device, 1238 u64 start) 1239 { 1240 int ret; 1241 struct btrfs_path *path; 1242 struct btrfs_root *root = device->dev_root; 1243 struct btrfs_key key; 1244 struct btrfs_key found_key; 1245 struct extent_buffer *leaf = NULL; 1246 struct btrfs_dev_extent *extent = NULL; 1247 1248 path = btrfs_alloc_path(); 1249 if (!path) 1250 return -ENOMEM; 1251 1252 key.objectid = device->devid; 1253 key.offset = start; 1254 key.type = BTRFS_DEV_EXTENT_KEY; 1255 again: 1256 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1257 if (ret > 0) { 1258 ret = btrfs_previous_item(root, path, key.objectid, 1259 BTRFS_DEV_EXTENT_KEY); 1260 if (ret) 1261 goto out; 1262 leaf = path->nodes[0]; 1263 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1264 extent = btrfs_item_ptr(leaf, path->slots[0], 1265 struct btrfs_dev_extent); 1266 BUG_ON(found_key.offset > start || found_key.offset + 1267 btrfs_dev_extent_length(leaf, extent) < start); 1268 key = found_key; 1269 btrfs_release_path(path); 1270 goto again; 1271 } else if (ret == 0) { 1272 leaf = path->nodes[0]; 1273 extent = btrfs_item_ptr(leaf, path->slots[0], 1274 struct btrfs_dev_extent); 1275 } else { 1276 btrfs_error(root->fs_info, ret, "Slot search failed"); 1277 goto out; 1278 } 1279 1280 if (device->bytes_used > 0) { 1281 u64 len = btrfs_dev_extent_length(leaf, extent); 1282 device->bytes_used -= len; 1283 spin_lock(&root->fs_info->free_chunk_lock); 1284 root->fs_info->free_chunk_space += len; 1285 spin_unlock(&root->fs_info->free_chunk_lock); 1286 } 1287 ret = btrfs_del_item(trans, root, path); 1288 if (ret) { 1289 btrfs_error(root->fs_info, ret, 1290 "Failed to remove dev extent item"); 1291 } 1292 out: 1293 btrfs_free_path(path); 1294 return ret; 1295 } 1296 1297 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1298 struct btrfs_device *device, 1299 u64 chunk_tree, u64 chunk_objectid, 1300 u64 chunk_offset, u64 start, u64 num_bytes) 1301 { 1302 int ret; 1303 struct btrfs_path *path; 1304 struct btrfs_root *root = device->dev_root; 1305 struct btrfs_dev_extent *extent; 1306 struct extent_buffer *leaf; 1307 struct btrfs_key key; 1308 1309 WARN_ON(!device->in_fs_metadata); 1310 WARN_ON(device->is_tgtdev_for_dev_replace); 1311 path = btrfs_alloc_path(); 1312 if (!path) 1313 return -ENOMEM; 1314 1315 key.objectid = device->devid; 1316 key.offset = start; 1317 key.type = BTRFS_DEV_EXTENT_KEY; 1318 ret = btrfs_insert_empty_item(trans, root, path, &key, 1319 sizeof(*extent)); 1320 if (ret) 1321 goto out; 1322 1323 leaf = path->nodes[0]; 1324 extent = btrfs_item_ptr(leaf, path->slots[0], 1325 struct btrfs_dev_extent); 1326 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1327 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1328 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1329 1330 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1331 btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); 1332 1333 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1334 btrfs_mark_buffer_dirty(leaf); 1335 out: 1336 btrfs_free_path(path); 1337 return ret; 1338 } 1339 1340 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1341 { 1342 struct extent_map_tree *em_tree; 1343 struct extent_map *em; 1344 struct rb_node *n; 1345 u64 ret = 0; 1346 1347 em_tree = &fs_info->mapping_tree.map_tree; 1348 read_lock(&em_tree->lock); 1349 n = rb_last(&em_tree->map); 1350 if (n) { 1351 em = rb_entry(n, struct extent_map, rb_node); 1352 ret = em->start + em->len; 1353 } 1354 read_unlock(&em_tree->lock); 1355 1356 return ret; 1357 } 1358 1359 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1360 u64 *devid_ret) 1361 { 1362 int ret; 1363 struct btrfs_key key; 1364 struct btrfs_key found_key; 1365 struct btrfs_path *path; 1366 1367 path = btrfs_alloc_path(); 1368 if (!path) 1369 return -ENOMEM; 1370 1371 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1372 key.type = BTRFS_DEV_ITEM_KEY; 1373 key.offset = (u64)-1; 1374 1375 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1376 if (ret < 0) 1377 goto error; 1378 1379 BUG_ON(ret == 0); /* Corruption */ 1380 1381 ret = btrfs_previous_item(fs_info->chunk_root, path, 1382 BTRFS_DEV_ITEMS_OBJECTID, 1383 BTRFS_DEV_ITEM_KEY); 1384 if (ret) { 1385 *devid_ret = 1; 1386 } else { 1387 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1388 path->slots[0]); 1389 *devid_ret = found_key.offset + 1; 1390 } 1391 ret = 0; 1392 error: 1393 btrfs_free_path(path); 1394 return ret; 1395 } 1396 1397 /* 1398 * the device information is stored in the chunk root 1399 * the btrfs_device struct should be fully filled in 1400 */ 1401 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1402 struct btrfs_root *root, 1403 struct btrfs_device *device) 1404 { 1405 int ret; 1406 struct btrfs_path *path; 1407 struct btrfs_dev_item *dev_item; 1408 struct extent_buffer *leaf; 1409 struct btrfs_key key; 1410 unsigned long ptr; 1411 1412 root = root->fs_info->chunk_root; 1413 1414 path = btrfs_alloc_path(); 1415 if (!path) 1416 return -ENOMEM; 1417 1418 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1419 key.type = BTRFS_DEV_ITEM_KEY; 1420 key.offset = device->devid; 1421 1422 ret = btrfs_insert_empty_item(trans, root, path, &key, 1423 sizeof(*dev_item)); 1424 if (ret) 1425 goto out; 1426 1427 leaf = path->nodes[0]; 1428 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1429 1430 btrfs_set_device_id(leaf, dev_item, device->devid); 1431 btrfs_set_device_generation(leaf, dev_item, 0); 1432 btrfs_set_device_type(leaf, dev_item, device->type); 1433 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1434 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1435 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1436 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1437 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1438 btrfs_set_device_group(leaf, dev_item, 0); 1439 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1440 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1441 btrfs_set_device_start_offset(leaf, dev_item, 0); 1442 1443 ptr = btrfs_device_uuid(dev_item); 1444 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1445 ptr = btrfs_device_fsid(dev_item); 1446 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1447 btrfs_mark_buffer_dirty(leaf); 1448 1449 ret = 0; 1450 out: 1451 btrfs_free_path(path); 1452 return ret; 1453 } 1454 1455 static int btrfs_rm_dev_item(struct btrfs_root *root, 1456 struct btrfs_device *device) 1457 { 1458 int ret; 1459 struct btrfs_path *path; 1460 struct btrfs_key key; 1461 struct btrfs_trans_handle *trans; 1462 1463 root = root->fs_info->chunk_root; 1464 1465 path = btrfs_alloc_path(); 1466 if (!path) 1467 return -ENOMEM; 1468 1469 trans = btrfs_start_transaction(root, 0); 1470 if (IS_ERR(trans)) { 1471 btrfs_free_path(path); 1472 return PTR_ERR(trans); 1473 } 1474 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1475 key.type = BTRFS_DEV_ITEM_KEY; 1476 key.offset = device->devid; 1477 lock_chunks(root); 1478 1479 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1480 if (ret < 0) 1481 goto out; 1482 1483 if (ret > 0) { 1484 ret = -ENOENT; 1485 goto out; 1486 } 1487 1488 ret = btrfs_del_item(trans, root, path); 1489 if (ret) 1490 goto out; 1491 out: 1492 btrfs_free_path(path); 1493 unlock_chunks(root); 1494 btrfs_commit_transaction(trans, root); 1495 return ret; 1496 } 1497 1498 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1499 { 1500 struct btrfs_device *device; 1501 struct btrfs_device *next_device; 1502 struct block_device *bdev; 1503 struct buffer_head *bh = NULL; 1504 struct btrfs_super_block *disk_super; 1505 struct btrfs_fs_devices *cur_devices; 1506 u64 all_avail; 1507 u64 devid; 1508 u64 num_devices; 1509 u8 *dev_uuid; 1510 unsigned seq; 1511 int ret = 0; 1512 bool clear_super = false; 1513 1514 mutex_lock(&uuid_mutex); 1515 1516 do { 1517 seq = read_seqbegin(&root->fs_info->profiles_lock); 1518 1519 all_avail = root->fs_info->avail_data_alloc_bits | 1520 root->fs_info->avail_system_alloc_bits | 1521 root->fs_info->avail_metadata_alloc_bits; 1522 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1523 1524 num_devices = root->fs_info->fs_devices->num_devices; 1525 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1526 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1527 WARN_ON(num_devices < 1); 1528 num_devices--; 1529 } 1530 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1531 1532 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1533 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1534 goto out; 1535 } 1536 1537 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1538 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; 1539 goto out; 1540 } 1541 1542 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1543 root->fs_info->fs_devices->rw_devices <= 2) { 1544 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; 1545 goto out; 1546 } 1547 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1548 root->fs_info->fs_devices->rw_devices <= 3) { 1549 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; 1550 goto out; 1551 } 1552 1553 if (strcmp(device_path, "missing") == 0) { 1554 struct list_head *devices; 1555 struct btrfs_device *tmp; 1556 1557 device = NULL; 1558 devices = &root->fs_info->fs_devices->devices; 1559 /* 1560 * It is safe to read the devices since the volume_mutex 1561 * is held. 1562 */ 1563 list_for_each_entry(tmp, devices, dev_list) { 1564 if (tmp->in_fs_metadata && 1565 !tmp->is_tgtdev_for_dev_replace && 1566 !tmp->bdev) { 1567 device = tmp; 1568 break; 1569 } 1570 } 1571 bdev = NULL; 1572 bh = NULL; 1573 disk_super = NULL; 1574 if (!device) { 1575 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 1576 goto out; 1577 } 1578 } else { 1579 ret = btrfs_get_bdev_and_sb(device_path, 1580 FMODE_WRITE | FMODE_EXCL, 1581 root->fs_info->bdev_holder, 0, 1582 &bdev, &bh); 1583 if (ret) 1584 goto out; 1585 disk_super = (struct btrfs_super_block *)bh->b_data; 1586 devid = btrfs_stack_device_id(&disk_super->dev_item); 1587 dev_uuid = disk_super->dev_item.uuid; 1588 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1589 disk_super->fsid); 1590 if (!device) { 1591 ret = -ENOENT; 1592 goto error_brelse; 1593 } 1594 } 1595 1596 if (device->is_tgtdev_for_dev_replace) { 1597 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1598 goto error_brelse; 1599 } 1600 1601 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1602 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1603 goto error_brelse; 1604 } 1605 1606 if (device->writeable) { 1607 lock_chunks(root); 1608 list_del_init(&device->dev_alloc_list); 1609 unlock_chunks(root); 1610 root->fs_info->fs_devices->rw_devices--; 1611 clear_super = true; 1612 } 1613 1614 mutex_unlock(&uuid_mutex); 1615 ret = btrfs_shrink_device(device, 0); 1616 mutex_lock(&uuid_mutex); 1617 if (ret) 1618 goto error_undo; 1619 1620 /* 1621 * TODO: the superblock still includes this device in its num_devices 1622 * counter although write_all_supers() is not locked out. This 1623 * could give a filesystem state which requires a degraded mount. 1624 */ 1625 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1626 if (ret) 1627 goto error_undo; 1628 1629 spin_lock(&root->fs_info->free_chunk_lock); 1630 root->fs_info->free_chunk_space = device->total_bytes - 1631 device->bytes_used; 1632 spin_unlock(&root->fs_info->free_chunk_lock); 1633 1634 device->in_fs_metadata = 0; 1635 btrfs_scrub_cancel_dev(root->fs_info, device); 1636 1637 /* 1638 * the device list mutex makes sure that we don't change 1639 * the device list while someone else is writing out all 1640 * the device supers. Whoever is writing all supers, should 1641 * lock the device list mutex before getting the number of 1642 * devices in the super block (super_copy). Conversely, 1643 * whoever updates the number of devices in the super block 1644 * (super_copy) should hold the device list mutex. 1645 */ 1646 1647 cur_devices = device->fs_devices; 1648 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1649 list_del_rcu(&device->dev_list); 1650 1651 device->fs_devices->num_devices--; 1652 device->fs_devices->total_devices--; 1653 1654 if (device->missing) 1655 root->fs_info->fs_devices->missing_devices--; 1656 1657 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1658 struct btrfs_device, dev_list); 1659 if (device->bdev == root->fs_info->sb->s_bdev) 1660 root->fs_info->sb->s_bdev = next_device->bdev; 1661 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1662 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1663 1664 if (device->bdev) 1665 device->fs_devices->open_devices--; 1666 1667 call_rcu(&device->rcu, free_device); 1668 1669 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1670 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1671 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1672 1673 if (cur_devices->open_devices == 0) { 1674 struct btrfs_fs_devices *fs_devices; 1675 fs_devices = root->fs_info->fs_devices; 1676 while (fs_devices) { 1677 if (fs_devices->seed == cur_devices) 1678 break; 1679 fs_devices = fs_devices->seed; 1680 } 1681 fs_devices->seed = cur_devices->seed; 1682 cur_devices->seed = NULL; 1683 lock_chunks(root); 1684 __btrfs_close_devices(cur_devices); 1685 unlock_chunks(root); 1686 free_fs_devices(cur_devices); 1687 } 1688 1689 root->fs_info->num_tolerated_disk_barrier_failures = 1690 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 1691 1692 /* 1693 * at this point, the device is zero sized. We want to 1694 * remove it from the devices list and zero out the old super 1695 */ 1696 if (clear_super && disk_super) { 1697 /* make sure this device isn't detected as part of 1698 * the FS anymore 1699 */ 1700 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1701 set_buffer_dirty(bh); 1702 sync_dirty_buffer(bh); 1703 } 1704 1705 ret = 0; 1706 1707 /* Notify udev that device has changed */ 1708 if (bdev) 1709 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1710 1711 error_brelse: 1712 brelse(bh); 1713 if (bdev) 1714 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1715 out: 1716 mutex_unlock(&uuid_mutex); 1717 return ret; 1718 error_undo: 1719 if (device->writeable) { 1720 lock_chunks(root); 1721 list_add(&device->dev_alloc_list, 1722 &root->fs_info->fs_devices->alloc_list); 1723 unlock_chunks(root); 1724 root->fs_info->fs_devices->rw_devices++; 1725 } 1726 goto error_brelse; 1727 } 1728 1729 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1730 struct btrfs_device *srcdev) 1731 { 1732 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1733 1734 list_del_rcu(&srcdev->dev_list); 1735 list_del_rcu(&srcdev->dev_alloc_list); 1736 fs_info->fs_devices->num_devices--; 1737 if (srcdev->missing) { 1738 fs_info->fs_devices->missing_devices--; 1739 fs_info->fs_devices->rw_devices++; 1740 } 1741 if (srcdev->can_discard) 1742 fs_info->fs_devices->num_can_discard--; 1743 if (srcdev->bdev) { 1744 fs_info->fs_devices->open_devices--; 1745 1746 /* zero out the old super */ 1747 btrfs_scratch_superblock(srcdev); 1748 } 1749 1750 call_rcu(&srcdev->rcu, free_device); 1751 } 1752 1753 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1754 struct btrfs_device *tgtdev) 1755 { 1756 struct btrfs_device *next_device; 1757 1758 WARN_ON(!tgtdev); 1759 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1760 if (tgtdev->bdev) { 1761 btrfs_scratch_superblock(tgtdev); 1762 fs_info->fs_devices->open_devices--; 1763 } 1764 fs_info->fs_devices->num_devices--; 1765 if (tgtdev->can_discard) 1766 fs_info->fs_devices->num_can_discard++; 1767 1768 next_device = list_entry(fs_info->fs_devices->devices.next, 1769 struct btrfs_device, dev_list); 1770 if (tgtdev->bdev == fs_info->sb->s_bdev) 1771 fs_info->sb->s_bdev = next_device->bdev; 1772 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1773 fs_info->fs_devices->latest_bdev = next_device->bdev; 1774 list_del_rcu(&tgtdev->dev_list); 1775 1776 call_rcu(&tgtdev->rcu, free_device); 1777 1778 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1779 } 1780 1781 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1782 struct btrfs_device **device) 1783 { 1784 int ret = 0; 1785 struct btrfs_super_block *disk_super; 1786 u64 devid; 1787 u8 *dev_uuid; 1788 struct block_device *bdev; 1789 struct buffer_head *bh; 1790 1791 *device = NULL; 1792 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1793 root->fs_info->bdev_holder, 0, &bdev, &bh); 1794 if (ret) 1795 return ret; 1796 disk_super = (struct btrfs_super_block *)bh->b_data; 1797 devid = btrfs_stack_device_id(&disk_super->dev_item); 1798 dev_uuid = disk_super->dev_item.uuid; 1799 *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1800 disk_super->fsid); 1801 brelse(bh); 1802 if (!*device) 1803 ret = -ENOENT; 1804 blkdev_put(bdev, FMODE_READ); 1805 return ret; 1806 } 1807 1808 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1809 char *device_path, 1810 struct btrfs_device **device) 1811 { 1812 *device = NULL; 1813 if (strcmp(device_path, "missing") == 0) { 1814 struct list_head *devices; 1815 struct btrfs_device *tmp; 1816 1817 devices = &root->fs_info->fs_devices->devices; 1818 /* 1819 * It is safe to read the devices since the volume_mutex 1820 * is held by the caller. 1821 */ 1822 list_for_each_entry(tmp, devices, dev_list) { 1823 if (tmp->in_fs_metadata && !tmp->bdev) { 1824 *device = tmp; 1825 break; 1826 } 1827 } 1828 1829 if (!*device) { 1830 btrfs_err(root->fs_info, "no missing device found"); 1831 return -ENOENT; 1832 } 1833 1834 return 0; 1835 } else { 1836 return btrfs_find_device_by_path(root, device_path, device); 1837 } 1838 } 1839 1840 /* 1841 * does all the dirty work required for changing file system's UUID. 1842 */ 1843 static int btrfs_prepare_sprout(struct btrfs_root *root) 1844 { 1845 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1846 struct btrfs_fs_devices *old_devices; 1847 struct btrfs_fs_devices *seed_devices; 1848 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1849 struct btrfs_device *device; 1850 u64 super_flags; 1851 1852 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1853 if (!fs_devices->seeding) 1854 return -EINVAL; 1855 1856 seed_devices = __alloc_fs_devices(); 1857 if (IS_ERR(seed_devices)) 1858 return PTR_ERR(seed_devices); 1859 1860 old_devices = clone_fs_devices(fs_devices); 1861 if (IS_ERR(old_devices)) { 1862 kfree(seed_devices); 1863 return PTR_ERR(old_devices); 1864 } 1865 1866 list_add(&old_devices->list, &fs_uuids); 1867 1868 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1869 seed_devices->opened = 1; 1870 INIT_LIST_HEAD(&seed_devices->devices); 1871 INIT_LIST_HEAD(&seed_devices->alloc_list); 1872 mutex_init(&seed_devices->device_list_mutex); 1873 1874 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1875 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1876 synchronize_rcu); 1877 1878 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1879 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1880 device->fs_devices = seed_devices; 1881 } 1882 1883 fs_devices->seeding = 0; 1884 fs_devices->num_devices = 0; 1885 fs_devices->open_devices = 0; 1886 fs_devices->total_devices = 0; 1887 fs_devices->seed = seed_devices; 1888 1889 generate_random_uuid(fs_devices->fsid); 1890 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1891 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1892 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1893 1894 super_flags = btrfs_super_flags(disk_super) & 1895 ~BTRFS_SUPER_FLAG_SEEDING; 1896 btrfs_set_super_flags(disk_super, super_flags); 1897 1898 return 0; 1899 } 1900 1901 /* 1902 * strore the expected generation for seed devices in device items. 1903 */ 1904 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1905 struct btrfs_root *root) 1906 { 1907 struct btrfs_path *path; 1908 struct extent_buffer *leaf; 1909 struct btrfs_dev_item *dev_item; 1910 struct btrfs_device *device; 1911 struct btrfs_key key; 1912 u8 fs_uuid[BTRFS_UUID_SIZE]; 1913 u8 dev_uuid[BTRFS_UUID_SIZE]; 1914 u64 devid; 1915 int ret; 1916 1917 path = btrfs_alloc_path(); 1918 if (!path) 1919 return -ENOMEM; 1920 1921 root = root->fs_info->chunk_root; 1922 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1923 key.offset = 0; 1924 key.type = BTRFS_DEV_ITEM_KEY; 1925 1926 while (1) { 1927 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1928 if (ret < 0) 1929 goto error; 1930 1931 leaf = path->nodes[0]; 1932 next_slot: 1933 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1934 ret = btrfs_next_leaf(root, path); 1935 if (ret > 0) 1936 break; 1937 if (ret < 0) 1938 goto error; 1939 leaf = path->nodes[0]; 1940 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1941 btrfs_release_path(path); 1942 continue; 1943 } 1944 1945 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1946 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1947 key.type != BTRFS_DEV_ITEM_KEY) 1948 break; 1949 1950 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1951 struct btrfs_dev_item); 1952 devid = btrfs_device_id(leaf, dev_item); 1953 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 1954 BTRFS_UUID_SIZE); 1955 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 1956 BTRFS_UUID_SIZE); 1957 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1958 fs_uuid); 1959 BUG_ON(!device); /* Logic error */ 1960 1961 if (device->fs_devices->seeding) { 1962 btrfs_set_device_generation(leaf, dev_item, 1963 device->generation); 1964 btrfs_mark_buffer_dirty(leaf); 1965 } 1966 1967 path->slots[0]++; 1968 goto next_slot; 1969 } 1970 ret = 0; 1971 error: 1972 btrfs_free_path(path); 1973 return ret; 1974 } 1975 1976 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1977 { 1978 struct request_queue *q; 1979 struct btrfs_trans_handle *trans; 1980 struct btrfs_device *device; 1981 struct block_device *bdev; 1982 struct list_head *devices; 1983 struct super_block *sb = root->fs_info->sb; 1984 struct rcu_string *name; 1985 u64 total_bytes; 1986 int seeding_dev = 0; 1987 int ret = 0; 1988 1989 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1990 return -EROFS; 1991 1992 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1993 root->fs_info->bdev_holder); 1994 if (IS_ERR(bdev)) 1995 return PTR_ERR(bdev); 1996 1997 if (root->fs_info->fs_devices->seeding) { 1998 seeding_dev = 1; 1999 down_write(&sb->s_umount); 2000 mutex_lock(&uuid_mutex); 2001 } 2002 2003 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2004 2005 devices = &root->fs_info->fs_devices->devices; 2006 2007 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2008 list_for_each_entry(device, devices, dev_list) { 2009 if (device->bdev == bdev) { 2010 ret = -EEXIST; 2011 mutex_unlock( 2012 &root->fs_info->fs_devices->device_list_mutex); 2013 goto error; 2014 } 2015 } 2016 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2017 2018 device = btrfs_alloc_device(root->fs_info, NULL, NULL); 2019 if (IS_ERR(device)) { 2020 /* we can safely leave the fs_devices entry around */ 2021 ret = PTR_ERR(device); 2022 goto error; 2023 } 2024 2025 name = rcu_string_strdup(device_path, GFP_NOFS); 2026 if (!name) { 2027 kfree(device); 2028 ret = -ENOMEM; 2029 goto error; 2030 } 2031 rcu_assign_pointer(device->name, name); 2032 2033 trans = btrfs_start_transaction(root, 0); 2034 if (IS_ERR(trans)) { 2035 rcu_string_free(device->name); 2036 kfree(device); 2037 ret = PTR_ERR(trans); 2038 goto error; 2039 } 2040 2041 lock_chunks(root); 2042 2043 q = bdev_get_queue(bdev); 2044 if (blk_queue_discard(q)) 2045 device->can_discard = 1; 2046 device->writeable = 1; 2047 device->generation = trans->transid; 2048 device->io_width = root->sectorsize; 2049 device->io_align = root->sectorsize; 2050 device->sector_size = root->sectorsize; 2051 device->total_bytes = i_size_read(bdev->bd_inode); 2052 device->disk_total_bytes = device->total_bytes; 2053 device->dev_root = root->fs_info->dev_root; 2054 device->bdev = bdev; 2055 device->in_fs_metadata = 1; 2056 device->is_tgtdev_for_dev_replace = 0; 2057 device->mode = FMODE_EXCL; 2058 device->dev_stats_valid = 1; 2059 set_blocksize(device->bdev, 4096); 2060 2061 if (seeding_dev) { 2062 sb->s_flags &= ~MS_RDONLY; 2063 ret = btrfs_prepare_sprout(root); 2064 BUG_ON(ret); /* -ENOMEM */ 2065 } 2066 2067 device->fs_devices = root->fs_info->fs_devices; 2068 2069 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2070 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2071 list_add(&device->dev_alloc_list, 2072 &root->fs_info->fs_devices->alloc_list); 2073 root->fs_info->fs_devices->num_devices++; 2074 root->fs_info->fs_devices->open_devices++; 2075 root->fs_info->fs_devices->rw_devices++; 2076 root->fs_info->fs_devices->total_devices++; 2077 if (device->can_discard) 2078 root->fs_info->fs_devices->num_can_discard++; 2079 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2080 2081 spin_lock(&root->fs_info->free_chunk_lock); 2082 root->fs_info->free_chunk_space += device->total_bytes; 2083 spin_unlock(&root->fs_info->free_chunk_lock); 2084 2085 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2086 root->fs_info->fs_devices->rotating = 1; 2087 2088 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 2089 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2090 total_bytes + device->total_bytes); 2091 2092 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2093 btrfs_set_super_num_devices(root->fs_info->super_copy, 2094 total_bytes + 1); 2095 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2096 2097 if (seeding_dev) { 2098 ret = init_first_rw_device(trans, root, device); 2099 if (ret) { 2100 btrfs_abort_transaction(trans, root, ret); 2101 goto error_trans; 2102 } 2103 ret = btrfs_finish_sprout(trans, root); 2104 if (ret) { 2105 btrfs_abort_transaction(trans, root, ret); 2106 goto error_trans; 2107 } 2108 } else { 2109 ret = btrfs_add_device(trans, root, device); 2110 if (ret) { 2111 btrfs_abort_transaction(trans, root, ret); 2112 goto error_trans; 2113 } 2114 } 2115 2116 /* 2117 * we've got more storage, clear any full flags on the space 2118 * infos 2119 */ 2120 btrfs_clear_space_info_full(root->fs_info); 2121 2122 unlock_chunks(root); 2123 root->fs_info->num_tolerated_disk_barrier_failures = 2124 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2125 ret = btrfs_commit_transaction(trans, root); 2126 2127 if (seeding_dev) { 2128 mutex_unlock(&uuid_mutex); 2129 up_write(&sb->s_umount); 2130 2131 if (ret) /* transaction commit */ 2132 return ret; 2133 2134 ret = btrfs_relocate_sys_chunks(root); 2135 if (ret < 0) 2136 btrfs_error(root->fs_info, ret, 2137 "Failed to relocate sys chunks after " 2138 "device initialization. This can be fixed " 2139 "using the \"btrfs balance\" command."); 2140 trans = btrfs_attach_transaction(root); 2141 if (IS_ERR(trans)) { 2142 if (PTR_ERR(trans) == -ENOENT) 2143 return 0; 2144 return PTR_ERR(trans); 2145 } 2146 ret = btrfs_commit_transaction(trans, root); 2147 } 2148 2149 return ret; 2150 2151 error_trans: 2152 unlock_chunks(root); 2153 btrfs_end_transaction(trans, root); 2154 rcu_string_free(device->name); 2155 kfree(device); 2156 error: 2157 blkdev_put(bdev, FMODE_EXCL); 2158 if (seeding_dev) { 2159 mutex_unlock(&uuid_mutex); 2160 up_write(&sb->s_umount); 2161 } 2162 return ret; 2163 } 2164 2165 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2166 struct btrfs_device **device_out) 2167 { 2168 struct request_queue *q; 2169 struct btrfs_device *device; 2170 struct block_device *bdev; 2171 struct btrfs_fs_info *fs_info = root->fs_info; 2172 struct list_head *devices; 2173 struct rcu_string *name; 2174 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2175 int ret = 0; 2176 2177 *device_out = NULL; 2178 if (fs_info->fs_devices->seeding) 2179 return -EINVAL; 2180 2181 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2182 fs_info->bdev_holder); 2183 if (IS_ERR(bdev)) 2184 return PTR_ERR(bdev); 2185 2186 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2187 2188 devices = &fs_info->fs_devices->devices; 2189 list_for_each_entry(device, devices, dev_list) { 2190 if (device->bdev == bdev) { 2191 ret = -EEXIST; 2192 goto error; 2193 } 2194 } 2195 2196 device = btrfs_alloc_device(NULL, &devid, NULL); 2197 if (IS_ERR(device)) { 2198 ret = PTR_ERR(device); 2199 goto error; 2200 } 2201 2202 name = rcu_string_strdup(device_path, GFP_NOFS); 2203 if (!name) { 2204 kfree(device); 2205 ret = -ENOMEM; 2206 goto error; 2207 } 2208 rcu_assign_pointer(device->name, name); 2209 2210 q = bdev_get_queue(bdev); 2211 if (blk_queue_discard(q)) 2212 device->can_discard = 1; 2213 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2214 device->writeable = 1; 2215 device->generation = 0; 2216 device->io_width = root->sectorsize; 2217 device->io_align = root->sectorsize; 2218 device->sector_size = root->sectorsize; 2219 device->total_bytes = i_size_read(bdev->bd_inode); 2220 device->disk_total_bytes = device->total_bytes; 2221 device->dev_root = fs_info->dev_root; 2222 device->bdev = bdev; 2223 device->in_fs_metadata = 1; 2224 device->is_tgtdev_for_dev_replace = 1; 2225 device->mode = FMODE_EXCL; 2226 device->dev_stats_valid = 1; 2227 set_blocksize(device->bdev, 4096); 2228 device->fs_devices = fs_info->fs_devices; 2229 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2230 fs_info->fs_devices->num_devices++; 2231 fs_info->fs_devices->open_devices++; 2232 if (device->can_discard) 2233 fs_info->fs_devices->num_can_discard++; 2234 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2235 2236 *device_out = device; 2237 return ret; 2238 2239 error: 2240 blkdev_put(bdev, FMODE_EXCL); 2241 return ret; 2242 } 2243 2244 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2245 struct btrfs_device *tgtdev) 2246 { 2247 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2248 tgtdev->io_width = fs_info->dev_root->sectorsize; 2249 tgtdev->io_align = fs_info->dev_root->sectorsize; 2250 tgtdev->sector_size = fs_info->dev_root->sectorsize; 2251 tgtdev->dev_root = fs_info->dev_root; 2252 tgtdev->in_fs_metadata = 1; 2253 } 2254 2255 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2256 struct btrfs_device *device) 2257 { 2258 int ret; 2259 struct btrfs_path *path; 2260 struct btrfs_root *root; 2261 struct btrfs_dev_item *dev_item; 2262 struct extent_buffer *leaf; 2263 struct btrfs_key key; 2264 2265 root = device->dev_root->fs_info->chunk_root; 2266 2267 path = btrfs_alloc_path(); 2268 if (!path) 2269 return -ENOMEM; 2270 2271 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2272 key.type = BTRFS_DEV_ITEM_KEY; 2273 key.offset = device->devid; 2274 2275 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2276 if (ret < 0) 2277 goto out; 2278 2279 if (ret > 0) { 2280 ret = -ENOENT; 2281 goto out; 2282 } 2283 2284 leaf = path->nodes[0]; 2285 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2286 2287 btrfs_set_device_id(leaf, dev_item, device->devid); 2288 btrfs_set_device_type(leaf, dev_item, device->type); 2289 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2290 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2291 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2292 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 2293 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 2294 btrfs_mark_buffer_dirty(leaf); 2295 2296 out: 2297 btrfs_free_path(path); 2298 return ret; 2299 } 2300 2301 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 2302 struct btrfs_device *device, u64 new_size) 2303 { 2304 struct btrfs_super_block *super_copy = 2305 device->dev_root->fs_info->super_copy; 2306 u64 old_total = btrfs_super_total_bytes(super_copy); 2307 u64 diff = new_size - device->total_bytes; 2308 2309 if (!device->writeable) 2310 return -EACCES; 2311 if (new_size <= device->total_bytes || 2312 device->is_tgtdev_for_dev_replace) 2313 return -EINVAL; 2314 2315 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2316 device->fs_devices->total_rw_bytes += diff; 2317 2318 device->total_bytes = new_size; 2319 device->disk_total_bytes = new_size; 2320 btrfs_clear_space_info_full(device->dev_root->fs_info); 2321 2322 return btrfs_update_device(trans, device); 2323 } 2324 2325 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2326 struct btrfs_device *device, u64 new_size) 2327 { 2328 int ret; 2329 lock_chunks(device->dev_root); 2330 ret = __btrfs_grow_device(trans, device, new_size); 2331 unlock_chunks(device->dev_root); 2332 return ret; 2333 } 2334 2335 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2336 struct btrfs_root *root, 2337 u64 chunk_tree, u64 chunk_objectid, 2338 u64 chunk_offset) 2339 { 2340 int ret; 2341 struct btrfs_path *path; 2342 struct btrfs_key key; 2343 2344 root = root->fs_info->chunk_root; 2345 path = btrfs_alloc_path(); 2346 if (!path) 2347 return -ENOMEM; 2348 2349 key.objectid = chunk_objectid; 2350 key.offset = chunk_offset; 2351 key.type = BTRFS_CHUNK_ITEM_KEY; 2352 2353 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2354 if (ret < 0) 2355 goto out; 2356 else if (ret > 0) { /* Logic error or corruption */ 2357 btrfs_error(root->fs_info, -ENOENT, 2358 "Failed lookup while freeing chunk."); 2359 ret = -ENOENT; 2360 goto out; 2361 } 2362 2363 ret = btrfs_del_item(trans, root, path); 2364 if (ret < 0) 2365 btrfs_error(root->fs_info, ret, 2366 "Failed to delete chunk item."); 2367 out: 2368 btrfs_free_path(path); 2369 return ret; 2370 } 2371 2372 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 2373 chunk_offset) 2374 { 2375 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2376 struct btrfs_disk_key *disk_key; 2377 struct btrfs_chunk *chunk; 2378 u8 *ptr; 2379 int ret = 0; 2380 u32 num_stripes; 2381 u32 array_size; 2382 u32 len = 0; 2383 u32 cur; 2384 struct btrfs_key key; 2385 2386 array_size = btrfs_super_sys_array_size(super_copy); 2387 2388 ptr = super_copy->sys_chunk_array; 2389 cur = 0; 2390 2391 while (cur < array_size) { 2392 disk_key = (struct btrfs_disk_key *)ptr; 2393 btrfs_disk_key_to_cpu(&key, disk_key); 2394 2395 len = sizeof(*disk_key); 2396 2397 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2398 chunk = (struct btrfs_chunk *)(ptr + len); 2399 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2400 len += btrfs_chunk_item_size(num_stripes); 2401 } else { 2402 ret = -EIO; 2403 break; 2404 } 2405 if (key.objectid == chunk_objectid && 2406 key.offset == chunk_offset) { 2407 memmove(ptr, ptr + len, array_size - (cur + len)); 2408 array_size -= len; 2409 btrfs_set_super_sys_array_size(super_copy, array_size); 2410 } else { 2411 ptr += len; 2412 cur += len; 2413 } 2414 } 2415 return ret; 2416 } 2417 2418 static int btrfs_relocate_chunk(struct btrfs_root *root, 2419 u64 chunk_tree, u64 chunk_objectid, 2420 u64 chunk_offset) 2421 { 2422 struct extent_map_tree *em_tree; 2423 struct btrfs_root *extent_root; 2424 struct btrfs_trans_handle *trans; 2425 struct extent_map *em; 2426 struct map_lookup *map; 2427 int ret; 2428 int i; 2429 2430 root = root->fs_info->chunk_root; 2431 extent_root = root->fs_info->extent_root; 2432 em_tree = &root->fs_info->mapping_tree.map_tree; 2433 2434 ret = btrfs_can_relocate(extent_root, chunk_offset); 2435 if (ret) 2436 return -ENOSPC; 2437 2438 /* step one, relocate all the extents inside this chunk */ 2439 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 2440 if (ret) 2441 return ret; 2442 2443 trans = btrfs_start_transaction(root, 0); 2444 if (IS_ERR(trans)) { 2445 ret = PTR_ERR(trans); 2446 btrfs_std_error(root->fs_info, ret); 2447 return ret; 2448 } 2449 2450 lock_chunks(root); 2451 2452 /* 2453 * step two, delete the device extents and the 2454 * chunk tree entries 2455 */ 2456 read_lock(&em_tree->lock); 2457 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2458 read_unlock(&em_tree->lock); 2459 2460 BUG_ON(!em || em->start > chunk_offset || 2461 em->start + em->len < chunk_offset); 2462 map = (struct map_lookup *)em->bdev; 2463 2464 for (i = 0; i < map->num_stripes; i++) { 2465 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2466 map->stripes[i].physical); 2467 BUG_ON(ret); 2468 2469 if (map->stripes[i].dev) { 2470 ret = btrfs_update_device(trans, map->stripes[i].dev); 2471 BUG_ON(ret); 2472 } 2473 } 2474 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2475 chunk_offset); 2476 2477 BUG_ON(ret); 2478 2479 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2480 2481 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2482 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2483 BUG_ON(ret); 2484 } 2485 2486 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2487 BUG_ON(ret); 2488 2489 write_lock(&em_tree->lock); 2490 remove_extent_mapping(em_tree, em); 2491 write_unlock(&em_tree->lock); 2492 2493 kfree(map); 2494 em->bdev = NULL; 2495 2496 /* once for the tree */ 2497 free_extent_map(em); 2498 /* once for us */ 2499 free_extent_map(em); 2500 2501 unlock_chunks(root); 2502 btrfs_end_transaction(trans, root); 2503 return 0; 2504 } 2505 2506 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2507 { 2508 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2509 struct btrfs_path *path; 2510 struct extent_buffer *leaf; 2511 struct btrfs_chunk *chunk; 2512 struct btrfs_key key; 2513 struct btrfs_key found_key; 2514 u64 chunk_tree = chunk_root->root_key.objectid; 2515 u64 chunk_type; 2516 bool retried = false; 2517 int failed = 0; 2518 int ret; 2519 2520 path = btrfs_alloc_path(); 2521 if (!path) 2522 return -ENOMEM; 2523 2524 again: 2525 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2526 key.offset = (u64)-1; 2527 key.type = BTRFS_CHUNK_ITEM_KEY; 2528 2529 while (1) { 2530 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2531 if (ret < 0) 2532 goto error; 2533 BUG_ON(ret == 0); /* Corruption */ 2534 2535 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2536 key.type); 2537 if (ret < 0) 2538 goto error; 2539 if (ret > 0) 2540 break; 2541 2542 leaf = path->nodes[0]; 2543 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2544 2545 chunk = btrfs_item_ptr(leaf, path->slots[0], 2546 struct btrfs_chunk); 2547 chunk_type = btrfs_chunk_type(leaf, chunk); 2548 btrfs_release_path(path); 2549 2550 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2551 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2552 found_key.objectid, 2553 found_key.offset); 2554 if (ret == -ENOSPC) 2555 failed++; 2556 else if (ret) 2557 BUG(); 2558 } 2559 2560 if (found_key.offset == 0) 2561 break; 2562 key.offset = found_key.offset - 1; 2563 } 2564 ret = 0; 2565 if (failed && !retried) { 2566 failed = 0; 2567 retried = true; 2568 goto again; 2569 } else if (WARN_ON(failed && retried)) { 2570 ret = -ENOSPC; 2571 } 2572 error: 2573 btrfs_free_path(path); 2574 return ret; 2575 } 2576 2577 static int insert_balance_item(struct btrfs_root *root, 2578 struct btrfs_balance_control *bctl) 2579 { 2580 struct btrfs_trans_handle *trans; 2581 struct btrfs_balance_item *item; 2582 struct btrfs_disk_balance_args disk_bargs; 2583 struct btrfs_path *path; 2584 struct extent_buffer *leaf; 2585 struct btrfs_key key; 2586 int ret, err; 2587 2588 path = btrfs_alloc_path(); 2589 if (!path) 2590 return -ENOMEM; 2591 2592 trans = btrfs_start_transaction(root, 0); 2593 if (IS_ERR(trans)) { 2594 btrfs_free_path(path); 2595 return PTR_ERR(trans); 2596 } 2597 2598 key.objectid = BTRFS_BALANCE_OBJECTID; 2599 key.type = BTRFS_BALANCE_ITEM_KEY; 2600 key.offset = 0; 2601 2602 ret = btrfs_insert_empty_item(trans, root, path, &key, 2603 sizeof(*item)); 2604 if (ret) 2605 goto out; 2606 2607 leaf = path->nodes[0]; 2608 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2609 2610 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2611 2612 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2613 btrfs_set_balance_data(leaf, item, &disk_bargs); 2614 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2615 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2616 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2617 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2618 2619 btrfs_set_balance_flags(leaf, item, bctl->flags); 2620 2621 btrfs_mark_buffer_dirty(leaf); 2622 out: 2623 btrfs_free_path(path); 2624 err = btrfs_commit_transaction(trans, root); 2625 if (err && !ret) 2626 ret = err; 2627 return ret; 2628 } 2629 2630 static int del_balance_item(struct btrfs_root *root) 2631 { 2632 struct btrfs_trans_handle *trans; 2633 struct btrfs_path *path; 2634 struct btrfs_key key; 2635 int ret, err; 2636 2637 path = btrfs_alloc_path(); 2638 if (!path) 2639 return -ENOMEM; 2640 2641 trans = btrfs_start_transaction(root, 0); 2642 if (IS_ERR(trans)) { 2643 btrfs_free_path(path); 2644 return PTR_ERR(trans); 2645 } 2646 2647 key.objectid = BTRFS_BALANCE_OBJECTID; 2648 key.type = BTRFS_BALANCE_ITEM_KEY; 2649 key.offset = 0; 2650 2651 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2652 if (ret < 0) 2653 goto out; 2654 if (ret > 0) { 2655 ret = -ENOENT; 2656 goto out; 2657 } 2658 2659 ret = btrfs_del_item(trans, root, path); 2660 out: 2661 btrfs_free_path(path); 2662 err = btrfs_commit_transaction(trans, root); 2663 if (err && !ret) 2664 ret = err; 2665 return ret; 2666 } 2667 2668 /* 2669 * This is a heuristic used to reduce the number of chunks balanced on 2670 * resume after balance was interrupted. 2671 */ 2672 static void update_balance_args(struct btrfs_balance_control *bctl) 2673 { 2674 /* 2675 * Turn on soft mode for chunk types that were being converted. 2676 */ 2677 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2678 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2679 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2680 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2681 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2682 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2683 2684 /* 2685 * Turn on usage filter if is not already used. The idea is 2686 * that chunks that we have already balanced should be 2687 * reasonably full. Don't do it for chunks that are being 2688 * converted - that will keep us from relocating unconverted 2689 * (albeit full) chunks. 2690 */ 2691 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2692 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2693 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2694 bctl->data.usage = 90; 2695 } 2696 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2697 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2698 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2699 bctl->sys.usage = 90; 2700 } 2701 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2702 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2703 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2704 bctl->meta.usage = 90; 2705 } 2706 } 2707 2708 /* 2709 * Should be called with both balance and volume mutexes held to 2710 * serialize other volume operations (add_dev/rm_dev/resize) with 2711 * restriper. Same goes for unset_balance_control. 2712 */ 2713 static void set_balance_control(struct btrfs_balance_control *bctl) 2714 { 2715 struct btrfs_fs_info *fs_info = bctl->fs_info; 2716 2717 BUG_ON(fs_info->balance_ctl); 2718 2719 spin_lock(&fs_info->balance_lock); 2720 fs_info->balance_ctl = bctl; 2721 spin_unlock(&fs_info->balance_lock); 2722 } 2723 2724 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2725 { 2726 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2727 2728 BUG_ON(!fs_info->balance_ctl); 2729 2730 spin_lock(&fs_info->balance_lock); 2731 fs_info->balance_ctl = NULL; 2732 spin_unlock(&fs_info->balance_lock); 2733 2734 kfree(bctl); 2735 } 2736 2737 /* 2738 * Balance filters. Return 1 if chunk should be filtered out 2739 * (should not be balanced). 2740 */ 2741 static int chunk_profiles_filter(u64 chunk_type, 2742 struct btrfs_balance_args *bargs) 2743 { 2744 chunk_type = chunk_to_extended(chunk_type) & 2745 BTRFS_EXTENDED_PROFILE_MASK; 2746 2747 if (bargs->profiles & chunk_type) 2748 return 0; 2749 2750 return 1; 2751 } 2752 2753 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2754 struct btrfs_balance_args *bargs) 2755 { 2756 struct btrfs_block_group_cache *cache; 2757 u64 chunk_used, user_thresh; 2758 int ret = 1; 2759 2760 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2761 chunk_used = btrfs_block_group_used(&cache->item); 2762 2763 if (bargs->usage == 0) 2764 user_thresh = 1; 2765 else if (bargs->usage > 100) 2766 user_thresh = cache->key.offset; 2767 else 2768 user_thresh = div_factor_fine(cache->key.offset, 2769 bargs->usage); 2770 2771 if (chunk_used < user_thresh) 2772 ret = 0; 2773 2774 btrfs_put_block_group(cache); 2775 return ret; 2776 } 2777 2778 static int chunk_devid_filter(struct extent_buffer *leaf, 2779 struct btrfs_chunk *chunk, 2780 struct btrfs_balance_args *bargs) 2781 { 2782 struct btrfs_stripe *stripe; 2783 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2784 int i; 2785 2786 for (i = 0; i < num_stripes; i++) { 2787 stripe = btrfs_stripe_nr(chunk, i); 2788 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2789 return 0; 2790 } 2791 2792 return 1; 2793 } 2794 2795 /* [pstart, pend) */ 2796 static int chunk_drange_filter(struct extent_buffer *leaf, 2797 struct btrfs_chunk *chunk, 2798 u64 chunk_offset, 2799 struct btrfs_balance_args *bargs) 2800 { 2801 struct btrfs_stripe *stripe; 2802 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2803 u64 stripe_offset; 2804 u64 stripe_length; 2805 int factor; 2806 int i; 2807 2808 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 2809 return 0; 2810 2811 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2812 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 2813 factor = num_stripes / 2; 2814 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 2815 factor = num_stripes - 1; 2816 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 2817 factor = num_stripes - 2; 2818 } else { 2819 factor = num_stripes; 2820 } 2821 2822 for (i = 0; i < num_stripes; i++) { 2823 stripe = btrfs_stripe_nr(chunk, i); 2824 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 2825 continue; 2826 2827 stripe_offset = btrfs_stripe_offset(leaf, stripe); 2828 stripe_length = btrfs_chunk_length(leaf, chunk); 2829 do_div(stripe_length, factor); 2830 2831 if (stripe_offset < bargs->pend && 2832 stripe_offset + stripe_length > bargs->pstart) 2833 return 0; 2834 } 2835 2836 return 1; 2837 } 2838 2839 /* [vstart, vend) */ 2840 static int chunk_vrange_filter(struct extent_buffer *leaf, 2841 struct btrfs_chunk *chunk, 2842 u64 chunk_offset, 2843 struct btrfs_balance_args *bargs) 2844 { 2845 if (chunk_offset < bargs->vend && 2846 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 2847 /* at least part of the chunk is inside this vrange */ 2848 return 0; 2849 2850 return 1; 2851 } 2852 2853 static int chunk_soft_convert_filter(u64 chunk_type, 2854 struct btrfs_balance_args *bargs) 2855 { 2856 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 2857 return 0; 2858 2859 chunk_type = chunk_to_extended(chunk_type) & 2860 BTRFS_EXTENDED_PROFILE_MASK; 2861 2862 if (bargs->target == chunk_type) 2863 return 1; 2864 2865 return 0; 2866 } 2867 2868 static int should_balance_chunk(struct btrfs_root *root, 2869 struct extent_buffer *leaf, 2870 struct btrfs_chunk *chunk, u64 chunk_offset) 2871 { 2872 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 2873 struct btrfs_balance_args *bargs = NULL; 2874 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 2875 2876 /* type filter */ 2877 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 2878 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 2879 return 0; 2880 } 2881 2882 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 2883 bargs = &bctl->data; 2884 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 2885 bargs = &bctl->sys; 2886 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 2887 bargs = &bctl->meta; 2888 2889 /* profiles filter */ 2890 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 2891 chunk_profiles_filter(chunk_type, bargs)) { 2892 return 0; 2893 } 2894 2895 /* usage filter */ 2896 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 2897 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 2898 return 0; 2899 } 2900 2901 /* devid filter */ 2902 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 2903 chunk_devid_filter(leaf, chunk, bargs)) { 2904 return 0; 2905 } 2906 2907 /* drange filter, makes sense only with devid filter */ 2908 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 2909 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 2910 return 0; 2911 } 2912 2913 /* vrange filter */ 2914 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 2915 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 2916 return 0; 2917 } 2918 2919 /* soft profile changing mode */ 2920 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 2921 chunk_soft_convert_filter(chunk_type, bargs)) { 2922 return 0; 2923 } 2924 2925 return 1; 2926 } 2927 2928 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2929 { 2930 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2931 struct btrfs_root *chunk_root = fs_info->chunk_root; 2932 struct btrfs_root *dev_root = fs_info->dev_root; 2933 struct list_head *devices; 2934 struct btrfs_device *device; 2935 u64 old_size; 2936 u64 size_to_free; 2937 struct btrfs_chunk *chunk; 2938 struct btrfs_path *path; 2939 struct btrfs_key key; 2940 struct btrfs_key found_key; 2941 struct btrfs_trans_handle *trans; 2942 struct extent_buffer *leaf; 2943 int slot; 2944 int ret; 2945 int enospc_errors = 0; 2946 bool counting = true; 2947 2948 /* step one make some room on all the devices */ 2949 devices = &fs_info->fs_devices->devices; 2950 list_for_each_entry(device, devices, dev_list) { 2951 old_size = device->total_bytes; 2952 size_to_free = div_factor(old_size, 1); 2953 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2954 if (!device->writeable || 2955 device->total_bytes - device->bytes_used > size_to_free || 2956 device->is_tgtdev_for_dev_replace) 2957 continue; 2958 2959 ret = btrfs_shrink_device(device, old_size - size_to_free); 2960 if (ret == -ENOSPC) 2961 break; 2962 BUG_ON(ret); 2963 2964 trans = btrfs_start_transaction(dev_root, 0); 2965 BUG_ON(IS_ERR(trans)); 2966 2967 ret = btrfs_grow_device(trans, device, old_size); 2968 BUG_ON(ret); 2969 2970 btrfs_end_transaction(trans, dev_root); 2971 } 2972 2973 /* step two, relocate all the chunks */ 2974 path = btrfs_alloc_path(); 2975 if (!path) { 2976 ret = -ENOMEM; 2977 goto error; 2978 } 2979 2980 /* zero out stat counters */ 2981 spin_lock(&fs_info->balance_lock); 2982 memset(&bctl->stat, 0, sizeof(bctl->stat)); 2983 spin_unlock(&fs_info->balance_lock); 2984 again: 2985 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2986 key.offset = (u64)-1; 2987 key.type = BTRFS_CHUNK_ITEM_KEY; 2988 2989 while (1) { 2990 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 2991 atomic_read(&fs_info->balance_cancel_req)) { 2992 ret = -ECANCELED; 2993 goto error; 2994 } 2995 2996 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2997 if (ret < 0) 2998 goto error; 2999 3000 /* 3001 * this shouldn't happen, it means the last relocate 3002 * failed 3003 */ 3004 if (ret == 0) 3005 BUG(); /* FIXME break ? */ 3006 3007 ret = btrfs_previous_item(chunk_root, path, 0, 3008 BTRFS_CHUNK_ITEM_KEY); 3009 if (ret) { 3010 ret = 0; 3011 break; 3012 } 3013 3014 leaf = path->nodes[0]; 3015 slot = path->slots[0]; 3016 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3017 3018 if (found_key.objectid != key.objectid) 3019 break; 3020 3021 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3022 3023 if (!counting) { 3024 spin_lock(&fs_info->balance_lock); 3025 bctl->stat.considered++; 3026 spin_unlock(&fs_info->balance_lock); 3027 } 3028 3029 ret = should_balance_chunk(chunk_root, leaf, chunk, 3030 found_key.offset); 3031 btrfs_release_path(path); 3032 if (!ret) 3033 goto loop; 3034 3035 if (counting) { 3036 spin_lock(&fs_info->balance_lock); 3037 bctl->stat.expected++; 3038 spin_unlock(&fs_info->balance_lock); 3039 goto loop; 3040 } 3041 3042 ret = btrfs_relocate_chunk(chunk_root, 3043 chunk_root->root_key.objectid, 3044 found_key.objectid, 3045 found_key.offset); 3046 if (ret && ret != -ENOSPC) 3047 goto error; 3048 if (ret == -ENOSPC) { 3049 enospc_errors++; 3050 } else { 3051 spin_lock(&fs_info->balance_lock); 3052 bctl->stat.completed++; 3053 spin_unlock(&fs_info->balance_lock); 3054 } 3055 loop: 3056 if (found_key.offset == 0) 3057 break; 3058 key.offset = found_key.offset - 1; 3059 } 3060 3061 if (counting) { 3062 btrfs_release_path(path); 3063 counting = false; 3064 goto again; 3065 } 3066 error: 3067 btrfs_free_path(path); 3068 if (enospc_errors) { 3069 btrfs_info(fs_info, "%d enospc errors during balance", 3070 enospc_errors); 3071 if (!ret) 3072 ret = -ENOSPC; 3073 } 3074 3075 return ret; 3076 } 3077 3078 /** 3079 * alloc_profile_is_valid - see if a given profile is valid and reduced 3080 * @flags: profile to validate 3081 * @extended: if true @flags is treated as an extended profile 3082 */ 3083 static int alloc_profile_is_valid(u64 flags, int extended) 3084 { 3085 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3086 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3087 3088 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3089 3090 /* 1) check that all other bits are zeroed */ 3091 if (flags & ~mask) 3092 return 0; 3093 3094 /* 2) see if profile is reduced */ 3095 if (flags == 0) 3096 return !extended; /* "0" is valid for usual profiles */ 3097 3098 /* true if exactly one bit set */ 3099 return (flags & (flags - 1)) == 0; 3100 } 3101 3102 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3103 { 3104 /* cancel requested || normal exit path */ 3105 return atomic_read(&fs_info->balance_cancel_req) || 3106 (atomic_read(&fs_info->balance_pause_req) == 0 && 3107 atomic_read(&fs_info->balance_cancel_req) == 0); 3108 } 3109 3110 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3111 { 3112 int ret; 3113 3114 unset_balance_control(fs_info); 3115 ret = del_balance_item(fs_info->tree_root); 3116 if (ret) 3117 btrfs_std_error(fs_info, ret); 3118 3119 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3120 } 3121 3122 /* 3123 * Should be called with both balance and volume mutexes held 3124 */ 3125 int btrfs_balance(struct btrfs_balance_control *bctl, 3126 struct btrfs_ioctl_balance_args *bargs) 3127 { 3128 struct btrfs_fs_info *fs_info = bctl->fs_info; 3129 u64 allowed; 3130 int mixed = 0; 3131 int ret; 3132 u64 num_devices; 3133 unsigned seq; 3134 3135 if (btrfs_fs_closing(fs_info) || 3136 atomic_read(&fs_info->balance_pause_req) || 3137 atomic_read(&fs_info->balance_cancel_req)) { 3138 ret = -EINVAL; 3139 goto out; 3140 } 3141 3142 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3143 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3144 mixed = 1; 3145 3146 /* 3147 * In case of mixed groups both data and meta should be picked, 3148 * and identical options should be given for both of them. 3149 */ 3150 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3151 if (mixed && (bctl->flags & allowed)) { 3152 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3153 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3154 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3155 btrfs_err(fs_info, "with mixed groups data and " 3156 "metadata balance options must be the same"); 3157 ret = -EINVAL; 3158 goto out; 3159 } 3160 } 3161 3162 num_devices = fs_info->fs_devices->num_devices; 3163 btrfs_dev_replace_lock(&fs_info->dev_replace); 3164 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3165 BUG_ON(num_devices < 1); 3166 num_devices--; 3167 } 3168 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3169 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3170 if (num_devices == 1) 3171 allowed |= BTRFS_BLOCK_GROUP_DUP; 3172 else if (num_devices > 1) 3173 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3174 if (num_devices > 2) 3175 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3176 if (num_devices > 3) 3177 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3178 BTRFS_BLOCK_GROUP_RAID6); 3179 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3180 (!alloc_profile_is_valid(bctl->data.target, 1) || 3181 (bctl->data.target & ~allowed))) { 3182 btrfs_err(fs_info, "unable to start balance with target " 3183 "data profile %llu", 3184 bctl->data.target); 3185 ret = -EINVAL; 3186 goto out; 3187 } 3188 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3189 (!alloc_profile_is_valid(bctl->meta.target, 1) || 3190 (bctl->meta.target & ~allowed))) { 3191 btrfs_err(fs_info, 3192 "unable to start balance with target metadata profile %llu", 3193 bctl->meta.target); 3194 ret = -EINVAL; 3195 goto out; 3196 } 3197 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3198 (!alloc_profile_is_valid(bctl->sys.target, 1) || 3199 (bctl->sys.target & ~allowed))) { 3200 btrfs_err(fs_info, 3201 "unable to start balance with target system profile %llu", 3202 bctl->sys.target); 3203 ret = -EINVAL; 3204 goto out; 3205 } 3206 3207 /* allow dup'ed data chunks only in mixed mode */ 3208 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3209 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 3210 btrfs_err(fs_info, "dup for data is not allowed"); 3211 ret = -EINVAL; 3212 goto out; 3213 } 3214 3215 /* allow to reduce meta or sys integrity only if force set */ 3216 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3217 BTRFS_BLOCK_GROUP_RAID10 | 3218 BTRFS_BLOCK_GROUP_RAID5 | 3219 BTRFS_BLOCK_GROUP_RAID6; 3220 do { 3221 seq = read_seqbegin(&fs_info->profiles_lock); 3222 3223 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3224 (fs_info->avail_system_alloc_bits & allowed) && 3225 !(bctl->sys.target & allowed)) || 3226 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3227 (fs_info->avail_metadata_alloc_bits & allowed) && 3228 !(bctl->meta.target & allowed))) { 3229 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3230 btrfs_info(fs_info, "force reducing metadata integrity"); 3231 } else { 3232 btrfs_err(fs_info, "balance will reduce metadata " 3233 "integrity, use force if you want this"); 3234 ret = -EINVAL; 3235 goto out; 3236 } 3237 } 3238 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3239 3240 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3241 int num_tolerated_disk_barrier_failures; 3242 u64 target = bctl->sys.target; 3243 3244 num_tolerated_disk_barrier_failures = 3245 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3246 if (num_tolerated_disk_barrier_failures > 0 && 3247 (target & 3248 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3249 BTRFS_AVAIL_ALLOC_BIT_SINGLE))) 3250 num_tolerated_disk_barrier_failures = 0; 3251 else if (num_tolerated_disk_barrier_failures > 1 && 3252 (target & 3253 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) 3254 num_tolerated_disk_barrier_failures = 1; 3255 3256 fs_info->num_tolerated_disk_barrier_failures = 3257 num_tolerated_disk_barrier_failures; 3258 } 3259 3260 ret = insert_balance_item(fs_info->tree_root, bctl); 3261 if (ret && ret != -EEXIST) 3262 goto out; 3263 3264 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3265 BUG_ON(ret == -EEXIST); 3266 set_balance_control(bctl); 3267 } else { 3268 BUG_ON(ret != -EEXIST); 3269 spin_lock(&fs_info->balance_lock); 3270 update_balance_args(bctl); 3271 spin_unlock(&fs_info->balance_lock); 3272 } 3273 3274 atomic_inc(&fs_info->balance_running); 3275 mutex_unlock(&fs_info->balance_mutex); 3276 3277 ret = __btrfs_balance(fs_info); 3278 3279 mutex_lock(&fs_info->balance_mutex); 3280 atomic_dec(&fs_info->balance_running); 3281 3282 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3283 fs_info->num_tolerated_disk_barrier_failures = 3284 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3285 } 3286 3287 if (bargs) { 3288 memset(bargs, 0, sizeof(*bargs)); 3289 update_ioctl_balance_args(fs_info, 0, bargs); 3290 } 3291 3292 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3293 balance_need_close(fs_info)) { 3294 __cancel_balance(fs_info); 3295 } 3296 3297 wake_up(&fs_info->balance_wait_q); 3298 3299 return ret; 3300 out: 3301 if (bctl->flags & BTRFS_BALANCE_RESUME) 3302 __cancel_balance(fs_info); 3303 else { 3304 kfree(bctl); 3305 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3306 } 3307 return ret; 3308 } 3309 3310 static int balance_kthread(void *data) 3311 { 3312 struct btrfs_fs_info *fs_info = data; 3313 int ret = 0; 3314 3315 mutex_lock(&fs_info->volume_mutex); 3316 mutex_lock(&fs_info->balance_mutex); 3317 3318 if (fs_info->balance_ctl) { 3319 btrfs_info(fs_info, "continuing balance"); 3320 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3321 } 3322 3323 mutex_unlock(&fs_info->balance_mutex); 3324 mutex_unlock(&fs_info->volume_mutex); 3325 3326 return ret; 3327 } 3328 3329 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3330 { 3331 struct task_struct *tsk; 3332 3333 spin_lock(&fs_info->balance_lock); 3334 if (!fs_info->balance_ctl) { 3335 spin_unlock(&fs_info->balance_lock); 3336 return 0; 3337 } 3338 spin_unlock(&fs_info->balance_lock); 3339 3340 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 3341 btrfs_info(fs_info, "force skipping balance"); 3342 return 0; 3343 } 3344 3345 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3346 return PTR_ERR_OR_ZERO(tsk); 3347 } 3348 3349 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3350 { 3351 struct btrfs_balance_control *bctl; 3352 struct btrfs_balance_item *item; 3353 struct btrfs_disk_balance_args disk_bargs; 3354 struct btrfs_path *path; 3355 struct extent_buffer *leaf; 3356 struct btrfs_key key; 3357 int ret; 3358 3359 path = btrfs_alloc_path(); 3360 if (!path) 3361 return -ENOMEM; 3362 3363 key.objectid = BTRFS_BALANCE_OBJECTID; 3364 key.type = BTRFS_BALANCE_ITEM_KEY; 3365 key.offset = 0; 3366 3367 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3368 if (ret < 0) 3369 goto out; 3370 if (ret > 0) { /* ret = -ENOENT; */ 3371 ret = 0; 3372 goto out; 3373 } 3374 3375 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3376 if (!bctl) { 3377 ret = -ENOMEM; 3378 goto out; 3379 } 3380 3381 leaf = path->nodes[0]; 3382 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3383 3384 bctl->fs_info = fs_info; 3385 bctl->flags = btrfs_balance_flags(leaf, item); 3386 bctl->flags |= BTRFS_BALANCE_RESUME; 3387 3388 btrfs_balance_data(leaf, item, &disk_bargs); 3389 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3390 btrfs_balance_meta(leaf, item, &disk_bargs); 3391 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3392 btrfs_balance_sys(leaf, item, &disk_bargs); 3393 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3394 3395 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3396 3397 mutex_lock(&fs_info->volume_mutex); 3398 mutex_lock(&fs_info->balance_mutex); 3399 3400 set_balance_control(bctl); 3401 3402 mutex_unlock(&fs_info->balance_mutex); 3403 mutex_unlock(&fs_info->volume_mutex); 3404 out: 3405 btrfs_free_path(path); 3406 return ret; 3407 } 3408 3409 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3410 { 3411 int ret = 0; 3412 3413 mutex_lock(&fs_info->balance_mutex); 3414 if (!fs_info->balance_ctl) { 3415 mutex_unlock(&fs_info->balance_mutex); 3416 return -ENOTCONN; 3417 } 3418 3419 if (atomic_read(&fs_info->balance_running)) { 3420 atomic_inc(&fs_info->balance_pause_req); 3421 mutex_unlock(&fs_info->balance_mutex); 3422 3423 wait_event(fs_info->balance_wait_q, 3424 atomic_read(&fs_info->balance_running) == 0); 3425 3426 mutex_lock(&fs_info->balance_mutex); 3427 /* we are good with balance_ctl ripped off from under us */ 3428 BUG_ON(atomic_read(&fs_info->balance_running)); 3429 atomic_dec(&fs_info->balance_pause_req); 3430 } else { 3431 ret = -ENOTCONN; 3432 } 3433 3434 mutex_unlock(&fs_info->balance_mutex); 3435 return ret; 3436 } 3437 3438 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 3439 { 3440 if (fs_info->sb->s_flags & MS_RDONLY) 3441 return -EROFS; 3442 3443 mutex_lock(&fs_info->balance_mutex); 3444 if (!fs_info->balance_ctl) { 3445 mutex_unlock(&fs_info->balance_mutex); 3446 return -ENOTCONN; 3447 } 3448 3449 atomic_inc(&fs_info->balance_cancel_req); 3450 /* 3451 * if we are running just wait and return, balance item is 3452 * deleted in btrfs_balance in this case 3453 */ 3454 if (atomic_read(&fs_info->balance_running)) { 3455 mutex_unlock(&fs_info->balance_mutex); 3456 wait_event(fs_info->balance_wait_q, 3457 atomic_read(&fs_info->balance_running) == 0); 3458 mutex_lock(&fs_info->balance_mutex); 3459 } else { 3460 /* __cancel_balance needs volume_mutex */ 3461 mutex_unlock(&fs_info->balance_mutex); 3462 mutex_lock(&fs_info->volume_mutex); 3463 mutex_lock(&fs_info->balance_mutex); 3464 3465 if (fs_info->balance_ctl) 3466 __cancel_balance(fs_info); 3467 3468 mutex_unlock(&fs_info->volume_mutex); 3469 } 3470 3471 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 3472 atomic_dec(&fs_info->balance_cancel_req); 3473 mutex_unlock(&fs_info->balance_mutex); 3474 return 0; 3475 } 3476 3477 static int btrfs_uuid_scan_kthread(void *data) 3478 { 3479 struct btrfs_fs_info *fs_info = data; 3480 struct btrfs_root *root = fs_info->tree_root; 3481 struct btrfs_key key; 3482 struct btrfs_key max_key; 3483 struct btrfs_path *path = NULL; 3484 int ret = 0; 3485 struct extent_buffer *eb; 3486 int slot; 3487 struct btrfs_root_item root_item; 3488 u32 item_size; 3489 struct btrfs_trans_handle *trans = NULL; 3490 3491 path = btrfs_alloc_path(); 3492 if (!path) { 3493 ret = -ENOMEM; 3494 goto out; 3495 } 3496 3497 key.objectid = 0; 3498 key.type = BTRFS_ROOT_ITEM_KEY; 3499 key.offset = 0; 3500 3501 max_key.objectid = (u64)-1; 3502 max_key.type = BTRFS_ROOT_ITEM_KEY; 3503 max_key.offset = (u64)-1; 3504 3505 path->keep_locks = 1; 3506 3507 while (1) { 3508 ret = btrfs_search_forward(root, &key, path, 0); 3509 if (ret) { 3510 if (ret > 0) 3511 ret = 0; 3512 break; 3513 } 3514 3515 if (key.type != BTRFS_ROOT_ITEM_KEY || 3516 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 3517 key.objectid != BTRFS_FS_TREE_OBJECTID) || 3518 key.objectid > BTRFS_LAST_FREE_OBJECTID) 3519 goto skip; 3520 3521 eb = path->nodes[0]; 3522 slot = path->slots[0]; 3523 item_size = btrfs_item_size_nr(eb, slot); 3524 if (item_size < sizeof(root_item)) 3525 goto skip; 3526 3527 read_extent_buffer(eb, &root_item, 3528 btrfs_item_ptr_offset(eb, slot), 3529 (int)sizeof(root_item)); 3530 if (btrfs_root_refs(&root_item) == 0) 3531 goto skip; 3532 3533 if (!btrfs_is_empty_uuid(root_item.uuid) || 3534 !btrfs_is_empty_uuid(root_item.received_uuid)) { 3535 if (trans) 3536 goto update_tree; 3537 3538 btrfs_release_path(path); 3539 /* 3540 * 1 - subvol uuid item 3541 * 1 - received_subvol uuid item 3542 */ 3543 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 3544 if (IS_ERR(trans)) { 3545 ret = PTR_ERR(trans); 3546 break; 3547 } 3548 continue; 3549 } else { 3550 goto skip; 3551 } 3552 update_tree: 3553 if (!btrfs_is_empty_uuid(root_item.uuid)) { 3554 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3555 root_item.uuid, 3556 BTRFS_UUID_KEY_SUBVOL, 3557 key.objectid); 3558 if (ret < 0) { 3559 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3560 ret); 3561 break; 3562 } 3563 } 3564 3565 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 3566 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3567 root_item.received_uuid, 3568 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3569 key.objectid); 3570 if (ret < 0) { 3571 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3572 ret); 3573 break; 3574 } 3575 } 3576 3577 skip: 3578 if (trans) { 3579 ret = btrfs_end_transaction(trans, fs_info->uuid_root); 3580 trans = NULL; 3581 if (ret) 3582 break; 3583 } 3584 3585 btrfs_release_path(path); 3586 if (key.offset < (u64)-1) { 3587 key.offset++; 3588 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 3589 key.offset = 0; 3590 key.type = BTRFS_ROOT_ITEM_KEY; 3591 } else if (key.objectid < (u64)-1) { 3592 key.offset = 0; 3593 key.type = BTRFS_ROOT_ITEM_KEY; 3594 key.objectid++; 3595 } else { 3596 break; 3597 } 3598 cond_resched(); 3599 } 3600 3601 out: 3602 btrfs_free_path(path); 3603 if (trans && !IS_ERR(trans)) 3604 btrfs_end_transaction(trans, fs_info->uuid_root); 3605 if (ret) 3606 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 3607 else 3608 fs_info->update_uuid_tree_gen = 1; 3609 up(&fs_info->uuid_tree_rescan_sem); 3610 return 0; 3611 } 3612 3613 /* 3614 * Callback for btrfs_uuid_tree_iterate(). 3615 * returns: 3616 * 0 check succeeded, the entry is not outdated. 3617 * < 0 if an error occured. 3618 * > 0 if the check failed, which means the caller shall remove the entry. 3619 */ 3620 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 3621 u8 *uuid, u8 type, u64 subid) 3622 { 3623 struct btrfs_key key; 3624 int ret = 0; 3625 struct btrfs_root *subvol_root; 3626 3627 if (type != BTRFS_UUID_KEY_SUBVOL && 3628 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 3629 goto out; 3630 3631 key.objectid = subid; 3632 key.type = BTRFS_ROOT_ITEM_KEY; 3633 key.offset = (u64)-1; 3634 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 3635 if (IS_ERR(subvol_root)) { 3636 ret = PTR_ERR(subvol_root); 3637 if (ret == -ENOENT) 3638 ret = 1; 3639 goto out; 3640 } 3641 3642 switch (type) { 3643 case BTRFS_UUID_KEY_SUBVOL: 3644 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 3645 ret = 1; 3646 break; 3647 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 3648 if (memcmp(uuid, subvol_root->root_item.received_uuid, 3649 BTRFS_UUID_SIZE)) 3650 ret = 1; 3651 break; 3652 } 3653 3654 out: 3655 return ret; 3656 } 3657 3658 static int btrfs_uuid_rescan_kthread(void *data) 3659 { 3660 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 3661 int ret; 3662 3663 /* 3664 * 1st step is to iterate through the existing UUID tree and 3665 * to delete all entries that contain outdated data. 3666 * 2nd step is to add all missing entries to the UUID tree. 3667 */ 3668 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 3669 if (ret < 0) { 3670 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 3671 up(&fs_info->uuid_tree_rescan_sem); 3672 return ret; 3673 } 3674 return btrfs_uuid_scan_kthread(data); 3675 } 3676 3677 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 3678 { 3679 struct btrfs_trans_handle *trans; 3680 struct btrfs_root *tree_root = fs_info->tree_root; 3681 struct btrfs_root *uuid_root; 3682 struct task_struct *task; 3683 int ret; 3684 3685 /* 3686 * 1 - root node 3687 * 1 - root item 3688 */ 3689 trans = btrfs_start_transaction(tree_root, 2); 3690 if (IS_ERR(trans)) 3691 return PTR_ERR(trans); 3692 3693 uuid_root = btrfs_create_tree(trans, fs_info, 3694 BTRFS_UUID_TREE_OBJECTID); 3695 if (IS_ERR(uuid_root)) { 3696 btrfs_abort_transaction(trans, tree_root, 3697 PTR_ERR(uuid_root)); 3698 return PTR_ERR(uuid_root); 3699 } 3700 3701 fs_info->uuid_root = uuid_root; 3702 3703 ret = btrfs_commit_transaction(trans, tree_root); 3704 if (ret) 3705 return ret; 3706 3707 down(&fs_info->uuid_tree_rescan_sem); 3708 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 3709 if (IS_ERR(task)) { 3710 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3711 btrfs_warn(fs_info, "failed to start uuid_scan task"); 3712 up(&fs_info->uuid_tree_rescan_sem); 3713 return PTR_ERR(task); 3714 } 3715 3716 return 0; 3717 } 3718 3719 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 3720 { 3721 struct task_struct *task; 3722 3723 down(&fs_info->uuid_tree_rescan_sem); 3724 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3725 if (IS_ERR(task)) { 3726 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3727 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 3728 up(&fs_info->uuid_tree_rescan_sem); 3729 return PTR_ERR(task); 3730 } 3731 3732 return 0; 3733 } 3734 3735 /* 3736 * shrinking a device means finding all of the device extents past 3737 * the new size, and then following the back refs to the chunks. 3738 * The chunk relocation code actually frees the device extent 3739 */ 3740 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 3741 { 3742 struct btrfs_trans_handle *trans; 3743 struct btrfs_root *root = device->dev_root; 3744 struct btrfs_dev_extent *dev_extent = NULL; 3745 struct btrfs_path *path; 3746 u64 length; 3747 u64 chunk_tree; 3748 u64 chunk_objectid; 3749 u64 chunk_offset; 3750 int ret; 3751 int slot; 3752 int failed = 0; 3753 bool retried = false; 3754 struct extent_buffer *l; 3755 struct btrfs_key key; 3756 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3757 u64 old_total = btrfs_super_total_bytes(super_copy); 3758 u64 old_size = device->total_bytes; 3759 u64 diff = device->total_bytes - new_size; 3760 3761 if (device->is_tgtdev_for_dev_replace) 3762 return -EINVAL; 3763 3764 path = btrfs_alloc_path(); 3765 if (!path) 3766 return -ENOMEM; 3767 3768 path->reada = 2; 3769 3770 lock_chunks(root); 3771 3772 device->total_bytes = new_size; 3773 if (device->writeable) { 3774 device->fs_devices->total_rw_bytes -= diff; 3775 spin_lock(&root->fs_info->free_chunk_lock); 3776 root->fs_info->free_chunk_space -= diff; 3777 spin_unlock(&root->fs_info->free_chunk_lock); 3778 } 3779 unlock_chunks(root); 3780 3781 again: 3782 key.objectid = device->devid; 3783 key.offset = (u64)-1; 3784 key.type = BTRFS_DEV_EXTENT_KEY; 3785 3786 do { 3787 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3788 if (ret < 0) 3789 goto done; 3790 3791 ret = btrfs_previous_item(root, path, 0, key.type); 3792 if (ret < 0) 3793 goto done; 3794 if (ret) { 3795 ret = 0; 3796 btrfs_release_path(path); 3797 break; 3798 } 3799 3800 l = path->nodes[0]; 3801 slot = path->slots[0]; 3802 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 3803 3804 if (key.objectid != device->devid) { 3805 btrfs_release_path(path); 3806 break; 3807 } 3808 3809 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3810 length = btrfs_dev_extent_length(l, dev_extent); 3811 3812 if (key.offset + length <= new_size) { 3813 btrfs_release_path(path); 3814 break; 3815 } 3816 3817 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3818 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3819 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3820 btrfs_release_path(path); 3821 3822 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 3823 chunk_offset); 3824 if (ret && ret != -ENOSPC) 3825 goto done; 3826 if (ret == -ENOSPC) 3827 failed++; 3828 } while (key.offset-- > 0); 3829 3830 if (failed && !retried) { 3831 failed = 0; 3832 retried = true; 3833 goto again; 3834 } else if (failed && retried) { 3835 ret = -ENOSPC; 3836 lock_chunks(root); 3837 3838 device->total_bytes = old_size; 3839 if (device->writeable) 3840 device->fs_devices->total_rw_bytes += diff; 3841 spin_lock(&root->fs_info->free_chunk_lock); 3842 root->fs_info->free_chunk_space += diff; 3843 spin_unlock(&root->fs_info->free_chunk_lock); 3844 unlock_chunks(root); 3845 goto done; 3846 } 3847 3848 /* Shrinking succeeded, else we would be at "done". */ 3849 trans = btrfs_start_transaction(root, 0); 3850 if (IS_ERR(trans)) { 3851 ret = PTR_ERR(trans); 3852 goto done; 3853 } 3854 3855 lock_chunks(root); 3856 3857 device->disk_total_bytes = new_size; 3858 /* Now btrfs_update_device() will change the on-disk size. */ 3859 ret = btrfs_update_device(trans, device); 3860 if (ret) { 3861 unlock_chunks(root); 3862 btrfs_end_transaction(trans, root); 3863 goto done; 3864 } 3865 WARN_ON(diff > old_total); 3866 btrfs_set_super_total_bytes(super_copy, old_total - diff); 3867 unlock_chunks(root); 3868 btrfs_end_transaction(trans, root); 3869 done: 3870 btrfs_free_path(path); 3871 return ret; 3872 } 3873 3874 static int btrfs_add_system_chunk(struct btrfs_root *root, 3875 struct btrfs_key *key, 3876 struct btrfs_chunk *chunk, int item_size) 3877 { 3878 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3879 struct btrfs_disk_key disk_key; 3880 u32 array_size; 3881 u8 *ptr; 3882 3883 array_size = btrfs_super_sys_array_size(super_copy); 3884 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3885 return -EFBIG; 3886 3887 ptr = super_copy->sys_chunk_array + array_size; 3888 btrfs_cpu_key_to_disk(&disk_key, key); 3889 memcpy(ptr, &disk_key, sizeof(disk_key)); 3890 ptr += sizeof(disk_key); 3891 memcpy(ptr, chunk, item_size); 3892 item_size += sizeof(disk_key); 3893 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 3894 return 0; 3895 } 3896 3897 /* 3898 * sort the devices in descending order by max_avail, total_avail 3899 */ 3900 static int btrfs_cmp_device_info(const void *a, const void *b) 3901 { 3902 const struct btrfs_device_info *di_a = a; 3903 const struct btrfs_device_info *di_b = b; 3904 3905 if (di_a->max_avail > di_b->max_avail) 3906 return -1; 3907 if (di_a->max_avail < di_b->max_avail) 3908 return 1; 3909 if (di_a->total_avail > di_b->total_avail) 3910 return -1; 3911 if (di_a->total_avail < di_b->total_avail) 3912 return 1; 3913 return 0; 3914 } 3915 3916 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3917 [BTRFS_RAID_RAID10] = { 3918 .sub_stripes = 2, 3919 .dev_stripes = 1, 3920 .devs_max = 0, /* 0 == as many as possible */ 3921 .devs_min = 4, 3922 .devs_increment = 2, 3923 .ncopies = 2, 3924 }, 3925 [BTRFS_RAID_RAID1] = { 3926 .sub_stripes = 1, 3927 .dev_stripes = 1, 3928 .devs_max = 2, 3929 .devs_min = 2, 3930 .devs_increment = 2, 3931 .ncopies = 2, 3932 }, 3933 [BTRFS_RAID_DUP] = { 3934 .sub_stripes = 1, 3935 .dev_stripes = 2, 3936 .devs_max = 1, 3937 .devs_min = 1, 3938 .devs_increment = 1, 3939 .ncopies = 2, 3940 }, 3941 [BTRFS_RAID_RAID0] = { 3942 .sub_stripes = 1, 3943 .dev_stripes = 1, 3944 .devs_max = 0, 3945 .devs_min = 2, 3946 .devs_increment = 1, 3947 .ncopies = 1, 3948 }, 3949 [BTRFS_RAID_SINGLE] = { 3950 .sub_stripes = 1, 3951 .dev_stripes = 1, 3952 .devs_max = 1, 3953 .devs_min = 1, 3954 .devs_increment = 1, 3955 .ncopies = 1, 3956 }, 3957 [BTRFS_RAID_RAID5] = { 3958 .sub_stripes = 1, 3959 .dev_stripes = 1, 3960 .devs_max = 0, 3961 .devs_min = 2, 3962 .devs_increment = 1, 3963 .ncopies = 2, 3964 }, 3965 [BTRFS_RAID_RAID6] = { 3966 .sub_stripes = 1, 3967 .dev_stripes = 1, 3968 .devs_max = 0, 3969 .devs_min = 3, 3970 .devs_increment = 1, 3971 .ncopies = 3, 3972 }, 3973 }; 3974 3975 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 3976 { 3977 /* TODO allow them to set a preferred stripe size */ 3978 return 64 * 1024; 3979 } 3980 3981 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 3982 { 3983 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 3984 return; 3985 3986 btrfs_set_fs_incompat(info, RAID56); 3987 } 3988 3989 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3990 struct btrfs_root *extent_root, u64 start, 3991 u64 type) 3992 { 3993 struct btrfs_fs_info *info = extent_root->fs_info; 3994 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3995 struct list_head *cur; 3996 struct map_lookup *map = NULL; 3997 struct extent_map_tree *em_tree; 3998 struct extent_map *em; 3999 struct btrfs_device_info *devices_info = NULL; 4000 u64 total_avail; 4001 int num_stripes; /* total number of stripes to allocate */ 4002 int data_stripes; /* number of stripes that count for 4003 block group size */ 4004 int sub_stripes; /* sub_stripes info for map */ 4005 int dev_stripes; /* stripes per dev */ 4006 int devs_max; /* max devs to use */ 4007 int devs_min; /* min devs needed */ 4008 int devs_increment; /* ndevs has to be a multiple of this */ 4009 int ncopies; /* how many copies to data has */ 4010 int ret; 4011 u64 max_stripe_size; 4012 u64 max_chunk_size; 4013 u64 stripe_size; 4014 u64 num_bytes; 4015 u64 raid_stripe_len = BTRFS_STRIPE_LEN; 4016 int ndevs; 4017 int i; 4018 int j; 4019 int index; 4020 4021 BUG_ON(!alloc_profile_is_valid(type, 0)); 4022 4023 if (list_empty(&fs_devices->alloc_list)) 4024 return -ENOSPC; 4025 4026 index = __get_raid_index(type); 4027 4028 sub_stripes = btrfs_raid_array[index].sub_stripes; 4029 dev_stripes = btrfs_raid_array[index].dev_stripes; 4030 devs_max = btrfs_raid_array[index].devs_max; 4031 devs_min = btrfs_raid_array[index].devs_min; 4032 devs_increment = btrfs_raid_array[index].devs_increment; 4033 ncopies = btrfs_raid_array[index].ncopies; 4034 4035 if (type & BTRFS_BLOCK_GROUP_DATA) { 4036 max_stripe_size = 1024 * 1024 * 1024; 4037 max_chunk_size = 10 * max_stripe_size; 4038 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4039 /* for larger filesystems, use larger metadata chunks */ 4040 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4041 max_stripe_size = 1024 * 1024 * 1024; 4042 else 4043 max_stripe_size = 256 * 1024 * 1024; 4044 max_chunk_size = max_stripe_size; 4045 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4046 max_stripe_size = 32 * 1024 * 1024; 4047 max_chunk_size = 2 * max_stripe_size; 4048 } else { 4049 btrfs_err(info, "invalid chunk type 0x%llx requested\n", 4050 type); 4051 BUG_ON(1); 4052 } 4053 4054 /* we don't want a chunk larger than 10% of writeable space */ 4055 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4056 max_chunk_size); 4057 4058 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 4059 GFP_NOFS); 4060 if (!devices_info) 4061 return -ENOMEM; 4062 4063 cur = fs_devices->alloc_list.next; 4064 4065 /* 4066 * in the first pass through the devices list, we gather information 4067 * about the available holes on each device. 4068 */ 4069 ndevs = 0; 4070 while (cur != &fs_devices->alloc_list) { 4071 struct btrfs_device *device; 4072 u64 max_avail; 4073 u64 dev_offset; 4074 4075 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 4076 4077 cur = cur->next; 4078 4079 if (!device->writeable) { 4080 WARN(1, KERN_ERR 4081 "BTRFS: read-only device in alloc_list\n"); 4082 continue; 4083 } 4084 4085 if (!device->in_fs_metadata || 4086 device->is_tgtdev_for_dev_replace) 4087 continue; 4088 4089 if (device->total_bytes > device->bytes_used) 4090 total_avail = device->total_bytes - device->bytes_used; 4091 else 4092 total_avail = 0; 4093 4094 /* If there is no space on this device, skip it. */ 4095 if (total_avail == 0) 4096 continue; 4097 4098 ret = find_free_dev_extent(trans, device, 4099 max_stripe_size * dev_stripes, 4100 &dev_offset, &max_avail); 4101 if (ret && ret != -ENOSPC) 4102 goto error; 4103 4104 if (ret == 0) 4105 max_avail = max_stripe_size * dev_stripes; 4106 4107 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4108 continue; 4109 4110 if (ndevs == fs_devices->rw_devices) { 4111 WARN(1, "%s: found more than %llu devices\n", 4112 __func__, fs_devices->rw_devices); 4113 break; 4114 } 4115 devices_info[ndevs].dev_offset = dev_offset; 4116 devices_info[ndevs].max_avail = max_avail; 4117 devices_info[ndevs].total_avail = total_avail; 4118 devices_info[ndevs].dev = device; 4119 ++ndevs; 4120 } 4121 4122 /* 4123 * now sort the devices by hole size / available space 4124 */ 4125 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4126 btrfs_cmp_device_info, NULL); 4127 4128 /* round down to number of usable stripes */ 4129 ndevs -= ndevs % devs_increment; 4130 4131 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4132 ret = -ENOSPC; 4133 goto error; 4134 } 4135 4136 if (devs_max && ndevs > devs_max) 4137 ndevs = devs_max; 4138 /* 4139 * the primary goal is to maximize the number of stripes, so use as many 4140 * devices as possible, even if the stripes are not maximum sized. 4141 */ 4142 stripe_size = devices_info[ndevs-1].max_avail; 4143 num_stripes = ndevs * dev_stripes; 4144 4145 /* 4146 * this will have to be fixed for RAID1 and RAID10 over 4147 * more drives 4148 */ 4149 data_stripes = num_stripes / ncopies; 4150 4151 if (type & BTRFS_BLOCK_GROUP_RAID5) { 4152 raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 4153 btrfs_super_stripesize(info->super_copy)); 4154 data_stripes = num_stripes - 1; 4155 } 4156 if (type & BTRFS_BLOCK_GROUP_RAID6) { 4157 raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 4158 btrfs_super_stripesize(info->super_copy)); 4159 data_stripes = num_stripes - 2; 4160 } 4161 4162 /* 4163 * Use the number of data stripes to figure out how big this chunk 4164 * is really going to be in terms of logical address space, 4165 * and compare that answer with the max chunk size 4166 */ 4167 if (stripe_size * data_stripes > max_chunk_size) { 4168 u64 mask = (1ULL << 24) - 1; 4169 stripe_size = max_chunk_size; 4170 do_div(stripe_size, data_stripes); 4171 4172 /* bump the answer up to a 16MB boundary */ 4173 stripe_size = (stripe_size + mask) & ~mask; 4174 4175 /* but don't go higher than the limits we found 4176 * while searching for free extents 4177 */ 4178 if (stripe_size > devices_info[ndevs-1].max_avail) 4179 stripe_size = devices_info[ndevs-1].max_avail; 4180 } 4181 4182 do_div(stripe_size, dev_stripes); 4183 4184 /* align to BTRFS_STRIPE_LEN */ 4185 do_div(stripe_size, raid_stripe_len); 4186 stripe_size *= raid_stripe_len; 4187 4188 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4189 if (!map) { 4190 ret = -ENOMEM; 4191 goto error; 4192 } 4193 map->num_stripes = num_stripes; 4194 4195 for (i = 0; i < ndevs; ++i) { 4196 for (j = 0; j < dev_stripes; ++j) { 4197 int s = i * dev_stripes + j; 4198 map->stripes[s].dev = devices_info[i].dev; 4199 map->stripes[s].physical = devices_info[i].dev_offset + 4200 j * stripe_size; 4201 } 4202 } 4203 map->sector_size = extent_root->sectorsize; 4204 map->stripe_len = raid_stripe_len; 4205 map->io_align = raid_stripe_len; 4206 map->io_width = raid_stripe_len; 4207 map->type = type; 4208 map->sub_stripes = sub_stripes; 4209 4210 num_bytes = stripe_size * data_stripes; 4211 4212 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 4213 4214 em = alloc_extent_map(); 4215 if (!em) { 4216 ret = -ENOMEM; 4217 goto error; 4218 } 4219 em->bdev = (struct block_device *)map; 4220 em->start = start; 4221 em->len = num_bytes; 4222 em->block_start = 0; 4223 em->block_len = em->len; 4224 em->orig_block_len = stripe_size; 4225 4226 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4227 write_lock(&em_tree->lock); 4228 ret = add_extent_mapping(em_tree, em, 0); 4229 if (!ret) { 4230 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4231 atomic_inc(&em->refs); 4232 } 4233 write_unlock(&em_tree->lock); 4234 if (ret) { 4235 free_extent_map(em); 4236 goto error; 4237 } 4238 4239 ret = btrfs_make_block_group(trans, extent_root, 0, type, 4240 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4241 start, num_bytes); 4242 if (ret) 4243 goto error_del_extent; 4244 4245 free_extent_map(em); 4246 check_raid56_incompat_flag(extent_root->fs_info, type); 4247 4248 kfree(devices_info); 4249 return 0; 4250 4251 error_del_extent: 4252 write_lock(&em_tree->lock); 4253 remove_extent_mapping(em_tree, em); 4254 write_unlock(&em_tree->lock); 4255 4256 /* One for our allocation */ 4257 free_extent_map(em); 4258 /* One for the tree reference */ 4259 free_extent_map(em); 4260 error: 4261 kfree(map); 4262 kfree(devices_info); 4263 return ret; 4264 } 4265 4266 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4267 struct btrfs_root *extent_root, 4268 u64 chunk_offset, u64 chunk_size) 4269 { 4270 struct btrfs_key key; 4271 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 4272 struct btrfs_device *device; 4273 struct btrfs_chunk *chunk; 4274 struct btrfs_stripe *stripe; 4275 struct extent_map_tree *em_tree; 4276 struct extent_map *em; 4277 struct map_lookup *map; 4278 size_t item_size; 4279 u64 dev_offset; 4280 u64 stripe_size; 4281 int i = 0; 4282 int ret; 4283 4284 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4285 read_lock(&em_tree->lock); 4286 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); 4287 read_unlock(&em_tree->lock); 4288 4289 if (!em) { 4290 btrfs_crit(extent_root->fs_info, "unable to find logical " 4291 "%Lu len %Lu", chunk_offset, chunk_size); 4292 return -EINVAL; 4293 } 4294 4295 if (em->start != chunk_offset || em->len != chunk_size) { 4296 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4297 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, 4298 chunk_size, em->start, em->len); 4299 free_extent_map(em); 4300 return -EINVAL; 4301 } 4302 4303 map = (struct map_lookup *)em->bdev; 4304 item_size = btrfs_chunk_item_size(map->num_stripes); 4305 stripe_size = em->orig_block_len; 4306 4307 chunk = kzalloc(item_size, GFP_NOFS); 4308 if (!chunk) { 4309 ret = -ENOMEM; 4310 goto out; 4311 } 4312 4313 for (i = 0; i < map->num_stripes; i++) { 4314 device = map->stripes[i].dev; 4315 dev_offset = map->stripes[i].physical; 4316 4317 device->bytes_used += stripe_size; 4318 ret = btrfs_update_device(trans, device); 4319 if (ret) 4320 goto out; 4321 ret = btrfs_alloc_dev_extent(trans, device, 4322 chunk_root->root_key.objectid, 4323 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4324 chunk_offset, dev_offset, 4325 stripe_size); 4326 if (ret) 4327 goto out; 4328 } 4329 4330 spin_lock(&extent_root->fs_info->free_chunk_lock); 4331 extent_root->fs_info->free_chunk_space -= (stripe_size * 4332 map->num_stripes); 4333 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4334 4335 stripe = &chunk->stripe; 4336 for (i = 0; i < map->num_stripes; i++) { 4337 device = map->stripes[i].dev; 4338 dev_offset = map->stripes[i].physical; 4339 4340 btrfs_set_stack_stripe_devid(stripe, device->devid); 4341 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4342 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4343 stripe++; 4344 } 4345 4346 btrfs_set_stack_chunk_length(chunk, chunk_size); 4347 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4348 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4349 btrfs_set_stack_chunk_type(chunk, map->type); 4350 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4351 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4352 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4353 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 4354 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4355 4356 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4357 key.type = BTRFS_CHUNK_ITEM_KEY; 4358 key.offset = chunk_offset; 4359 4360 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4361 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4362 /* 4363 * TODO: Cleanup of inserted chunk root in case of 4364 * failure. 4365 */ 4366 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 4367 item_size); 4368 } 4369 4370 out: 4371 kfree(chunk); 4372 free_extent_map(em); 4373 return ret; 4374 } 4375 4376 /* 4377 * Chunk allocation falls into two parts. The first part does works 4378 * that make the new allocated chunk useable, but not do any operation 4379 * that modifies the chunk tree. The second part does the works that 4380 * require modifying the chunk tree. This division is important for the 4381 * bootstrap process of adding storage to a seed btrfs. 4382 */ 4383 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4384 struct btrfs_root *extent_root, u64 type) 4385 { 4386 u64 chunk_offset; 4387 4388 chunk_offset = find_next_chunk(extent_root->fs_info); 4389 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); 4390 } 4391 4392 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4393 struct btrfs_root *root, 4394 struct btrfs_device *device) 4395 { 4396 u64 chunk_offset; 4397 u64 sys_chunk_offset; 4398 u64 alloc_profile; 4399 struct btrfs_fs_info *fs_info = root->fs_info; 4400 struct btrfs_root *extent_root = fs_info->extent_root; 4401 int ret; 4402 4403 chunk_offset = find_next_chunk(fs_info); 4404 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4405 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, 4406 alloc_profile); 4407 if (ret) 4408 return ret; 4409 4410 sys_chunk_offset = find_next_chunk(root->fs_info); 4411 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4412 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4413 alloc_profile); 4414 if (ret) { 4415 btrfs_abort_transaction(trans, root, ret); 4416 goto out; 4417 } 4418 4419 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4420 if (ret) 4421 btrfs_abort_transaction(trans, root, ret); 4422 out: 4423 return ret; 4424 } 4425 4426 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4427 { 4428 struct extent_map *em; 4429 struct map_lookup *map; 4430 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4431 int readonly = 0; 4432 int i; 4433 4434 read_lock(&map_tree->map_tree.lock); 4435 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 4436 read_unlock(&map_tree->map_tree.lock); 4437 if (!em) 4438 return 1; 4439 4440 if (btrfs_test_opt(root, DEGRADED)) { 4441 free_extent_map(em); 4442 return 0; 4443 } 4444 4445 map = (struct map_lookup *)em->bdev; 4446 for (i = 0; i < map->num_stripes; i++) { 4447 if (!map->stripes[i].dev->writeable) { 4448 readonly = 1; 4449 break; 4450 } 4451 } 4452 free_extent_map(em); 4453 return readonly; 4454 } 4455 4456 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 4457 { 4458 extent_map_tree_init(&tree->map_tree); 4459 } 4460 4461 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 4462 { 4463 struct extent_map *em; 4464 4465 while (1) { 4466 write_lock(&tree->map_tree.lock); 4467 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 4468 if (em) 4469 remove_extent_mapping(&tree->map_tree, em); 4470 write_unlock(&tree->map_tree.lock); 4471 if (!em) 4472 break; 4473 kfree(em->bdev); 4474 /* once for us */ 4475 free_extent_map(em); 4476 /* once for the tree */ 4477 free_extent_map(em); 4478 } 4479 } 4480 4481 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 4482 { 4483 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4484 struct extent_map *em; 4485 struct map_lookup *map; 4486 struct extent_map_tree *em_tree = &map_tree->map_tree; 4487 int ret; 4488 4489 read_lock(&em_tree->lock); 4490 em = lookup_extent_mapping(em_tree, logical, len); 4491 read_unlock(&em_tree->lock); 4492 4493 /* 4494 * We could return errors for these cases, but that could get ugly and 4495 * we'd probably do the same thing which is just not do anything else 4496 * and exit, so return 1 so the callers don't try to use other copies. 4497 */ 4498 if (!em) { 4499 btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, 4500 logical+len); 4501 return 1; 4502 } 4503 4504 if (em->start > logical || em->start + em->len < logical) { 4505 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4506 "%Lu-%Lu\n", logical, logical+len, em->start, 4507 em->start + em->len); 4508 free_extent_map(em); 4509 return 1; 4510 } 4511 4512 map = (struct map_lookup *)em->bdev; 4513 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 4514 ret = map->num_stripes; 4515 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4516 ret = map->sub_stripes; 4517 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4518 ret = 2; 4519 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4520 ret = 3; 4521 else 4522 ret = 1; 4523 free_extent_map(em); 4524 4525 btrfs_dev_replace_lock(&fs_info->dev_replace); 4526 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 4527 ret++; 4528 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4529 4530 return ret; 4531 } 4532 4533 unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4534 struct btrfs_mapping_tree *map_tree, 4535 u64 logical) 4536 { 4537 struct extent_map *em; 4538 struct map_lookup *map; 4539 struct extent_map_tree *em_tree = &map_tree->map_tree; 4540 unsigned long len = root->sectorsize; 4541 4542 read_lock(&em_tree->lock); 4543 em = lookup_extent_mapping(em_tree, logical, len); 4544 read_unlock(&em_tree->lock); 4545 BUG_ON(!em); 4546 4547 BUG_ON(em->start > logical || em->start + em->len < logical); 4548 map = (struct map_lookup *)em->bdev; 4549 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4550 BTRFS_BLOCK_GROUP_RAID6)) { 4551 len = map->stripe_len * nr_data_stripes(map); 4552 } 4553 free_extent_map(em); 4554 return len; 4555 } 4556 4557 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4558 u64 logical, u64 len, int mirror_num) 4559 { 4560 struct extent_map *em; 4561 struct map_lookup *map; 4562 struct extent_map_tree *em_tree = &map_tree->map_tree; 4563 int ret = 0; 4564 4565 read_lock(&em_tree->lock); 4566 em = lookup_extent_mapping(em_tree, logical, len); 4567 read_unlock(&em_tree->lock); 4568 BUG_ON(!em); 4569 4570 BUG_ON(em->start > logical || em->start + em->len < logical); 4571 map = (struct map_lookup *)em->bdev; 4572 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4573 BTRFS_BLOCK_GROUP_RAID6)) 4574 ret = 1; 4575 free_extent_map(em); 4576 return ret; 4577 } 4578 4579 static int find_live_mirror(struct btrfs_fs_info *fs_info, 4580 struct map_lookup *map, int first, int num, 4581 int optimal, int dev_replace_is_ongoing) 4582 { 4583 int i; 4584 int tolerance; 4585 struct btrfs_device *srcdev; 4586 4587 if (dev_replace_is_ongoing && 4588 fs_info->dev_replace.cont_reading_from_srcdev_mode == 4589 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 4590 srcdev = fs_info->dev_replace.srcdev; 4591 else 4592 srcdev = NULL; 4593 4594 /* 4595 * try to avoid the drive that is the source drive for a 4596 * dev-replace procedure, only choose it if no other non-missing 4597 * mirror is available 4598 */ 4599 for (tolerance = 0; tolerance < 2; tolerance++) { 4600 if (map->stripes[optimal].dev->bdev && 4601 (tolerance || map->stripes[optimal].dev != srcdev)) 4602 return optimal; 4603 for (i = first; i < first + num; i++) { 4604 if (map->stripes[i].dev->bdev && 4605 (tolerance || map->stripes[i].dev != srcdev)) 4606 return i; 4607 } 4608 } 4609 4610 /* we couldn't find one that doesn't fail. Just return something 4611 * and the io error handling code will clean up eventually 4612 */ 4613 return optimal; 4614 } 4615 4616 static inline int parity_smaller(u64 a, u64 b) 4617 { 4618 return a > b; 4619 } 4620 4621 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4622 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4623 { 4624 struct btrfs_bio_stripe s; 4625 int i; 4626 u64 l; 4627 int again = 1; 4628 4629 while (again) { 4630 again = 0; 4631 for (i = 0; i < bbio->num_stripes - 1; i++) { 4632 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4633 s = bbio->stripes[i]; 4634 l = raid_map[i]; 4635 bbio->stripes[i] = bbio->stripes[i+1]; 4636 raid_map[i] = raid_map[i+1]; 4637 bbio->stripes[i+1] = s; 4638 raid_map[i+1] = l; 4639 again = 1; 4640 } 4641 } 4642 } 4643 } 4644 4645 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4646 u64 logical, u64 *length, 4647 struct btrfs_bio **bbio_ret, 4648 int mirror_num, u64 **raid_map_ret) 4649 { 4650 struct extent_map *em; 4651 struct map_lookup *map; 4652 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4653 struct extent_map_tree *em_tree = &map_tree->map_tree; 4654 u64 offset; 4655 u64 stripe_offset; 4656 u64 stripe_end_offset; 4657 u64 stripe_nr; 4658 u64 stripe_nr_orig; 4659 u64 stripe_nr_end; 4660 u64 stripe_len; 4661 u64 *raid_map = NULL; 4662 int stripe_index; 4663 int i; 4664 int ret = 0; 4665 int num_stripes; 4666 int max_errors = 0; 4667 struct btrfs_bio *bbio = NULL; 4668 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4669 int dev_replace_is_ongoing = 0; 4670 int num_alloc_stripes; 4671 int patch_the_first_stripe_for_dev_replace = 0; 4672 u64 physical_to_patch_in_first_stripe = 0; 4673 u64 raid56_full_stripe_start = (u64)-1; 4674 4675 read_lock(&em_tree->lock); 4676 em = lookup_extent_mapping(em_tree, logical, *length); 4677 read_unlock(&em_tree->lock); 4678 4679 if (!em) { 4680 btrfs_crit(fs_info, "unable to find logical %llu len %llu", 4681 logical, *length); 4682 return -EINVAL; 4683 } 4684 4685 if (em->start > logical || em->start + em->len < logical) { 4686 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4687 "found %Lu-%Lu\n", logical, em->start, 4688 em->start + em->len); 4689 free_extent_map(em); 4690 return -EINVAL; 4691 } 4692 4693 map = (struct map_lookup *)em->bdev; 4694 offset = logical - em->start; 4695 4696 stripe_len = map->stripe_len; 4697 stripe_nr = offset; 4698 /* 4699 * stripe_nr counts the total number of stripes we have to stride 4700 * to get to this block 4701 */ 4702 do_div(stripe_nr, stripe_len); 4703 4704 stripe_offset = stripe_nr * stripe_len; 4705 BUG_ON(offset < stripe_offset); 4706 4707 /* stripe_offset is the offset of this block in its stripe*/ 4708 stripe_offset = offset - stripe_offset; 4709 4710 /* if we're here for raid56, we need to know the stripe aligned start */ 4711 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4712 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 4713 raid56_full_stripe_start = offset; 4714 4715 /* allow a write of a full stripe, but make sure we don't 4716 * allow straddling of stripes 4717 */ 4718 do_div(raid56_full_stripe_start, full_stripe_len); 4719 raid56_full_stripe_start *= full_stripe_len; 4720 } 4721 4722 if (rw & REQ_DISCARD) { 4723 /* we don't discard raid56 yet */ 4724 if (map->type & 4725 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4726 ret = -EOPNOTSUPP; 4727 goto out; 4728 } 4729 *length = min_t(u64, em->len - offset, *length); 4730 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4731 u64 max_len; 4732 /* For writes to RAID[56], allow a full stripeset across all disks. 4733 For other RAID types and for RAID[56] reads, just allow a single 4734 stripe (on a single disk). */ 4735 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 4736 (rw & REQ_WRITE)) { 4737 max_len = stripe_len * nr_data_stripes(map) - 4738 (offset - raid56_full_stripe_start); 4739 } else { 4740 /* we limit the length of each bio to what fits in a stripe */ 4741 max_len = stripe_len - stripe_offset; 4742 } 4743 *length = min_t(u64, em->len - offset, max_len); 4744 } else { 4745 *length = em->len - offset; 4746 } 4747 4748 /* This is for when we're called from btrfs_merge_bio_hook() and all 4749 it cares about is the length */ 4750 if (!bbio_ret) 4751 goto out; 4752 4753 btrfs_dev_replace_lock(dev_replace); 4754 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 4755 if (!dev_replace_is_ongoing) 4756 btrfs_dev_replace_unlock(dev_replace); 4757 4758 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 4759 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 4760 dev_replace->tgtdev != NULL) { 4761 /* 4762 * in dev-replace case, for repair case (that's the only 4763 * case where the mirror is selected explicitly when 4764 * calling btrfs_map_block), blocks left of the left cursor 4765 * can also be read from the target drive. 4766 * For REQ_GET_READ_MIRRORS, the target drive is added as 4767 * the last one to the array of stripes. For READ, it also 4768 * needs to be supported using the same mirror number. 4769 * If the requested block is not left of the left cursor, 4770 * EIO is returned. This can happen because btrfs_num_copies() 4771 * returns one more in the dev-replace case. 4772 */ 4773 u64 tmp_length = *length; 4774 struct btrfs_bio *tmp_bbio = NULL; 4775 int tmp_num_stripes; 4776 u64 srcdev_devid = dev_replace->srcdev->devid; 4777 int index_srcdev = 0; 4778 int found = 0; 4779 u64 physical_of_found = 0; 4780 4781 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4782 logical, &tmp_length, &tmp_bbio, 0, NULL); 4783 if (ret) { 4784 WARN_ON(tmp_bbio != NULL); 4785 goto out; 4786 } 4787 4788 tmp_num_stripes = tmp_bbio->num_stripes; 4789 if (mirror_num > tmp_num_stripes) { 4790 /* 4791 * REQ_GET_READ_MIRRORS does not contain this 4792 * mirror, that means that the requested area 4793 * is not left of the left cursor 4794 */ 4795 ret = -EIO; 4796 kfree(tmp_bbio); 4797 goto out; 4798 } 4799 4800 /* 4801 * process the rest of the function using the mirror_num 4802 * of the source drive. Therefore look it up first. 4803 * At the end, patch the device pointer to the one of the 4804 * target drive. 4805 */ 4806 for (i = 0; i < tmp_num_stripes; i++) { 4807 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 4808 /* 4809 * In case of DUP, in order to keep it 4810 * simple, only add the mirror with the 4811 * lowest physical address 4812 */ 4813 if (found && 4814 physical_of_found <= 4815 tmp_bbio->stripes[i].physical) 4816 continue; 4817 index_srcdev = i; 4818 found = 1; 4819 physical_of_found = 4820 tmp_bbio->stripes[i].physical; 4821 } 4822 } 4823 4824 if (found) { 4825 mirror_num = index_srcdev + 1; 4826 patch_the_first_stripe_for_dev_replace = 1; 4827 physical_to_patch_in_first_stripe = physical_of_found; 4828 } else { 4829 WARN_ON(1); 4830 ret = -EIO; 4831 kfree(tmp_bbio); 4832 goto out; 4833 } 4834 4835 kfree(tmp_bbio); 4836 } else if (mirror_num > map->num_stripes) { 4837 mirror_num = 0; 4838 } 4839 4840 num_stripes = 1; 4841 stripe_index = 0; 4842 stripe_nr_orig = stripe_nr; 4843 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 4844 do_div(stripe_nr_end, map->stripe_len); 4845 stripe_end_offset = stripe_nr_end * map->stripe_len - 4846 (offset + *length); 4847 4848 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4849 if (rw & REQ_DISCARD) 4850 num_stripes = min_t(u64, map->num_stripes, 4851 stripe_nr_end - stripe_nr_orig); 4852 stripe_index = do_div(stripe_nr, map->num_stripes); 4853 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4854 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 4855 num_stripes = map->num_stripes; 4856 else if (mirror_num) 4857 stripe_index = mirror_num - 1; 4858 else { 4859 stripe_index = find_live_mirror(fs_info, map, 0, 4860 map->num_stripes, 4861 current->pid % map->num_stripes, 4862 dev_replace_is_ongoing); 4863 mirror_num = stripe_index + 1; 4864 } 4865 4866 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4867 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 4868 num_stripes = map->num_stripes; 4869 } else if (mirror_num) { 4870 stripe_index = mirror_num - 1; 4871 } else { 4872 mirror_num = 1; 4873 } 4874 4875 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 4876 int factor = map->num_stripes / map->sub_stripes; 4877 4878 stripe_index = do_div(stripe_nr, factor); 4879 stripe_index *= map->sub_stripes; 4880 4881 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 4882 num_stripes = map->sub_stripes; 4883 else if (rw & REQ_DISCARD) 4884 num_stripes = min_t(u64, map->sub_stripes * 4885 (stripe_nr_end - stripe_nr_orig), 4886 map->num_stripes); 4887 else if (mirror_num) 4888 stripe_index += mirror_num - 1; 4889 else { 4890 int old_stripe_index = stripe_index; 4891 stripe_index = find_live_mirror(fs_info, map, 4892 stripe_index, 4893 map->sub_stripes, stripe_index + 4894 current->pid % map->sub_stripes, 4895 dev_replace_is_ongoing); 4896 mirror_num = stripe_index - old_stripe_index + 1; 4897 } 4898 4899 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4900 BTRFS_BLOCK_GROUP_RAID6)) { 4901 u64 tmp; 4902 4903 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 4904 && raid_map_ret) { 4905 int i, rot; 4906 4907 /* push stripe_nr back to the start of the full stripe */ 4908 stripe_nr = raid56_full_stripe_start; 4909 do_div(stripe_nr, stripe_len); 4910 4911 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4912 4913 /* RAID[56] write or recovery. Return all stripes */ 4914 num_stripes = map->num_stripes; 4915 max_errors = nr_parity_stripes(map); 4916 4917 raid_map = kmalloc_array(num_stripes, sizeof(u64), 4918 GFP_NOFS); 4919 if (!raid_map) { 4920 ret = -ENOMEM; 4921 goto out; 4922 } 4923 4924 /* Work out the disk rotation on this stripe-set */ 4925 tmp = stripe_nr; 4926 rot = do_div(tmp, num_stripes); 4927 4928 /* Fill in the logical address of each stripe */ 4929 tmp = stripe_nr * nr_data_stripes(map); 4930 for (i = 0; i < nr_data_stripes(map); i++) 4931 raid_map[(i+rot) % num_stripes] = 4932 em->start + (tmp + i) * map->stripe_len; 4933 4934 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 4935 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4936 raid_map[(i+rot+1) % num_stripes] = 4937 RAID6_Q_STRIPE; 4938 4939 *length = map->stripe_len; 4940 stripe_index = 0; 4941 stripe_offset = 0; 4942 } else { 4943 /* 4944 * Mirror #0 or #1 means the original data block. 4945 * Mirror #2 is RAID5 parity block. 4946 * Mirror #3 is RAID6 Q block. 4947 */ 4948 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4949 if (mirror_num > 1) 4950 stripe_index = nr_data_stripes(map) + 4951 mirror_num - 2; 4952 4953 /* We distribute the parity blocks across stripes */ 4954 tmp = stripe_nr + stripe_index; 4955 stripe_index = do_div(tmp, map->num_stripes); 4956 } 4957 } else { 4958 /* 4959 * after this do_div call, stripe_nr is the number of stripes 4960 * on this device we have to walk to find the data, and 4961 * stripe_index is the number of our device in the stripe array 4962 */ 4963 stripe_index = do_div(stripe_nr, map->num_stripes); 4964 mirror_num = stripe_index + 1; 4965 } 4966 BUG_ON(stripe_index >= map->num_stripes); 4967 4968 num_alloc_stripes = num_stripes; 4969 if (dev_replace_is_ongoing) { 4970 if (rw & (REQ_WRITE | REQ_DISCARD)) 4971 num_alloc_stripes <<= 1; 4972 if (rw & REQ_GET_READ_MIRRORS) 4973 num_alloc_stripes++; 4974 } 4975 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 4976 if (!bbio) { 4977 kfree(raid_map); 4978 ret = -ENOMEM; 4979 goto out; 4980 } 4981 atomic_set(&bbio->error, 0); 4982 4983 if (rw & REQ_DISCARD) { 4984 int factor = 0; 4985 int sub_stripes = 0; 4986 u64 stripes_per_dev = 0; 4987 u32 remaining_stripes = 0; 4988 u32 last_stripe = 0; 4989 4990 if (map->type & 4991 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 4992 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4993 sub_stripes = 1; 4994 else 4995 sub_stripes = map->sub_stripes; 4996 4997 factor = map->num_stripes / sub_stripes; 4998 stripes_per_dev = div_u64_rem(stripe_nr_end - 4999 stripe_nr_orig, 5000 factor, 5001 &remaining_stripes); 5002 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5003 last_stripe *= sub_stripes; 5004 } 5005 5006 for (i = 0; i < num_stripes; i++) { 5007 bbio->stripes[i].physical = 5008 map->stripes[stripe_index].physical + 5009 stripe_offset + stripe_nr * map->stripe_len; 5010 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5011 5012 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5013 BTRFS_BLOCK_GROUP_RAID10)) { 5014 bbio->stripes[i].length = stripes_per_dev * 5015 map->stripe_len; 5016 5017 if (i / sub_stripes < remaining_stripes) 5018 bbio->stripes[i].length += 5019 map->stripe_len; 5020 5021 /* 5022 * Special for the first stripe and 5023 * the last stripe: 5024 * 5025 * |-------|...|-------| 5026 * |----------| 5027 * off end_off 5028 */ 5029 if (i < sub_stripes) 5030 bbio->stripes[i].length -= 5031 stripe_offset; 5032 5033 if (stripe_index >= last_stripe && 5034 stripe_index <= (last_stripe + 5035 sub_stripes - 1)) 5036 bbio->stripes[i].length -= 5037 stripe_end_offset; 5038 5039 if (i == sub_stripes - 1) 5040 stripe_offset = 0; 5041 } else 5042 bbio->stripes[i].length = *length; 5043 5044 stripe_index++; 5045 if (stripe_index == map->num_stripes) { 5046 /* This could only happen for RAID0/10 */ 5047 stripe_index = 0; 5048 stripe_nr++; 5049 } 5050 } 5051 } else { 5052 for (i = 0; i < num_stripes; i++) { 5053 bbio->stripes[i].physical = 5054 map->stripes[stripe_index].physical + 5055 stripe_offset + 5056 stripe_nr * map->stripe_len; 5057 bbio->stripes[i].dev = 5058 map->stripes[stripe_index].dev; 5059 stripe_index++; 5060 } 5061 } 5062 5063 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 5064 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5065 BTRFS_BLOCK_GROUP_RAID10 | 5066 BTRFS_BLOCK_GROUP_RAID5 | 5067 BTRFS_BLOCK_GROUP_DUP)) { 5068 max_errors = 1; 5069 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 5070 max_errors = 2; 5071 } 5072 } 5073 5074 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5075 dev_replace->tgtdev != NULL) { 5076 int index_where_to_add; 5077 u64 srcdev_devid = dev_replace->srcdev->devid; 5078 5079 /* 5080 * duplicate the write operations while the dev replace 5081 * procedure is running. Since the copying of the old disk 5082 * to the new disk takes place at run time while the 5083 * filesystem is mounted writable, the regular write 5084 * operations to the old disk have to be duplicated to go 5085 * to the new disk as well. 5086 * Note that device->missing is handled by the caller, and 5087 * that the write to the old disk is already set up in the 5088 * stripes array. 5089 */ 5090 index_where_to_add = num_stripes; 5091 for (i = 0; i < num_stripes; i++) { 5092 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5093 /* write to new disk, too */ 5094 struct btrfs_bio_stripe *new = 5095 bbio->stripes + index_where_to_add; 5096 struct btrfs_bio_stripe *old = 5097 bbio->stripes + i; 5098 5099 new->physical = old->physical; 5100 new->length = old->length; 5101 new->dev = dev_replace->tgtdev; 5102 index_where_to_add++; 5103 max_errors++; 5104 } 5105 } 5106 num_stripes = index_where_to_add; 5107 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 5108 dev_replace->tgtdev != NULL) { 5109 u64 srcdev_devid = dev_replace->srcdev->devid; 5110 int index_srcdev = 0; 5111 int found = 0; 5112 u64 physical_of_found = 0; 5113 5114 /* 5115 * During the dev-replace procedure, the target drive can 5116 * also be used to read data in case it is needed to repair 5117 * a corrupt block elsewhere. This is possible if the 5118 * requested area is left of the left cursor. In this area, 5119 * the target drive is a full copy of the source drive. 5120 */ 5121 for (i = 0; i < num_stripes; i++) { 5122 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5123 /* 5124 * In case of DUP, in order to keep it 5125 * simple, only add the mirror with the 5126 * lowest physical address 5127 */ 5128 if (found && 5129 physical_of_found <= 5130 bbio->stripes[i].physical) 5131 continue; 5132 index_srcdev = i; 5133 found = 1; 5134 physical_of_found = bbio->stripes[i].physical; 5135 } 5136 } 5137 if (found) { 5138 u64 length = map->stripe_len; 5139 5140 if (physical_of_found + length <= 5141 dev_replace->cursor_left) { 5142 struct btrfs_bio_stripe *tgtdev_stripe = 5143 bbio->stripes + num_stripes; 5144 5145 tgtdev_stripe->physical = physical_of_found; 5146 tgtdev_stripe->length = 5147 bbio->stripes[index_srcdev].length; 5148 tgtdev_stripe->dev = dev_replace->tgtdev; 5149 5150 num_stripes++; 5151 } 5152 } 5153 } 5154 5155 *bbio_ret = bbio; 5156 bbio->num_stripes = num_stripes; 5157 bbio->max_errors = max_errors; 5158 bbio->mirror_num = mirror_num; 5159 5160 /* 5161 * this is the case that REQ_READ && dev_replace_is_ongoing && 5162 * mirror_num == num_stripes + 1 && dev_replace target drive is 5163 * available as a mirror 5164 */ 5165 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5166 WARN_ON(num_stripes > 1); 5167 bbio->stripes[0].dev = dev_replace->tgtdev; 5168 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5169 bbio->mirror_num = map->num_stripes + 1; 5170 } 5171 if (raid_map) { 5172 sort_parity_stripes(bbio, raid_map); 5173 *raid_map_ret = raid_map; 5174 } 5175 out: 5176 if (dev_replace_is_ongoing) 5177 btrfs_dev_replace_unlock(dev_replace); 5178 free_extent_map(em); 5179 return ret; 5180 } 5181 5182 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 5183 u64 logical, u64 *length, 5184 struct btrfs_bio **bbio_ret, int mirror_num) 5185 { 5186 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5187 mirror_num, NULL); 5188 } 5189 5190 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5191 u64 chunk_start, u64 physical, u64 devid, 5192 u64 **logical, int *naddrs, int *stripe_len) 5193 { 5194 struct extent_map_tree *em_tree = &map_tree->map_tree; 5195 struct extent_map *em; 5196 struct map_lookup *map; 5197 u64 *buf; 5198 u64 bytenr; 5199 u64 length; 5200 u64 stripe_nr; 5201 u64 rmap_len; 5202 int i, j, nr = 0; 5203 5204 read_lock(&em_tree->lock); 5205 em = lookup_extent_mapping(em_tree, chunk_start, 1); 5206 read_unlock(&em_tree->lock); 5207 5208 if (!em) { 5209 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", 5210 chunk_start); 5211 return -EIO; 5212 } 5213 5214 if (em->start != chunk_start) { 5215 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", 5216 em->start, chunk_start); 5217 free_extent_map(em); 5218 return -EIO; 5219 } 5220 map = (struct map_lookup *)em->bdev; 5221 5222 length = em->len; 5223 rmap_len = map->stripe_len; 5224 5225 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5226 do_div(length, map->num_stripes / map->sub_stripes); 5227 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5228 do_div(length, map->num_stripes); 5229 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5230 BTRFS_BLOCK_GROUP_RAID6)) { 5231 do_div(length, nr_data_stripes(map)); 5232 rmap_len = map->stripe_len * nr_data_stripes(map); 5233 } 5234 5235 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 5236 BUG_ON(!buf); /* -ENOMEM */ 5237 5238 for (i = 0; i < map->num_stripes; i++) { 5239 if (devid && map->stripes[i].dev->devid != devid) 5240 continue; 5241 if (map->stripes[i].physical > physical || 5242 map->stripes[i].physical + length <= physical) 5243 continue; 5244 5245 stripe_nr = physical - map->stripes[i].physical; 5246 do_div(stripe_nr, map->stripe_len); 5247 5248 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5249 stripe_nr = stripe_nr * map->num_stripes + i; 5250 do_div(stripe_nr, map->sub_stripes); 5251 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5252 stripe_nr = stripe_nr * map->num_stripes + i; 5253 } /* else if RAID[56], multiply by nr_data_stripes(). 5254 * Alternatively, just use rmap_len below instead of 5255 * map->stripe_len */ 5256 5257 bytenr = chunk_start + stripe_nr * rmap_len; 5258 WARN_ON(nr >= map->num_stripes); 5259 for (j = 0; j < nr; j++) { 5260 if (buf[j] == bytenr) 5261 break; 5262 } 5263 if (j == nr) { 5264 WARN_ON(nr >= map->num_stripes); 5265 buf[nr++] = bytenr; 5266 } 5267 } 5268 5269 *logical = buf; 5270 *naddrs = nr; 5271 *stripe_len = rmap_len; 5272 5273 free_extent_map(em); 5274 return 0; 5275 } 5276 5277 static void btrfs_end_bio(struct bio *bio, int err) 5278 { 5279 struct btrfs_bio *bbio = bio->bi_private; 5280 struct btrfs_device *dev = bbio->stripes[0].dev; 5281 int is_orig_bio = 0; 5282 5283 if (err) { 5284 atomic_inc(&bbio->error); 5285 if (err == -EIO || err == -EREMOTEIO) { 5286 unsigned int stripe_index = 5287 btrfs_io_bio(bio)->stripe_index; 5288 5289 BUG_ON(stripe_index >= bbio->num_stripes); 5290 dev = bbio->stripes[stripe_index].dev; 5291 if (dev->bdev) { 5292 if (bio->bi_rw & WRITE) 5293 btrfs_dev_stat_inc(dev, 5294 BTRFS_DEV_STAT_WRITE_ERRS); 5295 else 5296 btrfs_dev_stat_inc(dev, 5297 BTRFS_DEV_STAT_READ_ERRS); 5298 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 5299 btrfs_dev_stat_inc(dev, 5300 BTRFS_DEV_STAT_FLUSH_ERRS); 5301 btrfs_dev_stat_print_on_error(dev); 5302 } 5303 } 5304 } 5305 5306 if (bio == bbio->orig_bio) 5307 is_orig_bio = 1; 5308 5309 btrfs_bio_counter_dec(bbio->fs_info); 5310 5311 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5312 if (!is_orig_bio) { 5313 bio_put(bio); 5314 bio = bbio->orig_bio; 5315 } 5316 5317 /* 5318 * We have original bio now. So increment bi_remaining to 5319 * account for it in endio 5320 */ 5321 atomic_inc(&bio->bi_remaining); 5322 5323 bio->bi_private = bbio->private; 5324 bio->bi_end_io = bbio->end_io; 5325 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5326 /* only send an error to the higher layers if it is 5327 * beyond the tolerance of the btrfs bio 5328 */ 5329 if (atomic_read(&bbio->error) > bbio->max_errors) { 5330 err = -EIO; 5331 } else { 5332 /* 5333 * this bio is actually up to date, we didn't 5334 * go over the max number of errors 5335 */ 5336 set_bit(BIO_UPTODATE, &bio->bi_flags); 5337 err = 0; 5338 } 5339 kfree(bbio); 5340 5341 bio_endio(bio, err); 5342 } else if (!is_orig_bio) { 5343 bio_put(bio); 5344 } 5345 } 5346 5347 /* 5348 * see run_scheduled_bios for a description of why bios are collected for 5349 * async submit. 5350 * 5351 * This will add one bio to the pending list for a device and make sure 5352 * the work struct is scheduled. 5353 */ 5354 static noinline void btrfs_schedule_bio(struct btrfs_root *root, 5355 struct btrfs_device *device, 5356 int rw, struct bio *bio) 5357 { 5358 int should_queue = 1; 5359 struct btrfs_pending_bios *pending_bios; 5360 5361 if (device->missing || !device->bdev) { 5362 bio_endio(bio, -EIO); 5363 return; 5364 } 5365 5366 /* don't bother with additional async steps for reads, right now */ 5367 if (!(rw & REQ_WRITE)) { 5368 bio_get(bio); 5369 btrfsic_submit_bio(rw, bio); 5370 bio_put(bio); 5371 return; 5372 } 5373 5374 /* 5375 * nr_async_bios allows us to reliably return congestion to the 5376 * higher layers. Otherwise, the async bio makes it appear we have 5377 * made progress against dirty pages when we've really just put it 5378 * on a queue for later 5379 */ 5380 atomic_inc(&root->fs_info->nr_async_bios); 5381 WARN_ON(bio->bi_next); 5382 bio->bi_next = NULL; 5383 bio->bi_rw |= rw; 5384 5385 spin_lock(&device->io_lock); 5386 if (bio->bi_rw & REQ_SYNC) 5387 pending_bios = &device->pending_sync_bios; 5388 else 5389 pending_bios = &device->pending_bios; 5390 5391 if (pending_bios->tail) 5392 pending_bios->tail->bi_next = bio; 5393 5394 pending_bios->tail = bio; 5395 if (!pending_bios->head) 5396 pending_bios->head = bio; 5397 if (device->running_pending) 5398 should_queue = 0; 5399 5400 spin_unlock(&device->io_lock); 5401 5402 if (should_queue) 5403 btrfs_queue_work(root->fs_info->submit_workers, 5404 &device->work); 5405 } 5406 5407 static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5408 sector_t sector) 5409 { 5410 struct bio_vec *prev; 5411 struct request_queue *q = bdev_get_queue(bdev); 5412 unsigned int max_sectors = queue_max_sectors(q); 5413 struct bvec_merge_data bvm = { 5414 .bi_bdev = bdev, 5415 .bi_sector = sector, 5416 .bi_rw = bio->bi_rw, 5417 }; 5418 5419 if (WARN_ON(bio->bi_vcnt == 0)) 5420 return 1; 5421 5422 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5423 if (bio_sectors(bio) > max_sectors) 5424 return 0; 5425 5426 if (!q->merge_bvec_fn) 5427 return 1; 5428 5429 bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; 5430 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 5431 return 0; 5432 return 1; 5433 } 5434 5435 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5436 struct bio *bio, u64 physical, int dev_nr, 5437 int rw, int async) 5438 { 5439 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 5440 5441 bio->bi_private = bbio; 5442 btrfs_io_bio(bio)->stripe_index = dev_nr; 5443 bio->bi_end_io = btrfs_end_bio; 5444 bio->bi_iter.bi_sector = physical >> 9; 5445 #ifdef DEBUG 5446 { 5447 struct rcu_string *name; 5448 5449 rcu_read_lock(); 5450 name = rcu_dereference(dev->name); 5451 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5452 "(%s id %llu), size=%u\n", rw, 5453 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 5454 name->str, dev->devid, bio->bi_size); 5455 rcu_read_unlock(); 5456 } 5457 #endif 5458 bio->bi_bdev = dev->bdev; 5459 5460 btrfs_bio_counter_inc_noblocked(root->fs_info); 5461 5462 if (async) 5463 btrfs_schedule_bio(root, dev, rw, bio); 5464 else 5465 btrfsic_submit_bio(rw, bio); 5466 } 5467 5468 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5469 struct bio *first_bio, struct btrfs_device *dev, 5470 int dev_nr, int rw, int async) 5471 { 5472 struct bio_vec *bvec = first_bio->bi_io_vec; 5473 struct bio *bio; 5474 int nr_vecs = bio_get_nr_vecs(dev->bdev); 5475 u64 physical = bbio->stripes[dev_nr].physical; 5476 5477 again: 5478 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 5479 if (!bio) 5480 return -ENOMEM; 5481 5482 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 5483 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5484 bvec->bv_offset) < bvec->bv_len) { 5485 u64 len = bio->bi_iter.bi_size; 5486 5487 atomic_inc(&bbio->stripes_pending); 5488 submit_stripe_bio(root, bbio, bio, physical, dev_nr, 5489 rw, async); 5490 physical += len; 5491 goto again; 5492 } 5493 bvec++; 5494 } 5495 5496 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 5497 return 0; 5498 } 5499 5500 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 5501 { 5502 atomic_inc(&bbio->error); 5503 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5504 bio->bi_private = bbio->private; 5505 bio->bi_end_io = bbio->end_io; 5506 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5507 bio->bi_iter.bi_sector = logical >> 9; 5508 kfree(bbio); 5509 bio_endio(bio, -EIO); 5510 } 5511 } 5512 5513 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 5514 int mirror_num, int async_submit) 5515 { 5516 struct btrfs_device *dev; 5517 struct bio *first_bio = bio; 5518 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 5519 u64 length = 0; 5520 u64 map_length; 5521 u64 *raid_map = NULL; 5522 int ret; 5523 int dev_nr = 0; 5524 int total_devs = 1; 5525 struct btrfs_bio *bbio = NULL; 5526 5527 length = bio->bi_iter.bi_size; 5528 map_length = length; 5529 5530 btrfs_bio_counter_inc_blocked(root->fs_info); 5531 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5532 mirror_num, &raid_map); 5533 if (ret) { 5534 btrfs_bio_counter_dec(root->fs_info); 5535 return ret; 5536 } 5537 5538 total_devs = bbio->num_stripes; 5539 bbio->orig_bio = first_bio; 5540 bbio->private = first_bio->bi_private; 5541 bbio->end_io = first_bio->bi_end_io; 5542 bbio->fs_info = root->fs_info; 5543 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5544 5545 if (raid_map) { 5546 /* In this case, map_length has been set to the length of 5547 a single stripe; not the whole write */ 5548 if (rw & WRITE) { 5549 ret = raid56_parity_write(root, bio, bbio, 5550 raid_map, map_length); 5551 } else { 5552 ret = raid56_parity_recover(root, bio, bbio, 5553 raid_map, map_length, 5554 mirror_num); 5555 } 5556 /* 5557 * FIXME, replace dosen't support raid56 yet, please fix 5558 * it in the future. 5559 */ 5560 btrfs_bio_counter_dec(root->fs_info); 5561 return ret; 5562 } 5563 5564 if (map_length < length) { 5565 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", 5566 logical, length, map_length); 5567 BUG(); 5568 } 5569 5570 while (dev_nr < total_devs) { 5571 dev = bbio->stripes[dev_nr].dev; 5572 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5573 bbio_error(bbio, first_bio, logical); 5574 dev_nr++; 5575 continue; 5576 } 5577 5578 /* 5579 * Check and see if we're ok with this bio based on it's size 5580 * and offset with the given device. 5581 */ 5582 if (!bio_size_ok(dev->bdev, first_bio, 5583 bbio->stripes[dev_nr].physical >> 9)) { 5584 ret = breakup_stripe_bio(root, bbio, first_bio, dev, 5585 dev_nr, rw, async_submit); 5586 BUG_ON(ret); 5587 dev_nr++; 5588 continue; 5589 } 5590 5591 if (dev_nr < total_devs - 1) { 5592 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 5593 BUG_ON(!bio); /* -ENOMEM */ 5594 } else { 5595 bio = first_bio; 5596 } 5597 5598 submit_stripe_bio(root, bbio, bio, 5599 bbio->stripes[dev_nr].physical, dev_nr, rw, 5600 async_submit); 5601 dev_nr++; 5602 } 5603 btrfs_bio_counter_dec(root->fs_info); 5604 return 0; 5605 } 5606 5607 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 5608 u8 *uuid, u8 *fsid) 5609 { 5610 struct btrfs_device *device; 5611 struct btrfs_fs_devices *cur_devices; 5612 5613 cur_devices = fs_info->fs_devices; 5614 while (cur_devices) { 5615 if (!fsid || 5616 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5617 device = __find_device(&cur_devices->devices, 5618 devid, uuid); 5619 if (device) 5620 return device; 5621 } 5622 cur_devices = cur_devices->seed; 5623 } 5624 return NULL; 5625 } 5626 5627 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5628 u64 devid, u8 *dev_uuid) 5629 { 5630 struct btrfs_device *device; 5631 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 5632 5633 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5634 if (IS_ERR(device)) 5635 return NULL; 5636 5637 list_add(&device->dev_list, &fs_devices->devices); 5638 device->fs_devices = fs_devices; 5639 fs_devices->num_devices++; 5640 5641 device->missing = 1; 5642 fs_devices->missing_devices++; 5643 5644 return device; 5645 } 5646 5647 /** 5648 * btrfs_alloc_device - allocate struct btrfs_device 5649 * @fs_info: used only for generating a new devid, can be NULL if 5650 * devid is provided (i.e. @devid != NULL). 5651 * @devid: a pointer to devid for this device. If NULL a new devid 5652 * is generated. 5653 * @uuid: a pointer to UUID for this device. If NULL a new UUID 5654 * is generated. 5655 * 5656 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 5657 * on error. Returned struct is not linked onto any lists and can be 5658 * destroyed with kfree() right away. 5659 */ 5660 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 5661 const u64 *devid, 5662 const u8 *uuid) 5663 { 5664 struct btrfs_device *dev; 5665 u64 tmp; 5666 5667 if (WARN_ON(!devid && !fs_info)) 5668 return ERR_PTR(-EINVAL); 5669 5670 dev = __alloc_device(); 5671 if (IS_ERR(dev)) 5672 return dev; 5673 5674 if (devid) 5675 tmp = *devid; 5676 else { 5677 int ret; 5678 5679 ret = find_next_devid(fs_info, &tmp); 5680 if (ret) { 5681 kfree(dev); 5682 return ERR_PTR(ret); 5683 } 5684 } 5685 dev->devid = tmp; 5686 5687 if (uuid) 5688 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 5689 else 5690 generate_random_uuid(dev->uuid); 5691 5692 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); 5693 5694 return dev; 5695 } 5696 5697 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 5698 struct extent_buffer *leaf, 5699 struct btrfs_chunk *chunk) 5700 { 5701 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 5702 struct map_lookup *map; 5703 struct extent_map *em; 5704 u64 logical; 5705 u64 length; 5706 u64 devid; 5707 u8 uuid[BTRFS_UUID_SIZE]; 5708 int num_stripes; 5709 int ret; 5710 int i; 5711 5712 logical = key->offset; 5713 length = btrfs_chunk_length(leaf, chunk); 5714 5715 read_lock(&map_tree->map_tree.lock); 5716 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 5717 read_unlock(&map_tree->map_tree.lock); 5718 5719 /* already mapped? */ 5720 if (em && em->start <= logical && em->start + em->len > logical) { 5721 free_extent_map(em); 5722 return 0; 5723 } else if (em) { 5724 free_extent_map(em); 5725 } 5726 5727 em = alloc_extent_map(); 5728 if (!em) 5729 return -ENOMEM; 5730 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 5731 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 5732 if (!map) { 5733 free_extent_map(em); 5734 return -ENOMEM; 5735 } 5736 5737 em->bdev = (struct block_device *)map; 5738 em->start = logical; 5739 em->len = length; 5740 em->orig_start = 0; 5741 em->block_start = 0; 5742 em->block_len = em->len; 5743 5744 map->num_stripes = num_stripes; 5745 map->io_width = btrfs_chunk_io_width(leaf, chunk); 5746 map->io_align = btrfs_chunk_io_align(leaf, chunk); 5747 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 5748 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 5749 map->type = btrfs_chunk_type(leaf, chunk); 5750 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 5751 for (i = 0; i < num_stripes; i++) { 5752 map->stripes[i].physical = 5753 btrfs_stripe_offset_nr(leaf, chunk, i); 5754 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 5755 read_extent_buffer(leaf, uuid, (unsigned long) 5756 btrfs_stripe_dev_uuid_nr(chunk, i), 5757 BTRFS_UUID_SIZE); 5758 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 5759 uuid, NULL); 5760 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 5761 kfree(map); 5762 free_extent_map(em); 5763 return -EIO; 5764 } 5765 if (!map->stripes[i].dev) { 5766 map->stripes[i].dev = 5767 add_missing_dev(root, devid, uuid); 5768 if (!map->stripes[i].dev) { 5769 kfree(map); 5770 free_extent_map(em); 5771 return -EIO; 5772 } 5773 } 5774 map->stripes[i].dev->in_fs_metadata = 1; 5775 } 5776 5777 write_lock(&map_tree->map_tree.lock); 5778 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 5779 write_unlock(&map_tree->map_tree.lock); 5780 BUG_ON(ret); /* Tree corruption */ 5781 free_extent_map(em); 5782 5783 return 0; 5784 } 5785 5786 static void fill_device_from_item(struct extent_buffer *leaf, 5787 struct btrfs_dev_item *dev_item, 5788 struct btrfs_device *device) 5789 { 5790 unsigned long ptr; 5791 5792 device->devid = btrfs_device_id(leaf, dev_item); 5793 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 5794 device->total_bytes = device->disk_total_bytes; 5795 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 5796 device->type = btrfs_device_type(leaf, dev_item); 5797 device->io_align = btrfs_device_io_align(leaf, dev_item); 5798 device->io_width = btrfs_device_io_width(leaf, dev_item); 5799 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5800 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 5801 device->is_tgtdev_for_dev_replace = 0; 5802 5803 ptr = btrfs_device_uuid(dev_item); 5804 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5805 } 5806 5807 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 5808 { 5809 struct btrfs_fs_devices *fs_devices; 5810 int ret; 5811 5812 BUG_ON(!mutex_is_locked(&uuid_mutex)); 5813 5814 fs_devices = root->fs_info->fs_devices->seed; 5815 while (fs_devices) { 5816 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5817 ret = 0; 5818 goto out; 5819 } 5820 fs_devices = fs_devices->seed; 5821 } 5822 5823 fs_devices = find_fsid(fsid); 5824 if (!fs_devices) { 5825 ret = -ENOENT; 5826 goto out; 5827 } 5828 5829 fs_devices = clone_fs_devices(fs_devices); 5830 if (IS_ERR(fs_devices)) { 5831 ret = PTR_ERR(fs_devices); 5832 goto out; 5833 } 5834 5835 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 5836 root->fs_info->bdev_holder); 5837 if (ret) { 5838 free_fs_devices(fs_devices); 5839 goto out; 5840 } 5841 5842 if (!fs_devices->seeding) { 5843 __btrfs_close_devices(fs_devices); 5844 free_fs_devices(fs_devices); 5845 ret = -EINVAL; 5846 goto out; 5847 } 5848 5849 fs_devices->seed = root->fs_info->fs_devices->seed; 5850 root->fs_info->fs_devices->seed = fs_devices; 5851 out: 5852 return ret; 5853 } 5854 5855 static int read_one_dev(struct btrfs_root *root, 5856 struct extent_buffer *leaf, 5857 struct btrfs_dev_item *dev_item) 5858 { 5859 struct btrfs_device *device; 5860 u64 devid; 5861 int ret; 5862 u8 fs_uuid[BTRFS_UUID_SIZE]; 5863 u8 dev_uuid[BTRFS_UUID_SIZE]; 5864 5865 devid = btrfs_device_id(leaf, dev_item); 5866 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 5867 BTRFS_UUID_SIZE); 5868 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 5869 BTRFS_UUID_SIZE); 5870 5871 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 5872 ret = open_seed_devices(root, fs_uuid); 5873 if (ret && !btrfs_test_opt(root, DEGRADED)) 5874 return ret; 5875 } 5876 5877 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 5878 if (!device || !device->bdev) { 5879 if (!btrfs_test_opt(root, DEGRADED)) 5880 return -EIO; 5881 5882 if (!device) { 5883 btrfs_warn(root->fs_info, "devid %llu missing", devid); 5884 device = add_missing_dev(root, devid, dev_uuid); 5885 if (!device) 5886 return -ENOMEM; 5887 } else if (!device->missing) { 5888 /* 5889 * this happens when a device that was properly setup 5890 * in the device info lists suddenly goes bad. 5891 * device->bdev is NULL, and so we have to set 5892 * device->missing to one here 5893 */ 5894 root->fs_info->fs_devices->missing_devices++; 5895 device->missing = 1; 5896 } 5897 } 5898 5899 if (device->fs_devices != root->fs_info->fs_devices) { 5900 BUG_ON(device->writeable); 5901 if (device->generation != 5902 btrfs_device_generation(leaf, dev_item)) 5903 return -EINVAL; 5904 } 5905 5906 fill_device_from_item(leaf, dev_item, device); 5907 device->in_fs_metadata = 1; 5908 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5909 device->fs_devices->total_rw_bytes += device->total_bytes; 5910 spin_lock(&root->fs_info->free_chunk_lock); 5911 root->fs_info->free_chunk_space += device->total_bytes - 5912 device->bytes_used; 5913 spin_unlock(&root->fs_info->free_chunk_lock); 5914 } 5915 ret = 0; 5916 return ret; 5917 } 5918 5919 int btrfs_read_sys_array(struct btrfs_root *root) 5920 { 5921 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 5922 struct extent_buffer *sb; 5923 struct btrfs_disk_key *disk_key; 5924 struct btrfs_chunk *chunk; 5925 u8 *ptr; 5926 unsigned long sb_ptr; 5927 int ret = 0; 5928 u32 num_stripes; 5929 u32 array_size; 5930 u32 len = 0; 5931 u32 cur; 5932 struct btrfs_key key; 5933 5934 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 5935 BTRFS_SUPER_INFO_SIZE); 5936 if (!sb) 5937 return -ENOMEM; 5938 btrfs_set_buffer_uptodate(sb); 5939 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 5940 /* 5941 * The sb extent buffer is artifical and just used to read the system array. 5942 * btrfs_set_buffer_uptodate() call does not properly mark all it's 5943 * pages up-to-date when the page is larger: extent does not cover the 5944 * whole page and consequently check_page_uptodate does not find all 5945 * the page's extents up-to-date (the hole beyond sb), 5946 * write_extent_buffer then triggers a WARN_ON. 5947 * 5948 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 5949 * but sb spans only this function. Add an explicit SetPageUptodate call 5950 * to silence the warning eg. on PowerPC 64. 5951 */ 5952 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 5953 SetPageUptodate(sb->pages[0]); 5954 5955 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 5956 array_size = btrfs_super_sys_array_size(super_copy); 5957 5958 ptr = super_copy->sys_chunk_array; 5959 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 5960 cur = 0; 5961 5962 while (cur < array_size) { 5963 disk_key = (struct btrfs_disk_key *)ptr; 5964 btrfs_disk_key_to_cpu(&key, disk_key); 5965 5966 len = sizeof(*disk_key); ptr += len; 5967 sb_ptr += len; 5968 cur += len; 5969 5970 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 5971 chunk = (struct btrfs_chunk *)sb_ptr; 5972 ret = read_one_chunk(root, &key, sb, chunk); 5973 if (ret) 5974 break; 5975 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 5976 len = btrfs_chunk_item_size(num_stripes); 5977 } else { 5978 ret = -EIO; 5979 break; 5980 } 5981 ptr += len; 5982 sb_ptr += len; 5983 cur += len; 5984 } 5985 free_extent_buffer(sb); 5986 return ret; 5987 } 5988 5989 int btrfs_read_chunk_tree(struct btrfs_root *root) 5990 { 5991 struct btrfs_path *path; 5992 struct extent_buffer *leaf; 5993 struct btrfs_key key; 5994 struct btrfs_key found_key; 5995 int ret; 5996 int slot; 5997 5998 root = root->fs_info->chunk_root; 5999 6000 path = btrfs_alloc_path(); 6001 if (!path) 6002 return -ENOMEM; 6003 6004 mutex_lock(&uuid_mutex); 6005 lock_chunks(root); 6006 6007 /* 6008 * Read all device items, and then all the chunk items. All 6009 * device items are found before any chunk item (their object id 6010 * is smaller than the lowest possible object id for a chunk 6011 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6012 */ 6013 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6014 key.offset = 0; 6015 key.type = 0; 6016 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6017 if (ret < 0) 6018 goto error; 6019 while (1) { 6020 leaf = path->nodes[0]; 6021 slot = path->slots[0]; 6022 if (slot >= btrfs_header_nritems(leaf)) { 6023 ret = btrfs_next_leaf(root, path); 6024 if (ret == 0) 6025 continue; 6026 if (ret < 0) 6027 goto error; 6028 break; 6029 } 6030 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6031 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6032 struct btrfs_dev_item *dev_item; 6033 dev_item = btrfs_item_ptr(leaf, slot, 6034 struct btrfs_dev_item); 6035 ret = read_one_dev(root, leaf, dev_item); 6036 if (ret) 6037 goto error; 6038 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6039 struct btrfs_chunk *chunk; 6040 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6041 ret = read_one_chunk(root, &found_key, leaf, chunk); 6042 if (ret) 6043 goto error; 6044 } 6045 path->slots[0]++; 6046 } 6047 ret = 0; 6048 error: 6049 unlock_chunks(root); 6050 mutex_unlock(&uuid_mutex); 6051 6052 btrfs_free_path(path); 6053 return ret; 6054 } 6055 6056 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6057 { 6058 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6059 struct btrfs_device *device; 6060 6061 mutex_lock(&fs_devices->device_list_mutex); 6062 list_for_each_entry(device, &fs_devices->devices, dev_list) 6063 device->dev_root = fs_info->dev_root; 6064 mutex_unlock(&fs_devices->device_list_mutex); 6065 } 6066 6067 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6068 { 6069 int i; 6070 6071 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6072 btrfs_dev_stat_reset(dev, i); 6073 } 6074 6075 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6076 { 6077 struct btrfs_key key; 6078 struct btrfs_key found_key; 6079 struct btrfs_root *dev_root = fs_info->dev_root; 6080 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6081 struct extent_buffer *eb; 6082 int slot; 6083 int ret = 0; 6084 struct btrfs_device *device; 6085 struct btrfs_path *path = NULL; 6086 int i; 6087 6088 path = btrfs_alloc_path(); 6089 if (!path) { 6090 ret = -ENOMEM; 6091 goto out; 6092 } 6093 6094 mutex_lock(&fs_devices->device_list_mutex); 6095 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6096 int item_size; 6097 struct btrfs_dev_stats_item *ptr; 6098 6099 key.objectid = 0; 6100 key.type = BTRFS_DEV_STATS_KEY; 6101 key.offset = device->devid; 6102 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6103 if (ret) { 6104 __btrfs_reset_dev_stats(device); 6105 device->dev_stats_valid = 1; 6106 btrfs_release_path(path); 6107 continue; 6108 } 6109 slot = path->slots[0]; 6110 eb = path->nodes[0]; 6111 btrfs_item_key_to_cpu(eb, &found_key, slot); 6112 item_size = btrfs_item_size_nr(eb, slot); 6113 6114 ptr = btrfs_item_ptr(eb, slot, 6115 struct btrfs_dev_stats_item); 6116 6117 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6118 if (item_size >= (1 + i) * sizeof(__le64)) 6119 btrfs_dev_stat_set(device, i, 6120 btrfs_dev_stats_value(eb, ptr, i)); 6121 else 6122 btrfs_dev_stat_reset(device, i); 6123 } 6124 6125 device->dev_stats_valid = 1; 6126 btrfs_dev_stat_print_on_load(device); 6127 btrfs_release_path(path); 6128 } 6129 mutex_unlock(&fs_devices->device_list_mutex); 6130 6131 out: 6132 btrfs_free_path(path); 6133 return ret < 0 ? ret : 0; 6134 } 6135 6136 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 6137 struct btrfs_root *dev_root, 6138 struct btrfs_device *device) 6139 { 6140 struct btrfs_path *path; 6141 struct btrfs_key key; 6142 struct extent_buffer *eb; 6143 struct btrfs_dev_stats_item *ptr; 6144 int ret; 6145 int i; 6146 6147 key.objectid = 0; 6148 key.type = BTRFS_DEV_STATS_KEY; 6149 key.offset = device->devid; 6150 6151 path = btrfs_alloc_path(); 6152 BUG_ON(!path); 6153 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6154 if (ret < 0) { 6155 printk_in_rcu(KERN_WARNING "BTRFS: " 6156 "error %d while searching for dev_stats item for device %s!\n", 6157 ret, rcu_str_deref(device->name)); 6158 goto out; 6159 } 6160 6161 if (ret == 0 && 6162 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 6163 /* need to delete old one and insert a new one */ 6164 ret = btrfs_del_item(trans, dev_root, path); 6165 if (ret != 0) { 6166 printk_in_rcu(KERN_WARNING "BTRFS: " 6167 "delete too small dev_stats item for device %s failed %d!\n", 6168 rcu_str_deref(device->name), ret); 6169 goto out; 6170 } 6171 ret = 1; 6172 } 6173 6174 if (ret == 1) { 6175 /* need to insert a new item */ 6176 btrfs_release_path(path); 6177 ret = btrfs_insert_empty_item(trans, dev_root, path, 6178 &key, sizeof(*ptr)); 6179 if (ret < 0) { 6180 printk_in_rcu(KERN_WARNING "BTRFS: " 6181 "insert dev_stats item for device %s failed %d!\n", 6182 rcu_str_deref(device->name), ret); 6183 goto out; 6184 } 6185 } 6186 6187 eb = path->nodes[0]; 6188 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 6189 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6190 btrfs_set_dev_stats_value(eb, ptr, i, 6191 btrfs_dev_stat_read(device, i)); 6192 btrfs_mark_buffer_dirty(eb); 6193 6194 out: 6195 btrfs_free_path(path); 6196 return ret; 6197 } 6198 6199 /* 6200 * called from commit_transaction. Writes all changed device stats to disk. 6201 */ 6202 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 6203 struct btrfs_fs_info *fs_info) 6204 { 6205 struct btrfs_root *dev_root = fs_info->dev_root; 6206 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6207 struct btrfs_device *device; 6208 int ret = 0; 6209 6210 mutex_lock(&fs_devices->device_list_mutex); 6211 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6212 if (!device->dev_stats_valid || !device->dev_stats_dirty) 6213 continue; 6214 6215 ret = update_dev_stat_item(trans, dev_root, device); 6216 if (!ret) 6217 device->dev_stats_dirty = 0; 6218 } 6219 mutex_unlock(&fs_devices->device_list_mutex); 6220 6221 return ret; 6222 } 6223 6224 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 6225 { 6226 btrfs_dev_stat_inc(dev, index); 6227 btrfs_dev_stat_print_on_error(dev); 6228 } 6229 6230 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 6231 { 6232 if (!dev->dev_stats_valid) 6233 return; 6234 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 6235 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6236 rcu_str_deref(dev->name), 6237 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6238 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6239 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6240 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6241 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6242 } 6243 6244 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 6245 { 6246 int i; 6247 6248 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6249 if (btrfs_dev_stat_read(dev, i) != 0) 6250 break; 6251 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6252 return; /* all values == 0, suppress message */ 6253 6254 printk_in_rcu(KERN_INFO "BTRFS: " 6255 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6256 rcu_str_deref(dev->name), 6257 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6258 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6259 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6260 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6261 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6262 } 6263 6264 int btrfs_get_dev_stats(struct btrfs_root *root, 6265 struct btrfs_ioctl_get_dev_stats *stats) 6266 { 6267 struct btrfs_device *dev; 6268 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6269 int i; 6270 6271 mutex_lock(&fs_devices->device_list_mutex); 6272 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 6273 mutex_unlock(&fs_devices->device_list_mutex); 6274 6275 if (!dev) { 6276 btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); 6277 return -ENODEV; 6278 } else if (!dev->dev_stats_valid) { 6279 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); 6280 return -ENODEV; 6281 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 6282 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6283 if (stats->nr_items > i) 6284 stats->values[i] = 6285 btrfs_dev_stat_read_and_reset(dev, i); 6286 else 6287 btrfs_dev_stat_reset(dev, i); 6288 } 6289 } else { 6290 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6291 if (stats->nr_items > i) 6292 stats->values[i] = btrfs_dev_stat_read(dev, i); 6293 } 6294 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 6295 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 6296 return 0; 6297 } 6298 6299 int btrfs_scratch_superblock(struct btrfs_device *device) 6300 { 6301 struct buffer_head *bh; 6302 struct btrfs_super_block *disk_super; 6303 6304 bh = btrfs_read_dev_super(device->bdev); 6305 if (!bh) 6306 return -EINVAL; 6307 disk_super = (struct btrfs_super_block *)bh->b_data; 6308 6309 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 6310 set_buffer_dirty(bh); 6311 sync_dirty_buffer(bh); 6312 brelse(bh); 6313 6314 return 0; 6315 } 6316