1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/ratelimit.h> 27 #include <linux/kthread.h> 28 #include <linux/raid/pq.h> 29 #include <linux/semaphore.h> 30 #include <asm/div64.h> 31 #include "ctree.h" 32 #include "extent_map.h" 33 #include "disk-io.h" 34 #include "transaction.h" 35 #include "print-tree.h" 36 #include "volumes.h" 37 #include "raid56.h" 38 #include "async-thread.h" 39 #include "check-integrity.h" 40 #include "rcu-string.h" 41 #include "math.h" 42 #include "dev-replace.h" 43 #include "sysfs.h" 44 45 static int init_first_rw_device(struct btrfs_trans_handle *trans, 46 struct btrfs_root *root, 47 struct btrfs_device *device); 48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 52 53 DEFINE_MUTEX(uuid_mutex); 54 static LIST_HEAD(fs_uuids); 55 56 static struct btrfs_fs_devices *__alloc_fs_devices(void) 57 { 58 struct btrfs_fs_devices *fs_devs; 59 60 fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); 61 if (!fs_devs) 62 return ERR_PTR(-ENOMEM); 63 64 mutex_init(&fs_devs->device_list_mutex); 65 66 INIT_LIST_HEAD(&fs_devs->devices); 67 INIT_LIST_HEAD(&fs_devs->resized_devices); 68 INIT_LIST_HEAD(&fs_devs->alloc_list); 69 INIT_LIST_HEAD(&fs_devs->list); 70 71 return fs_devs; 72 } 73 74 /** 75 * alloc_fs_devices - allocate struct btrfs_fs_devices 76 * @fsid: a pointer to UUID for this FS. If NULL a new UUID is 77 * generated. 78 * 79 * Return: a pointer to a new &struct btrfs_fs_devices on success; 80 * ERR_PTR() on error. Returned struct is not linked onto any lists and 81 * can be destroyed with kfree() right away. 82 */ 83 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 84 { 85 struct btrfs_fs_devices *fs_devs; 86 87 fs_devs = __alloc_fs_devices(); 88 if (IS_ERR(fs_devs)) 89 return fs_devs; 90 91 if (fsid) 92 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 93 else 94 generate_random_uuid(fs_devs->fsid); 95 96 return fs_devs; 97 } 98 99 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 100 { 101 struct btrfs_device *device; 102 WARN_ON(fs_devices->opened); 103 while (!list_empty(&fs_devices->devices)) { 104 device = list_entry(fs_devices->devices.next, 105 struct btrfs_device, dev_list); 106 list_del(&device->dev_list); 107 rcu_string_free(device->name); 108 kfree(device); 109 } 110 kfree(fs_devices); 111 } 112 113 static void btrfs_kobject_uevent(struct block_device *bdev, 114 enum kobject_action action) 115 { 116 int ret; 117 118 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 119 if (ret) 120 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", 121 action, 122 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 123 &disk_to_dev(bdev->bd_disk)->kobj); 124 } 125 126 void btrfs_cleanup_fs_uuids(void) 127 { 128 struct btrfs_fs_devices *fs_devices; 129 130 while (!list_empty(&fs_uuids)) { 131 fs_devices = list_entry(fs_uuids.next, 132 struct btrfs_fs_devices, list); 133 list_del(&fs_devices->list); 134 free_fs_devices(fs_devices); 135 } 136 } 137 138 static struct btrfs_device *__alloc_device(void) 139 { 140 struct btrfs_device *dev; 141 142 dev = kzalloc(sizeof(*dev), GFP_NOFS); 143 if (!dev) 144 return ERR_PTR(-ENOMEM); 145 146 INIT_LIST_HEAD(&dev->dev_list); 147 INIT_LIST_HEAD(&dev->dev_alloc_list); 148 INIT_LIST_HEAD(&dev->resized_list); 149 150 spin_lock_init(&dev->io_lock); 151 152 spin_lock_init(&dev->reada_lock); 153 atomic_set(&dev->reada_in_flight, 0); 154 atomic_set(&dev->dev_stats_ccnt, 0); 155 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 156 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 157 158 return dev; 159 } 160 161 static noinline struct btrfs_device *__find_device(struct list_head *head, 162 u64 devid, u8 *uuid) 163 { 164 struct btrfs_device *dev; 165 166 list_for_each_entry(dev, head, dev_list) { 167 if (dev->devid == devid && 168 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 169 return dev; 170 } 171 } 172 return NULL; 173 } 174 175 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 176 { 177 struct btrfs_fs_devices *fs_devices; 178 179 list_for_each_entry(fs_devices, &fs_uuids, list) { 180 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 181 return fs_devices; 182 } 183 return NULL; 184 } 185 186 static int 187 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 188 int flush, struct block_device **bdev, 189 struct buffer_head **bh) 190 { 191 int ret; 192 193 *bdev = blkdev_get_by_path(device_path, flags, holder); 194 195 if (IS_ERR(*bdev)) { 196 ret = PTR_ERR(*bdev); 197 printk(KERN_INFO "BTRFS: open %s failed\n", device_path); 198 goto error; 199 } 200 201 if (flush) 202 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 203 ret = set_blocksize(*bdev, 4096); 204 if (ret) { 205 blkdev_put(*bdev, flags); 206 goto error; 207 } 208 invalidate_bdev(*bdev); 209 *bh = btrfs_read_dev_super(*bdev); 210 if (!*bh) { 211 ret = -EINVAL; 212 blkdev_put(*bdev, flags); 213 goto error; 214 } 215 216 return 0; 217 218 error: 219 *bdev = NULL; 220 *bh = NULL; 221 return ret; 222 } 223 224 static void requeue_list(struct btrfs_pending_bios *pending_bios, 225 struct bio *head, struct bio *tail) 226 { 227 228 struct bio *old_head; 229 230 old_head = pending_bios->head; 231 pending_bios->head = head; 232 if (pending_bios->tail) 233 tail->bi_next = old_head; 234 else 235 pending_bios->tail = tail; 236 } 237 238 /* 239 * we try to collect pending bios for a device so we don't get a large 240 * number of procs sending bios down to the same device. This greatly 241 * improves the schedulers ability to collect and merge the bios. 242 * 243 * But, it also turns into a long list of bios to process and that is sure 244 * to eventually make the worker thread block. The solution here is to 245 * make some progress and then put this work struct back at the end of 246 * the list if the block device is congested. This way, multiple devices 247 * can make progress from a single worker thread. 248 */ 249 static noinline void run_scheduled_bios(struct btrfs_device *device) 250 { 251 struct bio *pending; 252 struct backing_dev_info *bdi; 253 struct btrfs_fs_info *fs_info; 254 struct btrfs_pending_bios *pending_bios; 255 struct bio *tail; 256 struct bio *cur; 257 int again = 0; 258 unsigned long num_run; 259 unsigned long batch_run = 0; 260 unsigned long limit; 261 unsigned long last_waited = 0; 262 int force_reg = 0; 263 int sync_pending = 0; 264 struct blk_plug plug; 265 266 /* 267 * this function runs all the bios we've collected for 268 * a particular device. We don't want to wander off to 269 * another device without first sending all of these down. 270 * So, setup a plug here and finish it off before we return 271 */ 272 blk_start_plug(&plug); 273 274 bdi = blk_get_backing_dev_info(device->bdev); 275 fs_info = device->dev_root->fs_info; 276 limit = btrfs_async_submit_limit(fs_info); 277 limit = limit * 2 / 3; 278 279 loop: 280 spin_lock(&device->io_lock); 281 282 loop_lock: 283 num_run = 0; 284 285 /* take all the bios off the list at once and process them 286 * later on (without the lock held). But, remember the 287 * tail and other pointers so the bios can be properly reinserted 288 * into the list if we hit congestion 289 */ 290 if (!force_reg && device->pending_sync_bios.head) { 291 pending_bios = &device->pending_sync_bios; 292 force_reg = 1; 293 } else { 294 pending_bios = &device->pending_bios; 295 force_reg = 0; 296 } 297 298 pending = pending_bios->head; 299 tail = pending_bios->tail; 300 WARN_ON(pending && !tail); 301 302 /* 303 * if pending was null this time around, no bios need processing 304 * at all and we can stop. Otherwise it'll loop back up again 305 * and do an additional check so no bios are missed. 306 * 307 * device->running_pending is used to synchronize with the 308 * schedule_bio code. 309 */ 310 if (device->pending_sync_bios.head == NULL && 311 device->pending_bios.head == NULL) { 312 again = 0; 313 device->running_pending = 0; 314 } else { 315 again = 1; 316 device->running_pending = 1; 317 } 318 319 pending_bios->head = NULL; 320 pending_bios->tail = NULL; 321 322 spin_unlock(&device->io_lock); 323 324 while (pending) { 325 326 rmb(); 327 /* we want to work on both lists, but do more bios on the 328 * sync list than the regular list 329 */ 330 if ((num_run > 32 && 331 pending_bios != &device->pending_sync_bios && 332 device->pending_sync_bios.head) || 333 (num_run > 64 && pending_bios == &device->pending_sync_bios && 334 device->pending_bios.head)) { 335 spin_lock(&device->io_lock); 336 requeue_list(pending_bios, pending, tail); 337 goto loop_lock; 338 } 339 340 cur = pending; 341 pending = pending->bi_next; 342 cur->bi_next = NULL; 343 344 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 345 waitqueue_active(&fs_info->async_submit_wait)) 346 wake_up(&fs_info->async_submit_wait); 347 348 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 349 350 /* 351 * if we're doing the sync list, record that our 352 * plug has some sync requests on it 353 * 354 * If we're doing the regular list and there are 355 * sync requests sitting around, unplug before 356 * we add more 357 */ 358 if (pending_bios == &device->pending_sync_bios) { 359 sync_pending = 1; 360 } else if (sync_pending) { 361 blk_finish_plug(&plug); 362 blk_start_plug(&plug); 363 sync_pending = 0; 364 } 365 366 btrfsic_submit_bio(cur->bi_rw, cur); 367 num_run++; 368 batch_run++; 369 if (need_resched()) 370 cond_resched(); 371 372 /* 373 * we made progress, there is more work to do and the bdi 374 * is now congested. Back off and let other work structs 375 * run instead 376 */ 377 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 378 fs_info->fs_devices->open_devices > 1) { 379 struct io_context *ioc; 380 381 ioc = current->io_context; 382 383 /* 384 * the main goal here is that we don't want to 385 * block if we're going to be able to submit 386 * more requests without blocking. 387 * 388 * This code does two great things, it pokes into 389 * the elevator code from a filesystem _and_ 390 * it makes assumptions about how batching works. 391 */ 392 if (ioc && ioc->nr_batch_requests > 0 && 393 time_before(jiffies, ioc->last_waited + HZ/50UL) && 394 (last_waited == 0 || 395 ioc->last_waited == last_waited)) { 396 /* 397 * we want to go through our batch of 398 * requests and stop. So, we copy out 399 * the ioc->last_waited time and test 400 * against it before looping 401 */ 402 last_waited = ioc->last_waited; 403 if (need_resched()) 404 cond_resched(); 405 continue; 406 } 407 spin_lock(&device->io_lock); 408 requeue_list(pending_bios, pending, tail); 409 device->running_pending = 1; 410 411 spin_unlock(&device->io_lock); 412 btrfs_queue_work(fs_info->submit_workers, 413 &device->work); 414 goto done; 415 } 416 /* unplug every 64 requests just for good measure */ 417 if (batch_run % 64 == 0) { 418 blk_finish_plug(&plug); 419 blk_start_plug(&plug); 420 sync_pending = 0; 421 } 422 } 423 424 cond_resched(); 425 if (again) 426 goto loop; 427 428 spin_lock(&device->io_lock); 429 if (device->pending_bios.head || device->pending_sync_bios.head) 430 goto loop_lock; 431 spin_unlock(&device->io_lock); 432 433 done: 434 blk_finish_plug(&plug); 435 } 436 437 static void pending_bios_fn(struct btrfs_work *work) 438 { 439 struct btrfs_device *device; 440 441 device = container_of(work, struct btrfs_device, work); 442 run_scheduled_bios(device); 443 } 444 445 /* 446 * Add new device to list of registered devices 447 * 448 * Returns: 449 * 1 - first time device is seen 450 * 0 - device already known 451 * < 0 - error 452 */ 453 static noinline int device_list_add(const char *path, 454 struct btrfs_super_block *disk_super, 455 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 456 { 457 struct btrfs_device *device; 458 struct btrfs_fs_devices *fs_devices; 459 struct rcu_string *name; 460 int ret = 0; 461 u64 found_transid = btrfs_super_generation(disk_super); 462 463 fs_devices = find_fsid(disk_super->fsid); 464 if (!fs_devices) { 465 fs_devices = alloc_fs_devices(disk_super->fsid); 466 if (IS_ERR(fs_devices)) 467 return PTR_ERR(fs_devices); 468 469 list_add(&fs_devices->list, &fs_uuids); 470 471 device = NULL; 472 } else { 473 device = __find_device(&fs_devices->devices, devid, 474 disk_super->dev_item.uuid); 475 } 476 477 if (!device) { 478 if (fs_devices->opened) 479 return -EBUSY; 480 481 device = btrfs_alloc_device(NULL, &devid, 482 disk_super->dev_item.uuid); 483 if (IS_ERR(device)) { 484 /* we can safely leave the fs_devices entry around */ 485 return PTR_ERR(device); 486 } 487 488 name = rcu_string_strdup(path, GFP_NOFS); 489 if (!name) { 490 kfree(device); 491 return -ENOMEM; 492 } 493 rcu_assign_pointer(device->name, name); 494 495 mutex_lock(&fs_devices->device_list_mutex); 496 list_add_rcu(&device->dev_list, &fs_devices->devices); 497 fs_devices->num_devices++; 498 mutex_unlock(&fs_devices->device_list_mutex); 499 500 ret = 1; 501 device->fs_devices = fs_devices; 502 } else if (!device->name || strcmp(device->name->str, path)) { 503 /* 504 * When FS is already mounted. 505 * 1. If you are here and if the device->name is NULL that 506 * means this device was missing at time of FS mount. 507 * 2. If you are here and if the device->name is different 508 * from 'path' that means either 509 * a. The same device disappeared and reappeared with 510 * different name. or 511 * b. The missing-disk-which-was-replaced, has 512 * reappeared now. 513 * 514 * We must allow 1 and 2a above. But 2b would be a spurious 515 * and unintentional. 516 * 517 * Further in case of 1 and 2a above, the disk at 'path' 518 * would have missed some transaction when it was away and 519 * in case of 2a the stale bdev has to be updated as well. 520 * 2b must not be allowed at all time. 521 */ 522 523 /* 524 * For now, we do allow update to btrfs_fs_device through the 525 * btrfs dev scan cli after FS has been mounted. We're still 526 * tracking a problem where systems fail mount by subvolume id 527 * when we reject replacement on a mounted FS. 528 */ 529 if (!fs_devices->opened && found_transid < device->generation) { 530 /* 531 * That is if the FS is _not_ mounted and if you 532 * are here, that means there is more than one 533 * disk with same uuid and devid.We keep the one 534 * with larger generation number or the last-in if 535 * generation are equal. 536 */ 537 return -EEXIST; 538 } 539 540 name = rcu_string_strdup(path, GFP_NOFS); 541 if (!name) 542 return -ENOMEM; 543 rcu_string_free(device->name); 544 rcu_assign_pointer(device->name, name); 545 if (device->missing) { 546 fs_devices->missing_devices--; 547 device->missing = 0; 548 } 549 } 550 551 /* 552 * Unmount does not free the btrfs_device struct but would zero 553 * generation along with most of the other members. So just update 554 * it back. We need it to pick the disk with largest generation 555 * (as above). 556 */ 557 if (!fs_devices->opened) 558 device->generation = found_transid; 559 560 *fs_devices_ret = fs_devices; 561 562 return ret; 563 } 564 565 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 566 { 567 struct btrfs_fs_devices *fs_devices; 568 struct btrfs_device *device; 569 struct btrfs_device *orig_dev; 570 571 fs_devices = alloc_fs_devices(orig->fsid); 572 if (IS_ERR(fs_devices)) 573 return fs_devices; 574 575 mutex_lock(&orig->device_list_mutex); 576 fs_devices->total_devices = orig->total_devices; 577 578 /* We have held the volume lock, it is safe to get the devices. */ 579 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 580 struct rcu_string *name; 581 582 device = btrfs_alloc_device(NULL, &orig_dev->devid, 583 orig_dev->uuid); 584 if (IS_ERR(device)) 585 goto error; 586 587 /* 588 * This is ok to do without rcu read locked because we hold the 589 * uuid mutex so nothing we touch in here is going to disappear. 590 */ 591 if (orig_dev->name) { 592 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 593 if (!name) { 594 kfree(device); 595 goto error; 596 } 597 rcu_assign_pointer(device->name, name); 598 } 599 600 list_add(&device->dev_list, &fs_devices->devices); 601 device->fs_devices = fs_devices; 602 fs_devices->num_devices++; 603 } 604 mutex_unlock(&orig->device_list_mutex); 605 return fs_devices; 606 error: 607 mutex_unlock(&orig->device_list_mutex); 608 free_fs_devices(fs_devices); 609 return ERR_PTR(-ENOMEM); 610 } 611 612 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 613 struct btrfs_fs_devices *fs_devices, int step) 614 { 615 struct btrfs_device *device, *next; 616 struct btrfs_device *latest_dev = NULL; 617 618 mutex_lock(&uuid_mutex); 619 again: 620 /* This is the initialized path, it is safe to release the devices. */ 621 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 622 if (device->in_fs_metadata) { 623 if (!device->is_tgtdev_for_dev_replace && 624 (!latest_dev || 625 device->generation > latest_dev->generation)) { 626 latest_dev = device; 627 } 628 continue; 629 } 630 631 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 632 /* 633 * In the first step, keep the device which has 634 * the correct fsid and the devid that is used 635 * for the dev_replace procedure. 636 * In the second step, the dev_replace state is 637 * read from the device tree and it is known 638 * whether the procedure is really active or 639 * not, which means whether this device is 640 * used or whether it should be removed. 641 */ 642 if (step == 0 || device->is_tgtdev_for_dev_replace) { 643 continue; 644 } 645 } 646 if (device->bdev) { 647 blkdev_put(device->bdev, device->mode); 648 device->bdev = NULL; 649 fs_devices->open_devices--; 650 } 651 if (device->writeable) { 652 list_del_init(&device->dev_alloc_list); 653 device->writeable = 0; 654 if (!device->is_tgtdev_for_dev_replace) 655 fs_devices->rw_devices--; 656 } 657 list_del_init(&device->dev_list); 658 fs_devices->num_devices--; 659 rcu_string_free(device->name); 660 kfree(device); 661 } 662 663 if (fs_devices->seed) { 664 fs_devices = fs_devices->seed; 665 goto again; 666 } 667 668 fs_devices->latest_bdev = latest_dev->bdev; 669 670 mutex_unlock(&uuid_mutex); 671 } 672 673 static void __free_device(struct work_struct *work) 674 { 675 struct btrfs_device *device; 676 677 device = container_of(work, struct btrfs_device, rcu_work); 678 679 if (device->bdev) 680 blkdev_put(device->bdev, device->mode); 681 682 rcu_string_free(device->name); 683 kfree(device); 684 } 685 686 static void free_device(struct rcu_head *head) 687 { 688 struct btrfs_device *device; 689 690 device = container_of(head, struct btrfs_device, rcu); 691 692 INIT_WORK(&device->rcu_work, __free_device); 693 schedule_work(&device->rcu_work); 694 } 695 696 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 697 { 698 struct btrfs_device *device; 699 700 if (--fs_devices->opened > 0) 701 return 0; 702 703 mutex_lock(&fs_devices->device_list_mutex); 704 list_for_each_entry(device, &fs_devices->devices, dev_list) { 705 struct btrfs_device *new_device; 706 struct rcu_string *name; 707 708 if (device->bdev) 709 fs_devices->open_devices--; 710 711 if (device->writeable && 712 device->devid != BTRFS_DEV_REPLACE_DEVID) { 713 list_del_init(&device->dev_alloc_list); 714 fs_devices->rw_devices--; 715 } 716 717 if (device->missing) 718 fs_devices->missing_devices--; 719 720 new_device = btrfs_alloc_device(NULL, &device->devid, 721 device->uuid); 722 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ 723 724 /* Safe because we are under uuid_mutex */ 725 if (device->name) { 726 name = rcu_string_strdup(device->name->str, GFP_NOFS); 727 BUG_ON(!name); /* -ENOMEM */ 728 rcu_assign_pointer(new_device->name, name); 729 } 730 731 list_replace_rcu(&device->dev_list, &new_device->dev_list); 732 new_device->fs_devices = device->fs_devices; 733 734 call_rcu(&device->rcu, free_device); 735 } 736 mutex_unlock(&fs_devices->device_list_mutex); 737 738 WARN_ON(fs_devices->open_devices); 739 WARN_ON(fs_devices->rw_devices); 740 fs_devices->opened = 0; 741 fs_devices->seeding = 0; 742 743 return 0; 744 } 745 746 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 747 { 748 struct btrfs_fs_devices *seed_devices = NULL; 749 int ret; 750 751 mutex_lock(&uuid_mutex); 752 ret = __btrfs_close_devices(fs_devices); 753 if (!fs_devices->opened) { 754 seed_devices = fs_devices->seed; 755 fs_devices->seed = NULL; 756 } 757 mutex_unlock(&uuid_mutex); 758 759 while (seed_devices) { 760 fs_devices = seed_devices; 761 seed_devices = fs_devices->seed; 762 __btrfs_close_devices(fs_devices); 763 free_fs_devices(fs_devices); 764 } 765 /* 766 * Wait for rcu kworkers under __btrfs_close_devices 767 * to finish all blkdev_puts so device is really 768 * free when umount is done. 769 */ 770 rcu_barrier(); 771 return ret; 772 } 773 774 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 775 fmode_t flags, void *holder) 776 { 777 struct request_queue *q; 778 struct block_device *bdev; 779 struct list_head *head = &fs_devices->devices; 780 struct btrfs_device *device; 781 struct btrfs_device *latest_dev = NULL; 782 struct buffer_head *bh; 783 struct btrfs_super_block *disk_super; 784 u64 devid; 785 int seeding = 1; 786 int ret = 0; 787 788 flags |= FMODE_EXCL; 789 790 list_for_each_entry(device, head, dev_list) { 791 if (device->bdev) 792 continue; 793 if (!device->name) 794 continue; 795 796 /* Just open everything we can; ignore failures here */ 797 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 798 &bdev, &bh)) 799 continue; 800 801 disk_super = (struct btrfs_super_block *)bh->b_data; 802 devid = btrfs_stack_device_id(&disk_super->dev_item); 803 if (devid != device->devid) 804 goto error_brelse; 805 806 if (memcmp(device->uuid, disk_super->dev_item.uuid, 807 BTRFS_UUID_SIZE)) 808 goto error_brelse; 809 810 device->generation = btrfs_super_generation(disk_super); 811 if (!latest_dev || 812 device->generation > latest_dev->generation) 813 latest_dev = device; 814 815 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 816 device->writeable = 0; 817 } else { 818 device->writeable = !bdev_read_only(bdev); 819 seeding = 0; 820 } 821 822 q = bdev_get_queue(bdev); 823 if (blk_queue_discard(q)) 824 device->can_discard = 1; 825 826 device->bdev = bdev; 827 device->in_fs_metadata = 0; 828 device->mode = flags; 829 830 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 831 fs_devices->rotating = 1; 832 833 fs_devices->open_devices++; 834 if (device->writeable && 835 device->devid != BTRFS_DEV_REPLACE_DEVID) { 836 fs_devices->rw_devices++; 837 list_add(&device->dev_alloc_list, 838 &fs_devices->alloc_list); 839 } 840 brelse(bh); 841 continue; 842 843 error_brelse: 844 brelse(bh); 845 blkdev_put(bdev, flags); 846 continue; 847 } 848 if (fs_devices->open_devices == 0) { 849 ret = -EINVAL; 850 goto out; 851 } 852 fs_devices->seeding = seeding; 853 fs_devices->opened = 1; 854 fs_devices->latest_bdev = latest_dev->bdev; 855 fs_devices->total_rw_bytes = 0; 856 out: 857 return ret; 858 } 859 860 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 861 fmode_t flags, void *holder) 862 { 863 int ret; 864 865 mutex_lock(&uuid_mutex); 866 if (fs_devices->opened) { 867 fs_devices->opened++; 868 ret = 0; 869 } else { 870 ret = __btrfs_open_devices(fs_devices, flags, holder); 871 } 872 mutex_unlock(&uuid_mutex); 873 return ret; 874 } 875 876 /* 877 * Look for a btrfs signature on a device. This may be called out of the mount path 878 * and we are not allowed to call set_blocksize during the scan. The superblock 879 * is read via pagecache 880 */ 881 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 882 struct btrfs_fs_devices **fs_devices_ret) 883 { 884 struct btrfs_super_block *disk_super; 885 struct block_device *bdev; 886 struct page *page; 887 void *p; 888 int ret = -EINVAL; 889 u64 devid; 890 u64 transid; 891 u64 total_devices; 892 u64 bytenr; 893 pgoff_t index; 894 895 /* 896 * we would like to check all the supers, but that would make 897 * a btrfs mount succeed after a mkfs from a different FS. 898 * So, we need to add a special mount option to scan for 899 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 900 */ 901 bytenr = btrfs_sb_offset(0); 902 flags |= FMODE_EXCL; 903 mutex_lock(&uuid_mutex); 904 905 bdev = blkdev_get_by_path(path, flags, holder); 906 907 if (IS_ERR(bdev)) { 908 ret = PTR_ERR(bdev); 909 goto error; 910 } 911 912 /* make sure our super fits in the device */ 913 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) 914 goto error_bdev_put; 915 916 /* make sure our super fits in the page */ 917 if (sizeof(*disk_super) > PAGE_CACHE_SIZE) 918 goto error_bdev_put; 919 920 /* make sure our super doesn't straddle pages on disk */ 921 index = bytenr >> PAGE_CACHE_SHIFT; 922 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) 923 goto error_bdev_put; 924 925 /* pull in the page with our super */ 926 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, 927 index, GFP_NOFS); 928 929 if (IS_ERR_OR_NULL(page)) 930 goto error_bdev_put; 931 932 p = kmap(page); 933 934 /* align our pointer to the offset of the super block */ 935 disk_super = p + (bytenr & ~PAGE_CACHE_MASK); 936 937 if (btrfs_super_bytenr(disk_super) != bytenr || 938 btrfs_super_magic(disk_super) != BTRFS_MAGIC) 939 goto error_unmap; 940 941 devid = btrfs_stack_device_id(&disk_super->dev_item); 942 transid = btrfs_super_generation(disk_super); 943 total_devices = btrfs_super_num_devices(disk_super); 944 945 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 946 if (ret > 0) { 947 if (disk_super->label[0]) { 948 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 949 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 950 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); 951 } else { 952 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); 953 } 954 955 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); 956 ret = 0; 957 } 958 if (!ret && fs_devices_ret) 959 (*fs_devices_ret)->total_devices = total_devices; 960 961 error_unmap: 962 kunmap(page); 963 page_cache_release(page); 964 965 error_bdev_put: 966 blkdev_put(bdev, flags); 967 error: 968 mutex_unlock(&uuid_mutex); 969 return ret; 970 } 971 972 /* helper to account the used device space in the range */ 973 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 974 u64 end, u64 *length) 975 { 976 struct btrfs_key key; 977 struct btrfs_root *root = device->dev_root; 978 struct btrfs_dev_extent *dev_extent; 979 struct btrfs_path *path; 980 u64 extent_end; 981 int ret; 982 int slot; 983 struct extent_buffer *l; 984 985 *length = 0; 986 987 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace) 988 return 0; 989 990 path = btrfs_alloc_path(); 991 if (!path) 992 return -ENOMEM; 993 path->reada = 2; 994 995 key.objectid = device->devid; 996 key.offset = start; 997 key.type = BTRFS_DEV_EXTENT_KEY; 998 999 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1000 if (ret < 0) 1001 goto out; 1002 if (ret > 0) { 1003 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1004 if (ret < 0) 1005 goto out; 1006 } 1007 1008 while (1) { 1009 l = path->nodes[0]; 1010 slot = path->slots[0]; 1011 if (slot >= btrfs_header_nritems(l)) { 1012 ret = btrfs_next_leaf(root, path); 1013 if (ret == 0) 1014 continue; 1015 if (ret < 0) 1016 goto out; 1017 1018 break; 1019 } 1020 btrfs_item_key_to_cpu(l, &key, slot); 1021 1022 if (key.objectid < device->devid) 1023 goto next; 1024 1025 if (key.objectid > device->devid) 1026 break; 1027 1028 if (key.type != BTRFS_DEV_EXTENT_KEY) 1029 goto next; 1030 1031 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1032 extent_end = key.offset + btrfs_dev_extent_length(l, 1033 dev_extent); 1034 if (key.offset <= start && extent_end > end) { 1035 *length = end - start + 1; 1036 break; 1037 } else if (key.offset <= start && extent_end > start) 1038 *length += extent_end - start; 1039 else if (key.offset > start && extent_end <= end) 1040 *length += extent_end - key.offset; 1041 else if (key.offset > start && key.offset <= end) { 1042 *length += end - key.offset + 1; 1043 break; 1044 } else if (key.offset > end) 1045 break; 1046 1047 next: 1048 path->slots[0]++; 1049 } 1050 ret = 0; 1051 out: 1052 btrfs_free_path(path); 1053 return ret; 1054 } 1055 1056 static int contains_pending_extent(struct btrfs_trans_handle *trans, 1057 struct btrfs_device *device, 1058 u64 *start, u64 len) 1059 { 1060 struct extent_map *em; 1061 struct list_head *search_list = &trans->transaction->pending_chunks; 1062 int ret = 0; 1063 1064 again: 1065 list_for_each_entry(em, search_list, list) { 1066 struct map_lookup *map; 1067 int i; 1068 1069 map = (struct map_lookup *)em->bdev; 1070 for (i = 0; i < map->num_stripes; i++) { 1071 if (map->stripes[i].dev != device) 1072 continue; 1073 if (map->stripes[i].physical >= *start + len || 1074 map->stripes[i].physical + em->orig_block_len <= 1075 *start) 1076 continue; 1077 *start = map->stripes[i].physical + 1078 em->orig_block_len; 1079 ret = 1; 1080 } 1081 } 1082 if (search_list == &trans->transaction->pending_chunks) { 1083 search_list = &trans->root->fs_info->pinned_chunks; 1084 goto again; 1085 } 1086 1087 return ret; 1088 } 1089 1090 1091 /* 1092 * find_free_dev_extent - find free space in the specified device 1093 * @device: the device which we search the free space in 1094 * @num_bytes: the size of the free space that we need 1095 * @start: store the start of the free space. 1096 * @len: the size of the free space. that we find, or the size of the max 1097 * free space if we don't find suitable free space 1098 * 1099 * this uses a pretty simple search, the expectation is that it is 1100 * called very infrequently and that a given device has a small number 1101 * of extents 1102 * 1103 * @start is used to store the start of the free space if we find. But if we 1104 * don't find suitable free space, it will be used to store the start position 1105 * of the max free space. 1106 * 1107 * @len is used to store the size of the free space that we find. 1108 * But if we don't find suitable free space, it is used to store the size of 1109 * the max free space. 1110 */ 1111 int find_free_dev_extent(struct btrfs_trans_handle *trans, 1112 struct btrfs_device *device, u64 num_bytes, 1113 u64 *start, u64 *len) 1114 { 1115 struct btrfs_key key; 1116 struct btrfs_root *root = device->dev_root; 1117 struct btrfs_dev_extent *dev_extent; 1118 struct btrfs_path *path; 1119 u64 hole_size; 1120 u64 max_hole_start; 1121 u64 max_hole_size; 1122 u64 extent_end; 1123 u64 search_start; 1124 u64 search_end = device->total_bytes; 1125 int ret; 1126 int slot; 1127 struct extent_buffer *l; 1128 1129 /* FIXME use last free of some kind */ 1130 1131 /* we don't want to overwrite the superblock on the drive, 1132 * so we make sure to start at an offset of at least 1MB 1133 */ 1134 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1135 1136 path = btrfs_alloc_path(); 1137 if (!path) 1138 return -ENOMEM; 1139 again: 1140 max_hole_start = search_start; 1141 max_hole_size = 0; 1142 hole_size = 0; 1143 1144 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1145 ret = -ENOSPC; 1146 goto out; 1147 } 1148 1149 path->reada = 2; 1150 path->search_commit_root = 1; 1151 path->skip_locking = 1; 1152 1153 key.objectid = device->devid; 1154 key.offset = search_start; 1155 key.type = BTRFS_DEV_EXTENT_KEY; 1156 1157 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1158 if (ret < 0) 1159 goto out; 1160 if (ret > 0) { 1161 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1162 if (ret < 0) 1163 goto out; 1164 } 1165 1166 while (1) { 1167 l = path->nodes[0]; 1168 slot = path->slots[0]; 1169 if (slot >= btrfs_header_nritems(l)) { 1170 ret = btrfs_next_leaf(root, path); 1171 if (ret == 0) 1172 continue; 1173 if (ret < 0) 1174 goto out; 1175 1176 break; 1177 } 1178 btrfs_item_key_to_cpu(l, &key, slot); 1179 1180 if (key.objectid < device->devid) 1181 goto next; 1182 1183 if (key.objectid > device->devid) 1184 break; 1185 1186 if (key.type != BTRFS_DEV_EXTENT_KEY) 1187 goto next; 1188 1189 if (key.offset > search_start) { 1190 hole_size = key.offset - search_start; 1191 1192 /* 1193 * Have to check before we set max_hole_start, otherwise 1194 * we could end up sending back this offset anyway. 1195 */ 1196 if (contains_pending_extent(trans, device, 1197 &search_start, 1198 hole_size)) 1199 hole_size = 0; 1200 1201 if (hole_size > max_hole_size) { 1202 max_hole_start = search_start; 1203 max_hole_size = hole_size; 1204 } 1205 1206 /* 1207 * If this free space is greater than which we need, 1208 * it must be the max free space that we have found 1209 * until now, so max_hole_start must point to the start 1210 * of this free space and the length of this free space 1211 * is stored in max_hole_size. Thus, we return 1212 * max_hole_start and max_hole_size and go back to the 1213 * caller. 1214 */ 1215 if (hole_size >= num_bytes) { 1216 ret = 0; 1217 goto out; 1218 } 1219 } 1220 1221 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1222 extent_end = key.offset + btrfs_dev_extent_length(l, 1223 dev_extent); 1224 if (extent_end > search_start) 1225 search_start = extent_end; 1226 next: 1227 path->slots[0]++; 1228 cond_resched(); 1229 } 1230 1231 /* 1232 * At this point, search_start should be the end of 1233 * allocated dev extents, and when shrinking the device, 1234 * search_end may be smaller than search_start. 1235 */ 1236 if (search_end > search_start) 1237 hole_size = search_end - search_start; 1238 1239 if (hole_size > max_hole_size) { 1240 max_hole_start = search_start; 1241 max_hole_size = hole_size; 1242 } 1243 1244 if (contains_pending_extent(trans, device, &search_start, hole_size)) { 1245 btrfs_release_path(path); 1246 goto again; 1247 } 1248 1249 /* See above. */ 1250 if (hole_size < num_bytes) 1251 ret = -ENOSPC; 1252 else 1253 ret = 0; 1254 1255 out: 1256 btrfs_free_path(path); 1257 *start = max_hole_start; 1258 if (len) 1259 *len = max_hole_size; 1260 return ret; 1261 } 1262 1263 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1264 struct btrfs_device *device, 1265 u64 start, u64 *dev_extent_len) 1266 { 1267 int ret; 1268 struct btrfs_path *path; 1269 struct btrfs_root *root = device->dev_root; 1270 struct btrfs_key key; 1271 struct btrfs_key found_key; 1272 struct extent_buffer *leaf = NULL; 1273 struct btrfs_dev_extent *extent = NULL; 1274 1275 path = btrfs_alloc_path(); 1276 if (!path) 1277 return -ENOMEM; 1278 1279 key.objectid = device->devid; 1280 key.offset = start; 1281 key.type = BTRFS_DEV_EXTENT_KEY; 1282 again: 1283 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1284 if (ret > 0) { 1285 ret = btrfs_previous_item(root, path, key.objectid, 1286 BTRFS_DEV_EXTENT_KEY); 1287 if (ret) 1288 goto out; 1289 leaf = path->nodes[0]; 1290 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1291 extent = btrfs_item_ptr(leaf, path->slots[0], 1292 struct btrfs_dev_extent); 1293 BUG_ON(found_key.offset > start || found_key.offset + 1294 btrfs_dev_extent_length(leaf, extent) < start); 1295 key = found_key; 1296 btrfs_release_path(path); 1297 goto again; 1298 } else if (ret == 0) { 1299 leaf = path->nodes[0]; 1300 extent = btrfs_item_ptr(leaf, path->slots[0], 1301 struct btrfs_dev_extent); 1302 } else { 1303 btrfs_error(root->fs_info, ret, "Slot search failed"); 1304 goto out; 1305 } 1306 1307 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1308 1309 ret = btrfs_del_item(trans, root, path); 1310 if (ret) { 1311 btrfs_error(root->fs_info, ret, 1312 "Failed to remove dev extent item"); 1313 } 1314 out: 1315 btrfs_free_path(path); 1316 return ret; 1317 } 1318 1319 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1320 struct btrfs_device *device, 1321 u64 chunk_tree, u64 chunk_objectid, 1322 u64 chunk_offset, u64 start, u64 num_bytes) 1323 { 1324 int ret; 1325 struct btrfs_path *path; 1326 struct btrfs_root *root = device->dev_root; 1327 struct btrfs_dev_extent *extent; 1328 struct extent_buffer *leaf; 1329 struct btrfs_key key; 1330 1331 WARN_ON(!device->in_fs_metadata); 1332 WARN_ON(device->is_tgtdev_for_dev_replace); 1333 path = btrfs_alloc_path(); 1334 if (!path) 1335 return -ENOMEM; 1336 1337 key.objectid = device->devid; 1338 key.offset = start; 1339 key.type = BTRFS_DEV_EXTENT_KEY; 1340 ret = btrfs_insert_empty_item(trans, root, path, &key, 1341 sizeof(*extent)); 1342 if (ret) 1343 goto out; 1344 1345 leaf = path->nodes[0]; 1346 extent = btrfs_item_ptr(leaf, path->slots[0], 1347 struct btrfs_dev_extent); 1348 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1349 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1350 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1351 1352 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1353 btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); 1354 1355 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1356 btrfs_mark_buffer_dirty(leaf); 1357 out: 1358 btrfs_free_path(path); 1359 return ret; 1360 } 1361 1362 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1363 { 1364 struct extent_map_tree *em_tree; 1365 struct extent_map *em; 1366 struct rb_node *n; 1367 u64 ret = 0; 1368 1369 em_tree = &fs_info->mapping_tree.map_tree; 1370 read_lock(&em_tree->lock); 1371 n = rb_last(&em_tree->map); 1372 if (n) { 1373 em = rb_entry(n, struct extent_map, rb_node); 1374 ret = em->start + em->len; 1375 } 1376 read_unlock(&em_tree->lock); 1377 1378 return ret; 1379 } 1380 1381 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1382 u64 *devid_ret) 1383 { 1384 int ret; 1385 struct btrfs_key key; 1386 struct btrfs_key found_key; 1387 struct btrfs_path *path; 1388 1389 path = btrfs_alloc_path(); 1390 if (!path) 1391 return -ENOMEM; 1392 1393 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1394 key.type = BTRFS_DEV_ITEM_KEY; 1395 key.offset = (u64)-1; 1396 1397 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1398 if (ret < 0) 1399 goto error; 1400 1401 BUG_ON(ret == 0); /* Corruption */ 1402 1403 ret = btrfs_previous_item(fs_info->chunk_root, path, 1404 BTRFS_DEV_ITEMS_OBJECTID, 1405 BTRFS_DEV_ITEM_KEY); 1406 if (ret) { 1407 *devid_ret = 1; 1408 } else { 1409 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1410 path->slots[0]); 1411 *devid_ret = found_key.offset + 1; 1412 } 1413 ret = 0; 1414 error: 1415 btrfs_free_path(path); 1416 return ret; 1417 } 1418 1419 /* 1420 * the device information is stored in the chunk root 1421 * the btrfs_device struct should be fully filled in 1422 */ 1423 static int btrfs_add_device(struct btrfs_trans_handle *trans, 1424 struct btrfs_root *root, 1425 struct btrfs_device *device) 1426 { 1427 int ret; 1428 struct btrfs_path *path; 1429 struct btrfs_dev_item *dev_item; 1430 struct extent_buffer *leaf; 1431 struct btrfs_key key; 1432 unsigned long ptr; 1433 1434 root = root->fs_info->chunk_root; 1435 1436 path = btrfs_alloc_path(); 1437 if (!path) 1438 return -ENOMEM; 1439 1440 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1441 key.type = BTRFS_DEV_ITEM_KEY; 1442 key.offset = device->devid; 1443 1444 ret = btrfs_insert_empty_item(trans, root, path, &key, 1445 sizeof(*dev_item)); 1446 if (ret) 1447 goto out; 1448 1449 leaf = path->nodes[0]; 1450 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1451 1452 btrfs_set_device_id(leaf, dev_item, device->devid); 1453 btrfs_set_device_generation(leaf, dev_item, 0); 1454 btrfs_set_device_type(leaf, dev_item, device->type); 1455 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1456 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1457 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1458 btrfs_set_device_total_bytes(leaf, dev_item, 1459 btrfs_device_get_disk_total_bytes(device)); 1460 btrfs_set_device_bytes_used(leaf, dev_item, 1461 btrfs_device_get_bytes_used(device)); 1462 btrfs_set_device_group(leaf, dev_item, 0); 1463 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1464 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1465 btrfs_set_device_start_offset(leaf, dev_item, 0); 1466 1467 ptr = btrfs_device_uuid(dev_item); 1468 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1469 ptr = btrfs_device_fsid(dev_item); 1470 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1471 btrfs_mark_buffer_dirty(leaf); 1472 1473 ret = 0; 1474 out: 1475 btrfs_free_path(path); 1476 return ret; 1477 } 1478 1479 /* 1480 * Function to update ctime/mtime for a given device path. 1481 * Mainly used for ctime/mtime based probe like libblkid. 1482 */ 1483 static void update_dev_time(char *path_name) 1484 { 1485 struct file *filp; 1486 1487 filp = filp_open(path_name, O_RDWR, 0); 1488 if (IS_ERR(filp)) 1489 return; 1490 file_update_time(filp); 1491 filp_close(filp, NULL); 1492 return; 1493 } 1494 1495 static int btrfs_rm_dev_item(struct btrfs_root *root, 1496 struct btrfs_device *device) 1497 { 1498 int ret; 1499 struct btrfs_path *path; 1500 struct btrfs_key key; 1501 struct btrfs_trans_handle *trans; 1502 1503 root = root->fs_info->chunk_root; 1504 1505 path = btrfs_alloc_path(); 1506 if (!path) 1507 return -ENOMEM; 1508 1509 trans = btrfs_start_transaction(root, 0); 1510 if (IS_ERR(trans)) { 1511 btrfs_free_path(path); 1512 return PTR_ERR(trans); 1513 } 1514 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1515 key.type = BTRFS_DEV_ITEM_KEY; 1516 key.offset = device->devid; 1517 1518 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1519 if (ret < 0) 1520 goto out; 1521 1522 if (ret > 0) { 1523 ret = -ENOENT; 1524 goto out; 1525 } 1526 1527 ret = btrfs_del_item(trans, root, path); 1528 if (ret) 1529 goto out; 1530 out: 1531 btrfs_free_path(path); 1532 btrfs_commit_transaction(trans, root); 1533 return ret; 1534 } 1535 1536 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1537 { 1538 struct btrfs_device *device; 1539 struct btrfs_device *next_device; 1540 struct block_device *bdev; 1541 struct buffer_head *bh = NULL; 1542 struct btrfs_super_block *disk_super; 1543 struct btrfs_fs_devices *cur_devices; 1544 u64 all_avail; 1545 u64 devid; 1546 u64 num_devices; 1547 u8 *dev_uuid; 1548 unsigned seq; 1549 int ret = 0; 1550 bool clear_super = false; 1551 1552 mutex_lock(&uuid_mutex); 1553 1554 do { 1555 seq = read_seqbegin(&root->fs_info->profiles_lock); 1556 1557 all_avail = root->fs_info->avail_data_alloc_bits | 1558 root->fs_info->avail_system_alloc_bits | 1559 root->fs_info->avail_metadata_alloc_bits; 1560 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 1561 1562 num_devices = root->fs_info->fs_devices->num_devices; 1563 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1564 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1565 WARN_ON(num_devices < 1); 1566 num_devices--; 1567 } 1568 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1569 1570 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1571 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1572 goto out; 1573 } 1574 1575 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1576 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; 1577 goto out; 1578 } 1579 1580 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1581 root->fs_info->fs_devices->rw_devices <= 2) { 1582 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; 1583 goto out; 1584 } 1585 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1586 root->fs_info->fs_devices->rw_devices <= 3) { 1587 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; 1588 goto out; 1589 } 1590 1591 if (strcmp(device_path, "missing") == 0) { 1592 struct list_head *devices; 1593 struct btrfs_device *tmp; 1594 1595 device = NULL; 1596 devices = &root->fs_info->fs_devices->devices; 1597 /* 1598 * It is safe to read the devices since the volume_mutex 1599 * is held. 1600 */ 1601 list_for_each_entry(tmp, devices, dev_list) { 1602 if (tmp->in_fs_metadata && 1603 !tmp->is_tgtdev_for_dev_replace && 1604 !tmp->bdev) { 1605 device = tmp; 1606 break; 1607 } 1608 } 1609 bdev = NULL; 1610 bh = NULL; 1611 disk_super = NULL; 1612 if (!device) { 1613 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 1614 goto out; 1615 } 1616 } else { 1617 ret = btrfs_get_bdev_and_sb(device_path, 1618 FMODE_WRITE | FMODE_EXCL, 1619 root->fs_info->bdev_holder, 0, 1620 &bdev, &bh); 1621 if (ret) 1622 goto out; 1623 disk_super = (struct btrfs_super_block *)bh->b_data; 1624 devid = btrfs_stack_device_id(&disk_super->dev_item); 1625 dev_uuid = disk_super->dev_item.uuid; 1626 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1627 disk_super->fsid); 1628 if (!device) { 1629 ret = -ENOENT; 1630 goto error_brelse; 1631 } 1632 } 1633 1634 if (device->is_tgtdev_for_dev_replace) { 1635 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1636 goto error_brelse; 1637 } 1638 1639 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1640 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1641 goto error_brelse; 1642 } 1643 1644 if (device->writeable) { 1645 lock_chunks(root); 1646 list_del_init(&device->dev_alloc_list); 1647 device->fs_devices->rw_devices--; 1648 unlock_chunks(root); 1649 clear_super = true; 1650 } 1651 1652 mutex_unlock(&uuid_mutex); 1653 ret = btrfs_shrink_device(device, 0); 1654 mutex_lock(&uuid_mutex); 1655 if (ret) 1656 goto error_undo; 1657 1658 /* 1659 * TODO: the superblock still includes this device in its num_devices 1660 * counter although write_all_supers() is not locked out. This 1661 * could give a filesystem state which requires a degraded mount. 1662 */ 1663 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1664 if (ret) 1665 goto error_undo; 1666 1667 device->in_fs_metadata = 0; 1668 btrfs_scrub_cancel_dev(root->fs_info, device); 1669 1670 /* 1671 * the device list mutex makes sure that we don't change 1672 * the device list while someone else is writing out all 1673 * the device supers. Whoever is writing all supers, should 1674 * lock the device list mutex before getting the number of 1675 * devices in the super block (super_copy). Conversely, 1676 * whoever updates the number of devices in the super block 1677 * (super_copy) should hold the device list mutex. 1678 */ 1679 1680 cur_devices = device->fs_devices; 1681 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1682 list_del_rcu(&device->dev_list); 1683 1684 device->fs_devices->num_devices--; 1685 device->fs_devices->total_devices--; 1686 1687 if (device->missing) 1688 device->fs_devices->missing_devices--; 1689 1690 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1691 struct btrfs_device, dev_list); 1692 if (device->bdev == root->fs_info->sb->s_bdev) 1693 root->fs_info->sb->s_bdev = next_device->bdev; 1694 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1695 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1696 1697 if (device->bdev) { 1698 device->fs_devices->open_devices--; 1699 /* remove sysfs entry */ 1700 btrfs_kobj_rm_device(root->fs_info, device); 1701 } 1702 1703 call_rcu(&device->rcu, free_device); 1704 1705 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1706 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1707 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1708 1709 if (cur_devices->open_devices == 0) { 1710 struct btrfs_fs_devices *fs_devices; 1711 fs_devices = root->fs_info->fs_devices; 1712 while (fs_devices) { 1713 if (fs_devices->seed == cur_devices) { 1714 fs_devices->seed = cur_devices->seed; 1715 break; 1716 } 1717 fs_devices = fs_devices->seed; 1718 } 1719 cur_devices->seed = NULL; 1720 __btrfs_close_devices(cur_devices); 1721 free_fs_devices(cur_devices); 1722 } 1723 1724 root->fs_info->num_tolerated_disk_barrier_failures = 1725 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 1726 1727 /* 1728 * at this point, the device is zero sized. We want to 1729 * remove it from the devices list and zero out the old super 1730 */ 1731 if (clear_super && disk_super) { 1732 u64 bytenr; 1733 int i; 1734 1735 /* make sure this device isn't detected as part of 1736 * the FS anymore 1737 */ 1738 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1739 set_buffer_dirty(bh); 1740 sync_dirty_buffer(bh); 1741 1742 /* clear the mirror copies of super block on the disk 1743 * being removed, 0th copy is been taken care above and 1744 * the below would take of the rest 1745 */ 1746 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1747 bytenr = btrfs_sb_offset(i); 1748 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 1749 i_size_read(bdev->bd_inode)) 1750 break; 1751 1752 brelse(bh); 1753 bh = __bread(bdev, bytenr / 4096, 1754 BTRFS_SUPER_INFO_SIZE); 1755 if (!bh) 1756 continue; 1757 1758 disk_super = (struct btrfs_super_block *)bh->b_data; 1759 1760 if (btrfs_super_bytenr(disk_super) != bytenr || 1761 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1762 continue; 1763 } 1764 memset(&disk_super->magic, 0, 1765 sizeof(disk_super->magic)); 1766 set_buffer_dirty(bh); 1767 sync_dirty_buffer(bh); 1768 } 1769 } 1770 1771 ret = 0; 1772 1773 if (bdev) { 1774 /* Notify udev that device has changed */ 1775 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1776 1777 /* Update ctime/mtime for device path for libblkid */ 1778 update_dev_time(device_path); 1779 } 1780 1781 error_brelse: 1782 brelse(bh); 1783 if (bdev) 1784 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1785 out: 1786 mutex_unlock(&uuid_mutex); 1787 return ret; 1788 error_undo: 1789 if (device->writeable) { 1790 lock_chunks(root); 1791 list_add(&device->dev_alloc_list, 1792 &root->fs_info->fs_devices->alloc_list); 1793 device->fs_devices->rw_devices++; 1794 unlock_chunks(root); 1795 } 1796 goto error_brelse; 1797 } 1798 1799 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 1800 struct btrfs_device *srcdev) 1801 { 1802 struct btrfs_fs_devices *fs_devices; 1803 1804 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1805 1806 /* 1807 * in case of fs with no seed, srcdev->fs_devices will point 1808 * to fs_devices of fs_info. However when the dev being replaced is 1809 * a seed dev it will point to the seed's local fs_devices. In short 1810 * srcdev will have its correct fs_devices in both the cases. 1811 */ 1812 fs_devices = srcdev->fs_devices; 1813 1814 list_del_rcu(&srcdev->dev_list); 1815 list_del_rcu(&srcdev->dev_alloc_list); 1816 fs_devices->num_devices--; 1817 if (srcdev->missing) 1818 fs_devices->missing_devices--; 1819 1820 if (srcdev->writeable) { 1821 fs_devices->rw_devices--; 1822 /* zero out the old super if it is writable */ 1823 btrfs_scratch_superblock(srcdev); 1824 } 1825 1826 if (srcdev->bdev) 1827 fs_devices->open_devices--; 1828 } 1829 1830 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, 1831 struct btrfs_device *srcdev) 1832 { 1833 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 1834 1835 call_rcu(&srcdev->rcu, free_device); 1836 1837 /* 1838 * unless fs_devices is seed fs, num_devices shouldn't go 1839 * zero 1840 */ 1841 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); 1842 1843 /* if this is no devs we rather delete the fs_devices */ 1844 if (!fs_devices->num_devices) { 1845 struct btrfs_fs_devices *tmp_fs_devices; 1846 1847 tmp_fs_devices = fs_info->fs_devices; 1848 while (tmp_fs_devices) { 1849 if (tmp_fs_devices->seed == fs_devices) { 1850 tmp_fs_devices->seed = fs_devices->seed; 1851 break; 1852 } 1853 tmp_fs_devices = tmp_fs_devices->seed; 1854 } 1855 fs_devices->seed = NULL; 1856 __btrfs_close_devices(fs_devices); 1857 free_fs_devices(fs_devices); 1858 } 1859 } 1860 1861 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1862 struct btrfs_device *tgtdev) 1863 { 1864 struct btrfs_device *next_device; 1865 1866 mutex_lock(&uuid_mutex); 1867 WARN_ON(!tgtdev); 1868 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1869 if (tgtdev->bdev) { 1870 btrfs_scratch_superblock(tgtdev); 1871 fs_info->fs_devices->open_devices--; 1872 } 1873 fs_info->fs_devices->num_devices--; 1874 1875 next_device = list_entry(fs_info->fs_devices->devices.next, 1876 struct btrfs_device, dev_list); 1877 if (tgtdev->bdev == fs_info->sb->s_bdev) 1878 fs_info->sb->s_bdev = next_device->bdev; 1879 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) 1880 fs_info->fs_devices->latest_bdev = next_device->bdev; 1881 list_del_rcu(&tgtdev->dev_list); 1882 1883 call_rcu(&tgtdev->rcu, free_device); 1884 1885 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1886 mutex_unlock(&uuid_mutex); 1887 } 1888 1889 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1890 struct btrfs_device **device) 1891 { 1892 int ret = 0; 1893 struct btrfs_super_block *disk_super; 1894 u64 devid; 1895 u8 *dev_uuid; 1896 struct block_device *bdev; 1897 struct buffer_head *bh; 1898 1899 *device = NULL; 1900 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 1901 root->fs_info->bdev_holder, 0, &bdev, &bh); 1902 if (ret) 1903 return ret; 1904 disk_super = (struct btrfs_super_block *)bh->b_data; 1905 devid = btrfs_stack_device_id(&disk_super->dev_item); 1906 dev_uuid = disk_super->dev_item.uuid; 1907 *device = btrfs_find_device(root->fs_info, devid, dev_uuid, 1908 disk_super->fsid); 1909 brelse(bh); 1910 if (!*device) 1911 ret = -ENOENT; 1912 blkdev_put(bdev, FMODE_READ); 1913 return ret; 1914 } 1915 1916 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 1917 char *device_path, 1918 struct btrfs_device **device) 1919 { 1920 *device = NULL; 1921 if (strcmp(device_path, "missing") == 0) { 1922 struct list_head *devices; 1923 struct btrfs_device *tmp; 1924 1925 devices = &root->fs_info->fs_devices->devices; 1926 /* 1927 * It is safe to read the devices since the volume_mutex 1928 * is held by the caller. 1929 */ 1930 list_for_each_entry(tmp, devices, dev_list) { 1931 if (tmp->in_fs_metadata && !tmp->bdev) { 1932 *device = tmp; 1933 break; 1934 } 1935 } 1936 1937 if (!*device) { 1938 btrfs_err(root->fs_info, "no missing device found"); 1939 return -ENOENT; 1940 } 1941 1942 return 0; 1943 } else { 1944 return btrfs_find_device_by_path(root, device_path, device); 1945 } 1946 } 1947 1948 /* 1949 * does all the dirty work required for changing file system's UUID. 1950 */ 1951 static int btrfs_prepare_sprout(struct btrfs_root *root) 1952 { 1953 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1954 struct btrfs_fs_devices *old_devices; 1955 struct btrfs_fs_devices *seed_devices; 1956 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1957 struct btrfs_device *device; 1958 u64 super_flags; 1959 1960 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1961 if (!fs_devices->seeding) 1962 return -EINVAL; 1963 1964 seed_devices = __alloc_fs_devices(); 1965 if (IS_ERR(seed_devices)) 1966 return PTR_ERR(seed_devices); 1967 1968 old_devices = clone_fs_devices(fs_devices); 1969 if (IS_ERR(old_devices)) { 1970 kfree(seed_devices); 1971 return PTR_ERR(old_devices); 1972 } 1973 1974 list_add(&old_devices->list, &fs_uuids); 1975 1976 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1977 seed_devices->opened = 1; 1978 INIT_LIST_HEAD(&seed_devices->devices); 1979 INIT_LIST_HEAD(&seed_devices->alloc_list); 1980 mutex_init(&seed_devices->device_list_mutex); 1981 1982 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1983 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1984 synchronize_rcu); 1985 list_for_each_entry(device, &seed_devices->devices, dev_list) 1986 device->fs_devices = seed_devices; 1987 1988 lock_chunks(root); 1989 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1990 unlock_chunks(root); 1991 1992 fs_devices->seeding = 0; 1993 fs_devices->num_devices = 0; 1994 fs_devices->open_devices = 0; 1995 fs_devices->missing_devices = 0; 1996 fs_devices->rotating = 0; 1997 fs_devices->seed = seed_devices; 1998 1999 generate_random_uuid(fs_devices->fsid); 2000 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2001 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2002 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2003 2004 super_flags = btrfs_super_flags(disk_super) & 2005 ~BTRFS_SUPER_FLAG_SEEDING; 2006 btrfs_set_super_flags(disk_super, super_flags); 2007 2008 return 0; 2009 } 2010 2011 /* 2012 * strore the expected generation for seed devices in device items. 2013 */ 2014 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 2015 struct btrfs_root *root) 2016 { 2017 struct btrfs_path *path; 2018 struct extent_buffer *leaf; 2019 struct btrfs_dev_item *dev_item; 2020 struct btrfs_device *device; 2021 struct btrfs_key key; 2022 u8 fs_uuid[BTRFS_UUID_SIZE]; 2023 u8 dev_uuid[BTRFS_UUID_SIZE]; 2024 u64 devid; 2025 int ret; 2026 2027 path = btrfs_alloc_path(); 2028 if (!path) 2029 return -ENOMEM; 2030 2031 root = root->fs_info->chunk_root; 2032 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2033 key.offset = 0; 2034 key.type = BTRFS_DEV_ITEM_KEY; 2035 2036 while (1) { 2037 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2038 if (ret < 0) 2039 goto error; 2040 2041 leaf = path->nodes[0]; 2042 next_slot: 2043 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2044 ret = btrfs_next_leaf(root, path); 2045 if (ret > 0) 2046 break; 2047 if (ret < 0) 2048 goto error; 2049 leaf = path->nodes[0]; 2050 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2051 btrfs_release_path(path); 2052 continue; 2053 } 2054 2055 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2056 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2057 key.type != BTRFS_DEV_ITEM_KEY) 2058 break; 2059 2060 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2061 struct btrfs_dev_item); 2062 devid = btrfs_device_id(leaf, dev_item); 2063 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2064 BTRFS_UUID_SIZE); 2065 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2066 BTRFS_UUID_SIZE); 2067 device = btrfs_find_device(root->fs_info, devid, dev_uuid, 2068 fs_uuid); 2069 BUG_ON(!device); /* Logic error */ 2070 2071 if (device->fs_devices->seeding) { 2072 btrfs_set_device_generation(leaf, dev_item, 2073 device->generation); 2074 btrfs_mark_buffer_dirty(leaf); 2075 } 2076 2077 path->slots[0]++; 2078 goto next_slot; 2079 } 2080 ret = 0; 2081 error: 2082 btrfs_free_path(path); 2083 return ret; 2084 } 2085 2086 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 2087 { 2088 struct request_queue *q; 2089 struct btrfs_trans_handle *trans; 2090 struct btrfs_device *device; 2091 struct block_device *bdev; 2092 struct list_head *devices; 2093 struct super_block *sb = root->fs_info->sb; 2094 struct rcu_string *name; 2095 u64 tmp; 2096 int seeding_dev = 0; 2097 int ret = 0; 2098 2099 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 2100 return -EROFS; 2101 2102 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2103 root->fs_info->bdev_holder); 2104 if (IS_ERR(bdev)) 2105 return PTR_ERR(bdev); 2106 2107 if (root->fs_info->fs_devices->seeding) { 2108 seeding_dev = 1; 2109 down_write(&sb->s_umount); 2110 mutex_lock(&uuid_mutex); 2111 } 2112 2113 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2114 2115 devices = &root->fs_info->fs_devices->devices; 2116 2117 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2118 list_for_each_entry(device, devices, dev_list) { 2119 if (device->bdev == bdev) { 2120 ret = -EEXIST; 2121 mutex_unlock( 2122 &root->fs_info->fs_devices->device_list_mutex); 2123 goto error; 2124 } 2125 } 2126 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2127 2128 device = btrfs_alloc_device(root->fs_info, NULL, NULL); 2129 if (IS_ERR(device)) { 2130 /* we can safely leave the fs_devices entry around */ 2131 ret = PTR_ERR(device); 2132 goto error; 2133 } 2134 2135 name = rcu_string_strdup(device_path, GFP_NOFS); 2136 if (!name) { 2137 kfree(device); 2138 ret = -ENOMEM; 2139 goto error; 2140 } 2141 rcu_assign_pointer(device->name, name); 2142 2143 trans = btrfs_start_transaction(root, 0); 2144 if (IS_ERR(trans)) { 2145 rcu_string_free(device->name); 2146 kfree(device); 2147 ret = PTR_ERR(trans); 2148 goto error; 2149 } 2150 2151 q = bdev_get_queue(bdev); 2152 if (blk_queue_discard(q)) 2153 device->can_discard = 1; 2154 device->writeable = 1; 2155 device->generation = trans->transid; 2156 device->io_width = root->sectorsize; 2157 device->io_align = root->sectorsize; 2158 device->sector_size = root->sectorsize; 2159 device->total_bytes = i_size_read(bdev->bd_inode); 2160 device->disk_total_bytes = device->total_bytes; 2161 device->commit_total_bytes = device->total_bytes; 2162 device->dev_root = root->fs_info->dev_root; 2163 device->bdev = bdev; 2164 device->in_fs_metadata = 1; 2165 device->is_tgtdev_for_dev_replace = 0; 2166 device->mode = FMODE_EXCL; 2167 device->dev_stats_valid = 1; 2168 set_blocksize(device->bdev, 4096); 2169 2170 if (seeding_dev) { 2171 sb->s_flags &= ~MS_RDONLY; 2172 ret = btrfs_prepare_sprout(root); 2173 BUG_ON(ret); /* -ENOMEM */ 2174 } 2175 2176 device->fs_devices = root->fs_info->fs_devices; 2177 2178 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2179 lock_chunks(root); 2180 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2181 list_add(&device->dev_alloc_list, 2182 &root->fs_info->fs_devices->alloc_list); 2183 root->fs_info->fs_devices->num_devices++; 2184 root->fs_info->fs_devices->open_devices++; 2185 root->fs_info->fs_devices->rw_devices++; 2186 root->fs_info->fs_devices->total_devices++; 2187 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2188 2189 spin_lock(&root->fs_info->free_chunk_lock); 2190 root->fs_info->free_chunk_space += device->total_bytes; 2191 spin_unlock(&root->fs_info->free_chunk_lock); 2192 2193 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2194 root->fs_info->fs_devices->rotating = 1; 2195 2196 tmp = btrfs_super_total_bytes(root->fs_info->super_copy); 2197 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2198 tmp + device->total_bytes); 2199 2200 tmp = btrfs_super_num_devices(root->fs_info->super_copy); 2201 btrfs_set_super_num_devices(root->fs_info->super_copy, 2202 tmp + 1); 2203 2204 /* add sysfs device entry */ 2205 btrfs_kobj_add_device(root->fs_info, device); 2206 2207 /* 2208 * we've got more storage, clear any full flags on the space 2209 * infos 2210 */ 2211 btrfs_clear_space_info_full(root->fs_info); 2212 2213 unlock_chunks(root); 2214 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2215 2216 if (seeding_dev) { 2217 lock_chunks(root); 2218 ret = init_first_rw_device(trans, root, device); 2219 unlock_chunks(root); 2220 if (ret) { 2221 btrfs_abort_transaction(trans, root, ret); 2222 goto error_trans; 2223 } 2224 } 2225 2226 ret = btrfs_add_device(trans, root, device); 2227 if (ret) { 2228 btrfs_abort_transaction(trans, root, ret); 2229 goto error_trans; 2230 } 2231 2232 if (seeding_dev) { 2233 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2234 2235 ret = btrfs_finish_sprout(trans, root); 2236 if (ret) { 2237 btrfs_abort_transaction(trans, root, ret); 2238 goto error_trans; 2239 } 2240 2241 /* Sprouting would change fsid of the mounted root, 2242 * so rename the fsid on the sysfs 2243 */ 2244 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2245 root->fs_info->fsid); 2246 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2247 goto error_trans; 2248 } 2249 2250 root->fs_info->num_tolerated_disk_barrier_failures = 2251 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2252 ret = btrfs_commit_transaction(trans, root); 2253 2254 if (seeding_dev) { 2255 mutex_unlock(&uuid_mutex); 2256 up_write(&sb->s_umount); 2257 2258 if (ret) /* transaction commit */ 2259 return ret; 2260 2261 ret = btrfs_relocate_sys_chunks(root); 2262 if (ret < 0) 2263 btrfs_error(root->fs_info, ret, 2264 "Failed to relocate sys chunks after " 2265 "device initialization. This can be fixed " 2266 "using the \"btrfs balance\" command."); 2267 trans = btrfs_attach_transaction(root); 2268 if (IS_ERR(trans)) { 2269 if (PTR_ERR(trans) == -ENOENT) 2270 return 0; 2271 return PTR_ERR(trans); 2272 } 2273 ret = btrfs_commit_transaction(trans, root); 2274 } 2275 2276 /* Update ctime/mtime for libblkid */ 2277 update_dev_time(device_path); 2278 return ret; 2279 2280 error_trans: 2281 btrfs_end_transaction(trans, root); 2282 rcu_string_free(device->name); 2283 btrfs_kobj_rm_device(root->fs_info, device); 2284 kfree(device); 2285 error: 2286 blkdev_put(bdev, FMODE_EXCL); 2287 if (seeding_dev) { 2288 mutex_unlock(&uuid_mutex); 2289 up_write(&sb->s_umount); 2290 } 2291 return ret; 2292 } 2293 2294 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2295 struct btrfs_device *srcdev, 2296 struct btrfs_device **device_out) 2297 { 2298 struct request_queue *q; 2299 struct btrfs_device *device; 2300 struct block_device *bdev; 2301 struct btrfs_fs_info *fs_info = root->fs_info; 2302 struct list_head *devices; 2303 struct rcu_string *name; 2304 u64 devid = BTRFS_DEV_REPLACE_DEVID; 2305 int ret = 0; 2306 2307 *device_out = NULL; 2308 if (fs_info->fs_devices->seeding) { 2309 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 2310 return -EINVAL; 2311 } 2312 2313 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2314 fs_info->bdev_holder); 2315 if (IS_ERR(bdev)) { 2316 btrfs_err(fs_info, "target device %s is invalid!", device_path); 2317 return PTR_ERR(bdev); 2318 } 2319 2320 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2321 2322 devices = &fs_info->fs_devices->devices; 2323 list_for_each_entry(device, devices, dev_list) { 2324 if (device->bdev == bdev) { 2325 btrfs_err(fs_info, "target device is in the filesystem!"); 2326 ret = -EEXIST; 2327 goto error; 2328 } 2329 } 2330 2331 2332 if (i_size_read(bdev->bd_inode) < 2333 btrfs_device_get_total_bytes(srcdev)) { 2334 btrfs_err(fs_info, "target device is smaller than source device!"); 2335 ret = -EINVAL; 2336 goto error; 2337 } 2338 2339 2340 device = btrfs_alloc_device(NULL, &devid, NULL); 2341 if (IS_ERR(device)) { 2342 ret = PTR_ERR(device); 2343 goto error; 2344 } 2345 2346 name = rcu_string_strdup(device_path, GFP_NOFS); 2347 if (!name) { 2348 kfree(device); 2349 ret = -ENOMEM; 2350 goto error; 2351 } 2352 rcu_assign_pointer(device->name, name); 2353 2354 q = bdev_get_queue(bdev); 2355 if (blk_queue_discard(q)) 2356 device->can_discard = 1; 2357 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2358 device->writeable = 1; 2359 device->generation = 0; 2360 device->io_width = root->sectorsize; 2361 device->io_align = root->sectorsize; 2362 device->sector_size = root->sectorsize; 2363 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 2364 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 2365 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 2366 ASSERT(list_empty(&srcdev->resized_list)); 2367 device->commit_total_bytes = srcdev->commit_total_bytes; 2368 device->commit_bytes_used = device->bytes_used; 2369 device->dev_root = fs_info->dev_root; 2370 device->bdev = bdev; 2371 device->in_fs_metadata = 1; 2372 device->is_tgtdev_for_dev_replace = 1; 2373 device->mode = FMODE_EXCL; 2374 device->dev_stats_valid = 1; 2375 set_blocksize(device->bdev, 4096); 2376 device->fs_devices = fs_info->fs_devices; 2377 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2378 fs_info->fs_devices->num_devices++; 2379 fs_info->fs_devices->open_devices++; 2380 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2381 2382 *device_out = device; 2383 return ret; 2384 2385 error: 2386 blkdev_put(bdev, FMODE_EXCL); 2387 return ret; 2388 } 2389 2390 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 2391 struct btrfs_device *tgtdev) 2392 { 2393 WARN_ON(fs_info->fs_devices->rw_devices == 0); 2394 tgtdev->io_width = fs_info->dev_root->sectorsize; 2395 tgtdev->io_align = fs_info->dev_root->sectorsize; 2396 tgtdev->sector_size = fs_info->dev_root->sectorsize; 2397 tgtdev->dev_root = fs_info->dev_root; 2398 tgtdev->in_fs_metadata = 1; 2399 } 2400 2401 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2402 struct btrfs_device *device) 2403 { 2404 int ret; 2405 struct btrfs_path *path; 2406 struct btrfs_root *root; 2407 struct btrfs_dev_item *dev_item; 2408 struct extent_buffer *leaf; 2409 struct btrfs_key key; 2410 2411 root = device->dev_root->fs_info->chunk_root; 2412 2413 path = btrfs_alloc_path(); 2414 if (!path) 2415 return -ENOMEM; 2416 2417 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2418 key.type = BTRFS_DEV_ITEM_KEY; 2419 key.offset = device->devid; 2420 2421 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2422 if (ret < 0) 2423 goto out; 2424 2425 if (ret > 0) { 2426 ret = -ENOENT; 2427 goto out; 2428 } 2429 2430 leaf = path->nodes[0]; 2431 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2432 2433 btrfs_set_device_id(leaf, dev_item, device->devid); 2434 btrfs_set_device_type(leaf, dev_item, device->type); 2435 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2436 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2437 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2438 btrfs_set_device_total_bytes(leaf, dev_item, 2439 btrfs_device_get_disk_total_bytes(device)); 2440 btrfs_set_device_bytes_used(leaf, dev_item, 2441 btrfs_device_get_bytes_used(device)); 2442 btrfs_mark_buffer_dirty(leaf); 2443 2444 out: 2445 btrfs_free_path(path); 2446 return ret; 2447 } 2448 2449 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2450 struct btrfs_device *device, u64 new_size) 2451 { 2452 struct btrfs_super_block *super_copy = 2453 device->dev_root->fs_info->super_copy; 2454 struct btrfs_fs_devices *fs_devices; 2455 u64 old_total; 2456 u64 diff; 2457 2458 if (!device->writeable) 2459 return -EACCES; 2460 2461 lock_chunks(device->dev_root); 2462 old_total = btrfs_super_total_bytes(super_copy); 2463 diff = new_size - device->total_bytes; 2464 2465 if (new_size <= device->total_bytes || 2466 device->is_tgtdev_for_dev_replace) { 2467 unlock_chunks(device->dev_root); 2468 return -EINVAL; 2469 } 2470 2471 fs_devices = device->dev_root->fs_info->fs_devices; 2472 2473 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2474 device->fs_devices->total_rw_bytes += diff; 2475 2476 btrfs_device_set_total_bytes(device, new_size); 2477 btrfs_device_set_disk_total_bytes(device, new_size); 2478 btrfs_clear_space_info_full(device->dev_root->fs_info); 2479 if (list_empty(&device->resized_list)) 2480 list_add_tail(&device->resized_list, 2481 &fs_devices->resized_devices); 2482 unlock_chunks(device->dev_root); 2483 2484 return btrfs_update_device(trans, device); 2485 } 2486 2487 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2488 struct btrfs_root *root, 2489 u64 chunk_tree, u64 chunk_objectid, 2490 u64 chunk_offset) 2491 { 2492 int ret; 2493 struct btrfs_path *path; 2494 struct btrfs_key key; 2495 2496 root = root->fs_info->chunk_root; 2497 path = btrfs_alloc_path(); 2498 if (!path) 2499 return -ENOMEM; 2500 2501 key.objectid = chunk_objectid; 2502 key.offset = chunk_offset; 2503 key.type = BTRFS_CHUNK_ITEM_KEY; 2504 2505 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2506 if (ret < 0) 2507 goto out; 2508 else if (ret > 0) { /* Logic error or corruption */ 2509 btrfs_error(root->fs_info, -ENOENT, 2510 "Failed lookup while freeing chunk."); 2511 ret = -ENOENT; 2512 goto out; 2513 } 2514 2515 ret = btrfs_del_item(trans, root, path); 2516 if (ret < 0) 2517 btrfs_error(root->fs_info, ret, 2518 "Failed to delete chunk item."); 2519 out: 2520 btrfs_free_path(path); 2521 return ret; 2522 } 2523 2524 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 2525 chunk_offset) 2526 { 2527 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 2528 struct btrfs_disk_key *disk_key; 2529 struct btrfs_chunk *chunk; 2530 u8 *ptr; 2531 int ret = 0; 2532 u32 num_stripes; 2533 u32 array_size; 2534 u32 len = 0; 2535 u32 cur; 2536 struct btrfs_key key; 2537 2538 lock_chunks(root); 2539 array_size = btrfs_super_sys_array_size(super_copy); 2540 2541 ptr = super_copy->sys_chunk_array; 2542 cur = 0; 2543 2544 while (cur < array_size) { 2545 disk_key = (struct btrfs_disk_key *)ptr; 2546 btrfs_disk_key_to_cpu(&key, disk_key); 2547 2548 len = sizeof(*disk_key); 2549 2550 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2551 chunk = (struct btrfs_chunk *)(ptr + len); 2552 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2553 len += btrfs_chunk_item_size(num_stripes); 2554 } else { 2555 ret = -EIO; 2556 break; 2557 } 2558 if (key.objectid == chunk_objectid && 2559 key.offset == chunk_offset) { 2560 memmove(ptr, ptr + len, array_size - (cur + len)); 2561 array_size -= len; 2562 btrfs_set_super_sys_array_size(super_copy, array_size); 2563 } else { 2564 ptr += len; 2565 cur += len; 2566 } 2567 } 2568 unlock_chunks(root); 2569 return ret; 2570 } 2571 2572 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 2573 struct btrfs_root *root, u64 chunk_offset) 2574 { 2575 struct extent_map_tree *em_tree; 2576 struct extent_map *em; 2577 struct btrfs_root *extent_root = root->fs_info->extent_root; 2578 struct map_lookup *map; 2579 u64 dev_extent_len = 0; 2580 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2581 u64 chunk_tree = root->fs_info->chunk_root->objectid; 2582 int i, ret = 0; 2583 2584 /* Just in case */ 2585 root = root->fs_info->chunk_root; 2586 em_tree = &root->fs_info->mapping_tree.map_tree; 2587 2588 read_lock(&em_tree->lock); 2589 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2590 read_unlock(&em_tree->lock); 2591 2592 if (!em || em->start > chunk_offset || 2593 em->start + em->len < chunk_offset) { 2594 /* 2595 * This is a logic error, but we don't want to just rely on the 2596 * user having built with ASSERT enabled, so if ASSERT doens't 2597 * do anything we still error out. 2598 */ 2599 ASSERT(0); 2600 if (em) 2601 free_extent_map(em); 2602 return -EINVAL; 2603 } 2604 map = (struct map_lookup *)em->bdev; 2605 2606 for (i = 0; i < map->num_stripes; i++) { 2607 struct btrfs_device *device = map->stripes[i].dev; 2608 ret = btrfs_free_dev_extent(trans, device, 2609 map->stripes[i].physical, 2610 &dev_extent_len); 2611 if (ret) { 2612 btrfs_abort_transaction(trans, root, ret); 2613 goto out; 2614 } 2615 2616 if (device->bytes_used > 0) { 2617 lock_chunks(root); 2618 btrfs_device_set_bytes_used(device, 2619 device->bytes_used - dev_extent_len); 2620 spin_lock(&root->fs_info->free_chunk_lock); 2621 root->fs_info->free_chunk_space += dev_extent_len; 2622 spin_unlock(&root->fs_info->free_chunk_lock); 2623 btrfs_clear_space_info_full(root->fs_info); 2624 unlock_chunks(root); 2625 } 2626 2627 if (map->stripes[i].dev) { 2628 ret = btrfs_update_device(trans, map->stripes[i].dev); 2629 if (ret) { 2630 btrfs_abort_transaction(trans, root, ret); 2631 goto out; 2632 } 2633 } 2634 } 2635 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2636 chunk_offset); 2637 if (ret) { 2638 btrfs_abort_transaction(trans, root, ret); 2639 goto out; 2640 } 2641 2642 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2643 2644 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2645 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2646 if (ret) { 2647 btrfs_abort_transaction(trans, root, ret); 2648 goto out; 2649 } 2650 } 2651 2652 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); 2653 if (ret) { 2654 btrfs_abort_transaction(trans, extent_root, ret); 2655 goto out; 2656 } 2657 2658 out: 2659 /* once for us */ 2660 free_extent_map(em); 2661 return ret; 2662 } 2663 2664 static int btrfs_relocate_chunk(struct btrfs_root *root, 2665 u64 chunk_tree, u64 chunk_objectid, 2666 u64 chunk_offset) 2667 { 2668 struct btrfs_root *extent_root; 2669 struct btrfs_trans_handle *trans; 2670 int ret; 2671 2672 root = root->fs_info->chunk_root; 2673 extent_root = root->fs_info->extent_root; 2674 2675 ret = btrfs_can_relocate(extent_root, chunk_offset); 2676 if (ret) 2677 return -ENOSPC; 2678 2679 /* step one, relocate all the extents inside this chunk */ 2680 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 2681 if (ret) 2682 return ret; 2683 2684 trans = btrfs_start_transaction(root, 0); 2685 if (IS_ERR(trans)) { 2686 ret = PTR_ERR(trans); 2687 btrfs_std_error(root->fs_info, ret); 2688 return ret; 2689 } 2690 2691 /* 2692 * step two, delete the device extents and the 2693 * chunk tree entries 2694 */ 2695 ret = btrfs_remove_chunk(trans, root, chunk_offset); 2696 btrfs_end_transaction(trans, root); 2697 return ret; 2698 } 2699 2700 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2701 { 2702 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2703 struct btrfs_path *path; 2704 struct extent_buffer *leaf; 2705 struct btrfs_chunk *chunk; 2706 struct btrfs_key key; 2707 struct btrfs_key found_key; 2708 u64 chunk_tree = chunk_root->root_key.objectid; 2709 u64 chunk_type; 2710 bool retried = false; 2711 int failed = 0; 2712 int ret; 2713 2714 path = btrfs_alloc_path(); 2715 if (!path) 2716 return -ENOMEM; 2717 2718 again: 2719 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2720 key.offset = (u64)-1; 2721 key.type = BTRFS_CHUNK_ITEM_KEY; 2722 2723 while (1) { 2724 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2725 if (ret < 0) 2726 goto error; 2727 BUG_ON(ret == 0); /* Corruption */ 2728 2729 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2730 key.type); 2731 if (ret < 0) 2732 goto error; 2733 if (ret > 0) 2734 break; 2735 2736 leaf = path->nodes[0]; 2737 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2738 2739 chunk = btrfs_item_ptr(leaf, path->slots[0], 2740 struct btrfs_chunk); 2741 chunk_type = btrfs_chunk_type(leaf, chunk); 2742 btrfs_release_path(path); 2743 2744 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2745 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2746 found_key.objectid, 2747 found_key.offset); 2748 if (ret == -ENOSPC) 2749 failed++; 2750 else 2751 BUG_ON(ret); 2752 } 2753 2754 if (found_key.offset == 0) 2755 break; 2756 key.offset = found_key.offset - 1; 2757 } 2758 ret = 0; 2759 if (failed && !retried) { 2760 failed = 0; 2761 retried = true; 2762 goto again; 2763 } else if (WARN_ON(failed && retried)) { 2764 ret = -ENOSPC; 2765 } 2766 error: 2767 btrfs_free_path(path); 2768 return ret; 2769 } 2770 2771 static int insert_balance_item(struct btrfs_root *root, 2772 struct btrfs_balance_control *bctl) 2773 { 2774 struct btrfs_trans_handle *trans; 2775 struct btrfs_balance_item *item; 2776 struct btrfs_disk_balance_args disk_bargs; 2777 struct btrfs_path *path; 2778 struct extent_buffer *leaf; 2779 struct btrfs_key key; 2780 int ret, err; 2781 2782 path = btrfs_alloc_path(); 2783 if (!path) 2784 return -ENOMEM; 2785 2786 trans = btrfs_start_transaction(root, 0); 2787 if (IS_ERR(trans)) { 2788 btrfs_free_path(path); 2789 return PTR_ERR(trans); 2790 } 2791 2792 key.objectid = BTRFS_BALANCE_OBJECTID; 2793 key.type = BTRFS_BALANCE_ITEM_KEY; 2794 key.offset = 0; 2795 2796 ret = btrfs_insert_empty_item(trans, root, path, &key, 2797 sizeof(*item)); 2798 if (ret) 2799 goto out; 2800 2801 leaf = path->nodes[0]; 2802 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2803 2804 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2805 2806 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2807 btrfs_set_balance_data(leaf, item, &disk_bargs); 2808 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2809 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2810 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2811 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2812 2813 btrfs_set_balance_flags(leaf, item, bctl->flags); 2814 2815 btrfs_mark_buffer_dirty(leaf); 2816 out: 2817 btrfs_free_path(path); 2818 err = btrfs_commit_transaction(trans, root); 2819 if (err && !ret) 2820 ret = err; 2821 return ret; 2822 } 2823 2824 static int del_balance_item(struct btrfs_root *root) 2825 { 2826 struct btrfs_trans_handle *trans; 2827 struct btrfs_path *path; 2828 struct btrfs_key key; 2829 int ret, err; 2830 2831 path = btrfs_alloc_path(); 2832 if (!path) 2833 return -ENOMEM; 2834 2835 trans = btrfs_start_transaction(root, 0); 2836 if (IS_ERR(trans)) { 2837 btrfs_free_path(path); 2838 return PTR_ERR(trans); 2839 } 2840 2841 key.objectid = BTRFS_BALANCE_OBJECTID; 2842 key.type = BTRFS_BALANCE_ITEM_KEY; 2843 key.offset = 0; 2844 2845 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2846 if (ret < 0) 2847 goto out; 2848 if (ret > 0) { 2849 ret = -ENOENT; 2850 goto out; 2851 } 2852 2853 ret = btrfs_del_item(trans, root, path); 2854 out: 2855 btrfs_free_path(path); 2856 err = btrfs_commit_transaction(trans, root); 2857 if (err && !ret) 2858 ret = err; 2859 return ret; 2860 } 2861 2862 /* 2863 * This is a heuristic used to reduce the number of chunks balanced on 2864 * resume after balance was interrupted. 2865 */ 2866 static void update_balance_args(struct btrfs_balance_control *bctl) 2867 { 2868 /* 2869 * Turn on soft mode for chunk types that were being converted. 2870 */ 2871 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2872 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2873 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2874 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2875 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2876 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2877 2878 /* 2879 * Turn on usage filter if is not already used. The idea is 2880 * that chunks that we have already balanced should be 2881 * reasonably full. Don't do it for chunks that are being 2882 * converted - that will keep us from relocating unconverted 2883 * (albeit full) chunks. 2884 */ 2885 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2886 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2887 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2888 bctl->data.usage = 90; 2889 } 2890 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2891 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2892 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2893 bctl->sys.usage = 90; 2894 } 2895 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2896 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2897 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2898 bctl->meta.usage = 90; 2899 } 2900 } 2901 2902 /* 2903 * Should be called with both balance and volume mutexes held to 2904 * serialize other volume operations (add_dev/rm_dev/resize) with 2905 * restriper. Same goes for unset_balance_control. 2906 */ 2907 static void set_balance_control(struct btrfs_balance_control *bctl) 2908 { 2909 struct btrfs_fs_info *fs_info = bctl->fs_info; 2910 2911 BUG_ON(fs_info->balance_ctl); 2912 2913 spin_lock(&fs_info->balance_lock); 2914 fs_info->balance_ctl = bctl; 2915 spin_unlock(&fs_info->balance_lock); 2916 } 2917 2918 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2919 { 2920 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2921 2922 BUG_ON(!fs_info->balance_ctl); 2923 2924 spin_lock(&fs_info->balance_lock); 2925 fs_info->balance_ctl = NULL; 2926 spin_unlock(&fs_info->balance_lock); 2927 2928 kfree(bctl); 2929 } 2930 2931 /* 2932 * Balance filters. Return 1 if chunk should be filtered out 2933 * (should not be balanced). 2934 */ 2935 static int chunk_profiles_filter(u64 chunk_type, 2936 struct btrfs_balance_args *bargs) 2937 { 2938 chunk_type = chunk_to_extended(chunk_type) & 2939 BTRFS_EXTENDED_PROFILE_MASK; 2940 2941 if (bargs->profiles & chunk_type) 2942 return 0; 2943 2944 return 1; 2945 } 2946 2947 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2948 struct btrfs_balance_args *bargs) 2949 { 2950 struct btrfs_block_group_cache *cache; 2951 u64 chunk_used, user_thresh; 2952 int ret = 1; 2953 2954 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2955 chunk_used = btrfs_block_group_used(&cache->item); 2956 2957 if (bargs->usage == 0) 2958 user_thresh = 1; 2959 else if (bargs->usage > 100) 2960 user_thresh = cache->key.offset; 2961 else 2962 user_thresh = div_factor_fine(cache->key.offset, 2963 bargs->usage); 2964 2965 if (chunk_used < user_thresh) 2966 ret = 0; 2967 2968 btrfs_put_block_group(cache); 2969 return ret; 2970 } 2971 2972 static int chunk_devid_filter(struct extent_buffer *leaf, 2973 struct btrfs_chunk *chunk, 2974 struct btrfs_balance_args *bargs) 2975 { 2976 struct btrfs_stripe *stripe; 2977 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2978 int i; 2979 2980 for (i = 0; i < num_stripes; i++) { 2981 stripe = btrfs_stripe_nr(chunk, i); 2982 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2983 return 0; 2984 } 2985 2986 return 1; 2987 } 2988 2989 /* [pstart, pend) */ 2990 static int chunk_drange_filter(struct extent_buffer *leaf, 2991 struct btrfs_chunk *chunk, 2992 u64 chunk_offset, 2993 struct btrfs_balance_args *bargs) 2994 { 2995 struct btrfs_stripe *stripe; 2996 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2997 u64 stripe_offset; 2998 u64 stripe_length; 2999 int factor; 3000 int i; 3001 3002 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3003 return 0; 3004 3005 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3006 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3007 factor = num_stripes / 2; 3008 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 3009 factor = num_stripes - 1; 3010 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 3011 factor = num_stripes - 2; 3012 } else { 3013 factor = num_stripes; 3014 } 3015 3016 for (i = 0; i < num_stripes; i++) { 3017 stripe = btrfs_stripe_nr(chunk, i); 3018 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3019 continue; 3020 3021 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3022 stripe_length = btrfs_chunk_length(leaf, chunk); 3023 do_div(stripe_length, factor); 3024 3025 if (stripe_offset < bargs->pend && 3026 stripe_offset + stripe_length > bargs->pstart) 3027 return 0; 3028 } 3029 3030 return 1; 3031 } 3032 3033 /* [vstart, vend) */ 3034 static int chunk_vrange_filter(struct extent_buffer *leaf, 3035 struct btrfs_chunk *chunk, 3036 u64 chunk_offset, 3037 struct btrfs_balance_args *bargs) 3038 { 3039 if (chunk_offset < bargs->vend && 3040 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3041 /* at least part of the chunk is inside this vrange */ 3042 return 0; 3043 3044 return 1; 3045 } 3046 3047 static int chunk_soft_convert_filter(u64 chunk_type, 3048 struct btrfs_balance_args *bargs) 3049 { 3050 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3051 return 0; 3052 3053 chunk_type = chunk_to_extended(chunk_type) & 3054 BTRFS_EXTENDED_PROFILE_MASK; 3055 3056 if (bargs->target == chunk_type) 3057 return 1; 3058 3059 return 0; 3060 } 3061 3062 static int should_balance_chunk(struct btrfs_root *root, 3063 struct extent_buffer *leaf, 3064 struct btrfs_chunk *chunk, u64 chunk_offset) 3065 { 3066 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 3067 struct btrfs_balance_args *bargs = NULL; 3068 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3069 3070 /* type filter */ 3071 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3072 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3073 return 0; 3074 } 3075 3076 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3077 bargs = &bctl->data; 3078 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3079 bargs = &bctl->sys; 3080 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3081 bargs = &bctl->meta; 3082 3083 /* profiles filter */ 3084 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3085 chunk_profiles_filter(chunk_type, bargs)) { 3086 return 0; 3087 } 3088 3089 /* usage filter */ 3090 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3091 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 3092 return 0; 3093 } 3094 3095 /* devid filter */ 3096 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3097 chunk_devid_filter(leaf, chunk, bargs)) { 3098 return 0; 3099 } 3100 3101 /* drange filter, makes sense only with devid filter */ 3102 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3103 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 3104 return 0; 3105 } 3106 3107 /* vrange filter */ 3108 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3109 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3110 return 0; 3111 } 3112 3113 /* soft profile changing mode */ 3114 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3115 chunk_soft_convert_filter(chunk_type, bargs)) { 3116 return 0; 3117 } 3118 3119 /* 3120 * limited by count, must be the last filter 3121 */ 3122 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3123 if (bargs->limit == 0) 3124 return 0; 3125 else 3126 bargs->limit--; 3127 } 3128 3129 return 1; 3130 } 3131 3132 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3133 { 3134 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3135 struct btrfs_root *chunk_root = fs_info->chunk_root; 3136 struct btrfs_root *dev_root = fs_info->dev_root; 3137 struct list_head *devices; 3138 struct btrfs_device *device; 3139 u64 old_size; 3140 u64 size_to_free; 3141 struct btrfs_chunk *chunk; 3142 struct btrfs_path *path; 3143 struct btrfs_key key; 3144 struct btrfs_key found_key; 3145 struct btrfs_trans_handle *trans; 3146 struct extent_buffer *leaf; 3147 int slot; 3148 int ret; 3149 int enospc_errors = 0; 3150 bool counting = true; 3151 u64 limit_data = bctl->data.limit; 3152 u64 limit_meta = bctl->meta.limit; 3153 u64 limit_sys = bctl->sys.limit; 3154 3155 /* step one make some room on all the devices */ 3156 devices = &fs_info->fs_devices->devices; 3157 list_for_each_entry(device, devices, dev_list) { 3158 old_size = btrfs_device_get_total_bytes(device); 3159 size_to_free = div_factor(old_size, 1); 3160 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 3161 if (!device->writeable || 3162 btrfs_device_get_total_bytes(device) - 3163 btrfs_device_get_bytes_used(device) > size_to_free || 3164 device->is_tgtdev_for_dev_replace) 3165 continue; 3166 3167 ret = btrfs_shrink_device(device, old_size - size_to_free); 3168 if (ret == -ENOSPC) 3169 break; 3170 BUG_ON(ret); 3171 3172 trans = btrfs_start_transaction(dev_root, 0); 3173 BUG_ON(IS_ERR(trans)); 3174 3175 ret = btrfs_grow_device(trans, device, old_size); 3176 BUG_ON(ret); 3177 3178 btrfs_end_transaction(trans, dev_root); 3179 } 3180 3181 /* step two, relocate all the chunks */ 3182 path = btrfs_alloc_path(); 3183 if (!path) { 3184 ret = -ENOMEM; 3185 goto error; 3186 } 3187 3188 /* zero out stat counters */ 3189 spin_lock(&fs_info->balance_lock); 3190 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3191 spin_unlock(&fs_info->balance_lock); 3192 again: 3193 if (!counting) { 3194 bctl->data.limit = limit_data; 3195 bctl->meta.limit = limit_meta; 3196 bctl->sys.limit = limit_sys; 3197 } 3198 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3199 key.offset = (u64)-1; 3200 key.type = BTRFS_CHUNK_ITEM_KEY; 3201 3202 while (1) { 3203 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3204 atomic_read(&fs_info->balance_cancel_req)) { 3205 ret = -ECANCELED; 3206 goto error; 3207 } 3208 3209 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3210 if (ret < 0) 3211 goto error; 3212 3213 /* 3214 * this shouldn't happen, it means the last relocate 3215 * failed 3216 */ 3217 if (ret == 0) 3218 BUG(); /* FIXME break ? */ 3219 3220 ret = btrfs_previous_item(chunk_root, path, 0, 3221 BTRFS_CHUNK_ITEM_KEY); 3222 if (ret) { 3223 ret = 0; 3224 break; 3225 } 3226 3227 leaf = path->nodes[0]; 3228 slot = path->slots[0]; 3229 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3230 3231 if (found_key.objectid != key.objectid) 3232 break; 3233 3234 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3235 3236 if (!counting) { 3237 spin_lock(&fs_info->balance_lock); 3238 bctl->stat.considered++; 3239 spin_unlock(&fs_info->balance_lock); 3240 } 3241 3242 ret = should_balance_chunk(chunk_root, leaf, chunk, 3243 found_key.offset); 3244 btrfs_release_path(path); 3245 if (!ret) 3246 goto loop; 3247 3248 if (counting) { 3249 spin_lock(&fs_info->balance_lock); 3250 bctl->stat.expected++; 3251 spin_unlock(&fs_info->balance_lock); 3252 goto loop; 3253 } 3254 3255 ret = btrfs_relocate_chunk(chunk_root, 3256 chunk_root->root_key.objectid, 3257 found_key.objectid, 3258 found_key.offset); 3259 if (ret && ret != -ENOSPC) 3260 goto error; 3261 if (ret == -ENOSPC) { 3262 enospc_errors++; 3263 } else { 3264 spin_lock(&fs_info->balance_lock); 3265 bctl->stat.completed++; 3266 spin_unlock(&fs_info->balance_lock); 3267 } 3268 loop: 3269 if (found_key.offset == 0) 3270 break; 3271 key.offset = found_key.offset - 1; 3272 } 3273 3274 if (counting) { 3275 btrfs_release_path(path); 3276 counting = false; 3277 goto again; 3278 } 3279 error: 3280 btrfs_free_path(path); 3281 if (enospc_errors) { 3282 btrfs_info(fs_info, "%d enospc errors during balance", 3283 enospc_errors); 3284 if (!ret) 3285 ret = -ENOSPC; 3286 } 3287 3288 return ret; 3289 } 3290 3291 /** 3292 * alloc_profile_is_valid - see if a given profile is valid and reduced 3293 * @flags: profile to validate 3294 * @extended: if true @flags is treated as an extended profile 3295 */ 3296 static int alloc_profile_is_valid(u64 flags, int extended) 3297 { 3298 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3299 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3300 3301 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3302 3303 /* 1) check that all other bits are zeroed */ 3304 if (flags & ~mask) 3305 return 0; 3306 3307 /* 2) see if profile is reduced */ 3308 if (flags == 0) 3309 return !extended; /* "0" is valid for usual profiles */ 3310 3311 /* true if exactly one bit set */ 3312 return (flags & (flags - 1)) == 0; 3313 } 3314 3315 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3316 { 3317 /* cancel requested || normal exit path */ 3318 return atomic_read(&fs_info->balance_cancel_req) || 3319 (atomic_read(&fs_info->balance_pause_req) == 0 && 3320 atomic_read(&fs_info->balance_cancel_req) == 0); 3321 } 3322 3323 static void __cancel_balance(struct btrfs_fs_info *fs_info) 3324 { 3325 int ret; 3326 3327 unset_balance_control(fs_info); 3328 ret = del_balance_item(fs_info->tree_root); 3329 if (ret) 3330 btrfs_std_error(fs_info, ret); 3331 3332 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3333 } 3334 3335 /* 3336 * Should be called with both balance and volume mutexes held 3337 */ 3338 int btrfs_balance(struct btrfs_balance_control *bctl, 3339 struct btrfs_ioctl_balance_args *bargs) 3340 { 3341 struct btrfs_fs_info *fs_info = bctl->fs_info; 3342 u64 allowed; 3343 int mixed = 0; 3344 int ret; 3345 u64 num_devices; 3346 unsigned seq; 3347 3348 if (btrfs_fs_closing(fs_info) || 3349 atomic_read(&fs_info->balance_pause_req) || 3350 atomic_read(&fs_info->balance_cancel_req)) { 3351 ret = -EINVAL; 3352 goto out; 3353 } 3354 3355 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 3356 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 3357 mixed = 1; 3358 3359 /* 3360 * In case of mixed groups both data and meta should be picked, 3361 * and identical options should be given for both of them. 3362 */ 3363 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 3364 if (mixed && (bctl->flags & allowed)) { 3365 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3366 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3367 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3368 btrfs_err(fs_info, "with mixed groups data and " 3369 "metadata balance options must be the same"); 3370 ret = -EINVAL; 3371 goto out; 3372 } 3373 } 3374 3375 num_devices = fs_info->fs_devices->num_devices; 3376 btrfs_dev_replace_lock(&fs_info->dev_replace); 3377 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 3378 BUG_ON(num_devices < 1); 3379 num_devices--; 3380 } 3381 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3382 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3383 if (num_devices == 1) 3384 allowed |= BTRFS_BLOCK_GROUP_DUP; 3385 else if (num_devices > 1) 3386 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3387 if (num_devices > 2) 3388 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3389 if (num_devices > 3) 3390 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3391 BTRFS_BLOCK_GROUP_RAID6); 3392 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3393 (!alloc_profile_is_valid(bctl->data.target, 1) || 3394 (bctl->data.target & ~allowed))) { 3395 btrfs_err(fs_info, "unable to start balance with target " 3396 "data profile %llu", 3397 bctl->data.target); 3398 ret = -EINVAL; 3399 goto out; 3400 } 3401 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3402 (!alloc_profile_is_valid(bctl->meta.target, 1) || 3403 (bctl->meta.target & ~allowed))) { 3404 btrfs_err(fs_info, 3405 "unable to start balance with target metadata profile %llu", 3406 bctl->meta.target); 3407 ret = -EINVAL; 3408 goto out; 3409 } 3410 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3411 (!alloc_profile_is_valid(bctl->sys.target, 1) || 3412 (bctl->sys.target & ~allowed))) { 3413 btrfs_err(fs_info, 3414 "unable to start balance with target system profile %llu", 3415 bctl->sys.target); 3416 ret = -EINVAL; 3417 goto out; 3418 } 3419 3420 /* allow dup'ed data chunks only in mixed mode */ 3421 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3422 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 3423 btrfs_err(fs_info, "dup for data is not allowed"); 3424 ret = -EINVAL; 3425 goto out; 3426 } 3427 3428 /* allow to reduce meta or sys integrity only if force set */ 3429 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3430 BTRFS_BLOCK_GROUP_RAID10 | 3431 BTRFS_BLOCK_GROUP_RAID5 | 3432 BTRFS_BLOCK_GROUP_RAID6; 3433 do { 3434 seq = read_seqbegin(&fs_info->profiles_lock); 3435 3436 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3437 (fs_info->avail_system_alloc_bits & allowed) && 3438 !(bctl->sys.target & allowed)) || 3439 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3440 (fs_info->avail_metadata_alloc_bits & allowed) && 3441 !(bctl->meta.target & allowed))) { 3442 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3443 btrfs_info(fs_info, "force reducing metadata integrity"); 3444 } else { 3445 btrfs_err(fs_info, "balance will reduce metadata " 3446 "integrity, use force if you want this"); 3447 ret = -EINVAL; 3448 goto out; 3449 } 3450 } 3451 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3452 3453 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3454 int num_tolerated_disk_barrier_failures; 3455 u64 target = bctl->sys.target; 3456 3457 num_tolerated_disk_barrier_failures = 3458 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3459 if (num_tolerated_disk_barrier_failures > 0 && 3460 (target & 3461 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3462 BTRFS_AVAIL_ALLOC_BIT_SINGLE))) 3463 num_tolerated_disk_barrier_failures = 0; 3464 else if (num_tolerated_disk_barrier_failures > 1 && 3465 (target & 3466 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) 3467 num_tolerated_disk_barrier_failures = 1; 3468 3469 fs_info->num_tolerated_disk_barrier_failures = 3470 num_tolerated_disk_barrier_failures; 3471 } 3472 3473 ret = insert_balance_item(fs_info->tree_root, bctl); 3474 if (ret && ret != -EEXIST) 3475 goto out; 3476 3477 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 3478 BUG_ON(ret == -EEXIST); 3479 set_balance_control(bctl); 3480 } else { 3481 BUG_ON(ret != -EEXIST); 3482 spin_lock(&fs_info->balance_lock); 3483 update_balance_args(bctl); 3484 spin_unlock(&fs_info->balance_lock); 3485 } 3486 3487 atomic_inc(&fs_info->balance_running); 3488 mutex_unlock(&fs_info->balance_mutex); 3489 3490 ret = __btrfs_balance(fs_info); 3491 3492 mutex_lock(&fs_info->balance_mutex); 3493 atomic_dec(&fs_info->balance_running); 3494 3495 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3496 fs_info->num_tolerated_disk_barrier_failures = 3497 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3498 } 3499 3500 if (bargs) { 3501 memset(bargs, 0, sizeof(*bargs)); 3502 update_ioctl_balance_args(fs_info, 0, bargs); 3503 } 3504 3505 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3506 balance_need_close(fs_info)) { 3507 __cancel_balance(fs_info); 3508 } 3509 3510 wake_up(&fs_info->balance_wait_q); 3511 3512 return ret; 3513 out: 3514 if (bctl->flags & BTRFS_BALANCE_RESUME) 3515 __cancel_balance(fs_info); 3516 else { 3517 kfree(bctl); 3518 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3519 } 3520 return ret; 3521 } 3522 3523 static int balance_kthread(void *data) 3524 { 3525 struct btrfs_fs_info *fs_info = data; 3526 int ret = 0; 3527 3528 mutex_lock(&fs_info->volume_mutex); 3529 mutex_lock(&fs_info->balance_mutex); 3530 3531 if (fs_info->balance_ctl) { 3532 btrfs_info(fs_info, "continuing balance"); 3533 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3534 } 3535 3536 mutex_unlock(&fs_info->balance_mutex); 3537 mutex_unlock(&fs_info->volume_mutex); 3538 3539 return ret; 3540 } 3541 3542 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 3543 { 3544 struct task_struct *tsk; 3545 3546 spin_lock(&fs_info->balance_lock); 3547 if (!fs_info->balance_ctl) { 3548 spin_unlock(&fs_info->balance_lock); 3549 return 0; 3550 } 3551 spin_unlock(&fs_info->balance_lock); 3552 3553 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 3554 btrfs_info(fs_info, "force skipping balance"); 3555 return 0; 3556 } 3557 3558 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3559 return PTR_ERR_OR_ZERO(tsk); 3560 } 3561 3562 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3563 { 3564 struct btrfs_balance_control *bctl; 3565 struct btrfs_balance_item *item; 3566 struct btrfs_disk_balance_args disk_bargs; 3567 struct btrfs_path *path; 3568 struct extent_buffer *leaf; 3569 struct btrfs_key key; 3570 int ret; 3571 3572 path = btrfs_alloc_path(); 3573 if (!path) 3574 return -ENOMEM; 3575 3576 key.objectid = BTRFS_BALANCE_OBJECTID; 3577 key.type = BTRFS_BALANCE_ITEM_KEY; 3578 key.offset = 0; 3579 3580 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 3581 if (ret < 0) 3582 goto out; 3583 if (ret > 0) { /* ret = -ENOENT; */ 3584 ret = 0; 3585 goto out; 3586 } 3587 3588 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 3589 if (!bctl) { 3590 ret = -ENOMEM; 3591 goto out; 3592 } 3593 3594 leaf = path->nodes[0]; 3595 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3596 3597 bctl->fs_info = fs_info; 3598 bctl->flags = btrfs_balance_flags(leaf, item); 3599 bctl->flags |= BTRFS_BALANCE_RESUME; 3600 3601 btrfs_balance_data(leaf, item, &disk_bargs); 3602 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 3603 btrfs_balance_meta(leaf, item, &disk_bargs); 3604 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 3605 btrfs_balance_sys(leaf, item, &disk_bargs); 3606 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3607 3608 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); 3609 3610 mutex_lock(&fs_info->volume_mutex); 3611 mutex_lock(&fs_info->balance_mutex); 3612 3613 set_balance_control(bctl); 3614 3615 mutex_unlock(&fs_info->balance_mutex); 3616 mutex_unlock(&fs_info->volume_mutex); 3617 out: 3618 btrfs_free_path(path); 3619 return ret; 3620 } 3621 3622 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 3623 { 3624 int ret = 0; 3625 3626 mutex_lock(&fs_info->balance_mutex); 3627 if (!fs_info->balance_ctl) { 3628 mutex_unlock(&fs_info->balance_mutex); 3629 return -ENOTCONN; 3630 } 3631 3632 if (atomic_read(&fs_info->balance_running)) { 3633 atomic_inc(&fs_info->balance_pause_req); 3634 mutex_unlock(&fs_info->balance_mutex); 3635 3636 wait_event(fs_info->balance_wait_q, 3637 atomic_read(&fs_info->balance_running) == 0); 3638 3639 mutex_lock(&fs_info->balance_mutex); 3640 /* we are good with balance_ctl ripped off from under us */ 3641 BUG_ON(atomic_read(&fs_info->balance_running)); 3642 atomic_dec(&fs_info->balance_pause_req); 3643 } else { 3644 ret = -ENOTCONN; 3645 } 3646 3647 mutex_unlock(&fs_info->balance_mutex); 3648 return ret; 3649 } 3650 3651 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 3652 { 3653 if (fs_info->sb->s_flags & MS_RDONLY) 3654 return -EROFS; 3655 3656 mutex_lock(&fs_info->balance_mutex); 3657 if (!fs_info->balance_ctl) { 3658 mutex_unlock(&fs_info->balance_mutex); 3659 return -ENOTCONN; 3660 } 3661 3662 atomic_inc(&fs_info->balance_cancel_req); 3663 /* 3664 * if we are running just wait and return, balance item is 3665 * deleted in btrfs_balance in this case 3666 */ 3667 if (atomic_read(&fs_info->balance_running)) { 3668 mutex_unlock(&fs_info->balance_mutex); 3669 wait_event(fs_info->balance_wait_q, 3670 atomic_read(&fs_info->balance_running) == 0); 3671 mutex_lock(&fs_info->balance_mutex); 3672 } else { 3673 /* __cancel_balance needs volume_mutex */ 3674 mutex_unlock(&fs_info->balance_mutex); 3675 mutex_lock(&fs_info->volume_mutex); 3676 mutex_lock(&fs_info->balance_mutex); 3677 3678 if (fs_info->balance_ctl) 3679 __cancel_balance(fs_info); 3680 3681 mutex_unlock(&fs_info->volume_mutex); 3682 } 3683 3684 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 3685 atomic_dec(&fs_info->balance_cancel_req); 3686 mutex_unlock(&fs_info->balance_mutex); 3687 return 0; 3688 } 3689 3690 static int btrfs_uuid_scan_kthread(void *data) 3691 { 3692 struct btrfs_fs_info *fs_info = data; 3693 struct btrfs_root *root = fs_info->tree_root; 3694 struct btrfs_key key; 3695 struct btrfs_key max_key; 3696 struct btrfs_path *path = NULL; 3697 int ret = 0; 3698 struct extent_buffer *eb; 3699 int slot; 3700 struct btrfs_root_item root_item; 3701 u32 item_size; 3702 struct btrfs_trans_handle *trans = NULL; 3703 3704 path = btrfs_alloc_path(); 3705 if (!path) { 3706 ret = -ENOMEM; 3707 goto out; 3708 } 3709 3710 key.objectid = 0; 3711 key.type = BTRFS_ROOT_ITEM_KEY; 3712 key.offset = 0; 3713 3714 max_key.objectid = (u64)-1; 3715 max_key.type = BTRFS_ROOT_ITEM_KEY; 3716 max_key.offset = (u64)-1; 3717 3718 while (1) { 3719 ret = btrfs_search_forward(root, &key, path, 0); 3720 if (ret) { 3721 if (ret > 0) 3722 ret = 0; 3723 break; 3724 } 3725 3726 if (key.type != BTRFS_ROOT_ITEM_KEY || 3727 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 3728 key.objectid != BTRFS_FS_TREE_OBJECTID) || 3729 key.objectid > BTRFS_LAST_FREE_OBJECTID) 3730 goto skip; 3731 3732 eb = path->nodes[0]; 3733 slot = path->slots[0]; 3734 item_size = btrfs_item_size_nr(eb, slot); 3735 if (item_size < sizeof(root_item)) 3736 goto skip; 3737 3738 read_extent_buffer(eb, &root_item, 3739 btrfs_item_ptr_offset(eb, slot), 3740 (int)sizeof(root_item)); 3741 if (btrfs_root_refs(&root_item) == 0) 3742 goto skip; 3743 3744 if (!btrfs_is_empty_uuid(root_item.uuid) || 3745 !btrfs_is_empty_uuid(root_item.received_uuid)) { 3746 if (trans) 3747 goto update_tree; 3748 3749 btrfs_release_path(path); 3750 /* 3751 * 1 - subvol uuid item 3752 * 1 - received_subvol uuid item 3753 */ 3754 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 3755 if (IS_ERR(trans)) { 3756 ret = PTR_ERR(trans); 3757 break; 3758 } 3759 continue; 3760 } else { 3761 goto skip; 3762 } 3763 update_tree: 3764 if (!btrfs_is_empty_uuid(root_item.uuid)) { 3765 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3766 root_item.uuid, 3767 BTRFS_UUID_KEY_SUBVOL, 3768 key.objectid); 3769 if (ret < 0) { 3770 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3771 ret); 3772 break; 3773 } 3774 } 3775 3776 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 3777 ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, 3778 root_item.received_uuid, 3779 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3780 key.objectid); 3781 if (ret < 0) { 3782 btrfs_warn(fs_info, "uuid_tree_add failed %d", 3783 ret); 3784 break; 3785 } 3786 } 3787 3788 skip: 3789 if (trans) { 3790 ret = btrfs_end_transaction(trans, fs_info->uuid_root); 3791 trans = NULL; 3792 if (ret) 3793 break; 3794 } 3795 3796 btrfs_release_path(path); 3797 if (key.offset < (u64)-1) { 3798 key.offset++; 3799 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 3800 key.offset = 0; 3801 key.type = BTRFS_ROOT_ITEM_KEY; 3802 } else if (key.objectid < (u64)-1) { 3803 key.offset = 0; 3804 key.type = BTRFS_ROOT_ITEM_KEY; 3805 key.objectid++; 3806 } else { 3807 break; 3808 } 3809 cond_resched(); 3810 } 3811 3812 out: 3813 btrfs_free_path(path); 3814 if (trans && !IS_ERR(trans)) 3815 btrfs_end_transaction(trans, fs_info->uuid_root); 3816 if (ret) 3817 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 3818 else 3819 fs_info->update_uuid_tree_gen = 1; 3820 up(&fs_info->uuid_tree_rescan_sem); 3821 return 0; 3822 } 3823 3824 /* 3825 * Callback for btrfs_uuid_tree_iterate(). 3826 * returns: 3827 * 0 check succeeded, the entry is not outdated. 3828 * < 0 if an error occured. 3829 * > 0 if the check failed, which means the caller shall remove the entry. 3830 */ 3831 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, 3832 u8 *uuid, u8 type, u64 subid) 3833 { 3834 struct btrfs_key key; 3835 int ret = 0; 3836 struct btrfs_root *subvol_root; 3837 3838 if (type != BTRFS_UUID_KEY_SUBVOL && 3839 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 3840 goto out; 3841 3842 key.objectid = subid; 3843 key.type = BTRFS_ROOT_ITEM_KEY; 3844 key.offset = (u64)-1; 3845 subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); 3846 if (IS_ERR(subvol_root)) { 3847 ret = PTR_ERR(subvol_root); 3848 if (ret == -ENOENT) 3849 ret = 1; 3850 goto out; 3851 } 3852 3853 switch (type) { 3854 case BTRFS_UUID_KEY_SUBVOL: 3855 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) 3856 ret = 1; 3857 break; 3858 case BTRFS_UUID_KEY_RECEIVED_SUBVOL: 3859 if (memcmp(uuid, subvol_root->root_item.received_uuid, 3860 BTRFS_UUID_SIZE)) 3861 ret = 1; 3862 break; 3863 } 3864 3865 out: 3866 return ret; 3867 } 3868 3869 static int btrfs_uuid_rescan_kthread(void *data) 3870 { 3871 struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 3872 int ret; 3873 3874 /* 3875 * 1st step is to iterate through the existing UUID tree and 3876 * to delete all entries that contain outdated data. 3877 * 2nd step is to add all missing entries to the UUID tree. 3878 */ 3879 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 3880 if (ret < 0) { 3881 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); 3882 up(&fs_info->uuid_tree_rescan_sem); 3883 return ret; 3884 } 3885 return btrfs_uuid_scan_kthread(data); 3886 } 3887 3888 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 3889 { 3890 struct btrfs_trans_handle *trans; 3891 struct btrfs_root *tree_root = fs_info->tree_root; 3892 struct btrfs_root *uuid_root; 3893 struct task_struct *task; 3894 int ret; 3895 3896 /* 3897 * 1 - root node 3898 * 1 - root item 3899 */ 3900 trans = btrfs_start_transaction(tree_root, 2); 3901 if (IS_ERR(trans)) 3902 return PTR_ERR(trans); 3903 3904 uuid_root = btrfs_create_tree(trans, fs_info, 3905 BTRFS_UUID_TREE_OBJECTID); 3906 if (IS_ERR(uuid_root)) { 3907 btrfs_abort_transaction(trans, tree_root, 3908 PTR_ERR(uuid_root)); 3909 return PTR_ERR(uuid_root); 3910 } 3911 3912 fs_info->uuid_root = uuid_root; 3913 3914 ret = btrfs_commit_transaction(trans, tree_root); 3915 if (ret) 3916 return ret; 3917 3918 down(&fs_info->uuid_tree_rescan_sem); 3919 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 3920 if (IS_ERR(task)) { 3921 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3922 btrfs_warn(fs_info, "failed to start uuid_scan task"); 3923 up(&fs_info->uuid_tree_rescan_sem); 3924 return PTR_ERR(task); 3925 } 3926 3927 return 0; 3928 } 3929 3930 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 3931 { 3932 struct task_struct *task; 3933 3934 down(&fs_info->uuid_tree_rescan_sem); 3935 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3936 if (IS_ERR(task)) { 3937 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3938 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 3939 up(&fs_info->uuid_tree_rescan_sem); 3940 return PTR_ERR(task); 3941 } 3942 3943 return 0; 3944 } 3945 3946 /* 3947 * shrinking a device means finding all of the device extents past 3948 * the new size, and then following the back refs to the chunks. 3949 * The chunk relocation code actually frees the device extent 3950 */ 3951 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 3952 { 3953 struct btrfs_trans_handle *trans; 3954 struct btrfs_root *root = device->dev_root; 3955 struct btrfs_dev_extent *dev_extent = NULL; 3956 struct btrfs_path *path; 3957 u64 length; 3958 u64 chunk_tree; 3959 u64 chunk_objectid; 3960 u64 chunk_offset; 3961 int ret; 3962 int slot; 3963 int failed = 0; 3964 bool retried = false; 3965 struct extent_buffer *l; 3966 struct btrfs_key key; 3967 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3968 u64 old_total = btrfs_super_total_bytes(super_copy); 3969 u64 old_size = btrfs_device_get_total_bytes(device); 3970 u64 diff = old_size - new_size; 3971 3972 if (device->is_tgtdev_for_dev_replace) 3973 return -EINVAL; 3974 3975 path = btrfs_alloc_path(); 3976 if (!path) 3977 return -ENOMEM; 3978 3979 path->reada = 2; 3980 3981 lock_chunks(root); 3982 3983 btrfs_device_set_total_bytes(device, new_size); 3984 if (device->writeable) { 3985 device->fs_devices->total_rw_bytes -= diff; 3986 spin_lock(&root->fs_info->free_chunk_lock); 3987 root->fs_info->free_chunk_space -= diff; 3988 spin_unlock(&root->fs_info->free_chunk_lock); 3989 } 3990 unlock_chunks(root); 3991 3992 again: 3993 key.objectid = device->devid; 3994 key.offset = (u64)-1; 3995 key.type = BTRFS_DEV_EXTENT_KEY; 3996 3997 do { 3998 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3999 if (ret < 0) 4000 goto done; 4001 4002 ret = btrfs_previous_item(root, path, 0, key.type); 4003 if (ret < 0) 4004 goto done; 4005 if (ret) { 4006 ret = 0; 4007 btrfs_release_path(path); 4008 break; 4009 } 4010 4011 l = path->nodes[0]; 4012 slot = path->slots[0]; 4013 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4014 4015 if (key.objectid != device->devid) { 4016 btrfs_release_path(path); 4017 break; 4018 } 4019 4020 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4021 length = btrfs_dev_extent_length(l, dev_extent); 4022 4023 if (key.offset + length <= new_size) { 4024 btrfs_release_path(path); 4025 break; 4026 } 4027 4028 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 4029 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 4030 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4031 btrfs_release_path(path); 4032 4033 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 4034 chunk_offset); 4035 if (ret && ret != -ENOSPC) 4036 goto done; 4037 if (ret == -ENOSPC) 4038 failed++; 4039 } while (key.offset-- > 0); 4040 4041 if (failed && !retried) { 4042 failed = 0; 4043 retried = true; 4044 goto again; 4045 } else if (failed && retried) { 4046 ret = -ENOSPC; 4047 lock_chunks(root); 4048 4049 btrfs_device_set_total_bytes(device, old_size); 4050 if (device->writeable) 4051 device->fs_devices->total_rw_bytes += diff; 4052 spin_lock(&root->fs_info->free_chunk_lock); 4053 root->fs_info->free_chunk_space += diff; 4054 spin_unlock(&root->fs_info->free_chunk_lock); 4055 unlock_chunks(root); 4056 goto done; 4057 } 4058 4059 /* Shrinking succeeded, else we would be at "done". */ 4060 trans = btrfs_start_transaction(root, 0); 4061 if (IS_ERR(trans)) { 4062 ret = PTR_ERR(trans); 4063 goto done; 4064 } 4065 4066 lock_chunks(root); 4067 btrfs_device_set_disk_total_bytes(device, new_size); 4068 if (list_empty(&device->resized_list)) 4069 list_add_tail(&device->resized_list, 4070 &root->fs_info->fs_devices->resized_devices); 4071 4072 WARN_ON(diff > old_total); 4073 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4074 unlock_chunks(root); 4075 4076 /* Now btrfs_update_device() will change the on-disk size. */ 4077 ret = btrfs_update_device(trans, device); 4078 btrfs_end_transaction(trans, root); 4079 done: 4080 btrfs_free_path(path); 4081 return ret; 4082 } 4083 4084 static int btrfs_add_system_chunk(struct btrfs_root *root, 4085 struct btrfs_key *key, 4086 struct btrfs_chunk *chunk, int item_size) 4087 { 4088 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 4089 struct btrfs_disk_key disk_key; 4090 u32 array_size; 4091 u8 *ptr; 4092 4093 lock_chunks(root); 4094 array_size = btrfs_super_sys_array_size(super_copy); 4095 if (array_size + item_size + sizeof(disk_key) 4096 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4097 unlock_chunks(root); 4098 return -EFBIG; 4099 } 4100 4101 ptr = super_copy->sys_chunk_array + array_size; 4102 btrfs_cpu_key_to_disk(&disk_key, key); 4103 memcpy(ptr, &disk_key, sizeof(disk_key)); 4104 ptr += sizeof(disk_key); 4105 memcpy(ptr, chunk, item_size); 4106 item_size += sizeof(disk_key); 4107 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4108 unlock_chunks(root); 4109 4110 return 0; 4111 } 4112 4113 /* 4114 * sort the devices in descending order by max_avail, total_avail 4115 */ 4116 static int btrfs_cmp_device_info(const void *a, const void *b) 4117 { 4118 const struct btrfs_device_info *di_a = a; 4119 const struct btrfs_device_info *di_b = b; 4120 4121 if (di_a->max_avail > di_b->max_avail) 4122 return -1; 4123 if (di_a->max_avail < di_b->max_avail) 4124 return 1; 4125 if (di_a->total_avail > di_b->total_avail) 4126 return -1; 4127 if (di_a->total_avail < di_b->total_avail) 4128 return 1; 4129 return 0; 4130 } 4131 4132 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 4133 [BTRFS_RAID_RAID10] = { 4134 .sub_stripes = 2, 4135 .dev_stripes = 1, 4136 .devs_max = 0, /* 0 == as many as possible */ 4137 .devs_min = 4, 4138 .devs_increment = 2, 4139 .ncopies = 2, 4140 }, 4141 [BTRFS_RAID_RAID1] = { 4142 .sub_stripes = 1, 4143 .dev_stripes = 1, 4144 .devs_max = 2, 4145 .devs_min = 2, 4146 .devs_increment = 2, 4147 .ncopies = 2, 4148 }, 4149 [BTRFS_RAID_DUP] = { 4150 .sub_stripes = 1, 4151 .dev_stripes = 2, 4152 .devs_max = 1, 4153 .devs_min = 1, 4154 .devs_increment = 1, 4155 .ncopies = 2, 4156 }, 4157 [BTRFS_RAID_RAID0] = { 4158 .sub_stripes = 1, 4159 .dev_stripes = 1, 4160 .devs_max = 0, 4161 .devs_min = 2, 4162 .devs_increment = 1, 4163 .ncopies = 1, 4164 }, 4165 [BTRFS_RAID_SINGLE] = { 4166 .sub_stripes = 1, 4167 .dev_stripes = 1, 4168 .devs_max = 1, 4169 .devs_min = 1, 4170 .devs_increment = 1, 4171 .ncopies = 1, 4172 }, 4173 [BTRFS_RAID_RAID5] = { 4174 .sub_stripes = 1, 4175 .dev_stripes = 1, 4176 .devs_max = 0, 4177 .devs_min = 2, 4178 .devs_increment = 1, 4179 .ncopies = 2, 4180 }, 4181 [BTRFS_RAID_RAID6] = { 4182 .sub_stripes = 1, 4183 .dev_stripes = 1, 4184 .devs_max = 0, 4185 .devs_min = 3, 4186 .devs_increment = 1, 4187 .ncopies = 3, 4188 }, 4189 }; 4190 4191 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 4192 { 4193 /* TODO allow them to set a preferred stripe size */ 4194 return 64 * 1024; 4195 } 4196 4197 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4198 { 4199 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 4200 return; 4201 4202 btrfs_set_fs_incompat(info, RAID56); 4203 } 4204 4205 #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ 4206 - sizeof(struct btrfs_item) \ 4207 - sizeof(struct btrfs_chunk)) \ 4208 / sizeof(struct btrfs_stripe) + 1) 4209 4210 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ 4211 - 2 * sizeof(struct btrfs_disk_key) \ 4212 - 2 * sizeof(struct btrfs_chunk)) \ 4213 / sizeof(struct btrfs_stripe) + 1) 4214 4215 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4216 struct btrfs_root *extent_root, u64 start, 4217 u64 type) 4218 { 4219 struct btrfs_fs_info *info = extent_root->fs_info; 4220 struct btrfs_fs_devices *fs_devices = info->fs_devices; 4221 struct list_head *cur; 4222 struct map_lookup *map = NULL; 4223 struct extent_map_tree *em_tree; 4224 struct extent_map *em; 4225 struct btrfs_device_info *devices_info = NULL; 4226 u64 total_avail; 4227 int num_stripes; /* total number of stripes to allocate */ 4228 int data_stripes; /* number of stripes that count for 4229 block group size */ 4230 int sub_stripes; /* sub_stripes info for map */ 4231 int dev_stripes; /* stripes per dev */ 4232 int devs_max; /* max devs to use */ 4233 int devs_min; /* min devs needed */ 4234 int devs_increment; /* ndevs has to be a multiple of this */ 4235 int ncopies; /* how many copies to data has */ 4236 int ret; 4237 u64 max_stripe_size; 4238 u64 max_chunk_size; 4239 u64 stripe_size; 4240 u64 num_bytes; 4241 u64 raid_stripe_len = BTRFS_STRIPE_LEN; 4242 int ndevs; 4243 int i; 4244 int j; 4245 int index; 4246 4247 BUG_ON(!alloc_profile_is_valid(type, 0)); 4248 4249 if (list_empty(&fs_devices->alloc_list)) 4250 return -ENOSPC; 4251 4252 index = __get_raid_index(type); 4253 4254 sub_stripes = btrfs_raid_array[index].sub_stripes; 4255 dev_stripes = btrfs_raid_array[index].dev_stripes; 4256 devs_max = btrfs_raid_array[index].devs_max; 4257 devs_min = btrfs_raid_array[index].devs_min; 4258 devs_increment = btrfs_raid_array[index].devs_increment; 4259 ncopies = btrfs_raid_array[index].ncopies; 4260 4261 if (type & BTRFS_BLOCK_GROUP_DATA) { 4262 max_stripe_size = 1024 * 1024 * 1024; 4263 max_chunk_size = 10 * max_stripe_size; 4264 if (!devs_max) 4265 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4266 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4267 /* for larger filesystems, use larger metadata chunks */ 4268 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 4269 max_stripe_size = 1024 * 1024 * 1024; 4270 else 4271 max_stripe_size = 256 * 1024 * 1024; 4272 max_chunk_size = max_stripe_size; 4273 if (!devs_max) 4274 devs_max = BTRFS_MAX_DEVS(info->chunk_root); 4275 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4276 max_stripe_size = 32 * 1024 * 1024; 4277 max_chunk_size = 2 * max_stripe_size; 4278 if (!devs_max) 4279 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; 4280 } else { 4281 btrfs_err(info, "invalid chunk type 0x%llx requested", 4282 type); 4283 BUG_ON(1); 4284 } 4285 4286 /* we don't want a chunk larger than 10% of writeable space */ 4287 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4288 max_chunk_size); 4289 4290 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 4291 GFP_NOFS); 4292 if (!devices_info) 4293 return -ENOMEM; 4294 4295 cur = fs_devices->alloc_list.next; 4296 4297 /* 4298 * in the first pass through the devices list, we gather information 4299 * about the available holes on each device. 4300 */ 4301 ndevs = 0; 4302 while (cur != &fs_devices->alloc_list) { 4303 struct btrfs_device *device; 4304 u64 max_avail; 4305 u64 dev_offset; 4306 4307 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 4308 4309 cur = cur->next; 4310 4311 if (!device->writeable) { 4312 WARN(1, KERN_ERR 4313 "BTRFS: read-only device in alloc_list\n"); 4314 continue; 4315 } 4316 4317 if (!device->in_fs_metadata || 4318 device->is_tgtdev_for_dev_replace) 4319 continue; 4320 4321 if (device->total_bytes > device->bytes_used) 4322 total_avail = device->total_bytes - device->bytes_used; 4323 else 4324 total_avail = 0; 4325 4326 /* If there is no space on this device, skip it. */ 4327 if (total_avail == 0) 4328 continue; 4329 4330 ret = find_free_dev_extent(trans, device, 4331 max_stripe_size * dev_stripes, 4332 &dev_offset, &max_avail); 4333 if (ret && ret != -ENOSPC) 4334 goto error; 4335 4336 if (ret == 0) 4337 max_avail = max_stripe_size * dev_stripes; 4338 4339 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 4340 continue; 4341 4342 if (ndevs == fs_devices->rw_devices) { 4343 WARN(1, "%s: found more than %llu devices\n", 4344 __func__, fs_devices->rw_devices); 4345 break; 4346 } 4347 devices_info[ndevs].dev_offset = dev_offset; 4348 devices_info[ndevs].max_avail = max_avail; 4349 devices_info[ndevs].total_avail = total_avail; 4350 devices_info[ndevs].dev = device; 4351 ++ndevs; 4352 } 4353 4354 /* 4355 * now sort the devices by hole size / available space 4356 */ 4357 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4358 btrfs_cmp_device_info, NULL); 4359 4360 /* round down to number of usable stripes */ 4361 ndevs -= ndevs % devs_increment; 4362 4363 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 4364 ret = -ENOSPC; 4365 goto error; 4366 } 4367 4368 if (devs_max && ndevs > devs_max) 4369 ndevs = devs_max; 4370 /* 4371 * the primary goal is to maximize the number of stripes, so use as many 4372 * devices as possible, even if the stripes are not maximum sized. 4373 */ 4374 stripe_size = devices_info[ndevs-1].max_avail; 4375 num_stripes = ndevs * dev_stripes; 4376 4377 /* 4378 * this will have to be fixed for RAID1 and RAID10 over 4379 * more drives 4380 */ 4381 data_stripes = num_stripes / ncopies; 4382 4383 if (type & BTRFS_BLOCK_GROUP_RAID5) { 4384 raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 4385 btrfs_super_stripesize(info->super_copy)); 4386 data_stripes = num_stripes - 1; 4387 } 4388 if (type & BTRFS_BLOCK_GROUP_RAID6) { 4389 raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 4390 btrfs_super_stripesize(info->super_copy)); 4391 data_stripes = num_stripes - 2; 4392 } 4393 4394 /* 4395 * Use the number of data stripes to figure out how big this chunk 4396 * is really going to be in terms of logical address space, 4397 * and compare that answer with the max chunk size 4398 */ 4399 if (stripe_size * data_stripes > max_chunk_size) { 4400 u64 mask = (1ULL << 24) - 1; 4401 stripe_size = max_chunk_size; 4402 do_div(stripe_size, data_stripes); 4403 4404 /* bump the answer up to a 16MB boundary */ 4405 stripe_size = (stripe_size + mask) & ~mask; 4406 4407 /* but don't go higher than the limits we found 4408 * while searching for free extents 4409 */ 4410 if (stripe_size > devices_info[ndevs-1].max_avail) 4411 stripe_size = devices_info[ndevs-1].max_avail; 4412 } 4413 4414 do_div(stripe_size, dev_stripes); 4415 4416 /* align to BTRFS_STRIPE_LEN */ 4417 do_div(stripe_size, raid_stripe_len); 4418 stripe_size *= raid_stripe_len; 4419 4420 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4421 if (!map) { 4422 ret = -ENOMEM; 4423 goto error; 4424 } 4425 map->num_stripes = num_stripes; 4426 4427 for (i = 0; i < ndevs; ++i) { 4428 for (j = 0; j < dev_stripes; ++j) { 4429 int s = i * dev_stripes + j; 4430 map->stripes[s].dev = devices_info[i].dev; 4431 map->stripes[s].physical = devices_info[i].dev_offset + 4432 j * stripe_size; 4433 } 4434 } 4435 map->sector_size = extent_root->sectorsize; 4436 map->stripe_len = raid_stripe_len; 4437 map->io_align = raid_stripe_len; 4438 map->io_width = raid_stripe_len; 4439 map->type = type; 4440 map->sub_stripes = sub_stripes; 4441 4442 num_bytes = stripe_size * data_stripes; 4443 4444 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 4445 4446 em = alloc_extent_map(); 4447 if (!em) { 4448 kfree(map); 4449 ret = -ENOMEM; 4450 goto error; 4451 } 4452 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 4453 em->bdev = (struct block_device *)map; 4454 em->start = start; 4455 em->len = num_bytes; 4456 em->block_start = 0; 4457 em->block_len = em->len; 4458 em->orig_block_len = stripe_size; 4459 4460 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4461 write_lock(&em_tree->lock); 4462 ret = add_extent_mapping(em_tree, em, 0); 4463 if (!ret) { 4464 list_add_tail(&em->list, &trans->transaction->pending_chunks); 4465 atomic_inc(&em->refs); 4466 } 4467 write_unlock(&em_tree->lock); 4468 if (ret) { 4469 free_extent_map(em); 4470 goto error; 4471 } 4472 4473 ret = btrfs_make_block_group(trans, extent_root, 0, type, 4474 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4475 start, num_bytes); 4476 if (ret) 4477 goto error_del_extent; 4478 4479 for (i = 0; i < map->num_stripes; i++) { 4480 num_bytes = map->stripes[i].dev->bytes_used + stripe_size; 4481 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4482 } 4483 4484 spin_lock(&extent_root->fs_info->free_chunk_lock); 4485 extent_root->fs_info->free_chunk_space -= (stripe_size * 4486 map->num_stripes); 4487 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4488 4489 free_extent_map(em); 4490 check_raid56_incompat_flag(extent_root->fs_info, type); 4491 4492 kfree(devices_info); 4493 return 0; 4494 4495 error_del_extent: 4496 write_lock(&em_tree->lock); 4497 remove_extent_mapping(em_tree, em); 4498 write_unlock(&em_tree->lock); 4499 4500 /* One for our allocation */ 4501 free_extent_map(em); 4502 /* One for the tree reference */ 4503 free_extent_map(em); 4504 /* One for the pending_chunks list reference */ 4505 free_extent_map(em); 4506 error: 4507 kfree(devices_info); 4508 return ret; 4509 } 4510 4511 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 4512 struct btrfs_root *extent_root, 4513 u64 chunk_offset, u64 chunk_size) 4514 { 4515 struct btrfs_key key; 4516 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 4517 struct btrfs_device *device; 4518 struct btrfs_chunk *chunk; 4519 struct btrfs_stripe *stripe; 4520 struct extent_map_tree *em_tree; 4521 struct extent_map *em; 4522 struct map_lookup *map; 4523 size_t item_size; 4524 u64 dev_offset; 4525 u64 stripe_size; 4526 int i = 0; 4527 int ret; 4528 4529 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 4530 read_lock(&em_tree->lock); 4531 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); 4532 read_unlock(&em_tree->lock); 4533 4534 if (!em) { 4535 btrfs_crit(extent_root->fs_info, "unable to find logical " 4536 "%Lu len %Lu", chunk_offset, chunk_size); 4537 return -EINVAL; 4538 } 4539 4540 if (em->start != chunk_offset || em->len != chunk_size) { 4541 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" 4542 " %Lu-%Lu, found %Lu-%Lu", chunk_offset, 4543 chunk_size, em->start, em->len); 4544 free_extent_map(em); 4545 return -EINVAL; 4546 } 4547 4548 map = (struct map_lookup *)em->bdev; 4549 item_size = btrfs_chunk_item_size(map->num_stripes); 4550 stripe_size = em->orig_block_len; 4551 4552 chunk = kzalloc(item_size, GFP_NOFS); 4553 if (!chunk) { 4554 ret = -ENOMEM; 4555 goto out; 4556 } 4557 4558 for (i = 0; i < map->num_stripes; i++) { 4559 device = map->stripes[i].dev; 4560 dev_offset = map->stripes[i].physical; 4561 4562 ret = btrfs_update_device(trans, device); 4563 if (ret) 4564 goto out; 4565 ret = btrfs_alloc_dev_extent(trans, device, 4566 chunk_root->root_key.objectid, 4567 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 4568 chunk_offset, dev_offset, 4569 stripe_size); 4570 if (ret) 4571 goto out; 4572 } 4573 4574 stripe = &chunk->stripe; 4575 for (i = 0; i < map->num_stripes; i++) { 4576 device = map->stripes[i].dev; 4577 dev_offset = map->stripes[i].physical; 4578 4579 btrfs_set_stack_stripe_devid(stripe, device->devid); 4580 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4581 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4582 stripe++; 4583 } 4584 4585 btrfs_set_stack_chunk_length(chunk, chunk_size); 4586 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 4587 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 4588 btrfs_set_stack_chunk_type(chunk, map->type); 4589 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 4590 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 4591 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 4592 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 4593 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 4594 4595 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4596 key.type = BTRFS_CHUNK_ITEM_KEY; 4597 key.offset = chunk_offset; 4598 4599 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4600 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4601 /* 4602 * TODO: Cleanup of inserted chunk root in case of 4603 * failure. 4604 */ 4605 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 4606 item_size); 4607 } 4608 4609 out: 4610 kfree(chunk); 4611 free_extent_map(em); 4612 return ret; 4613 } 4614 4615 /* 4616 * Chunk allocation falls into two parts. The first part does works 4617 * that make the new allocated chunk useable, but not do any operation 4618 * that modifies the chunk tree. The second part does the works that 4619 * require modifying the chunk tree. This division is important for the 4620 * bootstrap process of adding storage to a seed btrfs. 4621 */ 4622 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 4623 struct btrfs_root *extent_root, u64 type) 4624 { 4625 u64 chunk_offset; 4626 4627 chunk_offset = find_next_chunk(extent_root->fs_info); 4628 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); 4629 } 4630 4631 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4632 struct btrfs_root *root, 4633 struct btrfs_device *device) 4634 { 4635 u64 chunk_offset; 4636 u64 sys_chunk_offset; 4637 u64 alloc_profile; 4638 struct btrfs_fs_info *fs_info = root->fs_info; 4639 struct btrfs_root *extent_root = fs_info->extent_root; 4640 int ret; 4641 4642 chunk_offset = find_next_chunk(fs_info); 4643 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4644 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, 4645 alloc_profile); 4646 if (ret) 4647 return ret; 4648 4649 sys_chunk_offset = find_next_chunk(root->fs_info); 4650 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4651 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4652 alloc_profile); 4653 return ret; 4654 } 4655 4656 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 4657 { 4658 int max_errors; 4659 4660 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4661 BTRFS_BLOCK_GROUP_RAID10 | 4662 BTRFS_BLOCK_GROUP_RAID5 | 4663 BTRFS_BLOCK_GROUP_DUP)) { 4664 max_errors = 1; 4665 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 4666 max_errors = 2; 4667 } else { 4668 max_errors = 0; 4669 } 4670 4671 return max_errors; 4672 } 4673 4674 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4675 { 4676 struct extent_map *em; 4677 struct map_lookup *map; 4678 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4679 int readonly = 0; 4680 int miss_ndevs = 0; 4681 int i; 4682 4683 read_lock(&map_tree->map_tree.lock); 4684 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 4685 read_unlock(&map_tree->map_tree.lock); 4686 if (!em) 4687 return 1; 4688 4689 map = (struct map_lookup *)em->bdev; 4690 for (i = 0; i < map->num_stripes; i++) { 4691 if (map->stripes[i].dev->missing) { 4692 miss_ndevs++; 4693 continue; 4694 } 4695 4696 if (!map->stripes[i].dev->writeable) { 4697 readonly = 1; 4698 goto end; 4699 } 4700 } 4701 4702 /* 4703 * If the number of missing devices is larger than max errors, 4704 * we can not write the data into that chunk successfully, so 4705 * set it readonly. 4706 */ 4707 if (miss_ndevs > btrfs_chunk_max_errors(map)) 4708 readonly = 1; 4709 end: 4710 free_extent_map(em); 4711 return readonly; 4712 } 4713 4714 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 4715 { 4716 extent_map_tree_init(&tree->map_tree); 4717 } 4718 4719 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 4720 { 4721 struct extent_map *em; 4722 4723 while (1) { 4724 write_lock(&tree->map_tree.lock); 4725 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 4726 if (em) 4727 remove_extent_mapping(&tree->map_tree, em); 4728 write_unlock(&tree->map_tree.lock); 4729 if (!em) 4730 break; 4731 /* once for us */ 4732 free_extent_map(em); 4733 /* once for the tree */ 4734 free_extent_map(em); 4735 } 4736 } 4737 4738 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 4739 { 4740 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4741 struct extent_map *em; 4742 struct map_lookup *map; 4743 struct extent_map_tree *em_tree = &map_tree->map_tree; 4744 int ret; 4745 4746 read_lock(&em_tree->lock); 4747 em = lookup_extent_mapping(em_tree, logical, len); 4748 read_unlock(&em_tree->lock); 4749 4750 /* 4751 * We could return errors for these cases, but that could get ugly and 4752 * we'd probably do the same thing which is just not do anything else 4753 * and exit, so return 1 so the callers don't try to use other copies. 4754 */ 4755 if (!em) { 4756 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, 4757 logical+len); 4758 return 1; 4759 } 4760 4761 if (em->start > logical || em->start + em->len < logical) { 4762 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " 4763 "%Lu-%Lu", logical, logical+len, em->start, 4764 em->start + em->len); 4765 free_extent_map(em); 4766 return 1; 4767 } 4768 4769 map = (struct map_lookup *)em->bdev; 4770 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 4771 ret = map->num_stripes; 4772 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4773 ret = map->sub_stripes; 4774 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4775 ret = 2; 4776 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4777 ret = 3; 4778 else 4779 ret = 1; 4780 free_extent_map(em); 4781 4782 btrfs_dev_replace_lock(&fs_info->dev_replace); 4783 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) 4784 ret++; 4785 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4786 4787 return ret; 4788 } 4789 4790 unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4791 struct btrfs_mapping_tree *map_tree, 4792 u64 logical) 4793 { 4794 struct extent_map *em; 4795 struct map_lookup *map; 4796 struct extent_map_tree *em_tree = &map_tree->map_tree; 4797 unsigned long len = root->sectorsize; 4798 4799 read_lock(&em_tree->lock); 4800 em = lookup_extent_mapping(em_tree, logical, len); 4801 read_unlock(&em_tree->lock); 4802 BUG_ON(!em); 4803 4804 BUG_ON(em->start > logical || em->start + em->len < logical); 4805 map = (struct map_lookup *)em->bdev; 4806 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4807 BTRFS_BLOCK_GROUP_RAID6)) { 4808 len = map->stripe_len * nr_data_stripes(map); 4809 } 4810 free_extent_map(em); 4811 return len; 4812 } 4813 4814 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4815 u64 logical, u64 len, int mirror_num) 4816 { 4817 struct extent_map *em; 4818 struct map_lookup *map; 4819 struct extent_map_tree *em_tree = &map_tree->map_tree; 4820 int ret = 0; 4821 4822 read_lock(&em_tree->lock); 4823 em = lookup_extent_mapping(em_tree, logical, len); 4824 read_unlock(&em_tree->lock); 4825 BUG_ON(!em); 4826 4827 BUG_ON(em->start > logical || em->start + em->len < logical); 4828 map = (struct map_lookup *)em->bdev; 4829 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4830 BTRFS_BLOCK_GROUP_RAID6)) 4831 ret = 1; 4832 free_extent_map(em); 4833 return ret; 4834 } 4835 4836 static int find_live_mirror(struct btrfs_fs_info *fs_info, 4837 struct map_lookup *map, int first, int num, 4838 int optimal, int dev_replace_is_ongoing) 4839 { 4840 int i; 4841 int tolerance; 4842 struct btrfs_device *srcdev; 4843 4844 if (dev_replace_is_ongoing && 4845 fs_info->dev_replace.cont_reading_from_srcdev_mode == 4846 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 4847 srcdev = fs_info->dev_replace.srcdev; 4848 else 4849 srcdev = NULL; 4850 4851 /* 4852 * try to avoid the drive that is the source drive for a 4853 * dev-replace procedure, only choose it if no other non-missing 4854 * mirror is available 4855 */ 4856 for (tolerance = 0; tolerance < 2; tolerance++) { 4857 if (map->stripes[optimal].dev->bdev && 4858 (tolerance || map->stripes[optimal].dev != srcdev)) 4859 return optimal; 4860 for (i = first; i < first + num; i++) { 4861 if (map->stripes[i].dev->bdev && 4862 (tolerance || map->stripes[i].dev != srcdev)) 4863 return i; 4864 } 4865 } 4866 4867 /* we couldn't find one that doesn't fail. Just return something 4868 * and the io error handling code will clean up eventually 4869 */ 4870 return optimal; 4871 } 4872 4873 static inline int parity_smaller(u64 a, u64 b) 4874 { 4875 return a > b; 4876 } 4877 4878 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4879 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4880 { 4881 struct btrfs_bio_stripe s; 4882 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 4883 int i; 4884 u64 l; 4885 int again = 1; 4886 int m; 4887 4888 while (again) { 4889 again = 0; 4890 for (i = 0; i < real_stripes - 1; i++) { 4891 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4892 s = bbio->stripes[i]; 4893 l = raid_map[i]; 4894 bbio->stripes[i] = bbio->stripes[i+1]; 4895 raid_map[i] = raid_map[i+1]; 4896 bbio->stripes[i+1] = s; 4897 raid_map[i+1] = l; 4898 4899 if (bbio->tgtdev_map) { 4900 m = bbio->tgtdev_map[i]; 4901 bbio->tgtdev_map[i] = 4902 bbio->tgtdev_map[i + 1]; 4903 bbio->tgtdev_map[i + 1] = m; 4904 } 4905 4906 again = 1; 4907 } 4908 } 4909 } 4910 } 4911 4912 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4913 u64 logical, u64 *length, 4914 struct btrfs_bio **bbio_ret, 4915 int mirror_num, u64 **raid_map_ret) 4916 { 4917 struct extent_map *em; 4918 struct map_lookup *map; 4919 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 4920 struct extent_map_tree *em_tree = &map_tree->map_tree; 4921 u64 offset; 4922 u64 stripe_offset; 4923 u64 stripe_end_offset; 4924 u64 stripe_nr; 4925 u64 stripe_nr_orig; 4926 u64 stripe_nr_end; 4927 u64 stripe_len; 4928 u64 *raid_map = NULL; 4929 int stripe_index; 4930 int i; 4931 int ret = 0; 4932 int num_stripes; 4933 int max_errors = 0; 4934 int tgtdev_indexes = 0; 4935 struct btrfs_bio *bbio = NULL; 4936 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4937 int dev_replace_is_ongoing = 0; 4938 int num_alloc_stripes; 4939 int patch_the_first_stripe_for_dev_replace = 0; 4940 u64 physical_to_patch_in_first_stripe = 0; 4941 u64 raid56_full_stripe_start = (u64)-1; 4942 4943 read_lock(&em_tree->lock); 4944 em = lookup_extent_mapping(em_tree, logical, *length); 4945 read_unlock(&em_tree->lock); 4946 4947 if (!em) { 4948 btrfs_crit(fs_info, "unable to find logical %llu len %llu", 4949 logical, *length); 4950 return -EINVAL; 4951 } 4952 4953 if (em->start > logical || em->start + em->len < logical) { 4954 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " 4955 "found %Lu-%Lu", logical, em->start, 4956 em->start + em->len); 4957 free_extent_map(em); 4958 return -EINVAL; 4959 } 4960 4961 map = (struct map_lookup *)em->bdev; 4962 offset = logical - em->start; 4963 4964 stripe_len = map->stripe_len; 4965 stripe_nr = offset; 4966 /* 4967 * stripe_nr counts the total number of stripes we have to stride 4968 * to get to this block 4969 */ 4970 do_div(stripe_nr, stripe_len); 4971 4972 stripe_offset = stripe_nr * stripe_len; 4973 BUG_ON(offset < stripe_offset); 4974 4975 /* stripe_offset is the offset of this block in its stripe*/ 4976 stripe_offset = offset - stripe_offset; 4977 4978 /* if we're here for raid56, we need to know the stripe aligned start */ 4979 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4980 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 4981 raid56_full_stripe_start = offset; 4982 4983 /* allow a write of a full stripe, but make sure we don't 4984 * allow straddling of stripes 4985 */ 4986 do_div(raid56_full_stripe_start, full_stripe_len); 4987 raid56_full_stripe_start *= full_stripe_len; 4988 } 4989 4990 if (rw & REQ_DISCARD) { 4991 /* we don't discard raid56 yet */ 4992 if (map->type & 4993 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4994 ret = -EOPNOTSUPP; 4995 goto out; 4996 } 4997 *length = min_t(u64, em->len - offset, *length); 4998 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4999 u64 max_len; 5000 /* For writes to RAID[56], allow a full stripeset across all disks. 5001 For other RAID types and for RAID[56] reads, just allow a single 5002 stripe (on a single disk). */ 5003 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 5004 (rw & REQ_WRITE)) { 5005 max_len = stripe_len * nr_data_stripes(map) - 5006 (offset - raid56_full_stripe_start); 5007 } else { 5008 /* we limit the length of each bio to what fits in a stripe */ 5009 max_len = stripe_len - stripe_offset; 5010 } 5011 *length = min_t(u64, em->len - offset, max_len); 5012 } else { 5013 *length = em->len - offset; 5014 } 5015 5016 /* This is for when we're called from btrfs_merge_bio_hook() and all 5017 it cares about is the length */ 5018 if (!bbio_ret) 5019 goto out; 5020 5021 btrfs_dev_replace_lock(dev_replace); 5022 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 5023 if (!dev_replace_is_ongoing) 5024 btrfs_dev_replace_unlock(dev_replace); 5025 5026 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 5027 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && 5028 dev_replace->tgtdev != NULL) { 5029 /* 5030 * in dev-replace case, for repair case (that's the only 5031 * case where the mirror is selected explicitly when 5032 * calling btrfs_map_block), blocks left of the left cursor 5033 * can also be read from the target drive. 5034 * For REQ_GET_READ_MIRRORS, the target drive is added as 5035 * the last one to the array of stripes. For READ, it also 5036 * needs to be supported using the same mirror number. 5037 * If the requested block is not left of the left cursor, 5038 * EIO is returned. This can happen because btrfs_num_copies() 5039 * returns one more in the dev-replace case. 5040 */ 5041 u64 tmp_length = *length; 5042 struct btrfs_bio *tmp_bbio = NULL; 5043 int tmp_num_stripes; 5044 u64 srcdev_devid = dev_replace->srcdev->devid; 5045 int index_srcdev = 0; 5046 int found = 0; 5047 u64 physical_of_found = 0; 5048 5049 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 5050 logical, &tmp_length, &tmp_bbio, 0, NULL); 5051 if (ret) { 5052 WARN_ON(tmp_bbio != NULL); 5053 goto out; 5054 } 5055 5056 tmp_num_stripes = tmp_bbio->num_stripes; 5057 if (mirror_num > tmp_num_stripes) { 5058 /* 5059 * REQ_GET_READ_MIRRORS does not contain this 5060 * mirror, that means that the requested area 5061 * is not left of the left cursor 5062 */ 5063 ret = -EIO; 5064 kfree(tmp_bbio); 5065 goto out; 5066 } 5067 5068 /* 5069 * process the rest of the function using the mirror_num 5070 * of the source drive. Therefore look it up first. 5071 * At the end, patch the device pointer to the one of the 5072 * target drive. 5073 */ 5074 for (i = 0; i < tmp_num_stripes; i++) { 5075 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { 5076 /* 5077 * In case of DUP, in order to keep it 5078 * simple, only add the mirror with the 5079 * lowest physical address 5080 */ 5081 if (found && 5082 physical_of_found <= 5083 tmp_bbio->stripes[i].physical) 5084 continue; 5085 index_srcdev = i; 5086 found = 1; 5087 physical_of_found = 5088 tmp_bbio->stripes[i].physical; 5089 } 5090 } 5091 5092 if (found) { 5093 mirror_num = index_srcdev + 1; 5094 patch_the_first_stripe_for_dev_replace = 1; 5095 physical_to_patch_in_first_stripe = physical_of_found; 5096 } else { 5097 WARN_ON(1); 5098 ret = -EIO; 5099 kfree(tmp_bbio); 5100 goto out; 5101 } 5102 5103 kfree(tmp_bbio); 5104 } else if (mirror_num > map->num_stripes) { 5105 mirror_num = 0; 5106 } 5107 5108 num_stripes = 1; 5109 stripe_index = 0; 5110 stripe_nr_orig = stripe_nr; 5111 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 5112 do_div(stripe_nr_end, map->stripe_len); 5113 stripe_end_offset = stripe_nr_end * map->stripe_len - 5114 (offset + *length); 5115 5116 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5117 if (rw & REQ_DISCARD) 5118 num_stripes = min_t(u64, map->num_stripes, 5119 stripe_nr_end - stripe_nr_orig); 5120 stripe_index = do_div(stripe_nr, map->num_stripes); 5121 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) 5122 mirror_num = 1; 5123 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5124 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 5125 num_stripes = map->num_stripes; 5126 else if (mirror_num) 5127 stripe_index = mirror_num - 1; 5128 else { 5129 stripe_index = find_live_mirror(fs_info, map, 0, 5130 map->num_stripes, 5131 current->pid % map->num_stripes, 5132 dev_replace_is_ongoing); 5133 mirror_num = stripe_index + 1; 5134 } 5135 5136 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 5137 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { 5138 num_stripes = map->num_stripes; 5139 } else if (mirror_num) { 5140 stripe_index = mirror_num - 1; 5141 } else { 5142 mirror_num = 1; 5143 } 5144 5145 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5146 int factor = map->num_stripes / map->sub_stripes; 5147 5148 stripe_index = do_div(stripe_nr, factor); 5149 stripe_index *= map->sub_stripes; 5150 5151 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5152 num_stripes = map->sub_stripes; 5153 else if (rw & REQ_DISCARD) 5154 num_stripes = min_t(u64, map->sub_stripes * 5155 (stripe_nr_end - stripe_nr_orig), 5156 map->num_stripes); 5157 else if (mirror_num) 5158 stripe_index += mirror_num - 1; 5159 else { 5160 int old_stripe_index = stripe_index; 5161 stripe_index = find_live_mirror(fs_info, map, 5162 stripe_index, 5163 map->sub_stripes, stripe_index + 5164 current->pid % map->sub_stripes, 5165 dev_replace_is_ongoing); 5166 mirror_num = stripe_index - old_stripe_index + 1; 5167 } 5168 5169 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5170 BTRFS_BLOCK_GROUP_RAID6)) { 5171 u64 tmp; 5172 5173 if (raid_map_ret && 5174 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || 5175 mirror_num > 1)) { 5176 int i, rot; 5177 5178 /* push stripe_nr back to the start of the full stripe */ 5179 stripe_nr = raid56_full_stripe_start; 5180 do_div(stripe_nr, stripe_len * nr_data_stripes(map)); 5181 5182 /* RAID[56] write or recovery. Return all stripes */ 5183 num_stripes = map->num_stripes; 5184 max_errors = nr_parity_stripes(map); 5185 5186 raid_map = kmalloc_array(num_stripes, sizeof(u64), 5187 GFP_NOFS); 5188 if (!raid_map) { 5189 ret = -ENOMEM; 5190 goto out; 5191 } 5192 5193 /* Work out the disk rotation on this stripe-set */ 5194 tmp = stripe_nr; 5195 rot = do_div(tmp, num_stripes); 5196 5197 /* Fill in the logical address of each stripe */ 5198 tmp = stripe_nr * nr_data_stripes(map); 5199 for (i = 0; i < nr_data_stripes(map); i++) 5200 raid_map[(i+rot) % num_stripes] = 5201 em->start + (tmp + i) * map->stripe_len; 5202 5203 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 5204 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5205 raid_map[(i+rot+1) % num_stripes] = 5206 RAID6_Q_STRIPE; 5207 5208 *length = map->stripe_len; 5209 stripe_index = 0; 5210 stripe_offset = 0; 5211 } else { 5212 /* 5213 * Mirror #0 or #1 means the original data block. 5214 * Mirror #2 is RAID5 parity block. 5215 * Mirror #3 is RAID6 Q block. 5216 */ 5217 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 5218 if (mirror_num > 1) 5219 stripe_index = nr_data_stripes(map) + 5220 mirror_num - 2; 5221 5222 /* We distribute the parity blocks across stripes */ 5223 tmp = stripe_nr + stripe_index; 5224 stripe_index = do_div(tmp, map->num_stripes); 5225 if (!(rw & (REQ_WRITE | REQ_DISCARD | 5226 REQ_GET_READ_MIRRORS)) && mirror_num <= 1) 5227 mirror_num = 1; 5228 } 5229 } else { 5230 /* 5231 * after this do_div call, stripe_nr is the number of stripes 5232 * on this device we have to walk to find the data, and 5233 * stripe_index is the number of our device in the stripe array 5234 */ 5235 stripe_index = do_div(stripe_nr, map->num_stripes); 5236 mirror_num = stripe_index + 1; 5237 } 5238 BUG_ON(stripe_index >= map->num_stripes); 5239 5240 num_alloc_stripes = num_stripes; 5241 if (dev_replace_is_ongoing) { 5242 if (rw & (REQ_WRITE | REQ_DISCARD)) 5243 num_alloc_stripes <<= 1; 5244 if (rw & REQ_GET_READ_MIRRORS) 5245 num_alloc_stripes++; 5246 tgtdev_indexes = num_stripes; 5247 } 5248 5249 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), 5250 GFP_NOFS); 5251 if (!bbio) { 5252 kfree(raid_map); 5253 ret = -ENOMEM; 5254 goto out; 5255 } 5256 atomic_set(&bbio->error, 0); 5257 if (dev_replace_is_ongoing) 5258 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5259 5260 if (rw & REQ_DISCARD) { 5261 int factor = 0; 5262 int sub_stripes = 0; 5263 u64 stripes_per_dev = 0; 5264 u32 remaining_stripes = 0; 5265 u32 last_stripe = 0; 5266 5267 if (map->type & 5268 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 5269 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5270 sub_stripes = 1; 5271 else 5272 sub_stripes = map->sub_stripes; 5273 5274 factor = map->num_stripes / sub_stripes; 5275 stripes_per_dev = div_u64_rem(stripe_nr_end - 5276 stripe_nr_orig, 5277 factor, 5278 &remaining_stripes); 5279 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5280 last_stripe *= sub_stripes; 5281 } 5282 5283 for (i = 0; i < num_stripes; i++) { 5284 bbio->stripes[i].physical = 5285 map->stripes[stripe_index].physical + 5286 stripe_offset + stripe_nr * map->stripe_len; 5287 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5288 5289 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5290 BTRFS_BLOCK_GROUP_RAID10)) { 5291 bbio->stripes[i].length = stripes_per_dev * 5292 map->stripe_len; 5293 5294 if (i / sub_stripes < remaining_stripes) 5295 bbio->stripes[i].length += 5296 map->stripe_len; 5297 5298 /* 5299 * Special for the first stripe and 5300 * the last stripe: 5301 * 5302 * |-------|...|-------| 5303 * |----------| 5304 * off end_off 5305 */ 5306 if (i < sub_stripes) 5307 bbio->stripes[i].length -= 5308 stripe_offset; 5309 5310 if (stripe_index >= last_stripe && 5311 stripe_index <= (last_stripe + 5312 sub_stripes - 1)) 5313 bbio->stripes[i].length -= 5314 stripe_end_offset; 5315 5316 if (i == sub_stripes - 1) 5317 stripe_offset = 0; 5318 } else 5319 bbio->stripes[i].length = *length; 5320 5321 stripe_index++; 5322 if (stripe_index == map->num_stripes) { 5323 /* This could only happen for RAID0/10 */ 5324 stripe_index = 0; 5325 stripe_nr++; 5326 } 5327 } 5328 } else { 5329 for (i = 0; i < num_stripes; i++) { 5330 bbio->stripes[i].physical = 5331 map->stripes[stripe_index].physical + 5332 stripe_offset + 5333 stripe_nr * map->stripe_len; 5334 bbio->stripes[i].dev = 5335 map->stripes[stripe_index].dev; 5336 stripe_index++; 5337 } 5338 } 5339 5340 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5341 max_errors = btrfs_chunk_max_errors(map); 5342 5343 tgtdev_indexes = 0; 5344 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5345 dev_replace->tgtdev != NULL) { 5346 int index_where_to_add; 5347 u64 srcdev_devid = dev_replace->srcdev->devid; 5348 5349 /* 5350 * duplicate the write operations while the dev replace 5351 * procedure is running. Since the copying of the old disk 5352 * to the new disk takes place at run time while the 5353 * filesystem is mounted writable, the regular write 5354 * operations to the old disk have to be duplicated to go 5355 * to the new disk as well. 5356 * Note that device->missing is handled by the caller, and 5357 * that the write to the old disk is already set up in the 5358 * stripes array. 5359 */ 5360 index_where_to_add = num_stripes; 5361 for (i = 0; i < num_stripes; i++) { 5362 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5363 /* write to new disk, too */ 5364 struct btrfs_bio_stripe *new = 5365 bbio->stripes + index_where_to_add; 5366 struct btrfs_bio_stripe *old = 5367 bbio->stripes + i; 5368 5369 new->physical = old->physical; 5370 new->length = old->length; 5371 new->dev = dev_replace->tgtdev; 5372 bbio->tgtdev_map[i] = index_where_to_add; 5373 index_where_to_add++; 5374 max_errors++; 5375 tgtdev_indexes++; 5376 } 5377 } 5378 num_stripes = index_where_to_add; 5379 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && 5380 dev_replace->tgtdev != NULL) { 5381 u64 srcdev_devid = dev_replace->srcdev->devid; 5382 int index_srcdev = 0; 5383 int found = 0; 5384 u64 physical_of_found = 0; 5385 5386 /* 5387 * During the dev-replace procedure, the target drive can 5388 * also be used to read data in case it is needed to repair 5389 * a corrupt block elsewhere. This is possible if the 5390 * requested area is left of the left cursor. In this area, 5391 * the target drive is a full copy of the source drive. 5392 */ 5393 for (i = 0; i < num_stripes; i++) { 5394 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5395 /* 5396 * In case of DUP, in order to keep it 5397 * simple, only add the mirror with the 5398 * lowest physical address 5399 */ 5400 if (found && 5401 physical_of_found <= 5402 bbio->stripes[i].physical) 5403 continue; 5404 index_srcdev = i; 5405 found = 1; 5406 physical_of_found = bbio->stripes[i].physical; 5407 } 5408 } 5409 if (found) { 5410 u64 length = map->stripe_len; 5411 5412 if (physical_of_found + length <= 5413 dev_replace->cursor_left) { 5414 struct btrfs_bio_stripe *tgtdev_stripe = 5415 bbio->stripes + num_stripes; 5416 5417 tgtdev_stripe->physical = physical_of_found; 5418 tgtdev_stripe->length = 5419 bbio->stripes[index_srcdev].length; 5420 tgtdev_stripe->dev = dev_replace->tgtdev; 5421 bbio->tgtdev_map[index_srcdev] = num_stripes; 5422 5423 tgtdev_indexes++; 5424 num_stripes++; 5425 } 5426 } 5427 } 5428 5429 *bbio_ret = bbio; 5430 bbio->num_stripes = num_stripes; 5431 bbio->max_errors = max_errors; 5432 bbio->mirror_num = mirror_num; 5433 bbio->num_tgtdevs = tgtdev_indexes; 5434 5435 /* 5436 * this is the case that REQ_READ && dev_replace_is_ongoing && 5437 * mirror_num == num_stripes + 1 && dev_replace target drive is 5438 * available as a mirror 5439 */ 5440 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 5441 WARN_ON(num_stripes > 1); 5442 bbio->stripes[0].dev = dev_replace->tgtdev; 5443 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5444 bbio->mirror_num = map->num_stripes + 1; 5445 } 5446 if (raid_map) { 5447 sort_parity_stripes(bbio, raid_map); 5448 *raid_map_ret = raid_map; 5449 } 5450 out: 5451 if (dev_replace_is_ongoing) 5452 btrfs_dev_replace_unlock(dev_replace); 5453 free_extent_map(em); 5454 return ret; 5455 } 5456 5457 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 5458 u64 logical, u64 *length, 5459 struct btrfs_bio **bbio_ret, int mirror_num) 5460 { 5461 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5462 mirror_num, NULL); 5463 } 5464 5465 /* For Scrub/replace */ 5466 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, 5467 u64 logical, u64 *length, 5468 struct btrfs_bio **bbio_ret, int mirror_num, 5469 u64 **raid_map_ret) 5470 { 5471 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5472 mirror_num, raid_map_ret); 5473 } 5474 5475 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5476 u64 chunk_start, u64 physical, u64 devid, 5477 u64 **logical, int *naddrs, int *stripe_len) 5478 { 5479 struct extent_map_tree *em_tree = &map_tree->map_tree; 5480 struct extent_map *em; 5481 struct map_lookup *map; 5482 u64 *buf; 5483 u64 bytenr; 5484 u64 length; 5485 u64 stripe_nr; 5486 u64 rmap_len; 5487 int i, j, nr = 0; 5488 5489 read_lock(&em_tree->lock); 5490 em = lookup_extent_mapping(em_tree, chunk_start, 1); 5491 read_unlock(&em_tree->lock); 5492 5493 if (!em) { 5494 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n", 5495 chunk_start); 5496 return -EIO; 5497 } 5498 5499 if (em->start != chunk_start) { 5500 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n", 5501 em->start, chunk_start); 5502 free_extent_map(em); 5503 return -EIO; 5504 } 5505 map = (struct map_lookup *)em->bdev; 5506 5507 length = em->len; 5508 rmap_len = map->stripe_len; 5509 5510 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5511 do_div(length, map->num_stripes / map->sub_stripes); 5512 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5513 do_div(length, map->num_stripes); 5514 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5515 BTRFS_BLOCK_GROUP_RAID6)) { 5516 do_div(length, nr_data_stripes(map)); 5517 rmap_len = map->stripe_len * nr_data_stripes(map); 5518 } 5519 5520 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 5521 BUG_ON(!buf); /* -ENOMEM */ 5522 5523 for (i = 0; i < map->num_stripes; i++) { 5524 if (devid && map->stripes[i].dev->devid != devid) 5525 continue; 5526 if (map->stripes[i].physical > physical || 5527 map->stripes[i].physical + length <= physical) 5528 continue; 5529 5530 stripe_nr = physical - map->stripes[i].physical; 5531 do_div(stripe_nr, map->stripe_len); 5532 5533 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5534 stripe_nr = stripe_nr * map->num_stripes + i; 5535 do_div(stripe_nr, map->sub_stripes); 5536 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5537 stripe_nr = stripe_nr * map->num_stripes + i; 5538 } /* else if RAID[56], multiply by nr_data_stripes(). 5539 * Alternatively, just use rmap_len below instead of 5540 * map->stripe_len */ 5541 5542 bytenr = chunk_start + stripe_nr * rmap_len; 5543 WARN_ON(nr >= map->num_stripes); 5544 for (j = 0; j < nr; j++) { 5545 if (buf[j] == bytenr) 5546 break; 5547 } 5548 if (j == nr) { 5549 WARN_ON(nr >= map->num_stripes); 5550 buf[nr++] = bytenr; 5551 } 5552 } 5553 5554 *logical = buf; 5555 *naddrs = nr; 5556 *stripe_len = rmap_len; 5557 5558 free_extent_map(em); 5559 return 0; 5560 } 5561 5562 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) 5563 { 5564 if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) 5565 bio_endio_nodec(bio, err); 5566 else 5567 bio_endio(bio, err); 5568 kfree(bbio); 5569 } 5570 5571 static void btrfs_end_bio(struct bio *bio, int err) 5572 { 5573 struct btrfs_bio *bbio = bio->bi_private; 5574 struct btrfs_device *dev = bbio->stripes[0].dev; 5575 int is_orig_bio = 0; 5576 5577 if (err) { 5578 atomic_inc(&bbio->error); 5579 if (err == -EIO || err == -EREMOTEIO) { 5580 unsigned int stripe_index = 5581 btrfs_io_bio(bio)->stripe_index; 5582 5583 BUG_ON(stripe_index >= bbio->num_stripes); 5584 dev = bbio->stripes[stripe_index].dev; 5585 if (dev->bdev) { 5586 if (bio->bi_rw & WRITE) 5587 btrfs_dev_stat_inc(dev, 5588 BTRFS_DEV_STAT_WRITE_ERRS); 5589 else 5590 btrfs_dev_stat_inc(dev, 5591 BTRFS_DEV_STAT_READ_ERRS); 5592 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 5593 btrfs_dev_stat_inc(dev, 5594 BTRFS_DEV_STAT_FLUSH_ERRS); 5595 btrfs_dev_stat_print_on_error(dev); 5596 } 5597 } 5598 } 5599 5600 if (bio == bbio->orig_bio) 5601 is_orig_bio = 1; 5602 5603 btrfs_bio_counter_dec(bbio->fs_info); 5604 5605 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5606 if (!is_orig_bio) { 5607 bio_put(bio); 5608 bio = bbio->orig_bio; 5609 } 5610 5611 bio->bi_private = bbio->private; 5612 bio->bi_end_io = bbio->end_io; 5613 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5614 /* only send an error to the higher layers if it is 5615 * beyond the tolerance of the btrfs bio 5616 */ 5617 if (atomic_read(&bbio->error) > bbio->max_errors) { 5618 err = -EIO; 5619 } else { 5620 /* 5621 * this bio is actually up to date, we didn't 5622 * go over the max number of errors 5623 */ 5624 set_bit(BIO_UPTODATE, &bio->bi_flags); 5625 err = 0; 5626 } 5627 5628 btrfs_end_bbio(bbio, bio, err); 5629 } else if (!is_orig_bio) { 5630 bio_put(bio); 5631 } 5632 } 5633 5634 /* 5635 * see run_scheduled_bios for a description of why bios are collected for 5636 * async submit. 5637 * 5638 * This will add one bio to the pending list for a device and make sure 5639 * the work struct is scheduled. 5640 */ 5641 static noinline void btrfs_schedule_bio(struct btrfs_root *root, 5642 struct btrfs_device *device, 5643 int rw, struct bio *bio) 5644 { 5645 int should_queue = 1; 5646 struct btrfs_pending_bios *pending_bios; 5647 5648 if (device->missing || !device->bdev) { 5649 bio_endio(bio, -EIO); 5650 return; 5651 } 5652 5653 /* don't bother with additional async steps for reads, right now */ 5654 if (!(rw & REQ_WRITE)) { 5655 bio_get(bio); 5656 btrfsic_submit_bio(rw, bio); 5657 bio_put(bio); 5658 return; 5659 } 5660 5661 /* 5662 * nr_async_bios allows us to reliably return congestion to the 5663 * higher layers. Otherwise, the async bio makes it appear we have 5664 * made progress against dirty pages when we've really just put it 5665 * on a queue for later 5666 */ 5667 atomic_inc(&root->fs_info->nr_async_bios); 5668 WARN_ON(bio->bi_next); 5669 bio->bi_next = NULL; 5670 bio->bi_rw |= rw; 5671 5672 spin_lock(&device->io_lock); 5673 if (bio->bi_rw & REQ_SYNC) 5674 pending_bios = &device->pending_sync_bios; 5675 else 5676 pending_bios = &device->pending_bios; 5677 5678 if (pending_bios->tail) 5679 pending_bios->tail->bi_next = bio; 5680 5681 pending_bios->tail = bio; 5682 if (!pending_bios->head) 5683 pending_bios->head = bio; 5684 if (device->running_pending) 5685 should_queue = 0; 5686 5687 spin_unlock(&device->io_lock); 5688 5689 if (should_queue) 5690 btrfs_queue_work(root->fs_info->submit_workers, 5691 &device->work); 5692 } 5693 5694 static int bio_size_ok(struct block_device *bdev, struct bio *bio, 5695 sector_t sector) 5696 { 5697 struct bio_vec *prev; 5698 struct request_queue *q = bdev_get_queue(bdev); 5699 unsigned int max_sectors = queue_max_sectors(q); 5700 struct bvec_merge_data bvm = { 5701 .bi_bdev = bdev, 5702 .bi_sector = sector, 5703 .bi_rw = bio->bi_rw, 5704 }; 5705 5706 if (WARN_ON(bio->bi_vcnt == 0)) 5707 return 1; 5708 5709 prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 5710 if (bio_sectors(bio) > max_sectors) 5711 return 0; 5712 5713 if (!q->merge_bvec_fn) 5714 return 1; 5715 5716 bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; 5717 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 5718 return 0; 5719 return 1; 5720 } 5721 5722 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5723 struct bio *bio, u64 physical, int dev_nr, 5724 int rw, int async) 5725 { 5726 struct btrfs_device *dev = bbio->stripes[dev_nr].dev; 5727 5728 bio->bi_private = bbio; 5729 btrfs_io_bio(bio)->stripe_index = dev_nr; 5730 bio->bi_end_io = btrfs_end_bio; 5731 bio->bi_iter.bi_sector = physical >> 9; 5732 #ifdef DEBUG 5733 { 5734 struct rcu_string *name; 5735 5736 rcu_read_lock(); 5737 name = rcu_dereference(dev->name); 5738 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5739 "(%s id %llu), size=%u\n", rw, 5740 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, 5741 name->str, dev->devid, bio->bi_iter.bi_size); 5742 rcu_read_unlock(); 5743 } 5744 #endif 5745 bio->bi_bdev = dev->bdev; 5746 5747 btrfs_bio_counter_inc_noblocked(root->fs_info); 5748 5749 if (async) 5750 btrfs_schedule_bio(root, dev, rw, bio); 5751 else 5752 btrfsic_submit_bio(rw, bio); 5753 } 5754 5755 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, 5756 struct bio *first_bio, struct btrfs_device *dev, 5757 int dev_nr, int rw, int async) 5758 { 5759 struct bio_vec *bvec = first_bio->bi_io_vec; 5760 struct bio *bio; 5761 int nr_vecs = bio_get_nr_vecs(dev->bdev); 5762 u64 physical = bbio->stripes[dev_nr].physical; 5763 5764 again: 5765 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); 5766 if (!bio) 5767 return -ENOMEM; 5768 5769 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 5770 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5771 bvec->bv_offset) < bvec->bv_len) { 5772 u64 len = bio->bi_iter.bi_size; 5773 5774 atomic_inc(&bbio->stripes_pending); 5775 submit_stripe_bio(root, bbio, bio, physical, dev_nr, 5776 rw, async); 5777 physical += len; 5778 goto again; 5779 } 5780 bvec++; 5781 } 5782 5783 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); 5784 return 0; 5785 } 5786 5787 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 5788 { 5789 atomic_inc(&bbio->error); 5790 if (atomic_dec_and_test(&bbio->stripes_pending)) { 5791 /* Shoud be the original bio. */ 5792 WARN_ON(bio != bbio->orig_bio); 5793 5794 bio->bi_private = bbio->private; 5795 bio->bi_end_io = bbio->end_io; 5796 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5797 bio->bi_iter.bi_sector = logical >> 9; 5798 5799 btrfs_end_bbio(bbio, bio, -EIO); 5800 } 5801 } 5802 5803 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 5804 int mirror_num, int async_submit) 5805 { 5806 struct btrfs_device *dev; 5807 struct bio *first_bio = bio; 5808 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 5809 u64 length = 0; 5810 u64 map_length; 5811 u64 *raid_map = NULL; 5812 int ret; 5813 int dev_nr = 0; 5814 int total_devs = 1; 5815 struct btrfs_bio *bbio = NULL; 5816 5817 length = bio->bi_iter.bi_size; 5818 map_length = length; 5819 5820 btrfs_bio_counter_inc_blocked(root->fs_info); 5821 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5822 mirror_num, &raid_map); 5823 if (ret) { 5824 btrfs_bio_counter_dec(root->fs_info); 5825 return ret; 5826 } 5827 5828 total_devs = bbio->num_stripes; 5829 bbio->orig_bio = first_bio; 5830 bbio->private = first_bio->bi_private; 5831 bbio->end_io = first_bio->bi_end_io; 5832 bbio->fs_info = root->fs_info; 5833 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5834 5835 if (raid_map) { 5836 /* In this case, map_length has been set to the length of 5837 a single stripe; not the whole write */ 5838 if (rw & WRITE) { 5839 ret = raid56_parity_write(root, bio, bbio, 5840 raid_map, map_length); 5841 } else { 5842 ret = raid56_parity_recover(root, bio, bbio, 5843 raid_map, map_length, 5844 mirror_num, 1); 5845 } 5846 5847 btrfs_bio_counter_dec(root->fs_info); 5848 return ret; 5849 } 5850 5851 if (map_length < length) { 5852 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", 5853 logical, length, map_length); 5854 BUG(); 5855 } 5856 5857 while (dev_nr < total_devs) { 5858 dev = bbio->stripes[dev_nr].dev; 5859 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5860 bbio_error(bbio, first_bio, logical); 5861 dev_nr++; 5862 continue; 5863 } 5864 5865 /* 5866 * Check and see if we're ok with this bio based on it's size 5867 * and offset with the given device. 5868 */ 5869 if (!bio_size_ok(dev->bdev, first_bio, 5870 bbio->stripes[dev_nr].physical >> 9)) { 5871 ret = breakup_stripe_bio(root, bbio, first_bio, dev, 5872 dev_nr, rw, async_submit); 5873 BUG_ON(ret); 5874 dev_nr++; 5875 continue; 5876 } 5877 5878 if (dev_nr < total_devs - 1) { 5879 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 5880 BUG_ON(!bio); /* -ENOMEM */ 5881 } else { 5882 bio = first_bio; 5883 bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; 5884 } 5885 5886 submit_stripe_bio(root, bbio, bio, 5887 bbio->stripes[dev_nr].physical, dev_nr, rw, 5888 async_submit); 5889 dev_nr++; 5890 } 5891 btrfs_bio_counter_dec(root->fs_info); 5892 return 0; 5893 } 5894 5895 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, 5896 u8 *uuid, u8 *fsid) 5897 { 5898 struct btrfs_device *device; 5899 struct btrfs_fs_devices *cur_devices; 5900 5901 cur_devices = fs_info->fs_devices; 5902 while (cur_devices) { 5903 if (!fsid || 5904 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 5905 device = __find_device(&cur_devices->devices, 5906 devid, uuid); 5907 if (device) 5908 return device; 5909 } 5910 cur_devices = cur_devices->seed; 5911 } 5912 return NULL; 5913 } 5914 5915 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5916 struct btrfs_fs_devices *fs_devices, 5917 u64 devid, u8 *dev_uuid) 5918 { 5919 struct btrfs_device *device; 5920 5921 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5922 if (IS_ERR(device)) 5923 return NULL; 5924 5925 list_add(&device->dev_list, &fs_devices->devices); 5926 device->fs_devices = fs_devices; 5927 fs_devices->num_devices++; 5928 5929 device->missing = 1; 5930 fs_devices->missing_devices++; 5931 5932 return device; 5933 } 5934 5935 /** 5936 * btrfs_alloc_device - allocate struct btrfs_device 5937 * @fs_info: used only for generating a new devid, can be NULL if 5938 * devid is provided (i.e. @devid != NULL). 5939 * @devid: a pointer to devid for this device. If NULL a new devid 5940 * is generated. 5941 * @uuid: a pointer to UUID for this device. If NULL a new UUID 5942 * is generated. 5943 * 5944 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 5945 * on error. Returned struct is not linked onto any lists and can be 5946 * destroyed with kfree() right away. 5947 */ 5948 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 5949 const u64 *devid, 5950 const u8 *uuid) 5951 { 5952 struct btrfs_device *dev; 5953 u64 tmp; 5954 5955 if (WARN_ON(!devid && !fs_info)) 5956 return ERR_PTR(-EINVAL); 5957 5958 dev = __alloc_device(); 5959 if (IS_ERR(dev)) 5960 return dev; 5961 5962 if (devid) 5963 tmp = *devid; 5964 else { 5965 int ret; 5966 5967 ret = find_next_devid(fs_info, &tmp); 5968 if (ret) { 5969 kfree(dev); 5970 return ERR_PTR(ret); 5971 } 5972 } 5973 dev->devid = tmp; 5974 5975 if (uuid) 5976 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 5977 else 5978 generate_random_uuid(dev->uuid); 5979 5980 btrfs_init_work(&dev->work, btrfs_submit_helper, 5981 pending_bios_fn, NULL, NULL); 5982 5983 return dev; 5984 } 5985 5986 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 5987 struct extent_buffer *leaf, 5988 struct btrfs_chunk *chunk) 5989 { 5990 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 5991 struct map_lookup *map; 5992 struct extent_map *em; 5993 u64 logical; 5994 u64 length; 5995 u64 devid; 5996 u8 uuid[BTRFS_UUID_SIZE]; 5997 int num_stripes; 5998 int ret; 5999 int i; 6000 6001 logical = key->offset; 6002 length = btrfs_chunk_length(leaf, chunk); 6003 6004 read_lock(&map_tree->map_tree.lock); 6005 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6006 read_unlock(&map_tree->map_tree.lock); 6007 6008 /* already mapped? */ 6009 if (em && em->start <= logical && em->start + em->len > logical) { 6010 free_extent_map(em); 6011 return 0; 6012 } else if (em) { 6013 free_extent_map(em); 6014 } 6015 6016 em = alloc_extent_map(); 6017 if (!em) 6018 return -ENOMEM; 6019 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6020 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6021 if (!map) { 6022 free_extent_map(em); 6023 return -ENOMEM; 6024 } 6025 6026 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6027 em->bdev = (struct block_device *)map; 6028 em->start = logical; 6029 em->len = length; 6030 em->orig_start = 0; 6031 em->block_start = 0; 6032 em->block_len = em->len; 6033 6034 map->num_stripes = num_stripes; 6035 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6036 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6037 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 6038 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6039 map->type = btrfs_chunk_type(leaf, chunk); 6040 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6041 for (i = 0; i < num_stripes; i++) { 6042 map->stripes[i].physical = 6043 btrfs_stripe_offset_nr(leaf, chunk, i); 6044 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6045 read_extent_buffer(leaf, uuid, (unsigned long) 6046 btrfs_stripe_dev_uuid_nr(chunk, i), 6047 BTRFS_UUID_SIZE); 6048 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, 6049 uuid, NULL); 6050 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 6051 free_extent_map(em); 6052 return -EIO; 6053 } 6054 if (!map->stripes[i].dev) { 6055 map->stripes[i].dev = 6056 add_missing_dev(root, root->fs_info->fs_devices, 6057 devid, uuid); 6058 if (!map->stripes[i].dev) { 6059 free_extent_map(em); 6060 return -EIO; 6061 } 6062 } 6063 map->stripes[i].dev->in_fs_metadata = 1; 6064 } 6065 6066 write_lock(&map_tree->map_tree.lock); 6067 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6068 write_unlock(&map_tree->map_tree.lock); 6069 BUG_ON(ret); /* Tree corruption */ 6070 free_extent_map(em); 6071 6072 return 0; 6073 } 6074 6075 static void fill_device_from_item(struct extent_buffer *leaf, 6076 struct btrfs_dev_item *dev_item, 6077 struct btrfs_device *device) 6078 { 6079 unsigned long ptr; 6080 6081 device->devid = btrfs_device_id(leaf, dev_item); 6082 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6083 device->total_bytes = device->disk_total_bytes; 6084 device->commit_total_bytes = device->disk_total_bytes; 6085 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6086 device->commit_bytes_used = device->bytes_used; 6087 device->type = btrfs_device_type(leaf, dev_item); 6088 device->io_align = btrfs_device_io_align(leaf, dev_item); 6089 device->io_width = btrfs_device_io_width(leaf, dev_item); 6090 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6091 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6092 device->is_tgtdev_for_dev_replace = 0; 6093 6094 ptr = btrfs_device_uuid(dev_item); 6095 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6096 } 6097 6098 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, 6099 u8 *fsid) 6100 { 6101 struct btrfs_fs_devices *fs_devices; 6102 int ret; 6103 6104 BUG_ON(!mutex_is_locked(&uuid_mutex)); 6105 6106 fs_devices = root->fs_info->fs_devices->seed; 6107 while (fs_devices) { 6108 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) 6109 return fs_devices; 6110 6111 fs_devices = fs_devices->seed; 6112 } 6113 6114 fs_devices = find_fsid(fsid); 6115 if (!fs_devices) { 6116 if (!btrfs_test_opt(root, DEGRADED)) 6117 return ERR_PTR(-ENOENT); 6118 6119 fs_devices = alloc_fs_devices(fsid); 6120 if (IS_ERR(fs_devices)) 6121 return fs_devices; 6122 6123 fs_devices->seeding = 1; 6124 fs_devices->opened = 1; 6125 return fs_devices; 6126 } 6127 6128 fs_devices = clone_fs_devices(fs_devices); 6129 if (IS_ERR(fs_devices)) 6130 return fs_devices; 6131 6132 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6133 root->fs_info->bdev_holder); 6134 if (ret) { 6135 free_fs_devices(fs_devices); 6136 fs_devices = ERR_PTR(ret); 6137 goto out; 6138 } 6139 6140 if (!fs_devices->seeding) { 6141 __btrfs_close_devices(fs_devices); 6142 free_fs_devices(fs_devices); 6143 fs_devices = ERR_PTR(-EINVAL); 6144 goto out; 6145 } 6146 6147 fs_devices->seed = root->fs_info->fs_devices->seed; 6148 root->fs_info->fs_devices->seed = fs_devices; 6149 out: 6150 return fs_devices; 6151 } 6152 6153 static int read_one_dev(struct btrfs_root *root, 6154 struct extent_buffer *leaf, 6155 struct btrfs_dev_item *dev_item) 6156 { 6157 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6158 struct btrfs_device *device; 6159 u64 devid; 6160 int ret; 6161 u8 fs_uuid[BTRFS_UUID_SIZE]; 6162 u8 dev_uuid[BTRFS_UUID_SIZE]; 6163 6164 devid = btrfs_device_id(leaf, dev_item); 6165 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6166 BTRFS_UUID_SIZE); 6167 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6168 BTRFS_UUID_SIZE); 6169 6170 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 6171 fs_devices = open_seed_devices(root, fs_uuid); 6172 if (IS_ERR(fs_devices)) 6173 return PTR_ERR(fs_devices); 6174 } 6175 6176 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 6177 if (!device) { 6178 if (!btrfs_test_opt(root, DEGRADED)) 6179 return -EIO; 6180 6181 btrfs_warn(root->fs_info, "devid %llu missing", devid); 6182 device = add_missing_dev(root, fs_devices, devid, dev_uuid); 6183 if (!device) 6184 return -ENOMEM; 6185 } else { 6186 if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) 6187 return -EIO; 6188 6189 if(!device->bdev && !device->missing) { 6190 /* 6191 * this happens when a device that was properly setup 6192 * in the device info lists suddenly goes bad. 6193 * device->bdev is NULL, and so we have to set 6194 * device->missing to one here 6195 */ 6196 device->fs_devices->missing_devices++; 6197 device->missing = 1; 6198 } 6199 6200 /* Move the device to its own fs_devices */ 6201 if (device->fs_devices != fs_devices) { 6202 ASSERT(device->missing); 6203 6204 list_move(&device->dev_list, &fs_devices->devices); 6205 device->fs_devices->num_devices--; 6206 fs_devices->num_devices++; 6207 6208 device->fs_devices->missing_devices--; 6209 fs_devices->missing_devices++; 6210 6211 device->fs_devices = fs_devices; 6212 } 6213 } 6214 6215 if (device->fs_devices != root->fs_info->fs_devices) { 6216 BUG_ON(device->writeable); 6217 if (device->generation != 6218 btrfs_device_generation(leaf, dev_item)) 6219 return -EINVAL; 6220 } 6221 6222 fill_device_from_item(leaf, dev_item, device); 6223 device->in_fs_metadata = 1; 6224 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 6225 device->fs_devices->total_rw_bytes += device->total_bytes; 6226 spin_lock(&root->fs_info->free_chunk_lock); 6227 root->fs_info->free_chunk_space += device->total_bytes - 6228 device->bytes_used; 6229 spin_unlock(&root->fs_info->free_chunk_lock); 6230 } 6231 ret = 0; 6232 return ret; 6233 } 6234 6235 int btrfs_read_sys_array(struct btrfs_root *root) 6236 { 6237 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 6238 struct extent_buffer *sb; 6239 struct btrfs_disk_key *disk_key; 6240 struct btrfs_chunk *chunk; 6241 u8 *ptr; 6242 unsigned long sb_ptr; 6243 int ret = 0; 6244 u32 num_stripes; 6245 u32 array_size; 6246 u32 len = 0; 6247 u32 cur; 6248 struct btrfs_key key; 6249 6250 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 6251 BTRFS_SUPER_INFO_SIZE); 6252 if (!sb) 6253 return -ENOMEM; 6254 btrfs_set_buffer_uptodate(sb); 6255 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6256 /* 6257 * The sb extent buffer is artifical and just used to read the system array. 6258 * btrfs_set_buffer_uptodate() call does not properly mark all it's 6259 * pages up-to-date when the page is larger: extent does not cover the 6260 * whole page and consequently check_page_uptodate does not find all 6261 * the page's extents up-to-date (the hole beyond sb), 6262 * write_extent_buffer then triggers a WARN_ON. 6263 * 6264 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6265 * but sb spans only this function. Add an explicit SetPageUptodate call 6266 * to silence the warning eg. on PowerPC 64. 6267 */ 6268 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 6269 SetPageUptodate(sb->pages[0]); 6270 6271 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6272 array_size = btrfs_super_sys_array_size(super_copy); 6273 6274 ptr = super_copy->sys_chunk_array; 6275 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 6276 cur = 0; 6277 6278 while (cur < array_size) { 6279 disk_key = (struct btrfs_disk_key *)ptr; 6280 btrfs_disk_key_to_cpu(&key, disk_key); 6281 6282 len = sizeof(*disk_key); ptr += len; 6283 sb_ptr += len; 6284 cur += len; 6285 6286 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6287 chunk = (struct btrfs_chunk *)sb_ptr; 6288 ret = read_one_chunk(root, &key, sb, chunk); 6289 if (ret) 6290 break; 6291 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6292 len = btrfs_chunk_item_size(num_stripes); 6293 } else { 6294 ret = -EIO; 6295 break; 6296 } 6297 ptr += len; 6298 sb_ptr += len; 6299 cur += len; 6300 } 6301 free_extent_buffer(sb); 6302 return ret; 6303 } 6304 6305 int btrfs_read_chunk_tree(struct btrfs_root *root) 6306 { 6307 struct btrfs_path *path; 6308 struct extent_buffer *leaf; 6309 struct btrfs_key key; 6310 struct btrfs_key found_key; 6311 int ret; 6312 int slot; 6313 6314 root = root->fs_info->chunk_root; 6315 6316 path = btrfs_alloc_path(); 6317 if (!path) 6318 return -ENOMEM; 6319 6320 mutex_lock(&uuid_mutex); 6321 lock_chunks(root); 6322 6323 /* 6324 * Read all device items, and then all the chunk items. All 6325 * device items are found before any chunk item (their object id 6326 * is smaller than the lowest possible object id for a chunk 6327 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 6328 */ 6329 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 6330 key.offset = 0; 6331 key.type = 0; 6332 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6333 if (ret < 0) 6334 goto error; 6335 while (1) { 6336 leaf = path->nodes[0]; 6337 slot = path->slots[0]; 6338 if (slot >= btrfs_header_nritems(leaf)) { 6339 ret = btrfs_next_leaf(root, path); 6340 if (ret == 0) 6341 continue; 6342 if (ret < 0) 6343 goto error; 6344 break; 6345 } 6346 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6347 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 6348 struct btrfs_dev_item *dev_item; 6349 dev_item = btrfs_item_ptr(leaf, slot, 6350 struct btrfs_dev_item); 6351 ret = read_one_dev(root, leaf, dev_item); 6352 if (ret) 6353 goto error; 6354 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 6355 struct btrfs_chunk *chunk; 6356 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 6357 ret = read_one_chunk(root, &found_key, leaf, chunk); 6358 if (ret) 6359 goto error; 6360 } 6361 path->slots[0]++; 6362 } 6363 ret = 0; 6364 error: 6365 unlock_chunks(root); 6366 mutex_unlock(&uuid_mutex); 6367 6368 btrfs_free_path(path); 6369 return ret; 6370 } 6371 6372 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 6373 { 6374 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6375 struct btrfs_device *device; 6376 6377 while (fs_devices) { 6378 mutex_lock(&fs_devices->device_list_mutex); 6379 list_for_each_entry(device, &fs_devices->devices, dev_list) 6380 device->dev_root = fs_info->dev_root; 6381 mutex_unlock(&fs_devices->device_list_mutex); 6382 6383 fs_devices = fs_devices->seed; 6384 } 6385 } 6386 6387 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 6388 { 6389 int i; 6390 6391 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6392 btrfs_dev_stat_reset(dev, i); 6393 } 6394 6395 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 6396 { 6397 struct btrfs_key key; 6398 struct btrfs_key found_key; 6399 struct btrfs_root *dev_root = fs_info->dev_root; 6400 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6401 struct extent_buffer *eb; 6402 int slot; 6403 int ret = 0; 6404 struct btrfs_device *device; 6405 struct btrfs_path *path = NULL; 6406 int i; 6407 6408 path = btrfs_alloc_path(); 6409 if (!path) { 6410 ret = -ENOMEM; 6411 goto out; 6412 } 6413 6414 mutex_lock(&fs_devices->device_list_mutex); 6415 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6416 int item_size; 6417 struct btrfs_dev_stats_item *ptr; 6418 6419 key.objectid = 0; 6420 key.type = BTRFS_DEV_STATS_KEY; 6421 key.offset = device->devid; 6422 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 6423 if (ret) { 6424 __btrfs_reset_dev_stats(device); 6425 device->dev_stats_valid = 1; 6426 btrfs_release_path(path); 6427 continue; 6428 } 6429 slot = path->slots[0]; 6430 eb = path->nodes[0]; 6431 btrfs_item_key_to_cpu(eb, &found_key, slot); 6432 item_size = btrfs_item_size_nr(eb, slot); 6433 6434 ptr = btrfs_item_ptr(eb, slot, 6435 struct btrfs_dev_stats_item); 6436 6437 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6438 if (item_size >= (1 + i) * sizeof(__le64)) 6439 btrfs_dev_stat_set(device, i, 6440 btrfs_dev_stats_value(eb, ptr, i)); 6441 else 6442 btrfs_dev_stat_reset(device, i); 6443 } 6444 6445 device->dev_stats_valid = 1; 6446 btrfs_dev_stat_print_on_load(device); 6447 btrfs_release_path(path); 6448 } 6449 mutex_unlock(&fs_devices->device_list_mutex); 6450 6451 out: 6452 btrfs_free_path(path); 6453 return ret < 0 ? ret : 0; 6454 } 6455 6456 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 6457 struct btrfs_root *dev_root, 6458 struct btrfs_device *device) 6459 { 6460 struct btrfs_path *path; 6461 struct btrfs_key key; 6462 struct extent_buffer *eb; 6463 struct btrfs_dev_stats_item *ptr; 6464 int ret; 6465 int i; 6466 6467 key.objectid = 0; 6468 key.type = BTRFS_DEV_STATS_KEY; 6469 key.offset = device->devid; 6470 6471 path = btrfs_alloc_path(); 6472 BUG_ON(!path); 6473 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6474 if (ret < 0) { 6475 printk_in_rcu(KERN_WARNING "BTRFS: " 6476 "error %d while searching for dev_stats item for device %s!\n", 6477 ret, rcu_str_deref(device->name)); 6478 goto out; 6479 } 6480 6481 if (ret == 0 && 6482 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 6483 /* need to delete old one and insert a new one */ 6484 ret = btrfs_del_item(trans, dev_root, path); 6485 if (ret != 0) { 6486 printk_in_rcu(KERN_WARNING "BTRFS: " 6487 "delete too small dev_stats item for device %s failed %d!\n", 6488 rcu_str_deref(device->name), ret); 6489 goto out; 6490 } 6491 ret = 1; 6492 } 6493 6494 if (ret == 1) { 6495 /* need to insert a new item */ 6496 btrfs_release_path(path); 6497 ret = btrfs_insert_empty_item(trans, dev_root, path, 6498 &key, sizeof(*ptr)); 6499 if (ret < 0) { 6500 printk_in_rcu(KERN_WARNING "BTRFS: " 6501 "insert dev_stats item for device %s failed %d!\n", 6502 rcu_str_deref(device->name), ret); 6503 goto out; 6504 } 6505 } 6506 6507 eb = path->nodes[0]; 6508 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 6509 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6510 btrfs_set_dev_stats_value(eb, ptr, i, 6511 btrfs_dev_stat_read(device, i)); 6512 btrfs_mark_buffer_dirty(eb); 6513 6514 out: 6515 btrfs_free_path(path); 6516 return ret; 6517 } 6518 6519 /* 6520 * called from commit_transaction. Writes all changed device stats to disk. 6521 */ 6522 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 6523 struct btrfs_fs_info *fs_info) 6524 { 6525 struct btrfs_root *dev_root = fs_info->dev_root; 6526 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6527 struct btrfs_device *device; 6528 int stats_cnt; 6529 int ret = 0; 6530 6531 mutex_lock(&fs_devices->device_list_mutex); 6532 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6533 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) 6534 continue; 6535 6536 stats_cnt = atomic_read(&device->dev_stats_ccnt); 6537 ret = update_dev_stat_item(trans, dev_root, device); 6538 if (!ret) 6539 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 6540 } 6541 mutex_unlock(&fs_devices->device_list_mutex); 6542 6543 return ret; 6544 } 6545 6546 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 6547 { 6548 btrfs_dev_stat_inc(dev, index); 6549 btrfs_dev_stat_print_on_error(dev); 6550 } 6551 6552 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 6553 { 6554 if (!dev->dev_stats_valid) 6555 return; 6556 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 6557 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6558 rcu_str_deref(dev->name), 6559 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6560 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6561 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6562 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6563 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6564 } 6565 6566 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 6567 { 6568 int i; 6569 6570 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6571 if (btrfs_dev_stat_read(dev, i) != 0) 6572 break; 6573 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6574 return; /* all values == 0, suppress message */ 6575 6576 printk_in_rcu(KERN_INFO "BTRFS: " 6577 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6578 rcu_str_deref(dev->name), 6579 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6580 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6581 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6582 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 6583 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 6584 } 6585 6586 int btrfs_get_dev_stats(struct btrfs_root *root, 6587 struct btrfs_ioctl_get_dev_stats *stats) 6588 { 6589 struct btrfs_device *dev; 6590 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6591 int i; 6592 6593 mutex_lock(&fs_devices->device_list_mutex); 6594 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); 6595 mutex_unlock(&fs_devices->device_list_mutex); 6596 6597 if (!dev) { 6598 btrfs_warn(root->fs_info, "get dev_stats failed, device not found"); 6599 return -ENODEV; 6600 } else if (!dev->dev_stats_valid) { 6601 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid"); 6602 return -ENODEV; 6603 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 6604 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6605 if (stats->nr_items > i) 6606 stats->values[i] = 6607 btrfs_dev_stat_read_and_reset(dev, i); 6608 else 6609 btrfs_dev_stat_reset(dev, i); 6610 } 6611 } else { 6612 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 6613 if (stats->nr_items > i) 6614 stats->values[i] = btrfs_dev_stat_read(dev, i); 6615 } 6616 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 6617 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 6618 return 0; 6619 } 6620 6621 int btrfs_scratch_superblock(struct btrfs_device *device) 6622 { 6623 struct buffer_head *bh; 6624 struct btrfs_super_block *disk_super; 6625 6626 bh = btrfs_read_dev_super(device->bdev); 6627 if (!bh) 6628 return -EINVAL; 6629 disk_super = (struct btrfs_super_block *)bh->b_data; 6630 6631 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 6632 set_buffer_dirty(bh); 6633 sync_dirty_buffer(bh); 6634 brelse(bh); 6635 6636 return 0; 6637 } 6638 6639 /* 6640 * Update the size of all devices, which is used for writing out the 6641 * super blocks. 6642 */ 6643 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) 6644 { 6645 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6646 struct btrfs_device *curr, *next; 6647 6648 if (list_empty(&fs_devices->resized_devices)) 6649 return; 6650 6651 mutex_lock(&fs_devices->device_list_mutex); 6652 lock_chunks(fs_info->dev_root); 6653 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, 6654 resized_list) { 6655 list_del_init(&curr->resized_list); 6656 curr->commit_total_bytes = curr->disk_total_bytes; 6657 } 6658 unlock_chunks(fs_info->dev_root); 6659 mutex_unlock(&fs_devices->device_list_mutex); 6660 } 6661 6662 /* Must be invoked during the transaction commit */ 6663 void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, 6664 struct btrfs_transaction *transaction) 6665 { 6666 struct extent_map *em; 6667 struct map_lookup *map; 6668 struct btrfs_device *dev; 6669 int i; 6670 6671 if (list_empty(&transaction->pending_chunks)) 6672 return; 6673 6674 /* In order to kick the device replace finish process */ 6675 lock_chunks(root); 6676 list_for_each_entry(em, &transaction->pending_chunks, list) { 6677 map = (struct map_lookup *)em->bdev; 6678 6679 for (i = 0; i < map->num_stripes; i++) { 6680 dev = map->stripes[i].dev; 6681 dev->commit_bytes_used = dev->bytes_used; 6682 } 6683 } 6684 unlock_chunks(root); 6685 } 6686