1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/bio.h> 20 #include <linux/slab.h> 21 #include <linux/buffer_head.h> 22 #include <linux/blkdev.h> 23 #include <linux/random.h> 24 #include <linux/iocontext.h> 25 #include <linux/capability.h> 26 #include <linux/ratelimit.h> 27 #include <linux/kthread.h> 28 #include <asm/div64.h> 29 #include "compat.h" 30 #include "ctree.h" 31 #include "extent_map.h" 32 #include "disk-io.h" 33 #include "transaction.h" 34 #include "print-tree.h" 35 #include "volumes.h" 36 #include "async-thread.h" 37 #include "check-integrity.h" 38 #include "rcu-string.h" 39 40 static int init_first_rw_device(struct btrfs_trans_handle *trans, 41 struct btrfs_root *root, 42 struct btrfs_device *device); 43 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 44 static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 45 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 46 47 static DEFINE_MUTEX(uuid_mutex); 48 static LIST_HEAD(fs_uuids); 49 50 static void lock_chunks(struct btrfs_root *root) 51 { 52 mutex_lock(&root->fs_info->chunk_mutex); 53 } 54 55 static void unlock_chunks(struct btrfs_root *root) 56 { 57 mutex_unlock(&root->fs_info->chunk_mutex); 58 } 59 60 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 61 { 62 struct btrfs_device *device; 63 WARN_ON(fs_devices->opened); 64 while (!list_empty(&fs_devices->devices)) { 65 device = list_entry(fs_devices->devices.next, 66 struct btrfs_device, dev_list); 67 list_del(&device->dev_list); 68 rcu_string_free(device->name); 69 kfree(device); 70 } 71 kfree(fs_devices); 72 } 73 74 void btrfs_cleanup_fs_uuids(void) 75 { 76 struct btrfs_fs_devices *fs_devices; 77 78 while (!list_empty(&fs_uuids)) { 79 fs_devices = list_entry(fs_uuids.next, 80 struct btrfs_fs_devices, list); 81 list_del(&fs_devices->list); 82 free_fs_devices(fs_devices); 83 } 84 } 85 86 static noinline struct btrfs_device *__find_device(struct list_head *head, 87 u64 devid, u8 *uuid) 88 { 89 struct btrfs_device *dev; 90 91 list_for_each_entry(dev, head, dev_list) { 92 if (dev->devid == devid && 93 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 94 return dev; 95 } 96 } 97 return NULL; 98 } 99 100 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 101 { 102 struct btrfs_fs_devices *fs_devices; 103 104 list_for_each_entry(fs_devices, &fs_uuids, list) { 105 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 106 return fs_devices; 107 } 108 return NULL; 109 } 110 111 static void requeue_list(struct btrfs_pending_bios *pending_bios, 112 struct bio *head, struct bio *tail) 113 { 114 115 struct bio *old_head; 116 117 old_head = pending_bios->head; 118 pending_bios->head = head; 119 if (pending_bios->tail) 120 tail->bi_next = old_head; 121 else 122 pending_bios->tail = tail; 123 } 124 125 /* 126 * we try to collect pending bios for a device so we don't get a large 127 * number of procs sending bios down to the same device. This greatly 128 * improves the schedulers ability to collect and merge the bios. 129 * 130 * But, it also turns into a long list of bios to process and that is sure 131 * to eventually make the worker thread block. The solution here is to 132 * make some progress and then put this work struct back at the end of 133 * the list if the block device is congested. This way, multiple devices 134 * can make progress from a single worker thread. 135 */ 136 static noinline void run_scheduled_bios(struct btrfs_device *device) 137 { 138 struct bio *pending; 139 struct backing_dev_info *bdi; 140 struct btrfs_fs_info *fs_info; 141 struct btrfs_pending_bios *pending_bios; 142 struct bio *tail; 143 struct bio *cur; 144 int again = 0; 145 unsigned long num_run; 146 unsigned long batch_run = 0; 147 unsigned long limit; 148 unsigned long last_waited = 0; 149 int force_reg = 0; 150 int sync_pending = 0; 151 struct blk_plug plug; 152 153 /* 154 * this function runs all the bios we've collected for 155 * a particular device. We don't want to wander off to 156 * another device without first sending all of these down. 157 * So, setup a plug here and finish it off before we return 158 */ 159 blk_start_plug(&plug); 160 161 bdi = blk_get_backing_dev_info(device->bdev); 162 fs_info = device->dev_root->fs_info; 163 limit = btrfs_async_submit_limit(fs_info); 164 limit = limit * 2 / 3; 165 166 loop: 167 spin_lock(&device->io_lock); 168 169 loop_lock: 170 num_run = 0; 171 172 /* take all the bios off the list at once and process them 173 * later on (without the lock held). But, remember the 174 * tail and other pointers so the bios can be properly reinserted 175 * into the list if we hit congestion 176 */ 177 if (!force_reg && device->pending_sync_bios.head) { 178 pending_bios = &device->pending_sync_bios; 179 force_reg = 1; 180 } else { 181 pending_bios = &device->pending_bios; 182 force_reg = 0; 183 } 184 185 pending = pending_bios->head; 186 tail = pending_bios->tail; 187 WARN_ON(pending && !tail); 188 189 /* 190 * if pending was null this time around, no bios need processing 191 * at all and we can stop. Otherwise it'll loop back up again 192 * and do an additional check so no bios are missed. 193 * 194 * device->running_pending is used to synchronize with the 195 * schedule_bio code. 196 */ 197 if (device->pending_sync_bios.head == NULL && 198 device->pending_bios.head == NULL) { 199 again = 0; 200 device->running_pending = 0; 201 } else { 202 again = 1; 203 device->running_pending = 1; 204 } 205 206 pending_bios->head = NULL; 207 pending_bios->tail = NULL; 208 209 spin_unlock(&device->io_lock); 210 211 while (pending) { 212 213 rmb(); 214 /* we want to work on both lists, but do more bios on the 215 * sync list than the regular list 216 */ 217 if ((num_run > 32 && 218 pending_bios != &device->pending_sync_bios && 219 device->pending_sync_bios.head) || 220 (num_run > 64 && pending_bios == &device->pending_sync_bios && 221 device->pending_bios.head)) { 222 spin_lock(&device->io_lock); 223 requeue_list(pending_bios, pending, tail); 224 goto loop_lock; 225 } 226 227 cur = pending; 228 pending = pending->bi_next; 229 cur->bi_next = NULL; 230 atomic_dec(&fs_info->nr_async_bios); 231 232 if (atomic_read(&fs_info->nr_async_bios) < limit && 233 waitqueue_active(&fs_info->async_submit_wait)) 234 wake_up(&fs_info->async_submit_wait); 235 236 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 237 238 /* 239 * if we're doing the sync list, record that our 240 * plug has some sync requests on it 241 * 242 * If we're doing the regular list and there are 243 * sync requests sitting around, unplug before 244 * we add more 245 */ 246 if (pending_bios == &device->pending_sync_bios) { 247 sync_pending = 1; 248 } else if (sync_pending) { 249 blk_finish_plug(&plug); 250 blk_start_plug(&plug); 251 sync_pending = 0; 252 } 253 254 btrfsic_submit_bio(cur->bi_rw, cur); 255 num_run++; 256 batch_run++; 257 if (need_resched()) 258 cond_resched(); 259 260 /* 261 * we made progress, there is more work to do and the bdi 262 * is now congested. Back off and let other work structs 263 * run instead 264 */ 265 if (pending && bdi_write_congested(bdi) && batch_run > 8 && 266 fs_info->fs_devices->open_devices > 1) { 267 struct io_context *ioc; 268 269 ioc = current->io_context; 270 271 /* 272 * the main goal here is that we don't want to 273 * block if we're going to be able to submit 274 * more requests without blocking. 275 * 276 * This code does two great things, it pokes into 277 * the elevator code from a filesystem _and_ 278 * it makes assumptions about how batching works. 279 */ 280 if (ioc && ioc->nr_batch_requests > 0 && 281 time_before(jiffies, ioc->last_waited + HZ/50UL) && 282 (last_waited == 0 || 283 ioc->last_waited == last_waited)) { 284 /* 285 * we want to go through our batch of 286 * requests and stop. So, we copy out 287 * the ioc->last_waited time and test 288 * against it before looping 289 */ 290 last_waited = ioc->last_waited; 291 if (need_resched()) 292 cond_resched(); 293 continue; 294 } 295 spin_lock(&device->io_lock); 296 requeue_list(pending_bios, pending, tail); 297 device->running_pending = 1; 298 299 spin_unlock(&device->io_lock); 300 btrfs_requeue_work(&device->work); 301 goto done; 302 } 303 /* unplug every 64 requests just for good measure */ 304 if (batch_run % 64 == 0) { 305 blk_finish_plug(&plug); 306 blk_start_plug(&plug); 307 sync_pending = 0; 308 } 309 } 310 311 cond_resched(); 312 if (again) 313 goto loop; 314 315 spin_lock(&device->io_lock); 316 if (device->pending_bios.head || device->pending_sync_bios.head) 317 goto loop_lock; 318 spin_unlock(&device->io_lock); 319 320 done: 321 blk_finish_plug(&plug); 322 } 323 324 static void pending_bios_fn(struct btrfs_work *work) 325 { 326 struct btrfs_device *device; 327 328 device = container_of(work, struct btrfs_device, work); 329 run_scheduled_bios(device); 330 } 331 332 static noinline int device_list_add(const char *path, 333 struct btrfs_super_block *disk_super, 334 u64 devid, struct btrfs_fs_devices **fs_devices_ret) 335 { 336 struct btrfs_device *device; 337 struct btrfs_fs_devices *fs_devices; 338 struct rcu_string *name; 339 u64 found_transid = btrfs_super_generation(disk_super); 340 341 fs_devices = find_fsid(disk_super->fsid); 342 if (!fs_devices) { 343 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 344 if (!fs_devices) 345 return -ENOMEM; 346 INIT_LIST_HEAD(&fs_devices->devices); 347 INIT_LIST_HEAD(&fs_devices->alloc_list); 348 list_add(&fs_devices->list, &fs_uuids); 349 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 350 fs_devices->latest_devid = devid; 351 fs_devices->latest_trans = found_transid; 352 mutex_init(&fs_devices->device_list_mutex); 353 device = NULL; 354 } else { 355 device = __find_device(&fs_devices->devices, devid, 356 disk_super->dev_item.uuid); 357 } 358 if (!device) { 359 if (fs_devices->opened) 360 return -EBUSY; 361 362 device = kzalloc(sizeof(*device), GFP_NOFS); 363 if (!device) { 364 /* we can safely leave the fs_devices entry around */ 365 return -ENOMEM; 366 } 367 device->devid = devid; 368 device->dev_stats_valid = 0; 369 device->work.func = pending_bios_fn; 370 memcpy(device->uuid, disk_super->dev_item.uuid, 371 BTRFS_UUID_SIZE); 372 spin_lock_init(&device->io_lock); 373 374 name = rcu_string_strdup(path, GFP_NOFS); 375 if (!name) { 376 kfree(device); 377 return -ENOMEM; 378 } 379 rcu_assign_pointer(device->name, name); 380 INIT_LIST_HEAD(&device->dev_alloc_list); 381 382 /* init readahead state */ 383 spin_lock_init(&device->reada_lock); 384 device->reada_curr_zone = NULL; 385 atomic_set(&device->reada_in_flight, 0); 386 device->reada_next = 0; 387 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); 388 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); 389 390 mutex_lock(&fs_devices->device_list_mutex); 391 list_add_rcu(&device->dev_list, &fs_devices->devices); 392 mutex_unlock(&fs_devices->device_list_mutex); 393 394 device->fs_devices = fs_devices; 395 fs_devices->num_devices++; 396 } else if (!device->name || strcmp(device->name->str, path)) { 397 name = rcu_string_strdup(path, GFP_NOFS); 398 if (!name) 399 return -ENOMEM; 400 rcu_string_free(device->name); 401 rcu_assign_pointer(device->name, name); 402 if (device->missing) { 403 fs_devices->missing_devices--; 404 device->missing = 0; 405 } 406 } 407 408 if (found_transid > fs_devices->latest_trans) { 409 fs_devices->latest_devid = devid; 410 fs_devices->latest_trans = found_transid; 411 } 412 *fs_devices_ret = fs_devices; 413 return 0; 414 } 415 416 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 417 { 418 struct btrfs_fs_devices *fs_devices; 419 struct btrfs_device *device; 420 struct btrfs_device *orig_dev; 421 422 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 423 if (!fs_devices) 424 return ERR_PTR(-ENOMEM); 425 426 INIT_LIST_HEAD(&fs_devices->devices); 427 INIT_LIST_HEAD(&fs_devices->alloc_list); 428 INIT_LIST_HEAD(&fs_devices->list); 429 mutex_init(&fs_devices->device_list_mutex); 430 fs_devices->latest_devid = orig->latest_devid; 431 fs_devices->latest_trans = orig->latest_trans; 432 fs_devices->total_devices = orig->total_devices; 433 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 434 435 /* We have held the volume lock, it is safe to get the devices. */ 436 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 437 struct rcu_string *name; 438 439 device = kzalloc(sizeof(*device), GFP_NOFS); 440 if (!device) 441 goto error; 442 443 /* 444 * This is ok to do without rcu read locked because we hold the 445 * uuid mutex so nothing we touch in here is going to disappear. 446 */ 447 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); 448 if (!name) { 449 kfree(device); 450 goto error; 451 } 452 rcu_assign_pointer(device->name, name); 453 454 device->devid = orig_dev->devid; 455 device->work.func = pending_bios_fn; 456 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 457 spin_lock_init(&device->io_lock); 458 INIT_LIST_HEAD(&device->dev_list); 459 INIT_LIST_HEAD(&device->dev_alloc_list); 460 461 list_add(&device->dev_list, &fs_devices->devices); 462 device->fs_devices = fs_devices; 463 fs_devices->num_devices++; 464 } 465 return fs_devices; 466 error: 467 free_fs_devices(fs_devices); 468 return ERR_PTR(-ENOMEM); 469 } 470 471 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 472 { 473 struct btrfs_device *device, *next; 474 475 struct block_device *latest_bdev = NULL; 476 u64 latest_devid = 0; 477 u64 latest_transid = 0; 478 479 mutex_lock(&uuid_mutex); 480 again: 481 /* This is the initialized path, it is safe to release the devices. */ 482 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 483 if (device->in_fs_metadata) { 484 if (!latest_transid || 485 device->generation > latest_transid) { 486 latest_devid = device->devid; 487 latest_transid = device->generation; 488 latest_bdev = device->bdev; 489 } 490 continue; 491 } 492 493 if (device->bdev) { 494 blkdev_put(device->bdev, device->mode); 495 device->bdev = NULL; 496 fs_devices->open_devices--; 497 } 498 if (device->writeable) { 499 list_del_init(&device->dev_alloc_list); 500 device->writeable = 0; 501 fs_devices->rw_devices--; 502 } 503 list_del_init(&device->dev_list); 504 fs_devices->num_devices--; 505 rcu_string_free(device->name); 506 kfree(device); 507 } 508 509 if (fs_devices->seed) { 510 fs_devices = fs_devices->seed; 511 goto again; 512 } 513 514 fs_devices->latest_bdev = latest_bdev; 515 fs_devices->latest_devid = latest_devid; 516 fs_devices->latest_trans = latest_transid; 517 518 mutex_unlock(&uuid_mutex); 519 } 520 521 static void __free_device(struct work_struct *work) 522 { 523 struct btrfs_device *device; 524 525 device = container_of(work, struct btrfs_device, rcu_work); 526 527 if (device->bdev) 528 blkdev_put(device->bdev, device->mode); 529 530 rcu_string_free(device->name); 531 kfree(device); 532 } 533 534 static void free_device(struct rcu_head *head) 535 { 536 struct btrfs_device *device; 537 538 device = container_of(head, struct btrfs_device, rcu); 539 540 INIT_WORK(&device->rcu_work, __free_device); 541 schedule_work(&device->rcu_work); 542 } 543 544 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 545 { 546 struct btrfs_device *device; 547 548 if (--fs_devices->opened > 0) 549 return 0; 550 551 mutex_lock(&fs_devices->device_list_mutex); 552 list_for_each_entry(device, &fs_devices->devices, dev_list) { 553 struct btrfs_device *new_device; 554 struct rcu_string *name; 555 556 if (device->bdev) 557 fs_devices->open_devices--; 558 559 if (device->writeable) { 560 list_del_init(&device->dev_alloc_list); 561 fs_devices->rw_devices--; 562 } 563 564 if (device->can_discard) 565 fs_devices->num_can_discard--; 566 567 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 568 BUG_ON(!new_device); /* -ENOMEM */ 569 memcpy(new_device, device, sizeof(*new_device)); 570 571 /* Safe because we are under uuid_mutex */ 572 name = rcu_string_strdup(device->name->str, GFP_NOFS); 573 BUG_ON(device->name && !name); /* -ENOMEM */ 574 rcu_assign_pointer(new_device->name, name); 575 new_device->bdev = NULL; 576 new_device->writeable = 0; 577 new_device->in_fs_metadata = 0; 578 new_device->can_discard = 0; 579 list_replace_rcu(&device->dev_list, &new_device->dev_list); 580 581 call_rcu(&device->rcu, free_device); 582 } 583 mutex_unlock(&fs_devices->device_list_mutex); 584 585 WARN_ON(fs_devices->open_devices); 586 WARN_ON(fs_devices->rw_devices); 587 fs_devices->opened = 0; 588 fs_devices->seeding = 0; 589 590 return 0; 591 } 592 593 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 594 { 595 struct btrfs_fs_devices *seed_devices = NULL; 596 int ret; 597 598 mutex_lock(&uuid_mutex); 599 ret = __btrfs_close_devices(fs_devices); 600 if (!fs_devices->opened) { 601 seed_devices = fs_devices->seed; 602 fs_devices->seed = NULL; 603 } 604 mutex_unlock(&uuid_mutex); 605 606 while (seed_devices) { 607 fs_devices = seed_devices; 608 seed_devices = fs_devices->seed; 609 __btrfs_close_devices(fs_devices); 610 free_fs_devices(fs_devices); 611 } 612 return ret; 613 } 614 615 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 616 fmode_t flags, void *holder) 617 { 618 struct request_queue *q; 619 struct block_device *bdev; 620 struct list_head *head = &fs_devices->devices; 621 struct btrfs_device *device; 622 struct block_device *latest_bdev = NULL; 623 struct buffer_head *bh; 624 struct btrfs_super_block *disk_super; 625 u64 latest_devid = 0; 626 u64 latest_transid = 0; 627 u64 devid; 628 int seeding = 1; 629 int ret = 0; 630 631 flags |= FMODE_EXCL; 632 633 list_for_each_entry(device, head, dev_list) { 634 if (device->bdev) 635 continue; 636 if (!device->name) 637 continue; 638 639 bdev = blkdev_get_by_path(device->name->str, flags, holder); 640 if (IS_ERR(bdev)) { 641 printk(KERN_INFO "open %s failed\n", device->name->str); 642 goto error; 643 } 644 filemap_write_and_wait(bdev->bd_inode->i_mapping); 645 invalidate_bdev(bdev); 646 set_blocksize(bdev, 4096); 647 648 bh = btrfs_read_dev_super(bdev); 649 if (!bh) 650 goto error_close; 651 652 disk_super = (struct btrfs_super_block *)bh->b_data; 653 devid = btrfs_stack_device_id(&disk_super->dev_item); 654 if (devid != device->devid) 655 goto error_brelse; 656 657 if (memcmp(device->uuid, disk_super->dev_item.uuid, 658 BTRFS_UUID_SIZE)) 659 goto error_brelse; 660 661 device->generation = btrfs_super_generation(disk_super); 662 if (!latest_transid || device->generation > latest_transid) { 663 latest_devid = devid; 664 latest_transid = device->generation; 665 latest_bdev = bdev; 666 } 667 668 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 669 device->writeable = 0; 670 } else { 671 device->writeable = !bdev_read_only(bdev); 672 seeding = 0; 673 } 674 675 q = bdev_get_queue(bdev); 676 if (blk_queue_discard(q)) { 677 device->can_discard = 1; 678 fs_devices->num_can_discard++; 679 } 680 681 device->bdev = bdev; 682 device->in_fs_metadata = 0; 683 device->mode = flags; 684 685 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 686 fs_devices->rotating = 1; 687 688 fs_devices->open_devices++; 689 if (device->writeable) { 690 fs_devices->rw_devices++; 691 list_add(&device->dev_alloc_list, 692 &fs_devices->alloc_list); 693 } 694 brelse(bh); 695 continue; 696 697 error_brelse: 698 brelse(bh); 699 error_close: 700 blkdev_put(bdev, flags); 701 error: 702 continue; 703 } 704 if (fs_devices->open_devices == 0) { 705 ret = -EINVAL; 706 goto out; 707 } 708 fs_devices->seeding = seeding; 709 fs_devices->opened = 1; 710 fs_devices->latest_bdev = latest_bdev; 711 fs_devices->latest_devid = latest_devid; 712 fs_devices->latest_trans = latest_transid; 713 fs_devices->total_rw_bytes = 0; 714 out: 715 return ret; 716 } 717 718 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 719 fmode_t flags, void *holder) 720 { 721 int ret; 722 723 mutex_lock(&uuid_mutex); 724 if (fs_devices->opened) { 725 fs_devices->opened++; 726 ret = 0; 727 } else { 728 ret = __btrfs_open_devices(fs_devices, flags, holder); 729 } 730 mutex_unlock(&uuid_mutex); 731 return ret; 732 } 733 734 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 735 struct btrfs_fs_devices **fs_devices_ret) 736 { 737 struct btrfs_super_block *disk_super; 738 struct block_device *bdev; 739 struct buffer_head *bh; 740 int ret; 741 u64 devid; 742 u64 transid; 743 u64 total_devices; 744 745 flags |= FMODE_EXCL; 746 bdev = blkdev_get_by_path(path, flags, holder); 747 748 if (IS_ERR(bdev)) { 749 ret = PTR_ERR(bdev); 750 goto error; 751 } 752 753 mutex_lock(&uuid_mutex); 754 ret = set_blocksize(bdev, 4096); 755 if (ret) 756 goto error_close; 757 bh = btrfs_read_dev_super(bdev); 758 if (!bh) { 759 ret = -EINVAL; 760 goto error_close; 761 } 762 disk_super = (struct btrfs_super_block *)bh->b_data; 763 devid = btrfs_stack_device_id(&disk_super->dev_item); 764 transid = btrfs_super_generation(disk_super); 765 total_devices = btrfs_super_num_devices(disk_super); 766 if (disk_super->label[0]) 767 printk(KERN_INFO "device label %s ", disk_super->label); 768 else 769 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 770 printk(KERN_CONT "devid %llu transid %llu %s\n", 771 (unsigned long long)devid, (unsigned long long)transid, path); 772 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 773 if (!ret && fs_devices_ret) 774 (*fs_devices_ret)->total_devices = total_devices; 775 brelse(bh); 776 error_close: 777 mutex_unlock(&uuid_mutex); 778 blkdev_put(bdev, flags); 779 error: 780 return ret; 781 } 782 783 /* helper to account the used device space in the range */ 784 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 785 u64 end, u64 *length) 786 { 787 struct btrfs_key key; 788 struct btrfs_root *root = device->dev_root; 789 struct btrfs_dev_extent *dev_extent; 790 struct btrfs_path *path; 791 u64 extent_end; 792 int ret; 793 int slot; 794 struct extent_buffer *l; 795 796 *length = 0; 797 798 if (start >= device->total_bytes) 799 return 0; 800 801 path = btrfs_alloc_path(); 802 if (!path) 803 return -ENOMEM; 804 path->reada = 2; 805 806 key.objectid = device->devid; 807 key.offset = start; 808 key.type = BTRFS_DEV_EXTENT_KEY; 809 810 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 811 if (ret < 0) 812 goto out; 813 if (ret > 0) { 814 ret = btrfs_previous_item(root, path, key.objectid, key.type); 815 if (ret < 0) 816 goto out; 817 } 818 819 while (1) { 820 l = path->nodes[0]; 821 slot = path->slots[0]; 822 if (slot >= btrfs_header_nritems(l)) { 823 ret = btrfs_next_leaf(root, path); 824 if (ret == 0) 825 continue; 826 if (ret < 0) 827 goto out; 828 829 break; 830 } 831 btrfs_item_key_to_cpu(l, &key, slot); 832 833 if (key.objectid < device->devid) 834 goto next; 835 836 if (key.objectid > device->devid) 837 break; 838 839 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 840 goto next; 841 842 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 843 extent_end = key.offset + btrfs_dev_extent_length(l, 844 dev_extent); 845 if (key.offset <= start && extent_end > end) { 846 *length = end - start + 1; 847 break; 848 } else if (key.offset <= start && extent_end > start) 849 *length += extent_end - start; 850 else if (key.offset > start && extent_end <= end) 851 *length += extent_end - key.offset; 852 else if (key.offset > start && key.offset <= end) { 853 *length += end - key.offset + 1; 854 break; 855 } else if (key.offset > end) 856 break; 857 858 next: 859 path->slots[0]++; 860 } 861 ret = 0; 862 out: 863 btrfs_free_path(path); 864 return ret; 865 } 866 867 /* 868 * find_free_dev_extent - find free space in the specified device 869 * @device: the device which we search the free space in 870 * @num_bytes: the size of the free space that we need 871 * @start: store the start of the free space. 872 * @len: the size of the free space. that we find, or the size of the max 873 * free space if we don't find suitable free space 874 * 875 * this uses a pretty simple search, the expectation is that it is 876 * called very infrequently and that a given device has a small number 877 * of extents 878 * 879 * @start is used to store the start of the free space if we find. But if we 880 * don't find suitable free space, it will be used to store the start position 881 * of the max free space. 882 * 883 * @len is used to store the size of the free space that we find. 884 * But if we don't find suitable free space, it is used to store the size of 885 * the max free space. 886 */ 887 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 888 u64 *start, u64 *len) 889 { 890 struct btrfs_key key; 891 struct btrfs_root *root = device->dev_root; 892 struct btrfs_dev_extent *dev_extent; 893 struct btrfs_path *path; 894 u64 hole_size; 895 u64 max_hole_start; 896 u64 max_hole_size; 897 u64 extent_end; 898 u64 search_start; 899 u64 search_end = device->total_bytes; 900 int ret; 901 int slot; 902 struct extent_buffer *l; 903 904 /* FIXME use last free of some kind */ 905 906 /* we don't want to overwrite the superblock on the drive, 907 * so we make sure to start at an offset of at least 1MB 908 */ 909 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 910 911 max_hole_start = search_start; 912 max_hole_size = 0; 913 hole_size = 0; 914 915 if (search_start >= search_end) { 916 ret = -ENOSPC; 917 goto error; 918 } 919 920 path = btrfs_alloc_path(); 921 if (!path) { 922 ret = -ENOMEM; 923 goto error; 924 } 925 path->reada = 2; 926 927 key.objectid = device->devid; 928 key.offset = search_start; 929 key.type = BTRFS_DEV_EXTENT_KEY; 930 931 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 932 if (ret < 0) 933 goto out; 934 if (ret > 0) { 935 ret = btrfs_previous_item(root, path, key.objectid, key.type); 936 if (ret < 0) 937 goto out; 938 } 939 940 while (1) { 941 l = path->nodes[0]; 942 slot = path->slots[0]; 943 if (slot >= btrfs_header_nritems(l)) { 944 ret = btrfs_next_leaf(root, path); 945 if (ret == 0) 946 continue; 947 if (ret < 0) 948 goto out; 949 950 break; 951 } 952 btrfs_item_key_to_cpu(l, &key, slot); 953 954 if (key.objectid < device->devid) 955 goto next; 956 957 if (key.objectid > device->devid) 958 break; 959 960 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 961 goto next; 962 963 if (key.offset > search_start) { 964 hole_size = key.offset - search_start; 965 966 if (hole_size > max_hole_size) { 967 max_hole_start = search_start; 968 max_hole_size = hole_size; 969 } 970 971 /* 972 * If this free space is greater than which we need, 973 * it must be the max free space that we have found 974 * until now, so max_hole_start must point to the start 975 * of this free space and the length of this free space 976 * is stored in max_hole_size. Thus, we return 977 * max_hole_start and max_hole_size and go back to the 978 * caller. 979 */ 980 if (hole_size >= num_bytes) { 981 ret = 0; 982 goto out; 983 } 984 } 985 986 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 987 extent_end = key.offset + btrfs_dev_extent_length(l, 988 dev_extent); 989 if (extent_end > search_start) 990 search_start = extent_end; 991 next: 992 path->slots[0]++; 993 cond_resched(); 994 } 995 996 /* 997 * At this point, search_start should be the end of 998 * allocated dev extents, and when shrinking the device, 999 * search_end may be smaller than search_start. 1000 */ 1001 if (search_end > search_start) 1002 hole_size = search_end - search_start; 1003 1004 if (hole_size > max_hole_size) { 1005 max_hole_start = search_start; 1006 max_hole_size = hole_size; 1007 } 1008 1009 /* See above. */ 1010 if (hole_size < num_bytes) 1011 ret = -ENOSPC; 1012 else 1013 ret = 0; 1014 1015 out: 1016 btrfs_free_path(path); 1017 error: 1018 *start = max_hole_start; 1019 if (len) 1020 *len = max_hole_size; 1021 return ret; 1022 } 1023 1024 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1025 struct btrfs_device *device, 1026 u64 start) 1027 { 1028 int ret; 1029 struct btrfs_path *path; 1030 struct btrfs_root *root = device->dev_root; 1031 struct btrfs_key key; 1032 struct btrfs_key found_key; 1033 struct extent_buffer *leaf = NULL; 1034 struct btrfs_dev_extent *extent = NULL; 1035 1036 path = btrfs_alloc_path(); 1037 if (!path) 1038 return -ENOMEM; 1039 1040 key.objectid = device->devid; 1041 key.offset = start; 1042 key.type = BTRFS_DEV_EXTENT_KEY; 1043 again: 1044 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1045 if (ret > 0) { 1046 ret = btrfs_previous_item(root, path, key.objectid, 1047 BTRFS_DEV_EXTENT_KEY); 1048 if (ret) 1049 goto out; 1050 leaf = path->nodes[0]; 1051 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1052 extent = btrfs_item_ptr(leaf, path->slots[0], 1053 struct btrfs_dev_extent); 1054 BUG_ON(found_key.offset > start || found_key.offset + 1055 btrfs_dev_extent_length(leaf, extent) < start); 1056 key = found_key; 1057 btrfs_release_path(path); 1058 goto again; 1059 } else if (ret == 0) { 1060 leaf = path->nodes[0]; 1061 extent = btrfs_item_ptr(leaf, path->slots[0], 1062 struct btrfs_dev_extent); 1063 } else { 1064 btrfs_error(root->fs_info, ret, "Slot search failed"); 1065 goto out; 1066 } 1067 1068 if (device->bytes_used > 0) { 1069 u64 len = btrfs_dev_extent_length(leaf, extent); 1070 device->bytes_used -= len; 1071 spin_lock(&root->fs_info->free_chunk_lock); 1072 root->fs_info->free_chunk_space += len; 1073 spin_unlock(&root->fs_info->free_chunk_lock); 1074 } 1075 ret = btrfs_del_item(trans, root, path); 1076 if (ret) { 1077 btrfs_error(root->fs_info, ret, 1078 "Failed to remove dev extent item"); 1079 } 1080 out: 1081 btrfs_free_path(path); 1082 return ret; 1083 } 1084 1085 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1086 struct btrfs_device *device, 1087 u64 chunk_tree, u64 chunk_objectid, 1088 u64 chunk_offset, u64 start, u64 num_bytes) 1089 { 1090 int ret; 1091 struct btrfs_path *path; 1092 struct btrfs_root *root = device->dev_root; 1093 struct btrfs_dev_extent *extent; 1094 struct extent_buffer *leaf; 1095 struct btrfs_key key; 1096 1097 WARN_ON(!device->in_fs_metadata); 1098 path = btrfs_alloc_path(); 1099 if (!path) 1100 return -ENOMEM; 1101 1102 key.objectid = device->devid; 1103 key.offset = start; 1104 key.type = BTRFS_DEV_EXTENT_KEY; 1105 ret = btrfs_insert_empty_item(trans, root, path, &key, 1106 sizeof(*extent)); 1107 if (ret) 1108 goto out; 1109 1110 leaf = path->nodes[0]; 1111 extent = btrfs_item_ptr(leaf, path->slots[0], 1112 struct btrfs_dev_extent); 1113 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); 1114 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); 1115 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1116 1117 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, 1118 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), 1119 BTRFS_UUID_SIZE); 1120 1121 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1122 btrfs_mark_buffer_dirty(leaf); 1123 out: 1124 btrfs_free_path(path); 1125 return ret; 1126 } 1127 1128 static noinline int find_next_chunk(struct btrfs_root *root, 1129 u64 objectid, u64 *offset) 1130 { 1131 struct btrfs_path *path; 1132 int ret; 1133 struct btrfs_key key; 1134 struct btrfs_chunk *chunk; 1135 struct btrfs_key found_key; 1136 1137 path = btrfs_alloc_path(); 1138 if (!path) 1139 return -ENOMEM; 1140 1141 key.objectid = objectid; 1142 key.offset = (u64)-1; 1143 key.type = BTRFS_CHUNK_ITEM_KEY; 1144 1145 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1146 if (ret < 0) 1147 goto error; 1148 1149 BUG_ON(ret == 0); /* Corruption */ 1150 1151 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1152 if (ret) { 1153 *offset = 0; 1154 } else { 1155 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1156 path->slots[0]); 1157 if (found_key.objectid != objectid) 1158 *offset = 0; 1159 else { 1160 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], 1161 struct btrfs_chunk); 1162 *offset = found_key.offset + 1163 btrfs_chunk_length(path->nodes[0], chunk); 1164 } 1165 } 1166 ret = 0; 1167 error: 1168 btrfs_free_path(path); 1169 return ret; 1170 } 1171 1172 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) 1173 { 1174 int ret; 1175 struct btrfs_key key; 1176 struct btrfs_key found_key; 1177 struct btrfs_path *path; 1178 1179 root = root->fs_info->chunk_root; 1180 1181 path = btrfs_alloc_path(); 1182 if (!path) 1183 return -ENOMEM; 1184 1185 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1186 key.type = BTRFS_DEV_ITEM_KEY; 1187 key.offset = (u64)-1; 1188 1189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1190 if (ret < 0) 1191 goto error; 1192 1193 BUG_ON(ret == 0); /* Corruption */ 1194 1195 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, 1196 BTRFS_DEV_ITEM_KEY); 1197 if (ret) { 1198 *objectid = 1; 1199 } else { 1200 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1201 path->slots[0]); 1202 *objectid = found_key.offset + 1; 1203 } 1204 ret = 0; 1205 error: 1206 btrfs_free_path(path); 1207 return ret; 1208 } 1209 1210 /* 1211 * the device information is stored in the chunk root 1212 * the btrfs_device struct should be fully filled in 1213 */ 1214 int btrfs_add_device(struct btrfs_trans_handle *trans, 1215 struct btrfs_root *root, 1216 struct btrfs_device *device) 1217 { 1218 int ret; 1219 struct btrfs_path *path; 1220 struct btrfs_dev_item *dev_item; 1221 struct extent_buffer *leaf; 1222 struct btrfs_key key; 1223 unsigned long ptr; 1224 1225 root = root->fs_info->chunk_root; 1226 1227 path = btrfs_alloc_path(); 1228 if (!path) 1229 return -ENOMEM; 1230 1231 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1232 key.type = BTRFS_DEV_ITEM_KEY; 1233 key.offset = device->devid; 1234 1235 ret = btrfs_insert_empty_item(trans, root, path, &key, 1236 sizeof(*dev_item)); 1237 if (ret) 1238 goto out; 1239 1240 leaf = path->nodes[0]; 1241 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1242 1243 btrfs_set_device_id(leaf, dev_item, device->devid); 1244 btrfs_set_device_generation(leaf, dev_item, 0); 1245 btrfs_set_device_type(leaf, dev_item, device->type); 1246 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1247 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1248 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1249 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1250 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1251 btrfs_set_device_group(leaf, dev_item, 0); 1252 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1253 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1254 btrfs_set_device_start_offset(leaf, dev_item, 0); 1255 1256 ptr = (unsigned long)btrfs_device_uuid(dev_item); 1257 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1258 ptr = (unsigned long)btrfs_device_fsid(dev_item); 1259 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); 1260 btrfs_mark_buffer_dirty(leaf); 1261 1262 ret = 0; 1263 out: 1264 btrfs_free_path(path); 1265 return ret; 1266 } 1267 1268 static int btrfs_rm_dev_item(struct btrfs_root *root, 1269 struct btrfs_device *device) 1270 { 1271 int ret; 1272 struct btrfs_path *path; 1273 struct btrfs_key key; 1274 struct btrfs_trans_handle *trans; 1275 1276 root = root->fs_info->chunk_root; 1277 1278 path = btrfs_alloc_path(); 1279 if (!path) 1280 return -ENOMEM; 1281 1282 trans = btrfs_start_transaction(root, 0); 1283 if (IS_ERR(trans)) { 1284 btrfs_free_path(path); 1285 return PTR_ERR(trans); 1286 } 1287 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1288 key.type = BTRFS_DEV_ITEM_KEY; 1289 key.offset = device->devid; 1290 lock_chunks(root); 1291 1292 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1293 if (ret < 0) 1294 goto out; 1295 1296 if (ret > 0) { 1297 ret = -ENOENT; 1298 goto out; 1299 } 1300 1301 ret = btrfs_del_item(trans, root, path); 1302 if (ret) 1303 goto out; 1304 out: 1305 btrfs_free_path(path); 1306 unlock_chunks(root); 1307 btrfs_commit_transaction(trans, root); 1308 return ret; 1309 } 1310 1311 int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1312 { 1313 struct btrfs_device *device; 1314 struct btrfs_device *next_device; 1315 struct block_device *bdev; 1316 struct buffer_head *bh = NULL; 1317 struct btrfs_super_block *disk_super; 1318 struct btrfs_fs_devices *cur_devices; 1319 u64 all_avail; 1320 u64 devid; 1321 u64 num_devices; 1322 u8 *dev_uuid; 1323 int ret = 0; 1324 bool clear_super = false; 1325 1326 mutex_lock(&uuid_mutex); 1327 1328 all_avail = root->fs_info->avail_data_alloc_bits | 1329 root->fs_info->avail_system_alloc_bits | 1330 root->fs_info->avail_metadata_alloc_bits; 1331 1332 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1333 root->fs_info->fs_devices->num_devices <= 4) { 1334 printk(KERN_ERR "btrfs: unable to go below four devices " 1335 "on raid10\n"); 1336 ret = -EINVAL; 1337 goto out; 1338 } 1339 1340 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1341 root->fs_info->fs_devices->num_devices <= 2) { 1342 printk(KERN_ERR "btrfs: unable to go below two " 1343 "devices on raid1\n"); 1344 ret = -EINVAL; 1345 goto out; 1346 } 1347 1348 if (strcmp(device_path, "missing") == 0) { 1349 struct list_head *devices; 1350 struct btrfs_device *tmp; 1351 1352 device = NULL; 1353 devices = &root->fs_info->fs_devices->devices; 1354 /* 1355 * It is safe to read the devices since the volume_mutex 1356 * is held. 1357 */ 1358 list_for_each_entry(tmp, devices, dev_list) { 1359 if (tmp->in_fs_metadata && !tmp->bdev) { 1360 device = tmp; 1361 break; 1362 } 1363 } 1364 bdev = NULL; 1365 bh = NULL; 1366 disk_super = NULL; 1367 if (!device) { 1368 printk(KERN_ERR "btrfs: no missing devices found to " 1369 "remove\n"); 1370 goto out; 1371 } 1372 } else { 1373 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1374 root->fs_info->bdev_holder); 1375 if (IS_ERR(bdev)) { 1376 ret = PTR_ERR(bdev); 1377 goto out; 1378 } 1379 1380 set_blocksize(bdev, 4096); 1381 invalidate_bdev(bdev); 1382 bh = btrfs_read_dev_super(bdev); 1383 if (!bh) { 1384 ret = -EINVAL; 1385 goto error_close; 1386 } 1387 disk_super = (struct btrfs_super_block *)bh->b_data; 1388 devid = btrfs_stack_device_id(&disk_super->dev_item); 1389 dev_uuid = disk_super->dev_item.uuid; 1390 device = btrfs_find_device(root, devid, dev_uuid, 1391 disk_super->fsid); 1392 if (!device) { 1393 ret = -ENOENT; 1394 goto error_brelse; 1395 } 1396 } 1397 1398 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1399 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1400 "device\n"); 1401 ret = -EINVAL; 1402 goto error_brelse; 1403 } 1404 1405 if (device->writeable) { 1406 lock_chunks(root); 1407 list_del_init(&device->dev_alloc_list); 1408 unlock_chunks(root); 1409 root->fs_info->fs_devices->rw_devices--; 1410 clear_super = true; 1411 } 1412 1413 ret = btrfs_shrink_device(device, 0); 1414 if (ret) 1415 goto error_undo; 1416 1417 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1418 if (ret) 1419 goto error_undo; 1420 1421 spin_lock(&root->fs_info->free_chunk_lock); 1422 root->fs_info->free_chunk_space = device->total_bytes - 1423 device->bytes_used; 1424 spin_unlock(&root->fs_info->free_chunk_lock); 1425 1426 device->in_fs_metadata = 0; 1427 btrfs_scrub_cancel_dev(root, device); 1428 1429 /* 1430 * the device list mutex makes sure that we don't change 1431 * the device list while someone else is writing out all 1432 * the device supers. 1433 */ 1434 1435 cur_devices = device->fs_devices; 1436 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1437 list_del_rcu(&device->dev_list); 1438 1439 device->fs_devices->num_devices--; 1440 device->fs_devices->total_devices--; 1441 1442 if (device->missing) 1443 root->fs_info->fs_devices->missing_devices--; 1444 1445 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1446 struct btrfs_device, dev_list); 1447 if (device->bdev == root->fs_info->sb->s_bdev) 1448 root->fs_info->sb->s_bdev = next_device->bdev; 1449 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1450 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1451 1452 if (device->bdev) 1453 device->fs_devices->open_devices--; 1454 1455 call_rcu(&device->rcu, free_device); 1456 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1457 1458 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1459 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); 1460 1461 if (cur_devices->open_devices == 0) { 1462 struct btrfs_fs_devices *fs_devices; 1463 fs_devices = root->fs_info->fs_devices; 1464 while (fs_devices) { 1465 if (fs_devices->seed == cur_devices) 1466 break; 1467 fs_devices = fs_devices->seed; 1468 } 1469 fs_devices->seed = cur_devices->seed; 1470 cur_devices->seed = NULL; 1471 lock_chunks(root); 1472 __btrfs_close_devices(cur_devices); 1473 unlock_chunks(root); 1474 free_fs_devices(cur_devices); 1475 } 1476 1477 /* 1478 * at this point, the device is zero sized. We want to 1479 * remove it from the devices list and zero out the old super 1480 */ 1481 if (clear_super) { 1482 /* make sure this device isn't detected as part of 1483 * the FS anymore 1484 */ 1485 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1486 set_buffer_dirty(bh); 1487 sync_dirty_buffer(bh); 1488 } 1489 1490 ret = 0; 1491 1492 error_brelse: 1493 brelse(bh); 1494 error_close: 1495 if (bdev) 1496 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1497 out: 1498 mutex_unlock(&uuid_mutex); 1499 return ret; 1500 error_undo: 1501 if (device->writeable) { 1502 lock_chunks(root); 1503 list_add(&device->dev_alloc_list, 1504 &root->fs_info->fs_devices->alloc_list); 1505 unlock_chunks(root); 1506 root->fs_info->fs_devices->rw_devices++; 1507 } 1508 goto error_brelse; 1509 } 1510 1511 /* 1512 * does all the dirty work required for changing file system's UUID. 1513 */ 1514 static int btrfs_prepare_sprout(struct btrfs_root *root) 1515 { 1516 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1517 struct btrfs_fs_devices *old_devices; 1518 struct btrfs_fs_devices *seed_devices; 1519 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1520 struct btrfs_device *device; 1521 u64 super_flags; 1522 1523 BUG_ON(!mutex_is_locked(&uuid_mutex)); 1524 if (!fs_devices->seeding) 1525 return -EINVAL; 1526 1527 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); 1528 if (!seed_devices) 1529 return -ENOMEM; 1530 1531 old_devices = clone_fs_devices(fs_devices); 1532 if (IS_ERR(old_devices)) { 1533 kfree(seed_devices); 1534 return PTR_ERR(old_devices); 1535 } 1536 1537 list_add(&old_devices->list, &fs_uuids); 1538 1539 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 1540 seed_devices->opened = 1; 1541 INIT_LIST_HEAD(&seed_devices->devices); 1542 INIT_LIST_HEAD(&seed_devices->alloc_list); 1543 mutex_init(&seed_devices->device_list_mutex); 1544 1545 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1546 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1547 synchronize_rcu); 1548 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1549 1550 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1551 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1552 device->fs_devices = seed_devices; 1553 } 1554 1555 fs_devices->seeding = 0; 1556 fs_devices->num_devices = 0; 1557 fs_devices->open_devices = 0; 1558 fs_devices->total_devices = 0; 1559 fs_devices->seed = seed_devices; 1560 1561 generate_random_uuid(fs_devices->fsid); 1562 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1563 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1564 super_flags = btrfs_super_flags(disk_super) & 1565 ~BTRFS_SUPER_FLAG_SEEDING; 1566 btrfs_set_super_flags(disk_super, super_flags); 1567 1568 return 0; 1569 } 1570 1571 /* 1572 * strore the expected generation for seed devices in device items. 1573 */ 1574 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, 1575 struct btrfs_root *root) 1576 { 1577 struct btrfs_path *path; 1578 struct extent_buffer *leaf; 1579 struct btrfs_dev_item *dev_item; 1580 struct btrfs_device *device; 1581 struct btrfs_key key; 1582 u8 fs_uuid[BTRFS_UUID_SIZE]; 1583 u8 dev_uuid[BTRFS_UUID_SIZE]; 1584 u64 devid; 1585 int ret; 1586 1587 path = btrfs_alloc_path(); 1588 if (!path) 1589 return -ENOMEM; 1590 1591 root = root->fs_info->chunk_root; 1592 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1593 key.offset = 0; 1594 key.type = BTRFS_DEV_ITEM_KEY; 1595 1596 while (1) { 1597 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1598 if (ret < 0) 1599 goto error; 1600 1601 leaf = path->nodes[0]; 1602 next_slot: 1603 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1604 ret = btrfs_next_leaf(root, path); 1605 if (ret > 0) 1606 break; 1607 if (ret < 0) 1608 goto error; 1609 leaf = path->nodes[0]; 1610 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1611 btrfs_release_path(path); 1612 continue; 1613 } 1614 1615 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1616 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 1617 key.type != BTRFS_DEV_ITEM_KEY) 1618 break; 1619 1620 dev_item = btrfs_item_ptr(leaf, path->slots[0], 1621 struct btrfs_dev_item); 1622 devid = btrfs_device_id(leaf, dev_item); 1623 read_extent_buffer(leaf, dev_uuid, 1624 (unsigned long)btrfs_device_uuid(dev_item), 1625 BTRFS_UUID_SIZE); 1626 read_extent_buffer(leaf, fs_uuid, 1627 (unsigned long)btrfs_device_fsid(dev_item), 1628 BTRFS_UUID_SIZE); 1629 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1630 BUG_ON(!device); /* Logic error */ 1631 1632 if (device->fs_devices->seeding) { 1633 btrfs_set_device_generation(leaf, dev_item, 1634 device->generation); 1635 btrfs_mark_buffer_dirty(leaf); 1636 } 1637 1638 path->slots[0]++; 1639 goto next_slot; 1640 } 1641 ret = 0; 1642 error: 1643 btrfs_free_path(path); 1644 return ret; 1645 } 1646 1647 int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1648 { 1649 struct request_queue *q; 1650 struct btrfs_trans_handle *trans; 1651 struct btrfs_device *device; 1652 struct block_device *bdev; 1653 struct list_head *devices; 1654 struct super_block *sb = root->fs_info->sb; 1655 struct rcu_string *name; 1656 u64 total_bytes; 1657 int seeding_dev = 0; 1658 int ret = 0; 1659 1660 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1661 return -EROFS; 1662 1663 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1664 root->fs_info->bdev_holder); 1665 if (IS_ERR(bdev)) 1666 return PTR_ERR(bdev); 1667 1668 if (root->fs_info->fs_devices->seeding) { 1669 seeding_dev = 1; 1670 down_write(&sb->s_umount); 1671 mutex_lock(&uuid_mutex); 1672 } 1673 1674 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1675 1676 devices = &root->fs_info->fs_devices->devices; 1677 /* 1678 * we have the volume lock, so we don't need the extra 1679 * device list mutex while reading the list here. 1680 */ 1681 list_for_each_entry(device, devices, dev_list) { 1682 if (device->bdev == bdev) { 1683 ret = -EEXIST; 1684 goto error; 1685 } 1686 } 1687 1688 device = kzalloc(sizeof(*device), GFP_NOFS); 1689 if (!device) { 1690 /* we can safely leave the fs_devices entry around */ 1691 ret = -ENOMEM; 1692 goto error; 1693 } 1694 1695 name = rcu_string_strdup(device_path, GFP_NOFS); 1696 if (!name) { 1697 kfree(device); 1698 ret = -ENOMEM; 1699 goto error; 1700 } 1701 rcu_assign_pointer(device->name, name); 1702 1703 ret = find_next_devid(root, &device->devid); 1704 if (ret) { 1705 rcu_string_free(device->name); 1706 kfree(device); 1707 goto error; 1708 } 1709 1710 trans = btrfs_start_transaction(root, 0); 1711 if (IS_ERR(trans)) { 1712 rcu_string_free(device->name); 1713 kfree(device); 1714 ret = PTR_ERR(trans); 1715 goto error; 1716 } 1717 1718 lock_chunks(root); 1719 1720 q = bdev_get_queue(bdev); 1721 if (blk_queue_discard(q)) 1722 device->can_discard = 1; 1723 device->writeable = 1; 1724 device->work.func = pending_bios_fn; 1725 generate_random_uuid(device->uuid); 1726 spin_lock_init(&device->io_lock); 1727 device->generation = trans->transid; 1728 device->io_width = root->sectorsize; 1729 device->io_align = root->sectorsize; 1730 device->sector_size = root->sectorsize; 1731 device->total_bytes = i_size_read(bdev->bd_inode); 1732 device->disk_total_bytes = device->total_bytes; 1733 device->dev_root = root->fs_info->dev_root; 1734 device->bdev = bdev; 1735 device->in_fs_metadata = 1; 1736 device->mode = FMODE_EXCL; 1737 set_blocksize(device->bdev, 4096); 1738 1739 if (seeding_dev) { 1740 sb->s_flags &= ~MS_RDONLY; 1741 ret = btrfs_prepare_sprout(root); 1742 BUG_ON(ret); /* -ENOMEM */ 1743 } 1744 1745 device->fs_devices = root->fs_info->fs_devices; 1746 1747 /* 1748 * we don't want write_supers to jump in here with our device 1749 * half setup 1750 */ 1751 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1752 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 1753 list_add(&device->dev_alloc_list, 1754 &root->fs_info->fs_devices->alloc_list); 1755 root->fs_info->fs_devices->num_devices++; 1756 root->fs_info->fs_devices->open_devices++; 1757 root->fs_info->fs_devices->rw_devices++; 1758 root->fs_info->fs_devices->total_devices++; 1759 if (device->can_discard) 1760 root->fs_info->fs_devices->num_can_discard++; 1761 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1762 1763 spin_lock(&root->fs_info->free_chunk_lock); 1764 root->fs_info->free_chunk_space += device->total_bytes; 1765 spin_unlock(&root->fs_info->free_chunk_lock); 1766 1767 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1768 root->fs_info->fs_devices->rotating = 1; 1769 1770 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 1771 btrfs_set_super_total_bytes(root->fs_info->super_copy, 1772 total_bytes + device->total_bytes); 1773 1774 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 1775 btrfs_set_super_num_devices(root->fs_info->super_copy, 1776 total_bytes + 1); 1777 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1778 1779 if (seeding_dev) { 1780 ret = init_first_rw_device(trans, root, device); 1781 if (ret) 1782 goto error_trans; 1783 ret = btrfs_finish_sprout(trans, root); 1784 if (ret) 1785 goto error_trans; 1786 } else { 1787 ret = btrfs_add_device(trans, root, device); 1788 if (ret) 1789 goto error_trans; 1790 } 1791 1792 /* 1793 * we've got more storage, clear any full flags on the space 1794 * infos 1795 */ 1796 btrfs_clear_space_info_full(root->fs_info); 1797 1798 unlock_chunks(root); 1799 ret = btrfs_commit_transaction(trans, root); 1800 1801 if (seeding_dev) { 1802 mutex_unlock(&uuid_mutex); 1803 up_write(&sb->s_umount); 1804 1805 if (ret) /* transaction commit */ 1806 return ret; 1807 1808 ret = btrfs_relocate_sys_chunks(root); 1809 if (ret < 0) 1810 btrfs_error(root->fs_info, ret, 1811 "Failed to relocate sys chunks after " 1812 "device initialization. This can be fixed " 1813 "using the \"btrfs balance\" command."); 1814 } 1815 1816 return ret; 1817 1818 error_trans: 1819 unlock_chunks(root); 1820 btrfs_abort_transaction(trans, root, ret); 1821 btrfs_end_transaction(trans, root); 1822 rcu_string_free(device->name); 1823 kfree(device); 1824 error: 1825 blkdev_put(bdev, FMODE_EXCL); 1826 if (seeding_dev) { 1827 mutex_unlock(&uuid_mutex); 1828 up_write(&sb->s_umount); 1829 } 1830 return ret; 1831 } 1832 1833 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1834 struct btrfs_device *device) 1835 { 1836 int ret; 1837 struct btrfs_path *path; 1838 struct btrfs_root *root; 1839 struct btrfs_dev_item *dev_item; 1840 struct extent_buffer *leaf; 1841 struct btrfs_key key; 1842 1843 root = device->dev_root->fs_info->chunk_root; 1844 1845 path = btrfs_alloc_path(); 1846 if (!path) 1847 return -ENOMEM; 1848 1849 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1850 key.type = BTRFS_DEV_ITEM_KEY; 1851 key.offset = device->devid; 1852 1853 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1854 if (ret < 0) 1855 goto out; 1856 1857 if (ret > 0) { 1858 ret = -ENOENT; 1859 goto out; 1860 } 1861 1862 leaf = path->nodes[0]; 1863 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1864 1865 btrfs_set_device_id(leaf, dev_item, device->devid); 1866 btrfs_set_device_type(leaf, dev_item, device->type); 1867 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1868 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1869 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1870 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 1871 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1872 btrfs_mark_buffer_dirty(leaf); 1873 1874 out: 1875 btrfs_free_path(path); 1876 return ret; 1877 } 1878 1879 static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 1880 struct btrfs_device *device, u64 new_size) 1881 { 1882 struct btrfs_super_block *super_copy = 1883 device->dev_root->fs_info->super_copy; 1884 u64 old_total = btrfs_super_total_bytes(super_copy); 1885 u64 diff = new_size - device->total_bytes; 1886 1887 if (!device->writeable) 1888 return -EACCES; 1889 if (new_size <= device->total_bytes) 1890 return -EINVAL; 1891 1892 btrfs_set_super_total_bytes(super_copy, old_total + diff); 1893 device->fs_devices->total_rw_bytes += diff; 1894 1895 device->total_bytes = new_size; 1896 device->disk_total_bytes = new_size; 1897 btrfs_clear_space_info_full(device->dev_root->fs_info); 1898 1899 return btrfs_update_device(trans, device); 1900 } 1901 1902 int btrfs_grow_device(struct btrfs_trans_handle *trans, 1903 struct btrfs_device *device, u64 new_size) 1904 { 1905 int ret; 1906 lock_chunks(device->dev_root); 1907 ret = __btrfs_grow_device(trans, device, new_size); 1908 unlock_chunks(device->dev_root); 1909 return ret; 1910 } 1911 1912 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 1913 struct btrfs_root *root, 1914 u64 chunk_tree, u64 chunk_objectid, 1915 u64 chunk_offset) 1916 { 1917 int ret; 1918 struct btrfs_path *path; 1919 struct btrfs_key key; 1920 1921 root = root->fs_info->chunk_root; 1922 path = btrfs_alloc_path(); 1923 if (!path) 1924 return -ENOMEM; 1925 1926 key.objectid = chunk_objectid; 1927 key.offset = chunk_offset; 1928 key.type = BTRFS_CHUNK_ITEM_KEY; 1929 1930 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1931 if (ret < 0) 1932 goto out; 1933 else if (ret > 0) { /* Logic error or corruption */ 1934 btrfs_error(root->fs_info, -ENOENT, 1935 "Failed lookup while freeing chunk."); 1936 ret = -ENOENT; 1937 goto out; 1938 } 1939 1940 ret = btrfs_del_item(trans, root, path); 1941 if (ret < 0) 1942 btrfs_error(root->fs_info, ret, 1943 "Failed to delete chunk item."); 1944 out: 1945 btrfs_free_path(path); 1946 return ret; 1947 } 1948 1949 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1950 chunk_offset) 1951 { 1952 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 1953 struct btrfs_disk_key *disk_key; 1954 struct btrfs_chunk *chunk; 1955 u8 *ptr; 1956 int ret = 0; 1957 u32 num_stripes; 1958 u32 array_size; 1959 u32 len = 0; 1960 u32 cur; 1961 struct btrfs_key key; 1962 1963 array_size = btrfs_super_sys_array_size(super_copy); 1964 1965 ptr = super_copy->sys_chunk_array; 1966 cur = 0; 1967 1968 while (cur < array_size) { 1969 disk_key = (struct btrfs_disk_key *)ptr; 1970 btrfs_disk_key_to_cpu(&key, disk_key); 1971 1972 len = sizeof(*disk_key); 1973 1974 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 1975 chunk = (struct btrfs_chunk *)(ptr + len); 1976 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 1977 len += btrfs_chunk_item_size(num_stripes); 1978 } else { 1979 ret = -EIO; 1980 break; 1981 } 1982 if (key.objectid == chunk_objectid && 1983 key.offset == chunk_offset) { 1984 memmove(ptr, ptr + len, array_size - (cur + len)); 1985 array_size -= len; 1986 btrfs_set_super_sys_array_size(super_copy, array_size); 1987 } else { 1988 ptr += len; 1989 cur += len; 1990 } 1991 } 1992 return ret; 1993 } 1994 1995 static int btrfs_relocate_chunk(struct btrfs_root *root, 1996 u64 chunk_tree, u64 chunk_objectid, 1997 u64 chunk_offset) 1998 { 1999 struct extent_map_tree *em_tree; 2000 struct btrfs_root *extent_root; 2001 struct btrfs_trans_handle *trans; 2002 struct extent_map *em; 2003 struct map_lookup *map; 2004 int ret; 2005 int i; 2006 2007 root = root->fs_info->chunk_root; 2008 extent_root = root->fs_info->extent_root; 2009 em_tree = &root->fs_info->mapping_tree.map_tree; 2010 2011 ret = btrfs_can_relocate(extent_root, chunk_offset); 2012 if (ret) 2013 return -ENOSPC; 2014 2015 /* step one, relocate all the extents inside this chunk */ 2016 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 2017 if (ret) 2018 return ret; 2019 2020 trans = btrfs_start_transaction(root, 0); 2021 BUG_ON(IS_ERR(trans)); 2022 2023 lock_chunks(root); 2024 2025 /* 2026 * step two, delete the device extents and the 2027 * chunk tree entries 2028 */ 2029 read_lock(&em_tree->lock); 2030 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2031 read_unlock(&em_tree->lock); 2032 2033 BUG_ON(!em || em->start > chunk_offset || 2034 em->start + em->len < chunk_offset); 2035 map = (struct map_lookup *)em->bdev; 2036 2037 for (i = 0; i < map->num_stripes; i++) { 2038 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2039 map->stripes[i].physical); 2040 BUG_ON(ret); 2041 2042 if (map->stripes[i].dev) { 2043 ret = btrfs_update_device(trans, map->stripes[i].dev); 2044 BUG_ON(ret); 2045 } 2046 } 2047 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2048 chunk_offset); 2049 2050 BUG_ON(ret); 2051 2052 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2053 2054 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2055 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2056 BUG_ON(ret); 2057 } 2058 2059 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2060 BUG_ON(ret); 2061 2062 write_lock(&em_tree->lock); 2063 remove_extent_mapping(em_tree, em); 2064 write_unlock(&em_tree->lock); 2065 2066 kfree(map); 2067 em->bdev = NULL; 2068 2069 /* once for the tree */ 2070 free_extent_map(em); 2071 /* once for us */ 2072 free_extent_map(em); 2073 2074 unlock_chunks(root); 2075 btrfs_end_transaction(trans, root); 2076 return 0; 2077 } 2078 2079 static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2080 { 2081 struct btrfs_root *chunk_root = root->fs_info->chunk_root; 2082 struct btrfs_path *path; 2083 struct extent_buffer *leaf; 2084 struct btrfs_chunk *chunk; 2085 struct btrfs_key key; 2086 struct btrfs_key found_key; 2087 u64 chunk_tree = chunk_root->root_key.objectid; 2088 u64 chunk_type; 2089 bool retried = false; 2090 int failed = 0; 2091 int ret; 2092 2093 path = btrfs_alloc_path(); 2094 if (!path) 2095 return -ENOMEM; 2096 2097 again: 2098 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2099 key.offset = (u64)-1; 2100 key.type = BTRFS_CHUNK_ITEM_KEY; 2101 2102 while (1) { 2103 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2104 if (ret < 0) 2105 goto error; 2106 BUG_ON(ret == 0); /* Corruption */ 2107 2108 ret = btrfs_previous_item(chunk_root, path, key.objectid, 2109 key.type); 2110 if (ret < 0) 2111 goto error; 2112 if (ret > 0) 2113 break; 2114 2115 leaf = path->nodes[0]; 2116 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2117 2118 chunk = btrfs_item_ptr(leaf, path->slots[0], 2119 struct btrfs_chunk); 2120 chunk_type = btrfs_chunk_type(leaf, chunk); 2121 btrfs_release_path(path); 2122 2123 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2124 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2125 found_key.objectid, 2126 found_key.offset); 2127 if (ret == -ENOSPC) 2128 failed++; 2129 else if (ret) 2130 BUG(); 2131 } 2132 2133 if (found_key.offset == 0) 2134 break; 2135 key.offset = found_key.offset - 1; 2136 } 2137 ret = 0; 2138 if (failed && !retried) { 2139 failed = 0; 2140 retried = true; 2141 goto again; 2142 } else if (failed && retried) { 2143 WARN_ON(1); 2144 ret = -ENOSPC; 2145 } 2146 error: 2147 btrfs_free_path(path); 2148 return ret; 2149 } 2150 2151 static int insert_balance_item(struct btrfs_root *root, 2152 struct btrfs_balance_control *bctl) 2153 { 2154 struct btrfs_trans_handle *trans; 2155 struct btrfs_balance_item *item; 2156 struct btrfs_disk_balance_args disk_bargs; 2157 struct btrfs_path *path; 2158 struct extent_buffer *leaf; 2159 struct btrfs_key key; 2160 int ret, err; 2161 2162 path = btrfs_alloc_path(); 2163 if (!path) 2164 return -ENOMEM; 2165 2166 trans = btrfs_start_transaction(root, 0); 2167 if (IS_ERR(trans)) { 2168 btrfs_free_path(path); 2169 return PTR_ERR(trans); 2170 } 2171 2172 key.objectid = BTRFS_BALANCE_OBJECTID; 2173 key.type = BTRFS_BALANCE_ITEM_KEY; 2174 key.offset = 0; 2175 2176 ret = btrfs_insert_empty_item(trans, root, path, &key, 2177 sizeof(*item)); 2178 if (ret) 2179 goto out; 2180 2181 leaf = path->nodes[0]; 2182 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2183 2184 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); 2185 2186 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 2187 btrfs_set_balance_data(leaf, item, &disk_bargs); 2188 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 2189 btrfs_set_balance_meta(leaf, item, &disk_bargs); 2190 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 2191 btrfs_set_balance_sys(leaf, item, &disk_bargs); 2192 2193 btrfs_set_balance_flags(leaf, item, bctl->flags); 2194 2195 btrfs_mark_buffer_dirty(leaf); 2196 out: 2197 btrfs_free_path(path); 2198 err = btrfs_commit_transaction(trans, root); 2199 if (err && !ret) 2200 ret = err; 2201 return ret; 2202 } 2203 2204 static int del_balance_item(struct btrfs_root *root) 2205 { 2206 struct btrfs_trans_handle *trans; 2207 struct btrfs_path *path; 2208 struct btrfs_key key; 2209 int ret, err; 2210 2211 path = btrfs_alloc_path(); 2212 if (!path) 2213 return -ENOMEM; 2214 2215 trans = btrfs_start_transaction(root, 0); 2216 if (IS_ERR(trans)) { 2217 btrfs_free_path(path); 2218 return PTR_ERR(trans); 2219 } 2220 2221 key.objectid = BTRFS_BALANCE_OBJECTID; 2222 key.type = BTRFS_BALANCE_ITEM_KEY; 2223 key.offset = 0; 2224 2225 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2226 if (ret < 0) 2227 goto out; 2228 if (ret > 0) { 2229 ret = -ENOENT; 2230 goto out; 2231 } 2232 2233 ret = btrfs_del_item(trans, root, path); 2234 out: 2235 btrfs_free_path(path); 2236 err = btrfs_commit_transaction(trans, root); 2237 if (err && !ret) 2238 ret = err; 2239 return ret; 2240 } 2241 2242 /* 2243 * This is a heuristic used to reduce the number of chunks balanced on 2244 * resume after balance was interrupted. 2245 */ 2246 static void update_balance_args(struct btrfs_balance_control *bctl) 2247 { 2248 /* 2249 * Turn on soft mode for chunk types that were being converted. 2250 */ 2251 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 2252 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 2253 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 2254 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 2255 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 2256 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 2257 2258 /* 2259 * Turn on usage filter if is not already used. The idea is 2260 * that chunks that we have already balanced should be 2261 * reasonably full. Don't do it for chunks that are being 2262 * converted - that will keep us from relocating unconverted 2263 * (albeit full) chunks. 2264 */ 2265 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 2266 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2267 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 2268 bctl->data.usage = 90; 2269 } 2270 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 2271 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2272 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 2273 bctl->sys.usage = 90; 2274 } 2275 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 2276 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 2277 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 2278 bctl->meta.usage = 90; 2279 } 2280 } 2281 2282 /* 2283 * Should be called with both balance and volume mutexes held to 2284 * serialize other volume operations (add_dev/rm_dev/resize) with 2285 * restriper. Same goes for unset_balance_control. 2286 */ 2287 static void set_balance_control(struct btrfs_balance_control *bctl) 2288 { 2289 struct btrfs_fs_info *fs_info = bctl->fs_info; 2290 2291 BUG_ON(fs_info->balance_ctl); 2292 2293 spin_lock(&fs_info->balance_lock); 2294 fs_info->balance_ctl = bctl; 2295 spin_unlock(&fs_info->balance_lock); 2296 } 2297 2298 static void unset_balance_control(struct btrfs_fs_info *fs_info) 2299 { 2300 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2301 2302 BUG_ON(!fs_info->balance_ctl); 2303 2304 spin_lock(&fs_info->balance_lock); 2305 fs_info->balance_ctl = NULL; 2306 spin_unlock(&fs_info->balance_lock); 2307 2308 kfree(bctl); 2309 } 2310 2311 /* 2312 * Balance filters. Return 1 if chunk should be filtered out 2313 * (should not be balanced). 2314 */ 2315 static int chunk_profiles_filter(u64 chunk_type, 2316 struct btrfs_balance_args *bargs) 2317 { 2318 chunk_type = chunk_to_extended(chunk_type) & 2319 BTRFS_EXTENDED_PROFILE_MASK; 2320 2321 if (bargs->profiles & chunk_type) 2322 return 0; 2323 2324 return 1; 2325 } 2326 2327 static u64 div_factor_fine(u64 num, int factor) 2328 { 2329 if (factor <= 0) 2330 return 0; 2331 if (factor >= 100) 2332 return num; 2333 2334 num *= factor; 2335 do_div(num, 100); 2336 return num; 2337 } 2338 2339 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2340 struct btrfs_balance_args *bargs) 2341 { 2342 struct btrfs_block_group_cache *cache; 2343 u64 chunk_used, user_thresh; 2344 int ret = 1; 2345 2346 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2347 chunk_used = btrfs_block_group_used(&cache->item); 2348 2349 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2350 if (chunk_used < user_thresh) 2351 ret = 0; 2352 2353 btrfs_put_block_group(cache); 2354 return ret; 2355 } 2356 2357 static int chunk_devid_filter(struct extent_buffer *leaf, 2358 struct btrfs_chunk *chunk, 2359 struct btrfs_balance_args *bargs) 2360 { 2361 struct btrfs_stripe *stripe; 2362 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2363 int i; 2364 2365 for (i = 0; i < num_stripes; i++) { 2366 stripe = btrfs_stripe_nr(chunk, i); 2367 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 2368 return 0; 2369 } 2370 2371 return 1; 2372 } 2373 2374 /* [pstart, pend) */ 2375 static int chunk_drange_filter(struct extent_buffer *leaf, 2376 struct btrfs_chunk *chunk, 2377 u64 chunk_offset, 2378 struct btrfs_balance_args *bargs) 2379 { 2380 struct btrfs_stripe *stripe; 2381 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 2382 u64 stripe_offset; 2383 u64 stripe_length; 2384 int factor; 2385 int i; 2386 2387 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 2388 return 0; 2389 2390 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2391 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2392 factor = 2; 2393 else 2394 factor = 1; 2395 factor = num_stripes / factor; 2396 2397 for (i = 0; i < num_stripes; i++) { 2398 stripe = btrfs_stripe_nr(chunk, i); 2399 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 2400 continue; 2401 2402 stripe_offset = btrfs_stripe_offset(leaf, stripe); 2403 stripe_length = btrfs_chunk_length(leaf, chunk); 2404 do_div(stripe_length, factor); 2405 2406 if (stripe_offset < bargs->pend && 2407 stripe_offset + stripe_length > bargs->pstart) 2408 return 0; 2409 } 2410 2411 return 1; 2412 } 2413 2414 /* [vstart, vend) */ 2415 static int chunk_vrange_filter(struct extent_buffer *leaf, 2416 struct btrfs_chunk *chunk, 2417 u64 chunk_offset, 2418 struct btrfs_balance_args *bargs) 2419 { 2420 if (chunk_offset < bargs->vend && 2421 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 2422 /* at least part of the chunk is inside this vrange */ 2423 return 0; 2424 2425 return 1; 2426 } 2427 2428 static int chunk_soft_convert_filter(u64 chunk_type, 2429 struct btrfs_balance_args *bargs) 2430 { 2431 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 2432 return 0; 2433 2434 chunk_type = chunk_to_extended(chunk_type) & 2435 BTRFS_EXTENDED_PROFILE_MASK; 2436 2437 if (bargs->target == chunk_type) 2438 return 1; 2439 2440 return 0; 2441 } 2442 2443 static int should_balance_chunk(struct btrfs_root *root, 2444 struct extent_buffer *leaf, 2445 struct btrfs_chunk *chunk, u64 chunk_offset) 2446 { 2447 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; 2448 struct btrfs_balance_args *bargs = NULL; 2449 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 2450 2451 /* type filter */ 2452 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 2453 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 2454 return 0; 2455 } 2456 2457 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 2458 bargs = &bctl->data; 2459 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 2460 bargs = &bctl->sys; 2461 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 2462 bargs = &bctl->meta; 2463 2464 /* profiles filter */ 2465 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 2466 chunk_profiles_filter(chunk_type, bargs)) { 2467 return 0; 2468 } 2469 2470 /* usage filter */ 2471 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 2472 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 2473 return 0; 2474 } 2475 2476 /* devid filter */ 2477 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 2478 chunk_devid_filter(leaf, chunk, bargs)) { 2479 return 0; 2480 } 2481 2482 /* drange filter, makes sense only with devid filter */ 2483 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 2484 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { 2485 return 0; 2486 } 2487 2488 /* vrange filter */ 2489 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 2490 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 2491 return 0; 2492 } 2493 2494 /* soft profile changing mode */ 2495 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 2496 chunk_soft_convert_filter(chunk_type, bargs)) { 2497 return 0; 2498 } 2499 2500 return 1; 2501 } 2502 2503 static u64 div_factor(u64 num, int factor) 2504 { 2505 if (factor == 10) 2506 return num; 2507 num *= factor; 2508 do_div(num, 10); 2509 return num; 2510 } 2511 2512 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2513 { 2514 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2515 struct btrfs_root *chunk_root = fs_info->chunk_root; 2516 struct btrfs_root *dev_root = fs_info->dev_root; 2517 struct list_head *devices; 2518 struct btrfs_device *device; 2519 u64 old_size; 2520 u64 size_to_free; 2521 struct btrfs_chunk *chunk; 2522 struct btrfs_path *path; 2523 struct btrfs_key key; 2524 struct btrfs_key found_key; 2525 struct btrfs_trans_handle *trans; 2526 struct extent_buffer *leaf; 2527 int slot; 2528 int ret; 2529 int enospc_errors = 0; 2530 bool counting = true; 2531 2532 /* step one make some room on all the devices */ 2533 devices = &fs_info->fs_devices->devices; 2534 list_for_each_entry(device, devices, dev_list) { 2535 old_size = device->total_bytes; 2536 size_to_free = div_factor(old_size, 1); 2537 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2538 if (!device->writeable || 2539 device->total_bytes - device->bytes_used > size_to_free) 2540 continue; 2541 2542 ret = btrfs_shrink_device(device, old_size - size_to_free); 2543 if (ret == -ENOSPC) 2544 break; 2545 BUG_ON(ret); 2546 2547 trans = btrfs_start_transaction(dev_root, 0); 2548 BUG_ON(IS_ERR(trans)); 2549 2550 ret = btrfs_grow_device(trans, device, old_size); 2551 BUG_ON(ret); 2552 2553 btrfs_end_transaction(trans, dev_root); 2554 } 2555 2556 /* step two, relocate all the chunks */ 2557 path = btrfs_alloc_path(); 2558 if (!path) { 2559 ret = -ENOMEM; 2560 goto error; 2561 } 2562 2563 /* zero out stat counters */ 2564 spin_lock(&fs_info->balance_lock); 2565 memset(&bctl->stat, 0, sizeof(bctl->stat)); 2566 spin_unlock(&fs_info->balance_lock); 2567 again: 2568 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2569 key.offset = (u64)-1; 2570 key.type = BTRFS_CHUNK_ITEM_KEY; 2571 2572 while (1) { 2573 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 2574 atomic_read(&fs_info->balance_cancel_req)) { 2575 ret = -ECANCELED; 2576 goto error; 2577 } 2578 2579 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2580 if (ret < 0) 2581 goto error; 2582 2583 /* 2584 * this shouldn't happen, it means the last relocate 2585 * failed 2586 */ 2587 if (ret == 0) 2588 BUG(); /* FIXME break ? */ 2589 2590 ret = btrfs_previous_item(chunk_root, path, 0, 2591 BTRFS_CHUNK_ITEM_KEY); 2592 if (ret) { 2593 ret = 0; 2594 break; 2595 } 2596 2597 leaf = path->nodes[0]; 2598 slot = path->slots[0]; 2599 btrfs_item_key_to_cpu(leaf, &found_key, slot); 2600 2601 if (found_key.objectid != key.objectid) 2602 break; 2603 2604 /* chunk zero is special */ 2605 if (found_key.offset == 0) 2606 break; 2607 2608 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 2609 2610 if (!counting) { 2611 spin_lock(&fs_info->balance_lock); 2612 bctl->stat.considered++; 2613 spin_unlock(&fs_info->balance_lock); 2614 } 2615 2616 ret = should_balance_chunk(chunk_root, leaf, chunk, 2617 found_key.offset); 2618 btrfs_release_path(path); 2619 if (!ret) 2620 goto loop; 2621 2622 if (counting) { 2623 spin_lock(&fs_info->balance_lock); 2624 bctl->stat.expected++; 2625 spin_unlock(&fs_info->balance_lock); 2626 goto loop; 2627 } 2628 2629 ret = btrfs_relocate_chunk(chunk_root, 2630 chunk_root->root_key.objectid, 2631 found_key.objectid, 2632 found_key.offset); 2633 if (ret && ret != -ENOSPC) 2634 goto error; 2635 if (ret == -ENOSPC) { 2636 enospc_errors++; 2637 } else { 2638 spin_lock(&fs_info->balance_lock); 2639 bctl->stat.completed++; 2640 spin_unlock(&fs_info->balance_lock); 2641 } 2642 loop: 2643 key.offset = found_key.offset - 1; 2644 } 2645 2646 if (counting) { 2647 btrfs_release_path(path); 2648 counting = false; 2649 goto again; 2650 } 2651 error: 2652 btrfs_free_path(path); 2653 if (enospc_errors) { 2654 printk(KERN_INFO "btrfs: %d enospc errors during balance\n", 2655 enospc_errors); 2656 if (!ret) 2657 ret = -ENOSPC; 2658 } 2659 2660 return ret; 2661 } 2662 2663 /** 2664 * alloc_profile_is_valid - see if a given profile is valid and reduced 2665 * @flags: profile to validate 2666 * @extended: if true @flags is treated as an extended profile 2667 */ 2668 static int alloc_profile_is_valid(u64 flags, int extended) 2669 { 2670 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 2671 BTRFS_BLOCK_GROUP_PROFILE_MASK); 2672 2673 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 2674 2675 /* 1) check that all other bits are zeroed */ 2676 if (flags & ~mask) 2677 return 0; 2678 2679 /* 2) see if profile is reduced */ 2680 if (flags == 0) 2681 return !extended; /* "0" is valid for usual profiles */ 2682 2683 /* true if exactly one bit set */ 2684 return (flags & (flags - 1)) == 0; 2685 } 2686 2687 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 2688 { 2689 /* cancel requested || normal exit path */ 2690 return atomic_read(&fs_info->balance_cancel_req) || 2691 (atomic_read(&fs_info->balance_pause_req) == 0 && 2692 atomic_read(&fs_info->balance_cancel_req) == 0); 2693 } 2694 2695 static void __cancel_balance(struct btrfs_fs_info *fs_info) 2696 { 2697 int ret; 2698 2699 unset_balance_control(fs_info); 2700 ret = del_balance_item(fs_info->tree_root); 2701 BUG_ON(ret); 2702 } 2703 2704 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 2705 struct btrfs_ioctl_balance_args *bargs); 2706 2707 /* 2708 * Should be called with both balance and volume mutexes held 2709 */ 2710 int btrfs_balance(struct btrfs_balance_control *bctl, 2711 struct btrfs_ioctl_balance_args *bargs) 2712 { 2713 struct btrfs_fs_info *fs_info = bctl->fs_info; 2714 u64 allowed; 2715 int mixed = 0; 2716 int ret; 2717 2718 if (btrfs_fs_closing(fs_info) || 2719 atomic_read(&fs_info->balance_pause_req) || 2720 atomic_read(&fs_info->balance_cancel_req)) { 2721 ret = -EINVAL; 2722 goto out; 2723 } 2724 2725 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 2726 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 2727 mixed = 1; 2728 2729 /* 2730 * In case of mixed groups both data and meta should be picked, 2731 * and identical options should be given for both of them. 2732 */ 2733 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 2734 if (mixed && (bctl->flags & allowed)) { 2735 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 2736 !(bctl->flags & BTRFS_BALANCE_METADATA) || 2737 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 2738 printk(KERN_ERR "btrfs: with mixed groups data and " 2739 "metadata balance options must be the same\n"); 2740 ret = -EINVAL; 2741 goto out; 2742 } 2743 } 2744 2745 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 2746 if (fs_info->fs_devices->num_devices == 1) 2747 allowed |= BTRFS_BLOCK_GROUP_DUP; 2748 else if (fs_info->fs_devices->num_devices < 4) 2749 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 2750 else 2751 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2752 BTRFS_BLOCK_GROUP_RAID10); 2753 2754 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2755 (!alloc_profile_is_valid(bctl->data.target, 1) || 2756 (bctl->data.target & ~allowed))) { 2757 printk(KERN_ERR "btrfs: unable to start balance with target " 2758 "data profile %llu\n", 2759 (unsigned long long)bctl->data.target); 2760 ret = -EINVAL; 2761 goto out; 2762 } 2763 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2764 (!alloc_profile_is_valid(bctl->meta.target, 1) || 2765 (bctl->meta.target & ~allowed))) { 2766 printk(KERN_ERR "btrfs: unable to start balance with target " 2767 "metadata profile %llu\n", 2768 (unsigned long long)bctl->meta.target); 2769 ret = -EINVAL; 2770 goto out; 2771 } 2772 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2773 (!alloc_profile_is_valid(bctl->sys.target, 1) || 2774 (bctl->sys.target & ~allowed))) { 2775 printk(KERN_ERR "btrfs: unable to start balance with target " 2776 "system profile %llu\n", 2777 (unsigned long long)bctl->sys.target); 2778 ret = -EINVAL; 2779 goto out; 2780 } 2781 2782 /* allow dup'ed data chunks only in mixed mode */ 2783 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2784 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 2785 printk(KERN_ERR "btrfs: dup for data is not allowed\n"); 2786 ret = -EINVAL; 2787 goto out; 2788 } 2789 2790 /* allow to reduce meta or sys integrity only if force set */ 2791 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 2792 BTRFS_BLOCK_GROUP_RAID10; 2793 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2794 (fs_info->avail_system_alloc_bits & allowed) && 2795 !(bctl->sys.target & allowed)) || 2796 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 2797 (fs_info->avail_metadata_alloc_bits & allowed) && 2798 !(bctl->meta.target & allowed))) { 2799 if (bctl->flags & BTRFS_BALANCE_FORCE) { 2800 printk(KERN_INFO "btrfs: force reducing metadata " 2801 "integrity\n"); 2802 } else { 2803 printk(KERN_ERR "btrfs: balance will reduce metadata " 2804 "integrity, use force if you want this\n"); 2805 ret = -EINVAL; 2806 goto out; 2807 } 2808 } 2809 2810 ret = insert_balance_item(fs_info->tree_root, bctl); 2811 if (ret && ret != -EEXIST) 2812 goto out; 2813 2814 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 2815 BUG_ON(ret == -EEXIST); 2816 set_balance_control(bctl); 2817 } else { 2818 BUG_ON(ret != -EEXIST); 2819 spin_lock(&fs_info->balance_lock); 2820 update_balance_args(bctl); 2821 spin_unlock(&fs_info->balance_lock); 2822 } 2823 2824 atomic_inc(&fs_info->balance_running); 2825 mutex_unlock(&fs_info->balance_mutex); 2826 2827 ret = __btrfs_balance(fs_info); 2828 2829 mutex_lock(&fs_info->balance_mutex); 2830 atomic_dec(&fs_info->balance_running); 2831 2832 if (bargs) { 2833 memset(bargs, 0, sizeof(*bargs)); 2834 update_ioctl_balance_args(fs_info, 0, bargs); 2835 } 2836 2837 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 2838 balance_need_close(fs_info)) { 2839 __cancel_balance(fs_info); 2840 } 2841 2842 wake_up(&fs_info->balance_wait_q); 2843 2844 return ret; 2845 out: 2846 if (bctl->flags & BTRFS_BALANCE_RESUME) 2847 __cancel_balance(fs_info); 2848 else 2849 kfree(bctl); 2850 return ret; 2851 } 2852 2853 static int balance_kthread(void *data) 2854 { 2855 struct btrfs_fs_info *fs_info = data; 2856 int ret = 0; 2857 2858 mutex_lock(&fs_info->volume_mutex); 2859 mutex_lock(&fs_info->balance_mutex); 2860 2861 if (fs_info->balance_ctl) { 2862 printk(KERN_INFO "btrfs: continuing balance\n"); 2863 ret = btrfs_balance(fs_info->balance_ctl, NULL); 2864 } 2865 2866 mutex_unlock(&fs_info->balance_mutex); 2867 mutex_unlock(&fs_info->volume_mutex); 2868 2869 return ret; 2870 } 2871 2872 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 2873 { 2874 struct task_struct *tsk; 2875 2876 spin_lock(&fs_info->balance_lock); 2877 if (!fs_info->balance_ctl) { 2878 spin_unlock(&fs_info->balance_lock); 2879 return 0; 2880 } 2881 spin_unlock(&fs_info->balance_lock); 2882 2883 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 2884 printk(KERN_INFO "btrfs: force skipping balance\n"); 2885 return 0; 2886 } 2887 2888 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 2889 if (IS_ERR(tsk)) 2890 return PTR_ERR(tsk); 2891 2892 return 0; 2893 } 2894 2895 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 2896 { 2897 struct btrfs_balance_control *bctl; 2898 struct btrfs_balance_item *item; 2899 struct btrfs_disk_balance_args disk_bargs; 2900 struct btrfs_path *path; 2901 struct extent_buffer *leaf; 2902 struct btrfs_key key; 2903 int ret; 2904 2905 path = btrfs_alloc_path(); 2906 if (!path) 2907 return -ENOMEM; 2908 2909 key.objectid = BTRFS_BALANCE_OBJECTID; 2910 key.type = BTRFS_BALANCE_ITEM_KEY; 2911 key.offset = 0; 2912 2913 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 2914 if (ret < 0) 2915 goto out; 2916 if (ret > 0) { /* ret = -ENOENT; */ 2917 ret = 0; 2918 goto out; 2919 } 2920 2921 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 2922 if (!bctl) { 2923 ret = -ENOMEM; 2924 goto out; 2925 } 2926 2927 leaf = path->nodes[0]; 2928 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 2929 2930 bctl->fs_info = fs_info; 2931 bctl->flags = btrfs_balance_flags(leaf, item); 2932 bctl->flags |= BTRFS_BALANCE_RESUME; 2933 2934 btrfs_balance_data(leaf, item, &disk_bargs); 2935 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 2936 btrfs_balance_meta(leaf, item, &disk_bargs); 2937 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 2938 btrfs_balance_sys(leaf, item, &disk_bargs); 2939 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 2940 2941 mutex_lock(&fs_info->volume_mutex); 2942 mutex_lock(&fs_info->balance_mutex); 2943 2944 set_balance_control(bctl); 2945 2946 mutex_unlock(&fs_info->balance_mutex); 2947 mutex_unlock(&fs_info->volume_mutex); 2948 out: 2949 btrfs_free_path(path); 2950 return ret; 2951 } 2952 2953 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 2954 { 2955 int ret = 0; 2956 2957 mutex_lock(&fs_info->balance_mutex); 2958 if (!fs_info->balance_ctl) { 2959 mutex_unlock(&fs_info->balance_mutex); 2960 return -ENOTCONN; 2961 } 2962 2963 if (atomic_read(&fs_info->balance_running)) { 2964 atomic_inc(&fs_info->balance_pause_req); 2965 mutex_unlock(&fs_info->balance_mutex); 2966 2967 wait_event(fs_info->balance_wait_q, 2968 atomic_read(&fs_info->balance_running) == 0); 2969 2970 mutex_lock(&fs_info->balance_mutex); 2971 /* we are good with balance_ctl ripped off from under us */ 2972 BUG_ON(atomic_read(&fs_info->balance_running)); 2973 atomic_dec(&fs_info->balance_pause_req); 2974 } else { 2975 ret = -ENOTCONN; 2976 } 2977 2978 mutex_unlock(&fs_info->balance_mutex); 2979 return ret; 2980 } 2981 2982 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 2983 { 2984 mutex_lock(&fs_info->balance_mutex); 2985 if (!fs_info->balance_ctl) { 2986 mutex_unlock(&fs_info->balance_mutex); 2987 return -ENOTCONN; 2988 } 2989 2990 atomic_inc(&fs_info->balance_cancel_req); 2991 /* 2992 * if we are running just wait and return, balance item is 2993 * deleted in btrfs_balance in this case 2994 */ 2995 if (atomic_read(&fs_info->balance_running)) { 2996 mutex_unlock(&fs_info->balance_mutex); 2997 wait_event(fs_info->balance_wait_q, 2998 atomic_read(&fs_info->balance_running) == 0); 2999 mutex_lock(&fs_info->balance_mutex); 3000 } else { 3001 /* __cancel_balance needs volume_mutex */ 3002 mutex_unlock(&fs_info->balance_mutex); 3003 mutex_lock(&fs_info->volume_mutex); 3004 mutex_lock(&fs_info->balance_mutex); 3005 3006 if (fs_info->balance_ctl) 3007 __cancel_balance(fs_info); 3008 3009 mutex_unlock(&fs_info->volume_mutex); 3010 } 3011 3012 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); 3013 atomic_dec(&fs_info->balance_cancel_req); 3014 mutex_unlock(&fs_info->balance_mutex); 3015 return 0; 3016 } 3017 3018 /* 3019 * shrinking a device means finding all of the device extents past 3020 * the new size, and then following the back refs to the chunks. 3021 * The chunk relocation code actually frees the device extent 3022 */ 3023 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 3024 { 3025 struct btrfs_trans_handle *trans; 3026 struct btrfs_root *root = device->dev_root; 3027 struct btrfs_dev_extent *dev_extent = NULL; 3028 struct btrfs_path *path; 3029 u64 length; 3030 u64 chunk_tree; 3031 u64 chunk_objectid; 3032 u64 chunk_offset; 3033 int ret; 3034 int slot; 3035 int failed = 0; 3036 bool retried = false; 3037 struct extent_buffer *l; 3038 struct btrfs_key key; 3039 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3040 u64 old_total = btrfs_super_total_bytes(super_copy); 3041 u64 old_size = device->total_bytes; 3042 u64 diff = device->total_bytes - new_size; 3043 3044 if (new_size >= device->total_bytes) 3045 return -EINVAL; 3046 3047 path = btrfs_alloc_path(); 3048 if (!path) 3049 return -ENOMEM; 3050 3051 path->reada = 2; 3052 3053 lock_chunks(root); 3054 3055 device->total_bytes = new_size; 3056 if (device->writeable) { 3057 device->fs_devices->total_rw_bytes -= diff; 3058 spin_lock(&root->fs_info->free_chunk_lock); 3059 root->fs_info->free_chunk_space -= diff; 3060 spin_unlock(&root->fs_info->free_chunk_lock); 3061 } 3062 unlock_chunks(root); 3063 3064 again: 3065 key.objectid = device->devid; 3066 key.offset = (u64)-1; 3067 key.type = BTRFS_DEV_EXTENT_KEY; 3068 3069 do { 3070 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3071 if (ret < 0) 3072 goto done; 3073 3074 ret = btrfs_previous_item(root, path, 0, key.type); 3075 if (ret < 0) 3076 goto done; 3077 if (ret) { 3078 ret = 0; 3079 btrfs_release_path(path); 3080 break; 3081 } 3082 3083 l = path->nodes[0]; 3084 slot = path->slots[0]; 3085 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 3086 3087 if (key.objectid != device->devid) { 3088 btrfs_release_path(path); 3089 break; 3090 } 3091 3092 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3093 length = btrfs_dev_extent_length(l, dev_extent); 3094 3095 if (key.offset + length <= new_size) { 3096 btrfs_release_path(path); 3097 break; 3098 } 3099 3100 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3101 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3102 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3103 btrfs_release_path(path); 3104 3105 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 3106 chunk_offset); 3107 if (ret && ret != -ENOSPC) 3108 goto done; 3109 if (ret == -ENOSPC) 3110 failed++; 3111 } while (key.offset-- > 0); 3112 3113 if (failed && !retried) { 3114 failed = 0; 3115 retried = true; 3116 goto again; 3117 } else if (failed && retried) { 3118 ret = -ENOSPC; 3119 lock_chunks(root); 3120 3121 device->total_bytes = old_size; 3122 if (device->writeable) 3123 device->fs_devices->total_rw_bytes += diff; 3124 spin_lock(&root->fs_info->free_chunk_lock); 3125 root->fs_info->free_chunk_space += diff; 3126 spin_unlock(&root->fs_info->free_chunk_lock); 3127 unlock_chunks(root); 3128 goto done; 3129 } 3130 3131 /* Shrinking succeeded, else we would be at "done". */ 3132 trans = btrfs_start_transaction(root, 0); 3133 if (IS_ERR(trans)) { 3134 ret = PTR_ERR(trans); 3135 goto done; 3136 } 3137 3138 lock_chunks(root); 3139 3140 device->disk_total_bytes = new_size; 3141 /* Now btrfs_update_device() will change the on-disk size. */ 3142 ret = btrfs_update_device(trans, device); 3143 if (ret) { 3144 unlock_chunks(root); 3145 btrfs_end_transaction(trans, root); 3146 goto done; 3147 } 3148 WARN_ON(diff > old_total); 3149 btrfs_set_super_total_bytes(super_copy, old_total - diff); 3150 unlock_chunks(root); 3151 btrfs_end_transaction(trans, root); 3152 done: 3153 btrfs_free_path(path); 3154 return ret; 3155 } 3156 3157 static int btrfs_add_system_chunk(struct btrfs_root *root, 3158 struct btrfs_key *key, 3159 struct btrfs_chunk *chunk, int item_size) 3160 { 3161 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3162 struct btrfs_disk_key disk_key; 3163 u32 array_size; 3164 u8 *ptr; 3165 3166 array_size = btrfs_super_sys_array_size(super_copy); 3167 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 3168 return -EFBIG; 3169 3170 ptr = super_copy->sys_chunk_array + array_size; 3171 btrfs_cpu_key_to_disk(&disk_key, key); 3172 memcpy(ptr, &disk_key, sizeof(disk_key)); 3173 ptr += sizeof(disk_key); 3174 memcpy(ptr, chunk, item_size); 3175 item_size += sizeof(disk_key); 3176 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 3177 return 0; 3178 } 3179 3180 /* 3181 * sort the devices in descending order by max_avail, total_avail 3182 */ 3183 static int btrfs_cmp_device_info(const void *a, const void *b) 3184 { 3185 const struct btrfs_device_info *di_a = a; 3186 const struct btrfs_device_info *di_b = b; 3187 3188 if (di_a->max_avail > di_b->max_avail) 3189 return -1; 3190 if (di_a->max_avail < di_b->max_avail) 3191 return 1; 3192 if (di_a->total_avail > di_b->total_avail) 3193 return -1; 3194 if (di_a->total_avail < di_b->total_avail) 3195 return 1; 3196 return 0; 3197 } 3198 3199 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3200 struct btrfs_root *extent_root, 3201 struct map_lookup **map_ret, 3202 u64 *num_bytes_out, u64 *stripe_size_out, 3203 u64 start, u64 type) 3204 { 3205 struct btrfs_fs_info *info = extent_root->fs_info; 3206 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3207 struct list_head *cur; 3208 struct map_lookup *map = NULL; 3209 struct extent_map_tree *em_tree; 3210 struct extent_map *em; 3211 struct btrfs_device_info *devices_info = NULL; 3212 u64 total_avail; 3213 int num_stripes; /* total number of stripes to allocate */ 3214 int sub_stripes; /* sub_stripes info for map */ 3215 int dev_stripes; /* stripes per dev */ 3216 int devs_max; /* max devs to use */ 3217 int devs_min; /* min devs needed */ 3218 int devs_increment; /* ndevs has to be a multiple of this */ 3219 int ncopies; /* how many copies to data has */ 3220 int ret; 3221 u64 max_stripe_size; 3222 u64 max_chunk_size; 3223 u64 stripe_size; 3224 u64 num_bytes; 3225 int ndevs; 3226 int i; 3227 int j; 3228 3229 BUG_ON(!alloc_profile_is_valid(type, 0)); 3230 3231 if (list_empty(&fs_devices->alloc_list)) 3232 return -ENOSPC; 3233 3234 sub_stripes = 1; 3235 dev_stripes = 1; 3236 devs_increment = 1; 3237 ncopies = 1; 3238 devs_max = 0; /* 0 == as many as possible */ 3239 devs_min = 1; 3240 3241 /* 3242 * define the properties of each RAID type. 3243 * FIXME: move this to a global table and use it in all RAID 3244 * calculation code 3245 */ 3246 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3247 dev_stripes = 2; 3248 ncopies = 2; 3249 devs_max = 1; 3250 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 3251 devs_min = 2; 3252 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 3253 devs_increment = 2; 3254 ncopies = 2; 3255 devs_max = 2; 3256 devs_min = 2; 3257 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 3258 sub_stripes = 2; 3259 devs_increment = 2; 3260 ncopies = 2; 3261 devs_min = 4; 3262 } else { 3263 devs_max = 1; 3264 } 3265 3266 if (type & BTRFS_BLOCK_GROUP_DATA) { 3267 max_stripe_size = 1024 * 1024 * 1024; 3268 max_chunk_size = 10 * max_stripe_size; 3269 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3270 /* for larger filesystems, use larger metadata chunks */ 3271 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) 3272 max_stripe_size = 1024 * 1024 * 1024; 3273 else 3274 max_stripe_size = 256 * 1024 * 1024; 3275 max_chunk_size = max_stripe_size; 3276 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3277 max_stripe_size = 32 * 1024 * 1024; 3278 max_chunk_size = 2 * max_stripe_size; 3279 } else { 3280 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3281 type); 3282 BUG_ON(1); 3283 } 3284 3285 /* we don't want a chunk larger than 10% of writeable space */ 3286 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 3287 max_chunk_size); 3288 3289 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 3290 GFP_NOFS); 3291 if (!devices_info) 3292 return -ENOMEM; 3293 3294 cur = fs_devices->alloc_list.next; 3295 3296 /* 3297 * in the first pass through the devices list, we gather information 3298 * about the available holes on each device. 3299 */ 3300 ndevs = 0; 3301 while (cur != &fs_devices->alloc_list) { 3302 struct btrfs_device *device; 3303 u64 max_avail; 3304 u64 dev_offset; 3305 3306 device = list_entry(cur, struct btrfs_device, dev_alloc_list); 3307 3308 cur = cur->next; 3309 3310 if (!device->writeable) { 3311 printk(KERN_ERR 3312 "btrfs: read-only device in alloc_list\n"); 3313 WARN_ON(1); 3314 continue; 3315 } 3316 3317 if (!device->in_fs_metadata) 3318 continue; 3319 3320 if (device->total_bytes > device->bytes_used) 3321 total_avail = device->total_bytes - device->bytes_used; 3322 else 3323 total_avail = 0; 3324 3325 /* If there is no space on this device, skip it. */ 3326 if (total_avail == 0) 3327 continue; 3328 3329 ret = find_free_dev_extent(device, 3330 max_stripe_size * dev_stripes, 3331 &dev_offset, &max_avail); 3332 if (ret && ret != -ENOSPC) 3333 goto error; 3334 3335 if (ret == 0) 3336 max_avail = max_stripe_size * dev_stripes; 3337 3338 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3339 continue; 3340 3341 devices_info[ndevs].dev_offset = dev_offset; 3342 devices_info[ndevs].max_avail = max_avail; 3343 devices_info[ndevs].total_avail = total_avail; 3344 devices_info[ndevs].dev = device; 3345 ++ndevs; 3346 } 3347 3348 /* 3349 * now sort the devices by hole size / available space 3350 */ 3351 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 3352 btrfs_cmp_device_info, NULL); 3353 3354 /* round down to number of usable stripes */ 3355 ndevs -= ndevs % devs_increment; 3356 3357 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { 3358 ret = -ENOSPC; 3359 goto error; 3360 } 3361 3362 if (devs_max && ndevs > devs_max) 3363 ndevs = devs_max; 3364 /* 3365 * the primary goal is to maximize the number of stripes, so use as many 3366 * devices as possible, even if the stripes are not maximum sized. 3367 */ 3368 stripe_size = devices_info[ndevs-1].max_avail; 3369 num_stripes = ndevs * dev_stripes; 3370 3371 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3372 stripe_size = max_chunk_size * ncopies; 3373 do_div(stripe_size, ndevs); 3374 } 3375 3376 do_div(stripe_size, dev_stripes); 3377 3378 /* align to BTRFS_STRIPE_LEN */ 3379 do_div(stripe_size, BTRFS_STRIPE_LEN); 3380 stripe_size *= BTRFS_STRIPE_LEN; 3381 3382 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3383 if (!map) { 3384 ret = -ENOMEM; 3385 goto error; 3386 } 3387 map->num_stripes = num_stripes; 3388 3389 for (i = 0; i < ndevs; ++i) { 3390 for (j = 0; j < dev_stripes; ++j) { 3391 int s = i * dev_stripes + j; 3392 map->stripes[s].dev = devices_info[i].dev; 3393 map->stripes[s].physical = devices_info[i].dev_offset + 3394 j * stripe_size; 3395 } 3396 } 3397 map->sector_size = extent_root->sectorsize; 3398 map->stripe_len = BTRFS_STRIPE_LEN; 3399 map->io_align = BTRFS_STRIPE_LEN; 3400 map->io_width = BTRFS_STRIPE_LEN; 3401 map->type = type; 3402 map->sub_stripes = sub_stripes; 3403 3404 *map_ret = map; 3405 num_bytes = stripe_size * (num_stripes / ncopies); 3406 3407 *stripe_size_out = stripe_size; 3408 *num_bytes_out = num_bytes; 3409 3410 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3411 3412 em = alloc_extent_map(); 3413 if (!em) { 3414 ret = -ENOMEM; 3415 goto error; 3416 } 3417 em->bdev = (struct block_device *)map; 3418 em->start = start; 3419 em->len = num_bytes; 3420 em->block_start = 0; 3421 em->block_len = em->len; 3422 3423 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3424 write_lock(&em_tree->lock); 3425 ret = add_extent_mapping(em_tree, em); 3426 write_unlock(&em_tree->lock); 3427 free_extent_map(em); 3428 if (ret) 3429 goto error; 3430 3431 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3432 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3433 start, num_bytes); 3434 if (ret) 3435 goto error; 3436 3437 for (i = 0; i < map->num_stripes; ++i) { 3438 struct btrfs_device *device; 3439 u64 dev_offset; 3440 3441 device = map->stripes[i].dev; 3442 dev_offset = map->stripes[i].physical; 3443 3444 ret = btrfs_alloc_dev_extent(trans, device, 3445 info->chunk_root->root_key.objectid, 3446 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3447 start, dev_offset, stripe_size); 3448 if (ret) { 3449 btrfs_abort_transaction(trans, extent_root, ret); 3450 goto error; 3451 } 3452 } 3453 3454 kfree(devices_info); 3455 return 0; 3456 3457 error: 3458 kfree(map); 3459 kfree(devices_info); 3460 return ret; 3461 } 3462 3463 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3464 struct btrfs_root *extent_root, 3465 struct map_lookup *map, u64 chunk_offset, 3466 u64 chunk_size, u64 stripe_size) 3467 { 3468 u64 dev_offset; 3469 struct btrfs_key key; 3470 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3471 struct btrfs_device *device; 3472 struct btrfs_chunk *chunk; 3473 struct btrfs_stripe *stripe; 3474 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3475 int index = 0; 3476 int ret; 3477 3478 chunk = kzalloc(item_size, GFP_NOFS); 3479 if (!chunk) 3480 return -ENOMEM; 3481 3482 index = 0; 3483 while (index < map->num_stripes) { 3484 device = map->stripes[index].dev; 3485 device->bytes_used += stripe_size; 3486 ret = btrfs_update_device(trans, device); 3487 if (ret) 3488 goto out_free; 3489 index++; 3490 } 3491 3492 spin_lock(&extent_root->fs_info->free_chunk_lock); 3493 extent_root->fs_info->free_chunk_space -= (stripe_size * 3494 map->num_stripes); 3495 spin_unlock(&extent_root->fs_info->free_chunk_lock); 3496 3497 index = 0; 3498 stripe = &chunk->stripe; 3499 while (index < map->num_stripes) { 3500 device = map->stripes[index].dev; 3501 dev_offset = map->stripes[index].physical; 3502 3503 btrfs_set_stack_stripe_devid(stripe, device->devid); 3504 btrfs_set_stack_stripe_offset(stripe, dev_offset); 3505 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 3506 stripe++; 3507 index++; 3508 } 3509 3510 btrfs_set_stack_chunk_length(chunk, chunk_size); 3511 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 3512 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 3513 btrfs_set_stack_chunk_type(chunk, map->type); 3514 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 3515 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 3516 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 3517 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); 3518 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 3519 3520 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3521 key.type = BTRFS_CHUNK_ITEM_KEY; 3522 key.offset = chunk_offset; 3523 3524 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 3525 3526 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3527 /* 3528 * TODO: Cleanup of inserted chunk root in case of 3529 * failure. 3530 */ 3531 ret = btrfs_add_system_chunk(chunk_root, &key, chunk, 3532 item_size); 3533 } 3534 3535 out_free: 3536 kfree(chunk); 3537 return ret; 3538 } 3539 3540 /* 3541 * Chunk allocation falls into two parts. The first part does works 3542 * that make the new allocated chunk useable, but not do any operation 3543 * that modifies the chunk tree. The second part does the works that 3544 * require modifying the chunk tree. This division is important for the 3545 * bootstrap process of adding storage to a seed btrfs. 3546 */ 3547 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3548 struct btrfs_root *extent_root, u64 type) 3549 { 3550 u64 chunk_offset; 3551 u64 chunk_size; 3552 u64 stripe_size; 3553 struct map_lookup *map; 3554 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3555 int ret; 3556 3557 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3558 &chunk_offset); 3559 if (ret) 3560 return ret; 3561 3562 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3563 &stripe_size, chunk_offset, type); 3564 if (ret) 3565 return ret; 3566 3567 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3568 chunk_size, stripe_size); 3569 if (ret) 3570 return ret; 3571 return 0; 3572 } 3573 3574 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 3575 struct btrfs_root *root, 3576 struct btrfs_device *device) 3577 { 3578 u64 chunk_offset; 3579 u64 sys_chunk_offset; 3580 u64 chunk_size; 3581 u64 sys_chunk_size; 3582 u64 stripe_size; 3583 u64 sys_stripe_size; 3584 u64 alloc_profile; 3585 struct map_lookup *map; 3586 struct map_lookup *sys_map; 3587 struct btrfs_fs_info *fs_info = root->fs_info; 3588 struct btrfs_root *extent_root = fs_info->extent_root; 3589 int ret; 3590 3591 ret = find_next_chunk(fs_info->chunk_root, 3592 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 3593 if (ret) 3594 return ret; 3595 3596 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3597 fs_info->avail_metadata_alloc_bits; 3598 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3599 3600 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3601 &stripe_size, chunk_offset, alloc_profile); 3602 if (ret) 3603 return ret; 3604 3605 sys_chunk_offset = chunk_offset + chunk_size; 3606 3607 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3608 fs_info->avail_system_alloc_bits; 3609 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3610 3611 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3612 &sys_chunk_size, &sys_stripe_size, 3613 sys_chunk_offset, alloc_profile); 3614 if (ret) 3615 goto abort; 3616 3617 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 3618 if (ret) 3619 goto abort; 3620 3621 /* 3622 * Modifying chunk tree needs allocating new blocks from both 3623 * system block group and metadata block group. So we only can 3624 * do operations require modifying the chunk tree after both 3625 * block groups were created. 3626 */ 3627 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, 3628 chunk_size, stripe_size); 3629 if (ret) 3630 goto abort; 3631 3632 ret = __finish_chunk_alloc(trans, extent_root, sys_map, 3633 sys_chunk_offset, sys_chunk_size, 3634 sys_stripe_size); 3635 if (ret) 3636 goto abort; 3637 3638 return 0; 3639 3640 abort: 3641 btrfs_abort_transaction(trans, root, ret); 3642 return ret; 3643 } 3644 3645 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 3646 { 3647 struct extent_map *em; 3648 struct map_lookup *map; 3649 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 3650 int readonly = 0; 3651 int i; 3652 3653 read_lock(&map_tree->map_tree.lock); 3654 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3655 read_unlock(&map_tree->map_tree.lock); 3656 if (!em) 3657 return 1; 3658 3659 if (btrfs_test_opt(root, DEGRADED)) { 3660 free_extent_map(em); 3661 return 0; 3662 } 3663 3664 map = (struct map_lookup *)em->bdev; 3665 for (i = 0; i < map->num_stripes; i++) { 3666 if (!map->stripes[i].dev->writeable) { 3667 readonly = 1; 3668 break; 3669 } 3670 } 3671 free_extent_map(em); 3672 return readonly; 3673 } 3674 3675 void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 3676 { 3677 extent_map_tree_init(&tree->map_tree); 3678 } 3679 3680 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 3681 { 3682 struct extent_map *em; 3683 3684 while (1) { 3685 write_lock(&tree->map_tree.lock); 3686 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 3687 if (em) 3688 remove_extent_mapping(&tree->map_tree, em); 3689 write_unlock(&tree->map_tree.lock); 3690 if (!em) 3691 break; 3692 kfree(em->bdev); 3693 /* once for us */ 3694 free_extent_map(em); 3695 /* once for the tree */ 3696 free_extent_map(em); 3697 } 3698 } 3699 3700 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3701 { 3702 struct extent_map *em; 3703 struct map_lookup *map; 3704 struct extent_map_tree *em_tree = &map_tree->map_tree; 3705 int ret; 3706 3707 read_lock(&em_tree->lock); 3708 em = lookup_extent_mapping(em_tree, logical, len); 3709 read_unlock(&em_tree->lock); 3710 BUG_ON(!em); 3711 3712 BUG_ON(em->start > logical || em->start + em->len < logical); 3713 map = (struct map_lookup *)em->bdev; 3714 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 3715 ret = map->num_stripes; 3716 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 3717 ret = map->sub_stripes; 3718 else 3719 ret = 1; 3720 free_extent_map(em); 3721 return ret; 3722 } 3723 3724 static int find_live_mirror(struct map_lookup *map, int first, int num, 3725 int optimal) 3726 { 3727 int i; 3728 if (map->stripes[optimal].dev->bdev) 3729 return optimal; 3730 for (i = first; i < first + num; i++) { 3731 if (map->stripes[i].dev->bdev) 3732 return i; 3733 } 3734 /* we couldn't find one that doesn't fail. Just return something 3735 * and the io error handling code will clean up eventually 3736 */ 3737 return optimal; 3738 } 3739 3740 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3741 u64 logical, u64 *length, 3742 struct btrfs_bio **bbio_ret, 3743 int mirror_num) 3744 { 3745 struct extent_map *em; 3746 struct map_lookup *map; 3747 struct extent_map_tree *em_tree = &map_tree->map_tree; 3748 u64 offset; 3749 u64 stripe_offset; 3750 u64 stripe_end_offset; 3751 u64 stripe_nr; 3752 u64 stripe_nr_orig; 3753 u64 stripe_nr_end; 3754 int stripe_index; 3755 int i; 3756 int ret = 0; 3757 int num_stripes; 3758 int max_errors = 0; 3759 struct btrfs_bio *bbio = NULL; 3760 3761 read_lock(&em_tree->lock); 3762 em = lookup_extent_mapping(em_tree, logical, *length); 3763 read_unlock(&em_tree->lock); 3764 3765 if (!em) { 3766 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 3767 (unsigned long long)logical, 3768 (unsigned long long)*length); 3769 BUG(); 3770 } 3771 3772 BUG_ON(em->start > logical || em->start + em->len < logical); 3773 map = (struct map_lookup *)em->bdev; 3774 offset = logical - em->start; 3775 3776 if (mirror_num > map->num_stripes) 3777 mirror_num = 0; 3778 3779 stripe_nr = offset; 3780 /* 3781 * stripe_nr counts the total number of stripes we have to stride 3782 * to get to this block 3783 */ 3784 do_div(stripe_nr, map->stripe_len); 3785 3786 stripe_offset = stripe_nr * map->stripe_len; 3787 BUG_ON(offset < stripe_offset); 3788 3789 /* stripe_offset is the offset of this block in its stripe*/ 3790 stripe_offset = offset - stripe_offset; 3791 3792 if (rw & REQ_DISCARD) 3793 *length = min_t(u64, em->len - offset, *length); 3794 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 3795 /* we limit the length of each bio to what fits in a stripe */ 3796 *length = min_t(u64, em->len - offset, 3797 map->stripe_len - stripe_offset); 3798 } else { 3799 *length = em->len - offset; 3800 } 3801 3802 if (!bbio_ret) 3803 goto out; 3804 3805 num_stripes = 1; 3806 stripe_index = 0; 3807 stripe_nr_orig = stripe_nr; 3808 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 3809 (~(map->stripe_len - 1)); 3810 do_div(stripe_nr_end, map->stripe_len); 3811 stripe_end_offset = stripe_nr_end * map->stripe_len - 3812 (offset + *length); 3813 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3814 if (rw & REQ_DISCARD) 3815 num_stripes = min_t(u64, map->num_stripes, 3816 stripe_nr_end - stripe_nr_orig); 3817 stripe_index = do_div(stripe_nr, map->num_stripes); 3818 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3819 if (rw & (REQ_WRITE | REQ_DISCARD)) 3820 num_stripes = map->num_stripes; 3821 else if (mirror_num) 3822 stripe_index = mirror_num - 1; 3823 else { 3824 stripe_index = find_live_mirror(map, 0, 3825 map->num_stripes, 3826 current->pid % map->num_stripes); 3827 mirror_num = stripe_index + 1; 3828 } 3829 3830 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3831 if (rw & (REQ_WRITE | REQ_DISCARD)) { 3832 num_stripes = map->num_stripes; 3833 } else if (mirror_num) { 3834 stripe_index = mirror_num - 1; 3835 } else { 3836 mirror_num = 1; 3837 } 3838 3839 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3840 int factor = map->num_stripes / map->sub_stripes; 3841 3842 stripe_index = do_div(stripe_nr, factor); 3843 stripe_index *= map->sub_stripes; 3844 3845 if (rw & REQ_WRITE) 3846 num_stripes = map->sub_stripes; 3847 else if (rw & REQ_DISCARD) 3848 num_stripes = min_t(u64, map->sub_stripes * 3849 (stripe_nr_end - stripe_nr_orig), 3850 map->num_stripes); 3851 else if (mirror_num) 3852 stripe_index += mirror_num - 1; 3853 else { 3854 int old_stripe_index = stripe_index; 3855 stripe_index = find_live_mirror(map, stripe_index, 3856 map->sub_stripes, stripe_index + 3857 current->pid % map->sub_stripes); 3858 mirror_num = stripe_index - old_stripe_index + 1; 3859 } 3860 } else { 3861 /* 3862 * after this do_div call, stripe_nr is the number of stripes 3863 * on this device we have to walk to find the data, and 3864 * stripe_index is the number of our device in the stripe array 3865 */ 3866 stripe_index = do_div(stripe_nr, map->num_stripes); 3867 mirror_num = stripe_index + 1; 3868 } 3869 BUG_ON(stripe_index >= map->num_stripes); 3870 3871 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 3872 if (!bbio) { 3873 ret = -ENOMEM; 3874 goto out; 3875 } 3876 atomic_set(&bbio->error, 0); 3877 3878 if (rw & REQ_DISCARD) { 3879 int factor = 0; 3880 int sub_stripes = 0; 3881 u64 stripes_per_dev = 0; 3882 u32 remaining_stripes = 0; 3883 u32 last_stripe = 0; 3884 3885 if (map->type & 3886 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 3887 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 3888 sub_stripes = 1; 3889 else 3890 sub_stripes = map->sub_stripes; 3891 3892 factor = map->num_stripes / sub_stripes; 3893 stripes_per_dev = div_u64_rem(stripe_nr_end - 3894 stripe_nr_orig, 3895 factor, 3896 &remaining_stripes); 3897 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 3898 last_stripe *= sub_stripes; 3899 } 3900 3901 for (i = 0; i < num_stripes; i++) { 3902 bbio->stripes[i].physical = 3903 map->stripes[stripe_index].physical + 3904 stripe_offset + stripe_nr * map->stripe_len; 3905 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3906 3907 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3908 BTRFS_BLOCK_GROUP_RAID10)) { 3909 bbio->stripes[i].length = stripes_per_dev * 3910 map->stripe_len; 3911 3912 if (i / sub_stripes < remaining_stripes) 3913 bbio->stripes[i].length += 3914 map->stripe_len; 3915 3916 /* 3917 * Special for the first stripe and 3918 * the last stripe: 3919 * 3920 * |-------|...|-------| 3921 * |----------| 3922 * off end_off 3923 */ 3924 if (i < sub_stripes) 3925 bbio->stripes[i].length -= 3926 stripe_offset; 3927 3928 if (stripe_index >= last_stripe && 3929 stripe_index <= (last_stripe + 3930 sub_stripes - 1)) 3931 bbio->stripes[i].length -= 3932 stripe_end_offset; 3933 3934 if (i == sub_stripes - 1) 3935 stripe_offset = 0; 3936 } else 3937 bbio->stripes[i].length = *length; 3938 3939 stripe_index++; 3940 if (stripe_index == map->num_stripes) { 3941 /* This could only happen for RAID0/10 */ 3942 stripe_index = 0; 3943 stripe_nr++; 3944 } 3945 } 3946 } else { 3947 for (i = 0; i < num_stripes; i++) { 3948 bbio->stripes[i].physical = 3949 map->stripes[stripe_index].physical + 3950 stripe_offset + 3951 stripe_nr * map->stripe_len; 3952 bbio->stripes[i].dev = 3953 map->stripes[stripe_index].dev; 3954 stripe_index++; 3955 } 3956 } 3957 3958 if (rw & REQ_WRITE) { 3959 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 3960 BTRFS_BLOCK_GROUP_RAID10 | 3961 BTRFS_BLOCK_GROUP_DUP)) { 3962 max_errors = 1; 3963 } 3964 } 3965 3966 *bbio_ret = bbio; 3967 bbio->num_stripes = num_stripes; 3968 bbio->max_errors = max_errors; 3969 bbio->mirror_num = mirror_num; 3970 out: 3971 free_extent_map(em); 3972 return ret; 3973 } 3974 3975 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3976 u64 logical, u64 *length, 3977 struct btrfs_bio **bbio_ret, int mirror_num) 3978 { 3979 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 3980 mirror_num); 3981 } 3982 3983 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3984 u64 chunk_start, u64 physical, u64 devid, 3985 u64 **logical, int *naddrs, int *stripe_len) 3986 { 3987 struct extent_map_tree *em_tree = &map_tree->map_tree; 3988 struct extent_map *em; 3989 struct map_lookup *map; 3990 u64 *buf; 3991 u64 bytenr; 3992 u64 length; 3993 u64 stripe_nr; 3994 int i, j, nr = 0; 3995 3996 read_lock(&em_tree->lock); 3997 em = lookup_extent_mapping(em_tree, chunk_start, 1); 3998 read_unlock(&em_tree->lock); 3999 4000 BUG_ON(!em || em->start != chunk_start); 4001 map = (struct map_lookup *)em->bdev; 4002 4003 length = em->len; 4004 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4005 do_div(length, map->num_stripes / map->sub_stripes); 4006 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4007 do_div(length, map->num_stripes); 4008 4009 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4010 BUG_ON(!buf); /* -ENOMEM */ 4011 4012 for (i = 0; i < map->num_stripes; i++) { 4013 if (devid && map->stripes[i].dev->devid != devid) 4014 continue; 4015 if (map->stripes[i].physical > physical || 4016 map->stripes[i].physical + length <= physical) 4017 continue; 4018 4019 stripe_nr = physical - map->stripes[i].physical; 4020 do_div(stripe_nr, map->stripe_len); 4021 4022 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 4023 stripe_nr = stripe_nr * map->num_stripes + i; 4024 do_div(stripe_nr, map->sub_stripes); 4025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4026 stripe_nr = stripe_nr * map->num_stripes + i; 4027 } 4028 bytenr = chunk_start + stripe_nr * map->stripe_len; 4029 WARN_ON(nr >= map->num_stripes); 4030 for (j = 0; j < nr; j++) { 4031 if (buf[j] == bytenr) 4032 break; 4033 } 4034 if (j == nr) { 4035 WARN_ON(nr >= map->num_stripes); 4036 buf[nr++] = bytenr; 4037 } 4038 } 4039 4040 *logical = buf; 4041 *naddrs = nr; 4042 *stripe_len = map->stripe_len; 4043 4044 free_extent_map(em); 4045 return 0; 4046 } 4047 4048 static void *merge_stripe_index_into_bio_private(void *bi_private, 4049 unsigned int stripe_index) 4050 { 4051 /* 4052 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is 4053 * at most 1. 4054 * The alternative solution (instead of stealing bits from the 4055 * pointer) would be to allocate an intermediate structure 4056 * that contains the old private pointer plus the stripe_index. 4057 */ 4058 BUG_ON((((uintptr_t)bi_private) & 3) != 0); 4059 BUG_ON(stripe_index > 3); 4060 return (void *)(((uintptr_t)bi_private) | stripe_index); 4061 } 4062 4063 static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) 4064 { 4065 return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); 4066 } 4067 4068 static unsigned int extract_stripe_index_from_bio_private(void *bi_private) 4069 { 4070 return (unsigned int)((uintptr_t)bi_private) & 3; 4071 } 4072 4073 static void btrfs_end_bio(struct bio *bio, int err) 4074 { 4075 struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); 4076 int is_orig_bio = 0; 4077 4078 if (err) { 4079 atomic_inc(&bbio->error); 4080 if (err == -EIO || err == -EREMOTEIO) { 4081 unsigned int stripe_index = 4082 extract_stripe_index_from_bio_private( 4083 bio->bi_private); 4084 struct btrfs_device *dev; 4085 4086 BUG_ON(stripe_index >= bbio->num_stripes); 4087 dev = bbio->stripes[stripe_index].dev; 4088 if (dev->bdev) { 4089 if (bio->bi_rw & WRITE) 4090 btrfs_dev_stat_inc(dev, 4091 BTRFS_DEV_STAT_WRITE_ERRS); 4092 else 4093 btrfs_dev_stat_inc(dev, 4094 BTRFS_DEV_STAT_READ_ERRS); 4095 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 4096 btrfs_dev_stat_inc(dev, 4097 BTRFS_DEV_STAT_FLUSH_ERRS); 4098 btrfs_dev_stat_print_on_error(dev); 4099 } 4100 } 4101 } 4102 4103 if (bio == bbio->orig_bio) 4104 is_orig_bio = 1; 4105 4106 if (atomic_dec_and_test(&bbio->stripes_pending)) { 4107 if (!is_orig_bio) { 4108 bio_put(bio); 4109 bio = bbio->orig_bio; 4110 } 4111 bio->bi_private = bbio->private; 4112 bio->bi_end_io = bbio->end_io; 4113 bio->bi_bdev = (struct block_device *) 4114 (unsigned long)bbio->mirror_num; 4115 /* only send an error to the higher layers if it is 4116 * beyond the tolerance of the multi-bio 4117 */ 4118 if (atomic_read(&bbio->error) > bbio->max_errors) { 4119 err = -EIO; 4120 } else { 4121 /* 4122 * this bio is actually up to date, we didn't 4123 * go over the max number of errors 4124 */ 4125 set_bit(BIO_UPTODATE, &bio->bi_flags); 4126 err = 0; 4127 } 4128 kfree(bbio); 4129 4130 bio_endio(bio, err); 4131 } else if (!is_orig_bio) { 4132 bio_put(bio); 4133 } 4134 } 4135 4136 struct async_sched { 4137 struct bio *bio; 4138 int rw; 4139 struct btrfs_fs_info *info; 4140 struct btrfs_work work; 4141 }; 4142 4143 /* 4144 * see run_scheduled_bios for a description of why bios are collected for 4145 * async submit. 4146 * 4147 * This will add one bio to the pending list for a device and make sure 4148 * the work struct is scheduled. 4149 */ 4150 static noinline void schedule_bio(struct btrfs_root *root, 4151 struct btrfs_device *device, 4152 int rw, struct bio *bio) 4153 { 4154 int should_queue = 1; 4155 struct btrfs_pending_bios *pending_bios; 4156 4157 /* don't bother with additional async steps for reads, right now */ 4158 if (!(rw & REQ_WRITE)) { 4159 bio_get(bio); 4160 btrfsic_submit_bio(rw, bio); 4161 bio_put(bio); 4162 return; 4163 } 4164 4165 /* 4166 * nr_async_bios allows us to reliably return congestion to the 4167 * higher layers. Otherwise, the async bio makes it appear we have 4168 * made progress against dirty pages when we've really just put it 4169 * on a queue for later 4170 */ 4171 atomic_inc(&root->fs_info->nr_async_bios); 4172 WARN_ON(bio->bi_next); 4173 bio->bi_next = NULL; 4174 bio->bi_rw |= rw; 4175 4176 spin_lock(&device->io_lock); 4177 if (bio->bi_rw & REQ_SYNC) 4178 pending_bios = &device->pending_sync_bios; 4179 else 4180 pending_bios = &device->pending_bios; 4181 4182 if (pending_bios->tail) 4183 pending_bios->tail->bi_next = bio; 4184 4185 pending_bios->tail = bio; 4186 if (!pending_bios->head) 4187 pending_bios->head = bio; 4188 if (device->running_pending) 4189 should_queue = 0; 4190 4191 spin_unlock(&device->io_lock); 4192 4193 if (should_queue) 4194 btrfs_queue_worker(&root->fs_info->submit_workers, 4195 &device->work); 4196 } 4197 4198 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4199 int mirror_num, int async_submit) 4200 { 4201 struct btrfs_mapping_tree *map_tree; 4202 struct btrfs_device *dev; 4203 struct bio *first_bio = bio; 4204 u64 logical = (u64)bio->bi_sector << 9; 4205 u64 length = 0; 4206 u64 map_length; 4207 int ret; 4208 int dev_nr = 0; 4209 int total_devs = 1; 4210 struct btrfs_bio *bbio = NULL; 4211 4212 length = bio->bi_size; 4213 map_tree = &root->fs_info->mapping_tree; 4214 map_length = length; 4215 4216 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4217 mirror_num); 4218 if (ret) /* -ENOMEM */ 4219 return ret; 4220 4221 total_devs = bbio->num_stripes; 4222 if (map_length < length) { 4223 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 4224 "len %llu\n", (unsigned long long)logical, 4225 (unsigned long long)length, 4226 (unsigned long long)map_length); 4227 BUG(); 4228 } 4229 4230 bbio->orig_bio = first_bio; 4231 bbio->private = first_bio->bi_private; 4232 bbio->end_io = first_bio->bi_end_io; 4233 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4234 4235 while (dev_nr < total_devs) { 4236 if (dev_nr < total_devs - 1) { 4237 bio = bio_clone(first_bio, GFP_NOFS); 4238 BUG_ON(!bio); /* -ENOMEM */ 4239 } else { 4240 bio = first_bio; 4241 } 4242 bio->bi_private = bbio; 4243 bio->bi_private = merge_stripe_index_into_bio_private( 4244 bio->bi_private, (unsigned int)dev_nr); 4245 bio->bi_end_io = btrfs_end_bio; 4246 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4247 dev = bbio->stripes[dev_nr].dev; 4248 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 4249 #ifdef DEBUG 4250 struct rcu_string *name; 4251 4252 rcu_read_lock(); 4253 name = rcu_dereference(dev->name); 4254 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " 4255 "(%s id %llu), size=%u\n", rw, 4256 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4257 name->str, dev->devid, bio->bi_size); 4258 rcu_read_unlock(); 4259 #endif 4260 bio->bi_bdev = dev->bdev; 4261 if (async_submit) 4262 schedule_bio(root, dev, rw, bio); 4263 else 4264 btrfsic_submit_bio(rw, bio); 4265 } else { 4266 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4267 bio->bi_sector = logical >> 9; 4268 bio_endio(bio, -EIO); 4269 } 4270 dev_nr++; 4271 } 4272 return 0; 4273 } 4274 4275 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4276 u8 *uuid, u8 *fsid) 4277 { 4278 struct btrfs_device *device; 4279 struct btrfs_fs_devices *cur_devices; 4280 4281 cur_devices = root->fs_info->fs_devices; 4282 while (cur_devices) { 4283 if (!fsid || 4284 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4285 device = __find_device(&cur_devices->devices, 4286 devid, uuid); 4287 if (device) 4288 return device; 4289 } 4290 cur_devices = cur_devices->seed; 4291 } 4292 return NULL; 4293 } 4294 4295 static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 4296 u64 devid, u8 *dev_uuid) 4297 { 4298 struct btrfs_device *device; 4299 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4300 4301 device = kzalloc(sizeof(*device), GFP_NOFS); 4302 if (!device) 4303 return NULL; 4304 list_add(&device->dev_list, 4305 &fs_devices->devices); 4306 device->dev_root = root->fs_info->dev_root; 4307 device->devid = devid; 4308 device->work.func = pending_bios_fn; 4309 device->fs_devices = fs_devices; 4310 device->missing = 1; 4311 fs_devices->num_devices++; 4312 fs_devices->missing_devices++; 4313 spin_lock_init(&device->io_lock); 4314 INIT_LIST_HEAD(&device->dev_alloc_list); 4315 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 4316 return device; 4317 } 4318 4319 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, 4320 struct extent_buffer *leaf, 4321 struct btrfs_chunk *chunk) 4322 { 4323 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4324 struct map_lookup *map; 4325 struct extent_map *em; 4326 u64 logical; 4327 u64 length; 4328 u64 devid; 4329 u8 uuid[BTRFS_UUID_SIZE]; 4330 int num_stripes; 4331 int ret; 4332 int i; 4333 4334 logical = key->offset; 4335 length = btrfs_chunk_length(leaf, chunk); 4336 4337 read_lock(&map_tree->map_tree.lock); 4338 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 4339 read_unlock(&map_tree->map_tree.lock); 4340 4341 /* already mapped? */ 4342 if (em && em->start <= logical && em->start + em->len > logical) { 4343 free_extent_map(em); 4344 return 0; 4345 } else if (em) { 4346 free_extent_map(em); 4347 } 4348 4349 em = alloc_extent_map(); 4350 if (!em) 4351 return -ENOMEM; 4352 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 4353 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4354 if (!map) { 4355 free_extent_map(em); 4356 return -ENOMEM; 4357 } 4358 4359 em->bdev = (struct block_device *)map; 4360 em->start = logical; 4361 em->len = length; 4362 em->block_start = 0; 4363 em->block_len = em->len; 4364 4365 map->num_stripes = num_stripes; 4366 map->io_width = btrfs_chunk_io_width(leaf, chunk); 4367 map->io_align = btrfs_chunk_io_align(leaf, chunk); 4368 map->sector_size = btrfs_chunk_sector_size(leaf, chunk); 4369 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 4370 map->type = btrfs_chunk_type(leaf, chunk); 4371 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 4372 for (i = 0; i < num_stripes; i++) { 4373 map->stripes[i].physical = 4374 btrfs_stripe_offset_nr(leaf, chunk, i); 4375 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 4376 read_extent_buffer(leaf, uuid, (unsigned long) 4377 btrfs_stripe_dev_uuid_nr(chunk, i), 4378 BTRFS_UUID_SIZE); 4379 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 4380 NULL); 4381 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 4382 kfree(map); 4383 free_extent_map(em); 4384 return -EIO; 4385 } 4386 if (!map->stripes[i].dev) { 4387 map->stripes[i].dev = 4388 add_missing_dev(root, devid, uuid); 4389 if (!map->stripes[i].dev) { 4390 kfree(map); 4391 free_extent_map(em); 4392 return -EIO; 4393 } 4394 } 4395 map->stripes[i].dev->in_fs_metadata = 1; 4396 } 4397 4398 write_lock(&map_tree->map_tree.lock); 4399 ret = add_extent_mapping(&map_tree->map_tree, em); 4400 write_unlock(&map_tree->map_tree.lock); 4401 BUG_ON(ret); /* Tree corruption */ 4402 free_extent_map(em); 4403 4404 return 0; 4405 } 4406 4407 static void fill_device_from_item(struct extent_buffer *leaf, 4408 struct btrfs_dev_item *dev_item, 4409 struct btrfs_device *device) 4410 { 4411 unsigned long ptr; 4412 4413 device->devid = btrfs_device_id(leaf, dev_item); 4414 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 4415 device->total_bytes = device->disk_total_bytes; 4416 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 4417 device->type = btrfs_device_type(leaf, dev_item); 4418 device->io_align = btrfs_device_io_align(leaf, dev_item); 4419 device->io_width = btrfs_device_io_width(leaf, dev_item); 4420 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 4421 4422 ptr = (unsigned long)btrfs_device_uuid(dev_item); 4423 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 4424 } 4425 4426 static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 4427 { 4428 struct btrfs_fs_devices *fs_devices; 4429 int ret; 4430 4431 BUG_ON(!mutex_is_locked(&uuid_mutex)); 4432 4433 fs_devices = root->fs_info->fs_devices->seed; 4434 while (fs_devices) { 4435 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4436 ret = 0; 4437 goto out; 4438 } 4439 fs_devices = fs_devices->seed; 4440 } 4441 4442 fs_devices = find_fsid(fsid); 4443 if (!fs_devices) { 4444 ret = -ENOENT; 4445 goto out; 4446 } 4447 4448 fs_devices = clone_fs_devices(fs_devices); 4449 if (IS_ERR(fs_devices)) { 4450 ret = PTR_ERR(fs_devices); 4451 goto out; 4452 } 4453 4454 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 4455 root->fs_info->bdev_holder); 4456 if (ret) { 4457 free_fs_devices(fs_devices); 4458 goto out; 4459 } 4460 4461 if (!fs_devices->seeding) { 4462 __btrfs_close_devices(fs_devices); 4463 free_fs_devices(fs_devices); 4464 ret = -EINVAL; 4465 goto out; 4466 } 4467 4468 fs_devices->seed = root->fs_info->fs_devices->seed; 4469 root->fs_info->fs_devices->seed = fs_devices; 4470 out: 4471 return ret; 4472 } 4473 4474 static int read_one_dev(struct btrfs_root *root, 4475 struct extent_buffer *leaf, 4476 struct btrfs_dev_item *dev_item) 4477 { 4478 struct btrfs_device *device; 4479 u64 devid; 4480 int ret; 4481 u8 fs_uuid[BTRFS_UUID_SIZE]; 4482 u8 dev_uuid[BTRFS_UUID_SIZE]; 4483 4484 devid = btrfs_device_id(leaf, dev_item); 4485 read_extent_buffer(leaf, dev_uuid, 4486 (unsigned long)btrfs_device_uuid(dev_item), 4487 BTRFS_UUID_SIZE); 4488 read_extent_buffer(leaf, fs_uuid, 4489 (unsigned long)btrfs_device_fsid(dev_item), 4490 BTRFS_UUID_SIZE); 4491 4492 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 4493 ret = open_seed_devices(root, fs_uuid); 4494 if (ret && !btrfs_test_opt(root, DEGRADED)) 4495 return ret; 4496 } 4497 4498 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 4499 if (!device || !device->bdev) { 4500 if (!btrfs_test_opt(root, DEGRADED)) 4501 return -EIO; 4502 4503 if (!device) { 4504 printk(KERN_WARNING "warning devid %llu missing\n", 4505 (unsigned long long)devid); 4506 device = add_missing_dev(root, devid, dev_uuid); 4507 if (!device) 4508 return -ENOMEM; 4509 } else if (!device->missing) { 4510 /* 4511 * this happens when a device that was properly setup 4512 * in the device info lists suddenly goes bad. 4513 * device->bdev is NULL, and so we have to set 4514 * device->missing to one here 4515 */ 4516 root->fs_info->fs_devices->missing_devices++; 4517 device->missing = 1; 4518 } 4519 } 4520 4521 if (device->fs_devices != root->fs_info->fs_devices) { 4522 BUG_ON(device->writeable); 4523 if (device->generation != 4524 btrfs_device_generation(leaf, dev_item)) 4525 return -EINVAL; 4526 } 4527 4528 fill_device_from_item(leaf, dev_item, device); 4529 device->dev_root = root->fs_info->dev_root; 4530 device->in_fs_metadata = 1; 4531 if (device->writeable) { 4532 device->fs_devices->total_rw_bytes += device->total_bytes; 4533 spin_lock(&root->fs_info->free_chunk_lock); 4534 root->fs_info->free_chunk_space += device->total_bytes - 4535 device->bytes_used; 4536 spin_unlock(&root->fs_info->free_chunk_lock); 4537 } 4538 ret = 0; 4539 return ret; 4540 } 4541 4542 int btrfs_read_sys_array(struct btrfs_root *root) 4543 { 4544 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 4545 struct extent_buffer *sb; 4546 struct btrfs_disk_key *disk_key; 4547 struct btrfs_chunk *chunk; 4548 u8 *ptr; 4549 unsigned long sb_ptr; 4550 int ret = 0; 4551 u32 num_stripes; 4552 u32 array_size; 4553 u32 len = 0; 4554 u32 cur; 4555 struct btrfs_key key; 4556 4557 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 4558 BTRFS_SUPER_INFO_SIZE); 4559 if (!sb) 4560 return -ENOMEM; 4561 btrfs_set_buffer_uptodate(sb); 4562 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 4563 /* 4564 * The sb extent buffer is artifical and just used to read the system array. 4565 * btrfs_set_buffer_uptodate() call does not properly mark all it's 4566 * pages up-to-date when the page is larger: extent does not cover the 4567 * whole page and consequently check_page_uptodate does not find all 4568 * the page's extents up-to-date (the hole beyond sb), 4569 * write_extent_buffer then triggers a WARN_ON. 4570 * 4571 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 4572 * but sb spans only this function. Add an explicit SetPageUptodate call 4573 * to silence the warning eg. on PowerPC 64. 4574 */ 4575 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 4576 SetPageUptodate(sb->pages[0]); 4577 4578 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 4579 array_size = btrfs_super_sys_array_size(super_copy); 4580 4581 ptr = super_copy->sys_chunk_array; 4582 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 4583 cur = 0; 4584 4585 while (cur < array_size) { 4586 disk_key = (struct btrfs_disk_key *)ptr; 4587 btrfs_disk_key_to_cpu(&key, disk_key); 4588 4589 len = sizeof(*disk_key); ptr += len; 4590 sb_ptr += len; 4591 cur += len; 4592 4593 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 4594 chunk = (struct btrfs_chunk *)sb_ptr; 4595 ret = read_one_chunk(root, &key, sb, chunk); 4596 if (ret) 4597 break; 4598 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 4599 len = btrfs_chunk_item_size(num_stripes); 4600 } else { 4601 ret = -EIO; 4602 break; 4603 } 4604 ptr += len; 4605 sb_ptr += len; 4606 cur += len; 4607 } 4608 free_extent_buffer(sb); 4609 return ret; 4610 } 4611 4612 struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, 4613 u64 logical, int mirror_num) 4614 { 4615 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4616 int ret; 4617 u64 map_length = 0; 4618 struct btrfs_bio *bbio = NULL; 4619 struct btrfs_device *device; 4620 4621 BUG_ON(mirror_num == 0); 4622 ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, 4623 mirror_num); 4624 if (ret) { 4625 BUG_ON(bbio != NULL); 4626 return NULL; 4627 } 4628 BUG_ON(mirror_num != bbio->mirror_num); 4629 device = bbio->stripes[mirror_num - 1].dev; 4630 kfree(bbio); 4631 return device; 4632 } 4633 4634 int btrfs_read_chunk_tree(struct btrfs_root *root) 4635 { 4636 struct btrfs_path *path; 4637 struct extent_buffer *leaf; 4638 struct btrfs_key key; 4639 struct btrfs_key found_key; 4640 int ret; 4641 int slot; 4642 4643 root = root->fs_info->chunk_root; 4644 4645 path = btrfs_alloc_path(); 4646 if (!path) 4647 return -ENOMEM; 4648 4649 mutex_lock(&uuid_mutex); 4650 lock_chunks(root); 4651 4652 /* first we search for all of the device items, and then we 4653 * read in all of the chunk items. This way we can create chunk 4654 * mappings that reference all of the devices that are afound 4655 */ 4656 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 4657 key.offset = 0; 4658 key.type = 0; 4659 again: 4660 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4661 if (ret < 0) 4662 goto error; 4663 while (1) { 4664 leaf = path->nodes[0]; 4665 slot = path->slots[0]; 4666 if (slot >= btrfs_header_nritems(leaf)) { 4667 ret = btrfs_next_leaf(root, path); 4668 if (ret == 0) 4669 continue; 4670 if (ret < 0) 4671 goto error; 4672 break; 4673 } 4674 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4675 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 4676 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) 4677 break; 4678 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 4679 struct btrfs_dev_item *dev_item; 4680 dev_item = btrfs_item_ptr(leaf, slot, 4681 struct btrfs_dev_item); 4682 ret = read_one_dev(root, leaf, dev_item); 4683 if (ret) 4684 goto error; 4685 } 4686 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 4687 struct btrfs_chunk *chunk; 4688 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 4689 ret = read_one_chunk(root, &found_key, leaf, chunk); 4690 if (ret) 4691 goto error; 4692 } 4693 path->slots[0]++; 4694 } 4695 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 4696 key.objectid = 0; 4697 btrfs_release_path(path); 4698 goto again; 4699 } 4700 ret = 0; 4701 error: 4702 unlock_chunks(root); 4703 mutex_unlock(&uuid_mutex); 4704 4705 btrfs_free_path(path); 4706 return ret; 4707 } 4708 4709 static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 4710 { 4711 int i; 4712 4713 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4714 btrfs_dev_stat_reset(dev, i); 4715 } 4716 4717 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 4718 { 4719 struct btrfs_key key; 4720 struct btrfs_key found_key; 4721 struct btrfs_root *dev_root = fs_info->dev_root; 4722 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 4723 struct extent_buffer *eb; 4724 int slot; 4725 int ret = 0; 4726 struct btrfs_device *device; 4727 struct btrfs_path *path = NULL; 4728 int i; 4729 4730 path = btrfs_alloc_path(); 4731 if (!path) { 4732 ret = -ENOMEM; 4733 goto out; 4734 } 4735 4736 mutex_lock(&fs_devices->device_list_mutex); 4737 list_for_each_entry(device, &fs_devices->devices, dev_list) { 4738 int item_size; 4739 struct btrfs_dev_stats_item *ptr; 4740 4741 key.objectid = 0; 4742 key.type = BTRFS_DEV_STATS_KEY; 4743 key.offset = device->devid; 4744 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 4745 if (ret) { 4746 __btrfs_reset_dev_stats(device); 4747 device->dev_stats_valid = 1; 4748 btrfs_release_path(path); 4749 continue; 4750 } 4751 slot = path->slots[0]; 4752 eb = path->nodes[0]; 4753 btrfs_item_key_to_cpu(eb, &found_key, slot); 4754 item_size = btrfs_item_size_nr(eb, slot); 4755 4756 ptr = btrfs_item_ptr(eb, slot, 4757 struct btrfs_dev_stats_item); 4758 4759 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4760 if (item_size >= (1 + i) * sizeof(__le64)) 4761 btrfs_dev_stat_set(device, i, 4762 btrfs_dev_stats_value(eb, ptr, i)); 4763 else 4764 btrfs_dev_stat_reset(device, i); 4765 } 4766 4767 device->dev_stats_valid = 1; 4768 btrfs_dev_stat_print_on_load(device); 4769 btrfs_release_path(path); 4770 } 4771 mutex_unlock(&fs_devices->device_list_mutex); 4772 4773 out: 4774 btrfs_free_path(path); 4775 return ret < 0 ? ret : 0; 4776 } 4777 4778 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 4779 struct btrfs_root *dev_root, 4780 struct btrfs_device *device) 4781 { 4782 struct btrfs_path *path; 4783 struct btrfs_key key; 4784 struct extent_buffer *eb; 4785 struct btrfs_dev_stats_item *ptr; 4786 int ret; 4787 int i; 4788 4789 key.objectid = 0; 4790 key.type = BTRFS_DEV_STATS_KEY; 4791 key.offset = device->devid; 4792 4793 path = btrfs_alloc_path(); 4794 BUG_ON(!path); 4795 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 4796 if (ret < 0) { 4797 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", 4798 ret, rcu_str_deref(device->name)); 4799 goto out; 4800 } 4801 4802 if (ret == 0 && 4803 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 4804 /* need to delete old one and insert a new one */ 4805 ret = btrfs_del_item(trans, dev_root, path); 4806 if (ret != 0) { 4807 printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", 4808 rcu_str_deref(device->name), ret); 4809 goto out; 4810 } 4811 ret = 1; 4812 } 4813 4814 if (ret == 1) { 4815 /* need to insert a new item */ 4816 btrfs_release_path(path); 4817 ret = btrfs_insert_empty_item(trans, dev_root, path, 4818 &key, sizeof(*ptr)); 4819 if (ret < 0) { 4820 printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", 4821 rcu_str_deref(device->name), ret); 4822 goto out; 4823 } 4824 } 4825 4826 eb = path->nodes[0]; 4827 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 4828 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4829 btrfs_set_dev_stats_value(eb, ptr, i, 4830 btrfs_dev_stat_read(device, i)); 4831 btrfs_mark_buffer_dirty(eb); 4832 4833 out: 4834 btrfs_free_path(path); 4835 return ret; 4836 } 4837 4838 /* 4839 * called from commit_transaction. Writes all changed device stats to disk. 4840 */ 4841 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 4842 struct btrfs_fs_info *fs_info) 4843 { 4844 struct btrfs_root *dev_root = fs_info->dev_root; 4845 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 4846 struct btrfs_device *device; 4847 int ret = 0; 4848 4849 mutex_lock(&fs_devices->device_list_mutex); 4850 list_for_each_entry(device, &fs_devices->devices, dev_list) { 4851 if (!device->dev_stats_valid || !device->dev_stats_dirty) 4852 continue; 4853 4854 ret = update_dev_stat_item(trans, dev_root, device); 4855 if (!ret) 4856 device->dev_stats_dirty = 0; 4857 } 4858 mutex_unlock(&fs_devices->device_list_mutex); 4859 4860 return ret; 4861 } 4862 4863 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 4864 { 4865 btrfs_dev_stat_inc(dev, index); 4866 btrfs_dev_stat_print_on_error(dev); 4867 } 4868 4869 void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 4870 { 4871 if (!dev->dev_stats_valid) 4872 return; 4873 printk_ratelimited_in_rcu(KERN_ERR 4874 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4875 rcu_str_deref(dev->name), 4876 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4877 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 4878 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 4879 btrfs_dev_stat_read(dev, 4880 BTRFS_DEV_STAT_CORRUPTION_ERRS), 4881 btrfs_dev_stat_read(dev, 4882 BTRFS_DEV_STAT_GENERATION_ERRS)); 4883 } 4884 4885 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 4886 { 4887 int i; 4888 4889 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4890 if (btrfs_dev_stat_read(dev, i) != 0) 4891 break; 4892 if (i == BTRFS_DEV_STAT_VALUES_MAX) 4893 return; /* all values == 0, suppress message */ 4894 4895 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4896 rcu_str_deref(dev->name), 4897 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4898 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 4899 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 4900 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 4901 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 4902 } 4903 4904 int btrfs_get_dev_stats(struct btrfs_root *root, 4905 struct btrfs_ioctl_get_dev_stats *stats) 4906 { 4907 struct btrfs_device *dev; 4908 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4909 int i; 4910 4911 mutex_lock(&fs_devices->device_list_mutex); 4912 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 4913 mutex_unlock(&fs_devices->device_list_mutex); 4914 4915 if (!dev) { 4916 printk(KERN_WARNING 4917 "btrfs: get dev_stats failed, device not found\n"); 4918 return -ENODEV; 4919 } else if (!dev->dev_stats_valid) { 4920 printk(KERN_WARNING 4921 "btrfs: get dev_stats failed, not yet valid\n"); 4922 return -ENODEV; 4923 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 4924 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4925 if (stats->nr_items > i) 4926 stats->values[i] = 4927 btrfs_dev_stat_read_and_reset(dev, i); 4928 else 4929 btrfs_dev_stat_reset(dev, i); 4930 } 4931 } else { 4932 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4933 if (stats->nr_items > i) 4934 stats->values[i] = btrfs_dev_stat_read(dev, i); 4935 } 4936 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 4937 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 4938 return 0; 4939 } 4940