1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include "misc.h" 18 #include "ctree.h" 19 #include "extent_map.h" 20 #include "disk-io.h" 21 #include "transaction.h" 22 #include "print-tree.h" 23 #include "volumes.h" 24 #include "raid56.h" 25 #include "async-thread.h" 26 #include "check-integrity.h" 27 #include "rcu-string.h" 28 #include "dev-replace.h" 29 #include "sysfs.h" 30 #include "tree-checker.h" 31 #include "space-info.h" 32 #include "block-group.h" 33 #include "discard.h" 34 35 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 36 [BTRFS_RAID_RAID10] = { 37 .sub_stripes = 2, 38 .dev_stripes = 1, 39 .devs_max = 0, /* 0 == as many as possible */ 40 .devs_min = 4, 41 .tolerated_failures = 1, 42 .devs_increment = 2, 43 .ncopies = 2, 44 .nparity = 0, 45 .raid_name = "raid10", 46 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 47 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 48 }, 49 [BTRFS_RAID_RAID1] = { 50 .sub_stripes = 1, 51 .dev_stripes = 1, 52 .devs_max = 2, 53 .devs_min = 2, 54 .tolerated_failures = 1, 55 .devs_increment = 2, 56 .ncopies = 2, 57 .nparity = 0, 58 .raid_name = "raid1", 59 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 60 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 61 }, 62 [BTRFS_RAID_RAID1C3] = { 63 .sub_stripes = 1, 64 .dev_stripes = 1, 65 .devs_max = 3, 66 .devs_min = 3, 67 .tolerated_failures = 2, 68 .devs_increment = 3, 69 .ncopies = 3, 70 .nparity = 0, 71 .raid_name = "raid1c3", 72 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 73 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 74 }, 75 [BTRFS_RAID_RAID1C4] = { 76 .sub_stripes = 1, 77 .dev_stripes = 1, 78 .devs_max = 4, 79 .devs_min = 4, 80 .tolerated_failures = 3, 81 .devs_increment = 4, 82 .ncopies = 4, 83 .nparity = 0, 84 .raid_name = "raid1c4", 85 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 86 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 87 }, 88 [BTRFS_RAID_DUP] = { 89 .sub_stripes = 1, 90 .dev_stripes = 2, 91 .devs_max = 1, 92 .devs_min = 1, 93 .tolerated_failures = 0, 94 .devs_increment = 1, 95 .ncopies = 2, 96 .nparity = 0, 97 .raid_name = "dup", 98 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 99 .mindev_error = 0, 100 }, 101 [BTRFS_RAID_RAID0] = { 102 .sub_stripes = 1, 103 .dev_stripes = 1, 104 .devs_max = 0, 105 .devs_min = 2, 106 .tolerated_failures = 0, 107 .devs_increment = 1, 108 .ncopies = 1, 109 .nparity = 0, 110 .raid_name = "raid0", 111 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 112 .mindev_error = 0, 113 }, 114 [BTRFS_RAID_SINGLE] = { 115 .sub_stripes = 1, 116 .dev_stripes = 1, 117 .devs_max = 1, 118 .devs_min = 1, 119 .tolerated_failures = 0, 120 .devs_increment = 1, 121 .ncopies = 1, 122 .nparity = 0, 123 .raid_name = "single", 124 .bg_flag = 0, 125 .mindev_error = 0, 126 }, 127 [BTRFS_RAID_RAID5] = { 128 .sub_stripes = 1, 129 .dev_stripes = 1, 130 .devs_max = 0, 131 .devs_min = 2, 132 .tolerated_failures = 1, 133 .devs_increment = 1, 134 .ncopies = 1, 135 .nparity = 1, 136 .raid_name = "raid5", 137 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 138 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 139 }, 140 [BTRFS_RAID_RAID6] = { 141 .sub_stripes = 1, 142 .dev_stripes = 1, 143 .devs_max = 0, 144 .devs_min = 3, 145 .tolerated_failures = 2, 146 .devs_increment = 1, 147 .ncopies = 1, 148 .nparity = 2, 149 .raid_name = "raid6", 150 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 151 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 152 }, 153 }; 154 155 const char *btrfs_bg_type_to_raid_name(u64 flags) 156 { 157 const int index = btrfs_bg_flags_to_raid_index(flags); 158 159 if (index >= BTRFS_NR_RAID_TYPES) 160 return NULL; 161 162 return btrfs_raid_array[index].raid_name; 163 } 164 165 /* 166 * Fill @buf with textual description of @bg_flags, no more than @size_buf 167 * bytes including terminating null byte. 168 */ 169 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 170 { 171 int i; 172 int ret; 173 char *bp = buf; 174 u64 flags = bg_flags; 175 u32 size_bp = size_buf; 176 177 if (!flags) { 178 strcpy(bp, "NONE"); 179 return; 180 } 181 182 #define DESCRIBE_FLAG(flag, desc) \ 183 do { \ 184 if (flags & (flag)) { \ 185 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 186 if (ret < 0 || ret >= size_bp) \ 187 goto out_overflow; \ 188 size_bp -= ret; \ 189 bp += ret; \ 190 flags &= ~(flag); \ 191 } \ 192 } while (0) 193 194 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 195 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 196 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 197 198 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 199 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 200 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 201 btrfs_raid_array[i].raid_name); 202 #undef DESCRIBE_FLAG 203 204 if (flags) { 205 ret = snprintf(bp, size_bp, "0x%llx|", flags); 206 size_bp -= ret; 207 } 208 209 if (size_bp < size_buf) 210 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 211 212 /* 213 * The text is trimmed, it's up to the caller to provide sufficiently 214 * large buffer 215 */ 216 out_overflow:; 217 } 218 219 static int init_first_rw_device(struct btrfs_trans_handle *trans); 220 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 221 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 222 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 223 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 224 enum btrfs_map_op op, 225 u64 logical, u64 *length, 226 struct btrfs_bio **bbio_ret, 227 int mirror_num, int need_raid_map); 228 229 /* 230 * Device locking 231 * ============== 232 * 233 * There are several mutexes that protect manipulation of devices and low-level 234 * structures like chunks but not block groups, extents or files 235 * 236 * uuid_mutex (global lock) 237 * ------------------------ 238 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 239 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 240 * device) or requested by the device= mount option 241 * 242 * the mutex can be very coarse and can cover long-running operations 243 * 244 * protects: updates to fs_devices counters like missing devices, rw devices, 245 * seeding, structure cloning, opening/closing devices at mount/umount time 246 * 247 * global::fs_devs - add, remove, updates to the global list 248 * 249 * does not protect: manipulation of the fs_devices::devices list in general 250 * but in mount context it could be used to exclude list modifications by eg. 251 * scan ioctl 252 * 253 * btrfs_device::name - renames (write side), read is RCU 254 * 255 * fs_devices::device_list_mutex (per-fs, with RCU) 256 * ------------------------------------------------ 257 * protects updates to fs_devices::devices, ie. adding and deleting 258 * 259 * simple list traversal with read-only actions can be done with RCU protection 260 * 261 * may be used to exclude some operations from running concurrently without any 262 * modifications to the list (see write_all_supers) 263 * 264 * Is not required at mount and close times, because our device list is 265 * protected by the uuid_mutex at that point. 266 * 267 * balance_mutex 268 * ------------- 269 * protects balance structures (status, state) and context accessed from 270 * several places (internally, ioctl) 271 * 272 * chunk_mutex 273 * ----------- 274 * protects chunks, adding or removing during allocation, trim or when a new 275 * device is added/removed. Additionally it also protects post_commit_list of 276 * individual devices, since they can be added to the transaction's 277 * post_commit_list only with chunk_mutex held. 278 * 279 * cleaner_mutex 280 * ------------- 281 * a big lock that is held by the cleaner thread and prevents running subvolume 282 * cleaning together with relocation or delayed iputs 283 * 284 * 285 * Lock nesting 286 * ============ 287 * 288 * uuid_mutex 289 * device_list_mutex 290 * chunk_mutex 291 * balance_mutex 292 * 293 * 294 * Exclusive operations, BTRFS_FS_EXCL_OP 295 * ====================================== 296 * 297 * Maintains the exclusivity of the following operations that apply to the 298 * whole filesystem and cannot run in parallel. 299 * 300 * - Balance (*) 301 * - Device add 302 * - Device remove 303 * - Device replace (*) 304 * - Resize 305 * 306 * The device operations (as above) can be in one of the following states: 307 * 308 * - Running state 309 * - Paused state 310 * - Completed state 311 * 312 * Only device operations marked with (*) can go into the Paused state for the 313 * following reasons: 314 * 315 * - ioctl (only Balance can be Paused through ioctl) 316 * - filesystem remounted as read-only 317 * - filesystem unmounted and mounted as read-only 318 * - system power-cycle and filesystem mounted as read-only 319 * - filesystem or device errors leading to forced read-only 320 * 321 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. 322 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. 323 * A device operation in Paused or Running state can be canceled or resumed 324 * either by ioctl (Balance only) or when remounted as read-write. 325 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or 326 * completed. 327 */ 328 329 DEFINE_MUTEX(uuid_mutex); 330 static LIST_HEAD(fs_uuids); 331 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 332 { 333 return &fs_uuids; 334 } 335 336 /* 337 * alloc_fs_devices - allocate struct btrfs_fs_devices 338 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 339 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 340 * 341 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 342 * The returned struct is not linked onto any lists and can be destroyed with 343 * kfree() right away. 344 */ 345 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 346 const u8 *metadata_fsid) 347 { 348 struct btrfs_fs_devices *fs_devs; 349 350 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 351 if (!fs_devs) 352 return ERR_PTR(-ENOMEM); 353 354 mutex_init(&fs_devs->device_list_mutex); 355 356 INIT_LIST_HEAD(&fs_devs->devices); 357 INIT_LIST_HEAD(&fs_devs->alloc_list); 358 INIT_LIST_HEAD(&fs_devs->fs_list); 359 if (fsid) 360 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 361 362 if (metadata_fsid) 363 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 364 else if (fsid) 365 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 366 367 return fs_devs; 368 } 369 370 void btrfs_free_device(struct btrfs_device *device) 371 { 372 WARN_ON(!list_empty(&device->post_commit_list)); 373 rcu_string_free(device->name); 374 extent_io_tree_release(&device->alloc_state); 375 bio_put(device->flush_bio); 376 kfree(device); 377 } 378 379 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 380 { 381 struct btrfs_device *device; 382 WARN_ON(fs_devices->opened); 383 while (!list_empty(&fs_devices->devices)) { 384 device = list_entry(fs_devices->devices.next, 385 struct btrfs_device, dev_list); 386 list_del(&device->dev_list); 387 btrfs_free_device(device); 388 } 389 kfree(fs_devices); 390 } 391 392 void __exit btrfs_cleanup_fs_uuids(void) 393 { 394 struct btrfs_fs_devices *fs_devices; 395 396 while (!list_empty(&fs_uuids)) { 397 fs_devices = list_entry(fs_uuids.next, 398 struct btrfs_fs_devices, fs_list); 399 list_del(&fs_devices->fs_list); 400 free_fs_devices(fs_devices); 401 } 402 } 403 404 /* 405 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. 406 * Returned struct is not linked onto any lists and must be destroyed using 407 * btrfs_free_device. 408 */ 409 static struct btrfs_device *__alloc_device(void) 410 { 411 struct btrfs_device *dev; 412 413 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 414 if (!dev) 415 return ERR_PTR(-ENOMEM); 416 417 /* 418 * Preallocate a bio that's always going to be used for flushing device 419 * barriers and matches the device lifespan 420 */ 421 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); 422 if (!dev->flush_bio) { 423 kfree(dev); 424 return ERR_PTR(-ENOMEM); 425 } 426 427 INIT_LIST_HEAD(&dev->dev_list); 428 INIT_LIST_HEAD(&dev->dev_alloc_list); 429 INIT_LIST_HEAD(&dev->post_commit_list); 430 431 atomic_set(&dev->reada_in_flight, 0); 432 atomic_set(&dev->dev_stats_ccnt, 0); 433 btrfs_device_data_ordered_init(dev); 434 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 435 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); 436 extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); 437 438 return dev; 439 } 440 441 static noinline struct btrfs_fs_devices *find_fsid( 442 const u8 *fsid, const u8 *metadata_fsid) 443 { 444 struct btrfs_fs_devices *fs_devices; 445 446 ASSERT(fsid); 447 448 /* Handle non-split brain cases */ 449 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 450 if (metadata_fsid) { 451 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 452 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 453 BTRFS_FSID_SIZE) == 0) 454 return fs_devices; 455 } else { 456 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 457 return fs_devices; 458 } 459 } 460 return NULL; 461 } 462 463 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 464 struct btrfs_super_block *disk_super) 465 { 466 467 struct btrfs_fs_devices *fs_devices; 468 469 /* 470 * Handle scanned device having completed its fsid change but 471 * belonging to a fs_devices that was created by first scanning 472 * a device which didn't have its fsid/metadata_uuid changed 473 * at all and the CHANGING_FSID_V2 flag set. 474 */ 475 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 476 if (fs_devices->fsid_change && 477 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 478 BTRFS_FSID_SIZE) == 0 && 479 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 480 BTRFS_FSID_SIZE) == 0) { 481 return fs_devices; 482 } 483 } 484 /* 485 * Handle scanned device having completed its fsid change but 486 * belonging to a fs_devices that was created by a device that 487 * has an outdated pair of fsid/metadata_uuid and 488 * CHANGING_FSID_V2 flag set. 489 */ 490 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 491 if (fs_devices->fsid_change && 492 memcmp(fs_devices->metadata_uuid, 493 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 494 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 495 BTRFS_FSID_SIZE) == 0) { 496 return fs_devices; 497 } 498 } 499 500 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 501 } 502 503 504 static int 505 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 506 int flush, struct block_device **bdev, 507 struct btrfs_super_block **disk_super) 508 { 509 int ret; 510 511 *bdev = blkdev_get_by_path(device_path, flags, holder); 512 513 if (IS_ERR(*bdev)) { 514 ret = PTR_ERR(*bdev); 515 goto error; 516 } 517 518 if (flush) 519 filemap_write_and_wait((*bdev)->bd_inode->i_mapping); 520 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 521 if (ret) { 522 blkdev_put(*bdev, flags); 523 goto error; 524 } 525 invalidate_bdev(*bdev); 526 *disk_super = btrfs_read_dev_super(*bdev); 527 if (IS_ERR(*disk_super)) { 528 ret = PTR_ERR(*disk_super); 529 blkdev_put(*bdev, flags); 530 goto error; 531 } 532 533 return 0; 534 535 error: 536 *bdev = NULL; 537 return ret; 538 } 539 540 static bool device_path_matched(const char *path, struct btrfs_device *device) 541 { 542 int found; 543 544 rcu_read_lock(); 545 found = strcmp(rcu_str_deref(device->name), path); 546 rcu_read_unlock(); 547 548 return found == 0; 549 } 550 551 /* 552 * Search and remove all stale (devices which are not mounted) devices. 553 * When both inputs are NULL, it will search and release all stale devices. 554 * path: Optional. When provided will it release all unmounted devices 555 * matching this path only. 556 * skip_dev: Optional. Will skip this device when searching for the stale 557 * devices. 558 * Return: 0 for success or if @path is NULL. 559 * -EBUSY if @path is a mounted device. 560 * -ENOENT if @path does not match any device in the list. 561 */ 562 static int btrfs_free_stale_devices(const char *path, 563 struct btrfs_device *skip_device) 564 { 565 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 566 struct btrfs_device *device, *tmp_device; 567 int ret = 0; 568 569 if (path) 570 ret = -ENOENT; 571 572 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 573 574 mutex_lock(&fs_devices->device_list_mutex); 575 list_for_each_entry_safe(device, tmp_device, 576 &fs_devices->devices, dev_list) { 577 if (skip_device && skip_device == device) 578 continue; 579 if (path && !device->name) 580 continue; 581 if (path && !device_path_matched(path, device)) 582 continue; 583 if (fs_devices->opened) { 584 /* for an already deleted device return 0 */ 585 if (path && ret != 0) 586 ret = -EBUSY; 587 break; 588 } 589 590 /* delete the stale device */ 591 fs_devices->num_devices--; 592 list_del(&device->dev_list); 593 btrfs_free_device(device); 594 595 ret = 0; 596 if (fs_devices->num_devices == 0) 597 break; 598 } 599 mutex_unlock(&fs_devices->device_list_mutex); 600 601 if (fs_devices->num_devices == 0) { 602 btrfs_sysfs_remove_fsid(fs_devices); 603 list_del(&fs_devices->fs_list); 604 free_fs_devices(fs_devices); 605 } 606 } 607 608 return ret; 609 } 610 611 /* 612 * This is only used on mount, and we are protected from competing things 613 * messing with our fs_devices by the uuid_mutex, thus we do not need the 614 * fs_devices->device_list_mutex here. 615 */ 616 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 617 struct btrfs_device *device, fmode_t flags, 618 void *holder) 619 { 620 struct request_queue *q; 621 struct block_device *bdev; 622 struct btrfs_super_block *disk_super; 623 u64 devid; 624 int ret; 625 626 if (device->bdev) 627 return -EINVAL; 628 if (!device->name) 629 return -EINVAL; 630 631 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 632 &bdev, &disk_super); 633 if (ret) 634 return ret; 635 636 devid = btrfs_stack_device_id(&disk_super->dev_item); 637 if (devid != device->devid) 638 goto error_free_page; 639 640 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 641 goto error_free_page; 642 643 device->generation = btrfs_super_generation(disk_super); 644 645 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 646 if (btrfs_super_incompat_flags(disk_super) & 647 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 648 pr_err( 649 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 650 goto error_free_page; 651 } 652 653 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 654 fs_devices->seeding = true; 655 } else { 656 if (bdev_read_only(bdev)) 657 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 658 else 659 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 660 } 661 662 q = bdev_get_queue(bdev); 663 if (!blk_queue_nonrot(q)) 664 fs_devices->rotating = true; 665 666 device->bdev = bdev; 667 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 668 device->mode = flags; 669 670 fs_devices->open_devices++; 671 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 672 device->devid != BTRFS_DEV_REPLACE_DEVID) { 673 fs_devices->rw_devices++; 674 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 675 } 676 btrfs_release_disk_super(disk_super); 677 678 return 0; 679 680 error_free_page: 681 btrfs_release_disk_super(disk_super); 682 blkdev_put(bdev, flags); 683 684 return -EINVAL; 685 } 686 687 /* 688 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 689 * being created with a disk that has already completed its fsid change. Such 690 * disk can belong to an fs which has its FSID changed or to one which doesn't. 691 * Handle both cases here. 692 */ 693 static struct btrfs_fs_devices *find_fsid_inprogress( 694 struct btrfs_super_block *disk_super) 695 { 696 struct btrfs_fs_devices *fs_devices; 697 698 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 699 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 700 BTRFS_FSID_SIZE) != 0 && 701 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 702 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 703 return fs_devices; 704 } 705 } 706 707 return find_fsid(disk_super->fsid, NULL); 708 } 709 710 711 static struct btrfs_fs_devices *find_fsid_changed( 712 struct btrfs_super_block *disk_super) 713 { 714 struct btrfs_fs_devices *fs_devices; 715 716 /* 717 * Handles the case where scanned device is part of an fs that had 718 * multiple successful changes of FSID but curently device didn't 719 * observe it. Meaning our fsid will be different than theirs. We need 720 * to handle two subcases : 721 * 1 - The fs still continues to have different METADATA/FSID uuids. 722 * 2 - The fs is switched back to its original FSID (METADATA/FSID 723 * are equal). 724 */ 725 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 726 /* Changed UUIDs */ 727 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 728 BTRFS_FSID_SIZE) != 0 && 729 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 730 BTRFS_FSID_SIZE) == 0 && 731 memcmp(fs_devices->fsid, disk_super->fsid, 732 BTRFS_FSID_SIZE) != 0) 733 return fs_devices; 734 735 /* Unchanged UUIDs */ 736 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 737 BTRFS_FSID_SIZE) == 0 && 738 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 739 BTRFS_FSID_SIZE) == 0) 740 return fs_devices; 741 } 742 743 return NULL; 744 } 745 746 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 747 struct btrfs_super_block *disk_super) 748 { 749 struct btrfs_fs_devices *fs_devices; 750 751 /* 752 * Handle the case where the scanned device is part of an fs whose last 753 * metadata UUID change reverted it to the original FSID. At the same 754 * time * fs_devices was first created by another constitutent device 755 * which didn't fully observe the operation. This results in an 756 * btrfs_fs_devices created with metadata/fsid different AND 757 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 758 * fs_devices equal to the FSID of the disk. 759 */ 760 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 761 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 762 BTRFS_FSID_SIZE) != 0 && 763 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 764 BTRFS_FSID_SIZE) == 0 && 765 fs_devices->fsid_change) 766 return fs_devices; 767 } 768 769 return NULL; 770 } 771 /* 772 * Add new device to list of registered devices 773 * 774 * Returns: 775 * device pointer which was just added or updated when successful 776 * error pointer when failed 777 */ 778 static noinline struct btrfs_device *device_list_add(const char *path, 779 struct btrfs_super_block *disk_super, 780 bool *new_device_added) 781 { 782 struct btrfs_device *device; 783 struct btrfs_fs_devices *fs_devices = NULL; 784 struct rcu_string *name; 785 u64 found_transid = btrfs_super_generation(disk_super); 786 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 787 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 788 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 789 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 790 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 791 792 if (fsid_change_in_progress) { 793 if (!has_metadata_uuid) 794 fs_devices = find_fsid_inprogress(disk_super); 795 else 796 fs_devices = find_fsid_changed(disk_super); 797 } else if (has_metadata_uuid) { 798 fs_devices = find_fsid_with_metadata_uuid(disk_super); 799 } else { 800 fs_devices = find_fsid_reverted_metadata(disk_super); 801 if (!fs_devices) 802 fs_devices = find_fsid(disk_super->fsid, NULL); 803 } 804 805 806 if (!fs_devices) { 807 if (has_metadata_uuid) 808 fs_devices = alloc_fs_devices(disk_super->fsid, 809 disk_super->metadata_uuid); 810 else 811 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 812 813 if (IS_ERR(fs_devices)) 814 return ERR_CAST(fs_devices); 815 816 fs_devices->fsid_change = fsid_change_in_progress; 817 818 mutex_lock(&fs_devices->device_list_mutex); 819 list_add(&fs_devices->fs_list, &fs_uuids); 820 821 device = NULL; 822 } else { 823 mutex_lock(&fs_devices->device_list_mutex); 824 device = btrfs_find_device(fs_devices, devid, 825 disk_super->dev_item.uuid, NULL, false); 826 827 /* 828 * If this disk has been pulled into an fs devices created by 829 * a device which had the CHANGING_FSID_V2 flag then replace the 830 * metadata_uuid/fsid values of the fs_devices. 831 */ 832 if (fs_devices->fsid_change && 833 found_transid > fs_devices->latest_generation) { 834 memcpy(fs_devices->fsid, disk_super->fsid, 835 BTRFS_FSID_SIZE); 836 837 if (has_metadata_uuid) 838 memcpy(fs_devices->metadata_uuid, 839 disk_super->metadata_uuid, 840 BTRFS_FSID_SIZE); 841 else 842 memcpy(fs_devices->metadata_uuid, 843 disk_super->fsid, BTRFS_FSID_SIZE); 844 845 fs_devices->fsid_change = false; 846 } 847 } 848 849 if (!device) { 850 if (fs_devices->opened) { 851 mutex_unlock(&fs_devices->device_list_mutex); 852 return ERR_PTR(-EBUSY); 853 } 854 855 device = btrfs_alloc_device(NULL, &devid, 856 disk_super->dev_item.uuid); 857 if (IS_ERR(device)) { 858 mutex_unlock(&fs_devices->device_list_mutex); 859 /* we can safely leave the fs_devices entry around */ 860 return device; 861 } 862 863 name = rcu_string_strdup(path, GFP_NOFS); 864 if (!name) { 865 btrfs_free_device(device); 866 mutex_unlock(&fs_devices->device_list_mutex); 867 return ERR_PTR(-ENOMEM); 868 } 869 rcu_assign_pointer(device->name, name); 870 871 list_add_rcu(&device->dev_list, &fs_devices->devices); 872 fs_devices->num_devices++; 873 874 device->fs_devices = fs_devices; 875 *new_device_added = true; 876 877 if (disk_super->label[0]) 878 pr_info( 879 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 880 disk_super->label, devid, found_transid, path, 881 current->comm, task_pid_nr(current)); 882 else 883 pr_info( 884 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 885 disk_super->fsid, devid, found_transid, path, 886 current->comm, task_pid_nr(current)); 887 888 } else if (!device->name || strcmp(device->name->str, path)) { 889 /* 890 * When FS is already mounted. 891 * 1. If you are here and if the device->name is NULL that 892 * means this device was missing at time of FS mount. 893 * 2. If you are here and if the device->name is different 894 * from 'path' that means either 895 * a. The same device disappeared and reappeared with 896 * different name. or 897 * b. The missing-disk-which-was-replaced, has 898 * reappeared now. 899 * 900 * We must allow 1 and 2a above. But 2b would be a spurious 901 * and unintentional. 902 * 903 * Further in case of 1 and 2a above, the disk at 'path' 904 * would have missed some transaction when it was away and 905 * in case of 2a the stale bdev has to be updated as well. 906 * 2b must not be allowed at all time. 907 */ 908 909 /* 910 * For now, we do allow update to btrfs_fs_device through the 911 * btrfs dev scan cli after FS has been mounted. We're still 912 * tracking a problem where systems fail mount by subvolume id 913 * when we reject replacement on a mounted FS. 914 */ 915 if (!fs_devices->opened && found_transid < device->generation) { 916 /* 917 * That is if the FS is _not_ mounted and if you 918 * are here, that means there is more than one 919 * disk with same uuid and devid.We keep the one 920 * with larger generation number or the last-in if 921 * generation are equal. 922 */ 923 mutex_unlock(&fs_devices->device_list_mutex); 924 return ERR_PTR(-EEXIST); 925 } 926 927 /* 928 * We are going to replace the device path for a given devid, 929 * make sure it's the same device if the device is mounted 930 */ 931 if (device->bdev) { 932 struct block_device *path_bdev; 933 934 path_bdev = lookup_bdev(path); 935 if (IS_ERR(path_bdev)) { 936 mutex_unlock(&fs_devices->device_list_mutex); 937 return ERR_CAST(path_bdev); 938 } 939 940 if (device->bdev != path_bdev) { 941 bdput(path_bdev); 942 mutex_unlock(&fs_devices->device_list_mutex); 943 btrfs_warn_in_rcu(device->fs_info, 944 "duplicate device fsid:devid for %pU:%llu old:%s new:%s", 945 disk_super->fsid, devid, 946 rcu_str_deref(device->name), path); 947 return ERR_PTR(-EEXIST); 948 } 949 bdput(path_bdev); 950 btrfs_info_in_rcu(device->fs_info, 951 "device fsid %pU devid %llu moved old:%s new:%s", 952 disk_super->fsid, devid, 953 rcu_str_deref(device->name), path); 954 } 955 956 name = rcu_string_strdup(path, GFP_NOFS); 957 if (!name) { 958 mutex_unlock(&fs_devices->device_list_mutex); 959 return ERR_PTR(-ENOMEM); 960 } 961 rcu_string_free(device->name); 962 rcu_assign_pointer(device->name, name); 963 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 964 fs_devices->missing_devices--; 965 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 966 } 967 } 968 969 /* 970 * Unmount does not free the btrfs_device struct but would zero 971 * generation along with most of the other members. So just update 972 * it back. We need it to pick the disk with largest generation 973 * (as above). 974 */ 975 if (!fs_devices->opened) { 976 device->generation = found_transid; 977 fs_devices->latest_generation = max_t(u64, found_transid, 978 fs_devices->latest_generation); 979 } 980 981 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 982 983 mutex_unlock(&fs_devices->device_list_mutex); 984 return device; 985 } 986 987 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 988 { 989 struct btrfs_fs_devices *fs_devices; 990 struct btrfs_device *device; 991 struct btrfs_device *orig_dev; 992 int ret = 0; 993 994 fs_devices = alloc_fs_devices(orig->fsid, NULL); 995 if (IS_ERR(fs_devices)) 996 return fs_devices; 997 998 mutex_lock(&orig->device_list_mutex); 999 fs_devices->total_devices = orig->total_devices; 1000 1001 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1002 struct rcu_string *name; 1003 1004 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1005 orig_dev->uuid); 1006 if (IS_ERR(device)) { 1007 ret = PTR_ERR(device); 1008 goto error; 1009 } 1010 1011 /* 1012 * This is ok to do without rcu read locked because we hold the 1013 * uuid mutex so nothing we touch in here is going to disappear. 1014 */ 1015 if (orig_dev->name) { 1016 name = rcu_string_strdup(orig_dev->name->str, 1017 GFP_KERNEL); 1018 if (!name) { 1019 btrfs_free_device(device); 1020 ret = -ENOMEM; 1021 goto error; 1022 } 1023 rcu_assign_pointer(device->name, name); 1024 } 1025 1026 list_add(&device->dev_list, &fs_devices->devices); 1027 device->fs_devices = fs_devices; 1028 fs_devices->num_devices++; 1029 } 1030 mutex_unlock(&orig->device_list_mutex); 1031 return fs_devices; 1032 error: 1033 mutex_unlock(&orig->device_list_mutex); 1034 free_fs_devices(fs_devices); 1035 return ERR_PTR(ret); 1036 } 1037 1038 /* 1039 * After we have read the system tree and know devids belonging to 1040 * this filesystem, remove the device which does not belong there. 1041 */ 1042 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) 1043 { 1044 struct btrfs_device *device, *next; 1045 struct btrfs_device *latest_dev = NULL; 1046 1047 mutex_lock(&uuid_mutex); 1048 again: 1049 /* This is the initialized path, it is safe to release the devices. */ 1050 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1051 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 1052 &device->dev_state)) { 1053 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1054 &device->dev_state) && 1055 !test_bit(BTRFS_DEV_STATE_MISSING, 1056 &device->dev_state) && 1057 (!latest_dev || 1058 device->generation > latest_dev->generation)) { 1059 latest_dev = device; 1060 } 1061 continue; 1062 } 1063 1064 if (device->devid == BTRFS_DEV_REPLACE_DEVID) { 1065 /* 1066 * In the first step, keep the device which has 1067 * the correct fsid and the devid that is used 1068 * for the dev_replace procedure. 1069 * In the second step, the dev_replace state is 1070 * read from the device tree and it is known 1071 * whether the procedure is really active or 1072 * not, which means whether this device is 1073 * used or whether it should be removed. 1074 */ 1075 if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1076 &device->dev_state)) { 1077 continue; 1078 } 1079 } 1080 if (device->bdev) { 1081 blkdev_put(device->bdev, device->mode); 1082 device->bdev = NULL; 1083 fs_devices->open_devices--; 1084 } 1085 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1086 list_del_init(&device->dev_alloc_list); 1087 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1088 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1089 &device->dev_state)) 1090 fs_devices->rw_devices--; 1091 } 1092 list_del_init(&device->dev_list); 1093 fs_devices->num_devices--; 1094 btrfs_free_device(device); 1095 } 1096 1097 if (fs_devices->seed) { 1098 fs_devices = fs_devices->seed; 1099 goto again; 1100 } 1101 1102 fs_devices->latest_bdev = latest_dev->bdev; 1103 1104 mutex_unlock(&uuid_mutex); 1105 } 1106 1107 static void btrfs_close_bdev(struct btrfs_device *device) 1108 { 1109 if (!device->bdev) 1110 return; 1111 1112 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1113 sync_blockdev(device->bdev); 1114 invalidate_bdev(device->bdev); 1115 } 1116 1117 blkdev_put(device->bdev, device->mode); 1118 } 1119 1120 static void btrfs_close_one_device(struct btrfs_device *device) 1121 { 1122 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1123 1124 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1125 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1126 list_del_init(&device->dev_alloc_list); 1127 fs_devices->rw_devices--; 1128 } 1129 1130 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 1131 fs_devices->missing_devices--; 1132 1133 btrfs_close_bdev(device); 1134 if (device->bdev) { 1135 fs_devices->open_devices--; 1136 device->bdev = NULL; 1137 } 1138 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1139 1140 device->fs_info = NULL; 1141 atomic_set(&device->dev_stats_ccnt, 0); 1142 extent_io_tree_release(&device->alloc_state); 1143 1144 /* Verify the device is back in a pristine state */ 1145 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1146 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1147 ASSERT(list_empty(&device->dev_alloc_list)); 1148 ASSERT(list_empty(&device->post_commit_list)); 1149 ASSERT(atomic_read(&device->reada_in_flight) == 0); 1150 } 1151 1152 static int close_fs_devices(struct btrfs_fs_devices *fs_devices) 1153 { 1154 struct btrfs_device *device, *tmp; 1155 1156 if (--fs_devices->opened > 0) 1157 return 0; 1158 1159 mutex_lock(&fs_devices->device_list_mutex); 1160 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 1161 btrfs_close_one_device(device); 1162 } 1163 mutex_unlock(&fs_devices->device_list_mutex); 1164 1165 WARN_ON(fs_devices->open_devices); 1166 WARN_ON(fs_devices->rw_devices); 1167 fs_devices->opened = 0; 1168 fs_devices->seeding = false; 1169 1170 return 0; 1171 } 1172 1173 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1174 { 1175 struct btrfs_fs_devices *seed_devices = NULL; 1176 int ret; 1177 1178 mutex_lock(&uuid_mutex); 1179 ret = close_fs_devices(fs_devices); 1180 if (!fs_devices->opened) { 1181 seed_devices = fs_devices->seed; 1182 fs_devices->seed = NULL; 1183 } 1184 mutex_unlock(&uuid_mutex); 1185 1186 while (seed_devices) { 1187 fs_devices = seed_devices; 1188 seed_devices = fs_devices->seed; 1189 close_fs_devices(fs_devices); 1190 free_fs_devices(fs_devices); 1191 } 1192 return ret; 1193 } 1194 1195 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1196 fmode_t flags, void *holder) 1197 { 1198 struct btrfs_device *device; 1199 struct btrfs_device *latest_dev = NULL; 1200 1201 flags |= FMODE_EXCL; 1202 1203 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1204 /* Just open everything we can; ignore failures here */ 1205 if (btrfs_open_one_device(fs_devices, device, flags, holder)) 1206 continue; 1207 1208 if (!latest_dev || 1209 device->generation > latest_dev->generation) 1210 latest_dev = device; 1211 } 1212 if (fs_devices->open_devices == 0) 1213 return -EINVAL; 1214 1215 fs_devices->opened = 1; 1216 fs_devices->latest_bdev = latest_dev->bdev; 1217 fs_devices->total_rw_bytes = 0; 1218 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1219 1220 return 0; 1221 } 1222 1223 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) 1224 { 1225 struct btrfs_device *dev1, *dev2; 1226 1227 dev1 = list_entry(a, struct btrfs_device, dev_list); 1228 dev2 = list_entry(b, struct btrfs_device, dev_list); 1229 1230 if (dev1->devid < dev2->devid) 1231 return -1; 1232 else if (dev1->devid > dev2->devid) 1233 return 1; 1234 return 0; 1235 } 1236 1237 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1238 fmode_t flags, void *holder) 1239 { 1240 int ret; 1241 1242 lockdep_assert_held(&uuid_mutex); 1243 /* 1244 * The device_list_mutex cannot be taken here in case opening the 1245 * underlying device takes further locks like bd_mutex. 1246 * 1247 * We also don't need the lock here as this is called during mount and 1248 * exclusion is provided by uuid_mutex 1249 */ 1250 1251 if (fs_devices->opened) { 1252 fs_devices->opened++; 1253 ret = 0; 1254 } else { 1255 list_sort(NULL, &fs_devices->devices, devid_cmp); 1256 ret = open_fs_devices(fs_devices, flags, holder); 1257 } 1258 1259 return ret; 1260 } 1261 1262 void btrfs_release_disk_super(struct btrfs_super_block *super) 1263 { 1264 struct page *page = virt_to_page(super); 1265 1266 put_page(page); 1267 } 1268 1269 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1270 u64 bytenr) 1271 { 1272 struct btrfs_super_block *disk_super; 1273 struct page *page; 1274 void *p; 1275 pgoff_t index; 1276 1277 /* make sure our super fits in the device */ 1278 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) 1279 return ERR_PTR(-EINVAL); 1280 1281 /* make sure our super fits in the page */ 1282 if (sizeof(*disk_super) > PAGE_SIZE) 1283 return ERR_PTR(-EINVAL); 1284 1285 /* make sure our super doesn't straddle pages on disk */ 1286 index = bytenr >> PAGE_SHIFT; 1287 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1288 return ERR_PTR(-EINVAL); 1289 1290 /* pull in the page with our super */ 1291 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1292 1293 if (IS_ERR(page)) 1294 return ERR_CAST(page); 1295 1296 p = page_address(page); 1297 1298 /* align our pointer to the offset of the super block */ 1299 disk_super = p + offset_in_page(bytenr); 1300 1301 if (btrfs_super_bytenr(disk_super) != bytenr || 1302 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1303 btrfs_release_disk_super(p); 1304 return ERR_PTR(-EINVAL); 1305 } 1306 1307 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1308 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1309 1310 return disk_super; 1311 } 1312 1313 int btrfs_forget_devices(const char *path) 1314 { 1315 int ret; 1316 1317 mutex_lock(&uuid_mutex); 1318 ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1319 mutex_unlock(&uuid_mutex); 1320 1321 return ret; 1322 } 1323 1324 /* 1325 * Look for a btrfs signature on a device. This may be called out of the mount path 1326 * and we are not allowed to call set_blocksize during the scan. The superblock 1327 * is read via pagecache 1328 */ 1329 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1330 void *holder) 1331 { 1332 struct btrfs_super_block *disk_super; 1333 bool new_device_added = false; 1334 struct btrfs_device *device = NULL; 1335 struct block_device *bdev; 1336 u64 bytenr; 1337 1338 lockdep_assert_held(&uuid_mutex); 1339 1340 /* 1341 * we would like to check all the supers, but that would make 1342 * a btrfs mount succeed after a mkfs from a different FS. 1343 * So, we need to add a special mount option to scan for 1344 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1345 */ 1346 bytenr = btrfs_sb_offset(0); 1347 flags |= FMODE_EXCL; 1348 1349 bdev = blkdev_get_by_path(path, flags, holder); 1350 if (IS_ERR(bdev)) 1351 return ERR_CAST(bdev); 1352 1353 disk_super = btrfs_read_disk_super(bdev, bytenr); 1354 if (IS_ERR(disk_super)) { 1355 device = ERR_CAST(disk_super); 1356 goto error_bdev_put; 1357 } 1358 1359 device = device_list_add(path, disk_super, &new_device_added); 1360 if (!IS_ERR(device)) { 1361 if (new_device_added) 1362 btrfs_free_stale_devices(path, device); 1363 } 1364 1365 btrfs_release_disk_super(disk_super); 1366 1367 error_bdev_put: 1368 blkdev_put(bdev, flags); 1369 1370 return device; 1371 } 1372 1373 /* 1374 * Try to find a chunk that intersects [start, start + len] range and when one 1375 * such is found, record the end of it in *start 1376 */ 1377 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1378 u64 len) 1379 { 1380 u64 physical_start, physical_end; 1381 1382 lockdep_assert_held(&device->fs_info->chunk_mutex); 1383 1384 if (!find_first_extent_bit(&device->alloc_state, *start, 1385 &physical_start, &physical_end, 1386 CHUNK_ALLOCATED, NULL)) { 1387 1388 if (in_range(physical_start, *start, len) || 1389 in_range(*start, physical_start, 1390 physical_end - physical_start)) { 1391 *start = physical_end + 1; 1392 return true; 1393 } 1394 } 1395 return false; 1396 } 1397 1398 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1399 { 1400 switch (device->fs_devices->chunk_alloc_policy) { 1401 case BTRFS_CHUNK_ALLOC_REGULAR: 1402 /* 1403 * We don't want to overwrite the superblock on the drive nor 1404 * any area used by the boot loader (grub for example), so we 1405 * make sure to start at an offset of at least 1MB. 1406 */ 1407 return max_t(u64, start, SZ_1M); 1408 default: 1409 BUG(); 1410 } 1411 } 1412 1413 /** 1414 * dev_extent_hole_check - check if specified hole is suitable for allocation 1415 * @device: the device which we have the hole 1416 * @hole_start: starting position of the hole 1417 * @hole_size: the size of the hole 1418 * @num_bytes: the size of the free space that we need 1419 * 1420 * This function may modify @hole_start and @hole_end to reflect the suitable 1421 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1422 */ 1423 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1424 u64 *hole_size, u64 num_bytes) 1425 { 1426 bool changed = false; 1427 u64 hole_end = *hole_start + *hole_size; 1428 1429 /* 1430 * Check before we set max_hole_start, otherwise we could end up 1431 * sending back this offset anyway. 1432 */ 1433 if (contains_pending_extent(device, hole_start, *hole_size)) { 1434 if (hole_end >= *hole_start) 1435 *hole_size = hole_end - *hole_start; 1436 else 1437 *hole_size = 0; 1438 changed = true; 1439 } 1440 1441 switch (device->fs_devices->chunk_alloc_policy) { 1442 case BTRFS_CHUNK_ALLOC_REGULAR: 1443 /* No extra check */ 1444 break; 1445 default: 1446 BUG(); 1447 } 1448 1449 return changed; 1450 } 1451 1452 /* 1453 * find_free_dev_extent_start - find free space in the specified device 1454 * @device: the device which we search the free space in 1455 * @num_bytes: the size of the free space that we need 1456 * @search_start: the position from which to begin the search 1457 * @start: store the start of the free space. 1458 * @len: the size of the free space. that we find, or the size 1459 * of the max free space if we don't find suitable free space 1460 * 1461 * this uses a pretty simple search, the expectation is that it is 1462 * called very infrequently and that a given device has a small number 1463 * of extents 1464 * 1465 * @start is used to store the start of the free space if we find. But if we 1466 * don't find suitable free space, it will be used to store the start position 1467 * of the max free space. 1468 * 1469 * @len is used to store the size of the free space that we find. 1470 * But if we don't find suitable free space, it is used to store the size of 1471 * the max free space. 1472 * 1473 * NOTE: This function will search *commit* root of device tree, and does extra 1474 * check to ensure dev extents are not double allocated. 1475 * This makes the function safe to allocate dev extents but may not report 1476 * correct usable device space, as device extent freed in current transaction 1477 * is not reported as avaiable. 1478 */ 1479 static int find_free_dev_extent_start(struct btrfs_device *device, 1480 u64 num_bytes, u64 search_start, u64 *start, 1481 u64 *len) 1482 { 1483 struct btrfs_fs_info *fs_info = device->fs_info; 1484 struct btrfs_root *root = fs_info->dev_root; 1485 struct btrfs_key key; 1486 struct btrfs_dev_extent *dev_extent; 1487 struct btrfs_path *path; 1488 u64 hole_size; 1489 u64 max_hole_start; 1490 u64 max_hole_size; 1491 u64 extent_end; 1492 u64 search_end = device->total_bytes; 1493 int ret; 1494 int slot; 1495 struct extent_buffer *l; 1496 1497 search_start = dev_extent_search_start(device, search_start); 1498 1499 path = btrfs_alloc_path(); 1500 if (!path) 1501 return -ENOMEM; 1502 1503 max_hole_start = search_start; 1504 max_hole_size = 0; 1505 1506 again: 1507 if (search_start >= search_end || 1508 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1509 ret = -ENOSPC; 1510 goto out; 1511 } 1512 1513 path->reada = READA_FORWARD; 1514 path->search_commit_root = 1; 1515 path->skip_locking = 1; 1516 1517 key.objectid = device->devid; 1518 key.offset = search_start; 1519 key.type = BTRFS_DEV_EXTENT_KEY; 1520 1521 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1522 if (ret < 0) 1523 goto out; 1524 if (ret > 0) { 1525 ret = btrfs_previous_item(root, path, key.objectid, key.type); 1526 if (ret < 0) 1527 goto out; 1528 } 1529 1530 while (1) { 1531 l = path->nodes[0]; 1532 slot = path->slots[0]; 1533 if (slot >= btrfs_header_nritems(l)) { 1534 ret = btrfs_next_leaf(root, path); 1535 if (ret == 0) 1536 continue; 1537 if (ret < 0) 1538 goto out; 1539 1540 break; 1541 } 1542 btrfs_item_key_to_cpu(l, &key, slot); 1543 1544 if (key.objectid < device->devid) 1545 goto next; 1546 1547 if (key.objectid > device->devid) 1548 break; 1549 1550 if (key.type != BTRFS_DEV_EXTENT_KEY) 1551 goto next; 1552 1553 if (key.offset > search_start) { 1554 hole_size = key.offset - search_start; 1555 dev_extent_hole_check(device, &search_start, &hole_size, 1556 num_bytes); 1557 1558 if (hole_size > max_hole_size) { 1559 max_hole_start = search_start; 1560 max_hole_size = hole_size; 1561 } 1562 1563 /* 1564 * If this free space is greater than which we need, 1565 * it must be the max free space that we have found 1566 * until now, so max_hole_start must point to the start 1567 * of this free space and the length of this free space 1568 * is stored in max_hole_size. Thus, we return 1569 * max_hole_start and max_hole_size and go back to the 1570 * caller. 1571 */ 1572 if (hole_size >= num_bytes) { 1573 ret = 0; 1574 goto out; 1575 } 1576 } 1577 1578 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1579 extent_end = key.offset + btrfs_dev_extent_length(l, 1580 dev_extent); 1581 if (extent_end > search_start) 1582 search_start = extent_end; 1583 next: 1584 path->slots[0]++; 1585 cond_resched(); 1586 } 1587 1588 /* 1589 * At this point, search_start should be the end of 1590 * allocated dev extents, and when shrinking the device, 1591 * search_end may be smaller than search_start. 1592 */ 1593 if (search_end > search_start) { 1594 hole_size = search_end - search_start; 1595 if (dev_extent_hole_check(device, &search_start, &hole_size, 1596 num_bytes)) { 1597 btrfs_release_path(path); 1598 goto again; 1599 } 1600 1601 if (hole_size > max_hole_size) { 1602 max_hole_start = search_start; 1603 max_hole_size = hole_size; 1604 } 1605 } 1606 1607 /* See above. */ 1608 if (max_hole_size < num_bytes) 1609 ret = -ENOSPC; 1610 else 1611 ret = 0; 1612 1613 out: 1614 btrfs_free_path(path); 1615 *start = max_hole_start; 1616 if (len) 1617 *len = max_hole_size; 1618 return ret; 1619 } 1620 1621 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1622 u64 *start, u64 *len) 1623 { 1624 /* FIXME use last free of some kind */ 1625 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1626 } 1627 1628 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1629 struct btrfs_device *device, 1630 u64 start, u64 *dev_extent_len) 1631 { 1632 struct btrfs_fs_info *fs_info = device->fs_info; 1633 struct btrfs_root *root = fs_info->dev_root; 1634 int ret; 1635 struct btrfs_path *path; 1636 struct btrfs_key key; 1637 struct btrfs_key found_key; 1638 struct extent_buffer *leaf = NULL; 1639 struct btrfs_dev_extent *extent = NULL; 1640 1641 path = btrfs_alloc_path(); 1642 if (!path) 1643 return -ENOMEM; 1644 1645 key.objectid = device->devid; 1646 key.offset = start; 1647 key.type = BTRFS_DEV_EXTENT_KEY; 1648 again: 1649 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1650 if (ret > 0) { 1651 ret = btrfs_previous_item(root, path, key.objectid, 1652 BTRFS_DEV_EXTENT_KEY); 1653 if (ret) 1654 goto out; 1655 leaf = path->nodes[0]; 1656 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1657 extent = btrfs_item_ptr(leaf, path->slots[0], 1658 struct btrfs_dev_extent); 1659 BUG_ON(found_key.offset > start || found_key.offset + 1660 btrfs_dev_extent_length(leaf, extent) < start); 1661 key = found_key; 1662 btrfs_release_path(path); 1663 goto again; 1664 } else if (ret == 0) { 1665 leaf = path->nodes[0]; 1666 extent = btrfs_item_ptr(leaf, path->slots[0], 1667 struct btrfs_dev_extent); 1668 } else { 1669 btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); 1670 goto out; 1671 } 1672 1673 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1674 1675 ret = btrfs_del_item(trans, root, path); 1676 if (ret) { 1677 btrfs_handle_fs_error(fs_info, ret, 1678 "Failed to remove dev extent item"); 1679 } else { 1680 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1681 } 1682 out: 1683 btrfs_free_path(path); 1684 return ret; 1685 } 1686 1687 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 1688 struct btrfs_device *device, 1689 u64 chunk_offset, u64 start, u64 num_bytes) 1690 { 1691 int ret; 1692 struct btrfs_path *path; 1693 struct btrfs_fs_info *fs_info = device->fs_info; 1694 struct btrfs_root *root = fs_info->dev_root; 1695 struct btrfs_dev_extent *extent; 1696 struct extent_buffer *leaf; 1697 struct btrfs_key key; 1698 1699 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 1700 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1701 path = btrfs_alloc_path(); 1702 if (!path) 1703 return -ENOMEM; 1704 1705 key.objectid = device->devid; 1706 key.offset = start; 1707 key.type = BTRFS_DEV_EXTENT_KEY; 1708 ret = btrfs_insert_empty_item(trans, root, path, &key, 1709 sizeof(*extent)); 1710 if (ret) 1711 goto out; 1712 1713 leaf = path->nodes[0]; 1714 extent = btrfs_item_ptr(leaf, path->slots[0], 1715 struct btrfs_dev_extent); 1716 btrfs_set_dev_extent_chunk_tree(leaf, extent, 1717 BTRFS_CHUNK_TREE_OBJECTID); 1718 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 1719 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1720 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 1721 1722 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 1723 btrfs_mark_buffer_dirty(leaf); 1724 out: 1725 btrfs_free_path(path); 1726 return ret; 1727 } 1728 1729 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1730 { 1731 struct extent_map_tree *em_tree; 1732 struct extent_map *em; 1733 struct rb_node *n; 1734 u64 ret = 0; 1735 1736 em_tree = &fs_info->mapping_tree; 1737 read_lock(&em_tree->lock); 1738 n = rb_last(&em_tree->map.rb_root); 1739 if (n) { 1740 em = rb_entry(n, struct extent_map, rb_node); 1741 ret = em->start + em->len; 1742 } 1743 read_unlock(&em_tree->lock); 1744 1745 return ret; 1746 } 1747 1748 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1749 u64 *devid_ret) 1750 { 1751 int ret; 1752 struct btrfs_key key; 1753 struct btrfs_key found_key; 1754 struct btrfs_path *path; 1755 1756 path = btrfs_alloc_path(); 1757 if (!path) 1758 return -ENOMEM; 1759 1760 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1761 key.type = BTRFS_DEV_ITEM_KEY; 1762 key.offset = (u64)-1; 1763 1764 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1765 if (ret < 0) 1766 goto error; 1767 1768 if (ret == 0) { 1769 /* Corruption */ 1770 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1771 ret = -EUCLEAN; 1772 goto error; 1773 } 1774 1775 ret = btrfs_previous_item(fs_info->chunk_root, path, 1776 BTRFS_DEV_ITEMS_OBJECTID, 1777 BTRFS_DEV_ITEM_KEY); 1778 if (ret) { 1779 *devid_ret = 1; 1780 } else { 1781 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1782 path->slots[0]); 1783 *devid_ret = found_key.offset + 1; 1784 } 1785 ret = 0; 1786 error: 1787 btrfs_free_path(path); 1788 return ret; 1789 } 1790 1791 /* 1792 * the device information is stored in the chunk root 1793 * the btrfs_device struct should be fully filled in 1794 */ 1795 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1796 struct btrfs_device *device) 1797 { 1798 int ret; 1799 struct btrfs_path *path; 1800 struct btrfs_dev_item *dev_item; 1801 struct extent_buffer *leaf; 1802 struct btrfs_key key; 1803 unsigned long ptr; 1804 1805 path = btrfs_alloc_path(); 1806 if (!path) 1807 return -ENOMEM; 1808 1809 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1810 key.type = BTRFS_DEV_ITEM_KEY; 1811 key.offset = device->devid; 1812 1813 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1814 &key, sizeof(*dev_item)); 1815 if (ret) 1816 goto out; 1817 1818 leaf = path->nodes[0]; 1819 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1820 1821 btrfs_set_device_id(leaf, dev_item, device->devid); 1822 btrfs_set_device_generation(leaf, dev_item, 0); 1823 btrfs_set_device_type(leaf, dev_item, device->type); 1824 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1825 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1826 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1827 btrfs_set_device_total_bytes(leaf, dev_item, 1828 btrfs_device_get_disk_total_bytes(device)); 1829 btrfs_set_device_bytes_used(leaf, dev_item, 1830 btrfs_device_get_bytes_used(device)); 1831 btrfs_set_device_group(leaf, dev_item, 0); 1832 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1833 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1834 btrfs_set_device_start_offset(leaf, dev_item, 0); 1835 1836 ptr = btrfs_device_uuid(dev_item); 1837 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1838 ptr = btrfs_device_fsid(dev_item); 1839 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1840 ptr, BTRFS_FSID_SIZE); 1841 btrfs_mark_buffer_dirty(leaf); 1842 1843 ret = 0; 1844 out: 1845 btrfs_free_path(path); 1846 return ret; 1847 } 1848 1849 /* 1850 * Function to update ctime/mtime for a given device path. 1851 * Mainly used for ctime/mtime based probe like libblkid. 1852 */ 1853 static void update_dev_time(const char *path_name) 1854 { 1855 struct file *filp; 1856 1857 filp = filp_open(path_name, O_RDWR, 0); 1858 if (IS_ERR(filp)) 1859 return; 1860 file_update_time(filp); 1861 filp_close(filp, NULL); 1862 } 1863 1864 static int btrfs_rm_dev_item(struct btrfs_device *device) 1865 { 1866 struct btrfs_root *root = device->fs_info->chunk_root; 1867 int ret; 1868 struct btrfs_path *path; 1869 struct btrfs_key key; 1870 struct btrfs_trans_handle *trans; 1871 1872 path = btrfs_alloc_path(); 1873 if (!path) 1874 return -ENOMEM; 1875 1876 trans = btrfs_start_transaction(root, 0); 1877 if (IS_ERR(trans)) { 1878 btrfs_free_path(path); 1879 return PTR_ERR(trans); 1880 } 1881 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1882 key.type = BTRFS_DEV_ITEM_KEY; 1883 key.offset = device->devid; 1884 1885 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1886 if (ret) { 1887 if (ret > 0) 1888 ret = -ENOENT; 1889 btrfs_abort_transaction(trans, ret); 1890 btrfs_end_transaction(trans); 1891 goto out; 1892 } 1893 1894 ret = btrfs_del_item(trans, root, path); 1895 if (ret) { 1896 btrfs_abort_transaction(trans, ret); 1897 btrfs_end_transaction(trans); 1898 } 1899 1900 out: 1901 btrfs_free_path(path); 1902 if (!ret) 1903 ret = btrfs_commit_transaction(trans); 1904 return ret; 1905 } 1906 1907 /* 1908 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1909 * filesystem. It's up to the caller to adjust that number regarding eg. device 1910 * replace. 1911 */ 1912 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1913 u64 num_devices) 1914 { 1915 u64 all_avail; 1916 unsigned seq; 1917 int i; 1918 1919 do { 1920 seq = read_seqbegin(&fs_info->profiles_lock); 1921 1922 all_avail = fs_info->avail_data_alloc_bits | 1923 fs_info->avail_system_alloc_bits | 1924 fs_info->avail_metadata_alloc_bits; 1925 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1926 1927 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1928 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1929 continue; 1930 1931 if (num_devices < btrfs_raid_array[i].devs_min) { 1932 int ret = btrfs_raid_array[i].mindev_error; 1933 1934 if (ret) 1935 return ret; 1936 } 1937 } 1938 1939 return 0; 1940 } 1941 1942 static struct btrfs_device * btrfs_find_next_active_device( 1943 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1944 { 1945 struct btrfs_device *next_device; 1946 1947 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1948 if (next_device != device && 1949 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1950 && next_device->bdev) 1951 return next_device; 1952 } 1953 1954 return NULL; 1955 } 1956 1957 /* 1958 * Helper function to check if the given device is part of s_bdev / latest_bdev 1959 * and replace it with the provided or the next active device, in the context 1960 * where this function called, there should be always be another device (or 1961 * this_dev) which is active. 1962 */ 1963 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1964 struct btrfs_device *this_dev) 1965 { 1966 struct btrfs_fs_info *fs_info = device->fs_info; 1967 struct btrfs_device *next_device; 1968 1969 if (this_dev) 1970 next_device = this_dev; 1971 else 1972 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1973 device); 1974 ASSERT(next_device); 1975 1976 if (fs_info->sb->s_bdev && 1977 (fs_info->sb->s_bdev == device->bdev)) 1978 fs_info->sb->s_bdev = next_device->bdev; 1979 1980 if (fs_info->fs_devices->latest_bdev == device->bdev) 1981 fs_info->fs_devices->latest_bdev = next_device->bdev; 1982 } 1983 1984 /* 1985 * Return btrfs_fs_devices::num_devices excluding the device that's being 1986 * currently replaced. 1987 */ 1988 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 1989 { 1990 u64 num_devices = fs_info->fs_devices->num_devices; 1991 1992 down_read(&fs_info->dev_replace.rwsem); 1993 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 1994 ASSERT(num_devices > 1); 1995 num_devices--; 1996 } 1997 up_read(&fs_info->dev_replace.rwsem); 1998 1999 return num_devices; 2000 } 2001 2002 static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2003 struct block_device *bdev, 2004 const char *device_path) 2005 { 2006 struct btrfs_super_block *disk_super; 2007 int copy_num; 2008 2009 if (!bdev) 2010 return; 2011 2012 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2013 struct page *page; 2014 int ret; 2015 2016 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2017 if (IS_ERR(disk_super)) 2018 continue; 2019 2020 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2021 2022 page = virt_to_page(disk_super); 2023 set_page_dirty(page); 2024 lock_page(page); 2025 /* write_on_page() unlocks the page */ 2026 ret = write_one_page(page); 2027 if (ret) 2028 btrfs_warn(fs_info, 2029 "error clearing superblock number %d (%d)", 2030 copy_num, ret); 2031 btrfs_release_disk_super(disk_super); 2032 2033 } 2034 2035 /* Notify udev that device has changed */ 2036 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2037 2038 /* Update ctime/mtime for device path for libblkid */ 2039 update_dev_time(device_path); 2040 } 2041 2042 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, 2043 u64 devid) 2044 { 2045 struct btrfs_device *device; 2046 struct btrfs_fs_devices *cur_devices; 2047 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2048 u64 num_devices; 2049 int ret = 0; 2050 2051 mutex_lock(&uuid_mutex); 2052 2053 num_devices = btrfs_num_devices(fs_info); 2054 2055 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2056 if (ret) 2057 goto out; 2058 2059 device = btrfs_find_device_by_devspec(fs_info, devid, device_path); 2060 2061 if (IS_ERR(device)) { 2062 if (PTR_ERR(device) == -ENOENT && 2063 strcmp(device_path, "missing") == 0) 2064 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2065 else 2066 ret = PTR_ERR(device); 2067 goto out; 2068 } 2069 2070 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2071 btrfs_warn_in_rcu(fs_info, 2072 "cannot remove device %s (devid %llu) due to active swapfile", 2073 rcu_str_deref(device->name), device->devid); 2074 ret = -ETXTBSY; 2075 goto out; 2076 } 2077 2078 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2079 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2080 goto out; 2081 } 2082 2083 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2084 fs_info->fs_devices->rw_devices == 1) { 2085 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2086 goto out; 2087 } 2088 2089 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2090 mutex_lock(&fs_info->chunk_mutex); 2091 list_del_init(&device->dev_alloc_list); 2092 device->fs_devices->rw_devices--; 2093 mutex_unlock(&fs_info->chunk_mutex); 2094 } 2095 2096 mutex_unlock(&uuid_mutex); 2097 ret = btrfs_shrink_device(device, 0); 2098 mutex_lock(&uuid_mutex); 2099 if (ret) 2100 goto error_undo; 2101 2102 /* 2103 * TODO: the superblock still includes this device in its num_devices 2104 * counter although write_all_supers() is not locked out. This 2105 * could give a filesystem state which requires a degraded mount. 2106 */ 2107 ret = btrfs_rm_dev_item(device); 2108 if (ret) 2109 goto error_undo; 2110 2111 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2112 btrfs_scrub_cancel_dev(device); 2113 2114 /* 2115 * the device list mutex makes sure that we don't change 2116 * the device list while someone else is writing out all 2117 * the device supers. Whoever is writing all supers, should 2118 * lock the device list mutex before getting the number of 2119 * devices in the super block (super_copy). Conversely, 2120 * whoever updates the number of devices in the super block 2121 * (super_copy) should hold the device list mutex. 2122 */ 2123 2124 /* 2125 * In normal cases the cur_devices == fs_devices. But in case 2126 * of deleting a seed device, the cur_devices should point to 2127 * its own fs_devices listed under the fs_devices->seed. 2128 */ 2129 cur_devices = device->fs_devices; 2130 mutex_lock(&fs_devices->device_list_mutex); 2131 list_del_rcu(&device->dev_list); 2132 2133 cur_devices->num_devices--; 2134 cur_devices->total_devices--; 2135 /* Update total_devices of the parent fs_devices if it's seed */ 2136 if (cur_devices != fs_devices) 2137 fs_devices->total_devices--; 2138 2139 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2140 cur_devices->missing_devices--; 2141 2142 btrfs_assign_next_active_device(device, NULL); 2143 2144 if (device->bdev) { 2145 cur_devices->open_devices--; 2146 /* remove sysfs entry */ 2147 btrfs_sysfs_remove_devices_dir(fs_devices, device); 2148 } 2149 2150 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2151 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2152 mutex_unlock(&fs_devices->device_list_mutex); 2153 2154 /* 2155 * at this point, the device is zero sized and detached from 2156 * the devices list. All that's left is to zero out the old 2157 * supers and free the device. 2158 */ 2159 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2160 btrfs_scratch_superblocks(fs_info, device->bdev, 2161 device->name->str); 2162 2163 btrfs_close_bdev(device); 2164 synchronize_rcu(); 2165 btrfs_free_device(device); 2166 2167 if (cur_devices->open_devices == 0) { 2168 while (fs_devices) { 2169 if (fs_devices->seed == cur_devices) { 2170 fs_devices->seed = cur_devices->seed; 2171 break; 2172 } 2173 fs_devices = fs_devices->seed; 2174 } 2175 cur_devices->seed = NULL; 2176 close_fs_devices(cur_devices); 2177 free_fs_devices(cur_devices); 2178 } 2179 2180 out: 2181 mutex_unlock(&uuid_mutex); 2182 return ret; 2183 2184 error_undo: 2185 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2186 mutex_lock(&fs_info->chunk_mutex); 2187 list_add(&device->dev_alloc_list, 2188 &fs_devices->alloc_list); 2189 device->fs_devices->rw_devices++; 2190 mutex_unlock(&fs_info->chunk_mutex); 2191 } 2192 goto out; 2193 } 2194 2195 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2196 { 2197 struct btrfs_fs_devices *fs_devices; 2198 2199 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2200 2201 /* 2202 * in case of fs with no seed, srcdev->fs_devices will point 2203 * to fs_devices of fs_info. However when the dev being replaced is 2204 * a seed dev it will point to the seed's local fs_devices. In short 2205 * srcdev will have its correct fs_devices in both the cases. 2206 */ 2207 fs_devices = srcdev->fs_devices; 2208 2209 list_del_rcu(&srcdev->dev_list); 2210 list_del(&srcdev->dev_alloc_list); 2211 fs_devices->num_devices--; 2212 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2213 fs_devices->missing_devices--; 2214 2215 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2216 fs_devices->rw_devices--; 2217 2218 if (srcdev->bdev) 2219 fs_devices->open_devices--; 2220 } 2221 2222 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2223 { 2224 struct btrfs_fs_info *fs_info = srcdev->fs_info; 2225 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2226 2227 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { 2228 /* zero out the old super if it is writable */ 2229 btrfs_scratch_superblocks(fs_info, srcdev->bdev, 2230 srcdev->name->str); 2231 } 2232 2233 btrfs_close_bdev(srcdev); 2234 synchronize_rcu(); 2235 btrfs_free_device(srcdev); 2236 2237 /* if this is no devs we rather delete the fs_devices */ 2238 if (!fs_devices->num_devices) { 2239 struct btrfs_fs_devices *tmp_fs_devices; 2240 2241 /* 2242 * On a mounted FS, num_devices can't be zero unless it's a 2243 * seed. In case of a seed device being replaced, the replace 2244 * target added to the sprout FS, so there will be no more 2245 * device left under the seed FS. 2246 */ 2247 ASSERT(fs_devices->seeding); 2248 2249 tmp_fs_devices = fs_info->fs_devices; 2250 while (tmp_fs_devices) { 2251 if (tmp_fs_devices->seed == fs_devices) { 2252 tmp_fs_devices->seed = fs_devices->seed; 2253 break; 2254 } 2255 tmp_fs_devices = tmp_fs_devices->seed; 2256 } 2257 fs_devices->seed = NULL; 2258 close_fs_devices(fs_devices); 2259 free_fs_devices(fs_devices); 2260 } 2261 } 2262 2263 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2264 { 2265 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2266 2267 mutex_lock(&fs_devices->device_list_mutex); 2268 2269 btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); 2270 2271 if (tgtdev->bdev) 2272 fs_devices->open_devices--; 2273 2274 fs_devices->num_devices--; 2275 2276 btrfs_assign_next_active_device(tgtdev, NULL); 2277 2278 list_del_rcu(&tgtdev->dev_list); 2279 2280 mutex_unlock(&fs_devices->device_list_mutex); 2281 2282 /* 2283 * The update_dev_time() with in btrfs_scratch_superblocks() 2284 * may lead to a call to btrfs_show_devname() which will try 2285 * to hold device_list_mutex. And here this device 2286 * is already out of device list, so we don't have to hold 2287 * the device_list_mutex lock. 2288 */ 2289 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2290 tgtdev->name->str); 2291 2292 btrfs_close_bdev(tgtdev); 2293 synchronize_rcu(); 2294 btrfs_free_device(tgtdev); 2295 } 2296 2297 static struct btrfs_device *btrfs_find_device_by_path( 2298 struct btrfs_fs_info *fs_info, const char *device_path) 2299 { 2300 int ret = 0; 2301 struct btrfs_super_block *disk_super; 2302 u64 devid; 2303 u8 *dev_uuid; 2304 struct block_device *bdev; 2305 struct btrfs_device *device; 2306 2307 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, 2308 fs_info->bdev_holder, 0, &bdev, &disk_super); 2309 if (ret) 2310 return ERR_PTR(ret); 2311 2312 devid = btrfs_stack_device_id(&disk_super->dev_item); 2313 dev_uuid = disk_super->dev_item.uuid; 2314 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2315 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2316 disk_super->metadata_uuid, true); 2317 else 2318 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2319 disk_super->fsid, true); 2320 2321 btrfs_release_disk_super(disk_super); 2322 if (!device) 2323 device = ERR_PTR(-ENOENT); 2324 blkdev_put(bdev, FMODE_READ); 2325 return device; 2326 } 2327 2328 /* 2329 * Lookup a device given by device id, or the path if the id is 0. 2330 */ 2331 struct btrfs_device *btrfs_find_device_by_devspec( 2332 struct btrfs_fs_info *fs_info, u64 devid, 2333 const char *device_path) 2334 { 2335 struct btrfs_device *device; 2336 2337 if (devid) { 2338 device = btrfs_find_device(fs_info->fs_devices, devid, NULL, 2339 NULL, true); 2340 if (!device) 2341 return ERR_PTR(-ENOENT); 2342 return device; 2343 } 2344 2345 if (!device_path || !device_path[0]) 2346 return ERR_PTR(-EINVAL); 2347 2348 if (strcmp(device_path, "missing") == 0) { 2349 /* Find first missing device */ 2350 list_for_each_entry(device, &fs_info->fs_devices->devices, 2351 dev_list) { 2352 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 2353 &device->dev_state) && !device->bdev) 2354 return device; 2355 } 2356 return ERR_PTR(-ENOENT); 2357 } 2358 2359 return btrfs_find_device_by_path(fs_info, device_path); 2360 } 2361 2362 /* 2363 * does all the dirty work required for changing file system's UUID. 2364 */ 2365 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) 2366 { 2367 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2368 struct btrfs_fs_devices *old_devices; 2369 struct btrfs_fs_devices *seed_devices; 2370 struct btrfs_super_block *disk_super = fs_info->super_copy; 2371 struct btrfs_device *device; 2372 u64 super_flags; 2373 2374 lockdep_assert_held(&uuid_mutex); 2375 if (!fs_devices->seeding) 2376 return -EINVAL; 2377 2378 seed_devices = alloc_fs_devices(NULL, NULL); 2379 if (IS_ERR(seed_devices)) 2380 return PTR_ERR(seed_devices); 2381 2382 old_devices = clone_fs_devices(fs_devices); 2383 if (IS_ERR(old_devices)) { 2384 kfree(seed_devices); 2385 return PTR_ERR(old_devices); 2386 } 2387 2388 list_add(&old_devices->fs_list, &fs_uuids); 2389 2390 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2391 seed_devices->opened = 1; 2392 INIT_LIST_HEAD(&seed_devices->devices); 2393 INIT_LIST_HEAD(&seed_devices->alloc_list); 2394 mutex_init(&seed_devices->device_list_mutex); 2395 2396 mutex_lock(&fs_devices->device_list_mutex); 2397 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2398 synchronize_rcu); 2399 list_for_each_entry(device, &seed_devices->devices, dev_list) 2400 device->fs_devices = seed_devices; 2401 2402 mutex_lock(&fs_info->chunk_mutex); 2403 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 2404 mutex_unlock(&fs_info->chunk_mutex); 2405 2406 fs_devices->seeding = false; 2407 fs_devices->num_devices = 0; 2408 fs_devices->open_devices = 0; 2409 fs_devices->missing_devices = 0; 2410 fs_devices->rotating = false; 2411 fs_devices->seed = seed_devices; 2412 2413 generate_random_uuid(fs_devices->fsid); 2414 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2415 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2416 mutex_unlock(&fs_devices->device_list_mutex); 2417 2418 super_flags = btrfs_super_flags(disk_super) & 2419 ~BTRFS_SUPER_FLAG_SEEDING; 2420 btrfs_set_super_flags(disk_super, super_flags); 2421 2422 return 0; 2423 } 2424 2425 /* 2426 * Store the expected generation for seed devices in device items. 2427 */ 2428 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2429 { 2430 struct btrfs_fs_info *fs_info = trans->fs_info; 2431 struct btrfs_root *root = fs_info->chunk_root; 2432 struct btrfs_path *path; 2433 struct extent_buffer *leaf; 2434 struct btrfs_dev_item *dev_item; 2435 struct btrfs_device *device; 2436 struct btrfs_key key; 2437 u8 fs_uuid[BTRFS_FSID_SIZE]; 2438 u8 dev_uuid[BTRFS_UUID_SIZE]; 2439 u64 devid; 2440 int ret; 2441 2442 path = btrfs_alloc_path(); 2443 if (!path) 2444 return -ENOMEM; 2445 2446 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2447 key.offset = 0; 2448 key.type = BTRFS_DEV_ITEM_KEY; 2449 2450 while (1) { 2451 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2452 if (ret < 0) 2453 goto error; 2454 2455 leaf = path->nodes[0]; 2456 next_slot: 2457 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2458 ret = btrfs_next_leaf(root, path); 2459 if (ret > 0) 2460 break; 2461 if (ret < 0) 2462 goto error; 2463 leaf = path->nodes[0]; 2464 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2465 btrfs_release_path(path); 2466 continue; 2467 } 2468 2469 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2470 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2471 key.type != BTRFS_DEV_ITEM_KEY) 2472 break; 2473 2474 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2475 struct btrfs_dev_item); 2476 devid = btrfs_device_id(leaf, dev_item); 2477 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2478 BTRFS_UUID_SIZE); 2479 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2480 BTRFS_FSID_SIZE); 2481 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 2482 fs_uuid, true); 2483 BUG_ON(!device); /* Logic error */ 2484 2485 if (device->fs_devices->seeding) { 2486 btrfs_set_device_generation(leaf, dev_item, 2487 device->generation); 2488 btrfs_mark_buffer_dirty(leaf); 2489 } 2490 2491 path->slots[0]++; 2492 goto next_slot; 2493 } 2494 ret = 0; 2495 error: 2496 btrfs_free_path(path); 2497 return ret; 2498 } 2499 2500 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2501 { 2502 struct btrfs_root *root = fs_info->dev_root; 2503 struct request_queue *q; 2504 struct btrfs_trans_handle *trans; 2505 struct btrfs_device *device; 2506 struct block_device *bdev; 2507 struct super_block *sb = fs_info->sb; 2508 struct rcu_string *name; 2509 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2510 u64 orig_super_total_bytes; 2511 u64 orig_super_num_devices; 2512 int seeding_dev = 0; 2513 int ret = 0; 2514 bool unlocked = false; 2515 2516 if (sb_rdonly(sb) && !fs_devices->seeding) 2517 return -EROFS; 2518 2519 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2520 fs_info->bdev_holder); 2521 if (IS_ERR(bdev)) 2522 return PTR_ERR(bdev); 2523 2524 if (fs_devices->seeding) { 2525 seeding_dev = 1; 2526 down_write(&sb->s_umount); 2527 mutex_lock(&uuid_mutex); 2528 } 2529 2530 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2531 2532 mutex_lock(&fs_devices->device_list_mutex); 2533 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2534 if (device->bdev == bdev) { 2535 ret = -EEXIST; 2536 mutex_unlock( 2537 &fs_devices->device_list_mutex); 2538 goto error; 2539 } 2540 } 2541 mutex_unlock(&fs_devices->device_list_mutex); 2542 2543 device = btrfs_alloc_device(fs_info, NULL, NULL); 2544 if (IS_ERR(device)) { 2545 /* we can safely leave the fs_devices entry around */ 2546 ret = PTR_ERR(device); 2547 goto error; 2548 } 2549 2550 name = rcu_string_strdup(device_path, GFP_KERNEL); 2551 if (!name) { 2552 ret = -ENOMEM; 2553 goto error_free_device; 2554 } 2555 rcu_assign_pointer(device->name, name); 2556 2557 trans = btrfs_start_transaction(root, 0); 2558 if (IS_ERR(trans)) { 2559 ret = PTR_ERR(trans); 2560 goto error_free_device; 2561 } 2562 2563 q = bdev_get_queue(bdev); 2564 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2565 device->generation = trans->transid; 2566 device->io_width = fs_info->sectorsize; 2567 device->io_align = fs_info->sectorsize; 2568 device->sector_size = fs_info->sectorsize; 2569 device->total_bytes = round_down(i_size_read(bdev->bd_inode), 2570 fs_info->sectorsize); 2571 device->disk_total_bytes = device->total_bytes; 2572 device->commit_total_bytes = device->total_bytes; 2573 device->fs_info = fs_info; 2574 device->bdev = bdev; 2575 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2576 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2577 device->mode = FMODE_EXCL; 2578 device->dev_stats_valid = 1; 2579 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2580 2581 if (seeding_dev) { 2582 sb->s_flags &= ~SB_RDONLY; 2583 ret = btrfs_prepare_sprout(fs_info); 2584 if (ret) { 2585 btrfs_abort_transaction(trans, ret); 2586 goto error_trans; 2587 } 2588 } 2589 2590 device->fs_devices = fs_devices; 2591 2592 mutex_lock(&fs_devices->device_list_mutex); 2593 mutex_lock(&fs_info->chunk_mutex); 2594 list_add_rcu(&device->dev_list, &fs_devices->devices); 2595 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2596 fs_devices->num_devices++; 2597 fs_devices->open_devices++; 2598 fs_devices->rw_devices++; 2599 fs_devices->total_devices++; 2600 fs_devices->total_rw_bytes += device->total_bytes; 2601 2602 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2603 2604 if (!blk_queue_nonrot(q)) 2605 fs_devices->rotating = true; 2606 2607 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2608 btrfs_set_super_total_bytes(fs_info->super_copy, 2609 round_down(orig_super_total_bytes + device->total_bytes, 2610 fs_info->sectorsize)); 2611 2612 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2613 btrfs_set_super_num_devices(fs_info->super_copy, 2614 orig_super_num_devices + 1); 2615 2616 /* add sysfs device entry */ 2617 btrfs_sysfs_add_devices_dir(fs_devices, device); 2618 2619 /* 2620 * we've got more storage, clear any full flags on the space 2621 * infos 2622 */ 2623 btrfs_clear_space_info_full(fs_info); 2624 2625 mutex_unlock(&fs_info->chunk_mutex); 2626 mutex_unlock(&fs_devices->device_list_mutex); 2627 2628 if (seeding_dev) { 2629 mutex_lock(&fs_info->chunk_mutex); 2630 ret = init_first_rw_device(trans); 2631 mutex_unlock(&fs_info->chunk_mutex); 2632 if (ret) { 2633 btrfs_abort_transaction(trans, ret); 2634 goto error_sysfs; 2635 } 2636 } 2637 2638 ret = btrfs_add_dev_item(trans, device); 2639 if (ret) { 2640 btrfs_abort_transaction(trans, ret); 2641 goto error_sysfs; 2642 } 2643 2644 if (seeding_dev) { 2645 ret = btrfs_finish_sprout(trans); 2646 if (ret) { 2647 btrfs_abort_transaction(trans, ret); 2648 goto error_sysfs; 2649 } 2650 2651 btrfs_sysfs_update_sprout_fsid(fs_devices, 2652 fs_info->fs_devices->fsid); 2653 } 2654 2655 ret = btrfs_commit_transaction(trans); 2656 2657 if (seeding_dev) { 2658 mutex_unlock(&uuid_mutex); 2659 up_write(&sb->s_umount); 2660 unlocked = true; 2661 2662 if (ret) /* transaction commit */ 2663 return ret; 2664 2665 ret = btrfs_relocate_sys_chunks(fs_info); 2666 if (ret < 0) 2667 btrfs_handle_fs_error(fs_info, ret, 2668 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2669 trans = btrfs_attach_transaction(root); 2670 if (IS_ERR(trans)) { 2671 if (PTR_ERR(trans) == -ENOENT) 2672 return 0; 2673 ret = PTR_ERR(trans); 2674 trans = NULL; 2675 goto error_sysfs; 2676 } 2677 ret = btrfs_commit_transaction(trans); 2678 } 2679 2680 /* 2681 * Now that we have written a new super block to this device, check all 2682 * other fs_devices list if device_path alienates any other scanned 2683 * device. 2684 * We can ignore the return value as it typically returns -EINVAL and 2685 * only succeeds if the device was an alien. 2686 */ 2687 btrfs_forget_devices(device_path); 2688 2689 /* Update ctime/mtime for blkid or udev */ 2690 update_dev_time(device_path); 2691 2692 return ret; 2693 2694 error_sysfs: 2695 btrfs_sysfs_remove_devices_dir(fs_devices, device); 2696 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2697 mutex_lock(&fs_info->chunk_mutex); 2698 list_del_rcu(&device->dev_list); 2699 list_del(&device->dev_alloc_list); 2700 fs_info->fs_devices->num_devices--; 2701 fs_info->fs_devices->open_devices--; 2702 fs_info->fs_devices->rw_devices--; 2703 fs_info->fs_devices->total_devices--; 2704 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2705 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2706 btrfs_set_super_total_bytes(fs_info->super_copy, 2707 orig_super_total_bytes); 2708 btrfs_set_super_num_devices(fs_info->super_copy, 2709 orig_super_num_devices); 2710 mutex_unlock(&fs_info->chunk_mutex); 2711 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2712 error_trans: 2713 if (seeding_dev) 2714 sb->s_flags |= SB_RDONLY; 2715 if (trans) 2716 btrfs_end_transaction(trans); 2717 error_free_device: 2718 btrfs_free_device(device); 2719 error: 2720 blkdev_put(bdev, FMODE_EXCL); 2721 if (seeding_dev && !unlocked) { 2722 mutex_unlock(&uuid_mutex); 2723 up_write(&sb->s_umount); 2724 } 2725 return ret; 2726 } 2727 2728 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2729 struct btrfs_device *device) 2730 { 2731 int ret; 2732 struct btrfs_path *path; 2733 struct btrfs_root *root = device->fs_info->chunk_root; 2734 struct btrfs_dev_item *dev_item; 2735 struct extent_buffer *leaf; 2736 struct btrfs_key key; 2737 2738 path = btrfs_alloc_path(); 2739 if (!path) 2740 return -ENOMEM; 2741 2742 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2743 key.type = BTRFS_DEV_ITEM_KEY; 2744 key.offset = device->devid; 2745 2746 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2747 if (ret < 0) 2748 goto out; 2749 2750 if (ret > 0) { 2751 ret = -ENOENT; 2752 goto out; 2753 } 2754 2755 leaf = path->nodes[0]; 2756 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2757 2758 btrfs_set_device_id(leaf, dev_item, device->devid); 2759 btrfs_set_device_type(leaf, dev_item, device->type); 2760 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2761 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2762 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2763 btrfs_set_device_total_bytes(leaf, dev_item, 2764 btrfs_device_get_disk_total_bytes(device)); 2765 btrfs_set_device_bytes_used(leaf, dev_item, 2766 btrfs_device_get_bytes_used(device)); 2767 btrfs_mark_buffer_dirty(leaf); 2768 2769 out: 2770 btrfs_free_path(path); 2771 return ret; 2772 } 2773 2774 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2775 struct btrfs_device *device, u64 new_size) 2776 { 2777 struct btrfs_fs_info *fs_info = device->fs_info; 2778 struct btrfs_super_block *super_copy = fs_info->super_copy; 2779 u64 old_total; 2780 u64 diff; 2781 2782 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2783 return -EACCES; 2784 2785 new_size = round_down(new_size, fs_info->sectorsize); 2786 2787 mutex_lock(&fs_info->chunk_mutex); 2788 old_total = btrfs_super_total_bytes(super_copy); 2789 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2790 2791 if (new_size <= device->total_bytes || 2792 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2793 mutex_unlock(&fs_info->chunk_mutex); 2794 return -EINVAL; 2795 } 2796 2797 btrfs_set_super_total_bytes(super_copy, 2798 round_down(old_total + diff, fs_info->sectorsize)); 2799 device->fs_devices->total_rw_bytes += diff; 2800 2801 btrfs_device_set_total_bytes(device, new_size); 2802 btrfs_device_set_disk_total_bytes(device, new_size); 2803 btrfs_clear_space_info_full(device->fs_info); 2804 if (list_empty(&device->post_commit_list)) 2805 list_add_tail(&device->post_commit_list, 2806 &trans->transaction->dev_update_list); 2807 mutex_unlock(&fs_info->chunk_mutex); 2808 2809 return btrfs_update_device(trans, device); 2810 } 2811 2812 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2813 { 2814 struct btrfs_fs_info *fs_info = trans->fs_info; 2815 struct btrfs_root *root = fs_info->chunk_root; 2816 int ret; 2817 struct btrfs_path *path; 2818 struct btrfs_key key; 2819 2820 path = btrfs_alloc_path(); 2821 if (!path) 2822 return -ENOMEM; 2823 2824 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2825 key.offset = chunk_offset; 2826 key.type = BTRFS_CHUNK_ITEM_KEY; 2827 2828 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2829 if (ret < 0) 2830 goto out; 2831 else if (ret > 0) { /* Logic error or corruption */ 2832 btrfs_handle_fs_error(fs_info, -ENOENT, 2833 "Failed lookup while freeing chunk."); 2834 ret = -ENOENT; 2835 goto out; 2836 } 2837 2838 ret = btrfs_del_item(trans, root, path); 2839 if (ret < 0) 2840 btrfs_handle_fs_error(fs_info, ret, 2841 "Failed to delete chunk item."); 2842 out: 2843 btrfs_free_path(path); 2844 return ret; 2845 } 2846 2847 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2848 { 2849 struct btrfs_super_block *super_copy = fs_info->super_copy; 2850 struct btrfs_disk_key *disk_key; 2851 struct btrfs_chunk *chunk; 2852 u8 *ptr; 2853 int ret = 0; 2854 u32 num_stripes; 2855 u32 array_size; 2856 u32 len = 0; 2857 u32 cur; 2858 struct btrfs_key key; 2859 2860 mutex_lock(&fs_info->chunk_mutex); 2861 array_size = btrfs_super_sys_array_size(super_copy); 2862 2863 ptr = super_copy->sys_chunk_array; 2864 cur = 0; 2865 2866 while (cur < array_size) { 2867 disk_key = (struct btrfs_disk_key *)ptr; 2868 btrfs_disk_key_to_cpu(&key, disk_key); 2869 2870 len = sizeof(*disk_key); 2871 2872 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2873 chunk = (struct btrfs_chunk *)(ptr + len); 2874 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2875 len += btrfs_chunk_item_size(num_stripes); 2876 } else { 2877 ret = -EIO; 2878 break; 2879 } 2880 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2881 key.offset == chunk_offset) { 2882 memmove(ptr, ptr + len, array_size - (cur + len)); 2883 array_size -= len; 2884 btrfs_set_super_sys_array_size(super_copy, array_size); 2885 } else { 2886 ptr += len; 2887 cur += len; 2888 } 2889 } 2890 mutex_unlock(&fs_info->chunk_mutex); 2891 return ret; 2892 } 2893 2894 /* 2895 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 2896 * @logical: Logical block offset in bytes. 2897 * @length: Length of extent in bytes. 2898 * 2899 * Return: Chunk mapping or ERR_PTR. 2900 */ 2901 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 2902 u64 logical, u64 length) 2903 { 2904 struct extent_map_tree *em_tree; 2905 struct extent_map *em; 2906 2907 em_tree = &fs_info->mapping_tree; 2908 read_lock(&em_tree->lock); 2909 em = lookup_extent_mapping(em_tree, logical, length); 2910 read_unlock(&em_tree->lock); 2911 2912 if (!em) { 2913 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 2914 logical, length); 2915 return ERR_PTR(-EINVAL); 2916 } 2917 2918 if (em->start > logical || em->start + em->len < logical) { 2919 btrfs_crit(fs_info, 2920 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 2921 logical, length, em->start, em->start + em->len); 2922 free_extent_map(em); 2923 return ERR_PTR(-EINVAL); 2924 } 2925 2926 /* callers are responsible for dropping em's ref. */ 2927 return em; 2928 } 2929 2930 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2931 { 2932 struct btrfs_fs_info *fs_info = trans->fs_info; 2933 struct extent_map *em; 2934 struct map_lookup *map; 2935 u64 dev_extent_len = 0; 2936 int i, ret = 0; 2937 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2938 2939 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 2940 if (IS_ERR(em)) { 2941 /* 2942 * This is a logic error, but we don't want to just rely on the 2943 * user having built with ASSERT enabled, so if ASSERT doesn't 2944 * do anything we still error out. 2945 */ 2946 ASSERT(0); 2947 return PTR_ERR(em); 2948 } 2949 map = em->map_lookup; 2950 mutex_lock(&fs_info->chunk_mutex); 2951 check_system_chunk(trans, map->type); 2952 mutex_unlock(&fs_info->chunk_mutex); 2953 2954 /* 2955 * Take the device list mutex to prevent races with the final phase of 2956 * a device replace operation that replaces the device object associated 2957 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). 2958 */ 2959 mutex_lock(&fs_devices->device_list_mutex); 2960 for (i = 0; i < map->num_stripes; i++) { 2961 struct btrfs_device *device = map->stripes[i].dev; 2962 ret = btrfs_free_dev_extent(trans, device, 2963 map->stripes[i].physical, 2964 &dev_extent_len); 2965 if (ret) { 2966 mutex_unlock(&fs_devices->device_list_mutex); 2967 btrfs_abort_transaction(trans, ret); 2968 goto out; 2969 } 2970 2971 if (device->bytes_used > 0) { 2972 mutex_lock(&fs_info->chunk_mutex); 2973 btrfs_device_set_bytes_used(device, 2974 device->bytes_used - dev_extent_len); 2975 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 2976 btrfs_clear_space_info_full(fs_info); 2977 mutex_unlock(&fs_info->chunk_mutex); 2978 } 2979 2980 ret = btrfs_update_device(trans, device); 2981 if (ret) { 2982 mutex_unlock(&fs_devices->device_list_mutex); 2983 btrfs_abort_transaction(trans, ret); 2984 goto out; 2985 } 2986 } 2987 mutex_unlock(&fs_devices->device_list_mutex); 2988 2989 ret = btrfs_free_chunk(trans, chunk_offset); 2990 if (ret) { 2991 btrfs_abort_transaction(trans, ret); 2992 goto out; 2993 } 2994 2995 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 2996 2997 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2998 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 2999 if (ret) { 3000 btrfs_abort_transaction(trans, ret); 3001 goto out; 3002 } 3003 } 3004 3005 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3006 if (ret) { 3007 btrfs_abort_transaction(trans, ret); 3008 goto out; 3009 } 3010 3011 out: 3012 /* once for us */ 3013 free_extent_map(em); 3014 return ret; 3015 } 3016 3017 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3018 { 3019 struct btrfs_root *root = fs_info->chunk_root; 3020 struct btrfs_trans_handle *trans; 3021 struct btrfs_block_group *block_group; 3022 int ret; 3023 3024 /* 3025 * Prevent races with automatic removal of unused block groups. 3026 * After we relocate and before we remove the chunk with offset 3027 * chunk_offset, automatic removal of the block group can kick in, 3028 * resulting in a failure when calling btrfs_remove_chunk() below. 3029 * 3030 * Make sure to acquire this mutex before doing a tree search (dev 3031 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3032 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3033 * we release the path used to search the chunk/dev tree and before 3034 * the current task acquires this mutex and calls us. 3035 */ 3036 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); 3037 3038 /* step one, relocate all the extents inside this chunk */ 3039 btrfs_scrub_pause(fs_info); 3040 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3041 btrfs_scrub_continue(fs_info); 3042 if (ret) 3043 return ret; 3044 3045 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3046 if (!block_group) 3047 return -ENOENT; 3048 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3049 btrfs_put_block_group(block_group); 3050 3051 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3052 chunk_offset); 3053 if (IS_ERR(trans)) { 3054 ret = PTR_ERR(trans); 3055 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3056 return ret; 3057 } 3058 3059 /* 3060 * step two, delete the device extents and the 3061 * chunk tree entries 3062 */ 3063 ret = btrfs_remove_chunk(trans, chunk_offset); 3064 btrfs_end_transaction(trans); 3065 return ret; 3066 } 3067 3068 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3069 { 3070 struct btrfs_root *chunk_root = fs_info->chunk_root; 3071 struct btrfs_path *path; 3072 struct extent_buffer *leaf; 3073 struct btrfs_chunk *chunk; 3074 struct btrfs_key key; 3075 struct btrfs_key found_key; 3076 u64 chunk_type; 3077 bool retried = false; 3078 int failed = 0; 3079 int ret; 3080 3081 path = btrfs_alloc_path(); 3082 if (!path) 3083 return -ENOMEM; 3084 3085 again: 3086 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3087 key.offset = (u64)-1; 3088 key.type = BTRFS_CHUNK_ITEM_KEY; 3089 3090 while (1) { 3091 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3092 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3093 if (ret < 0) { 3094 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3095 goto error; 3096 } 3097 BUG_ON(ret == 0); /* Corruption */ 3098 3099 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3100 key.type); 3101 if (ret) 3102 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3103 if (ret < 0) 3104 goto error; 3105 if (ret > 0) 3106 break; 3107 3108 leaf = path->nodes[0]; 3109 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3110 3111 chunk = btrfs_item_ptr(leaf, path->slots[0], 3112 struct btrfs_chunk); 3113 chunk_type = btrfs_chunk_type(leaf, chunk); 3114 btrfs_release_path(path); 3115 3116 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3117 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3118 if (ret == -ENOSPC) 3119 failed++; 3120 else 3121 BUG_ON(ret); 3122 } 3123 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3124 3125 if (found_key.offset == 0) 3126 break; 3127 key.offset = found_key.offset - 1; 3128 } 3129 ret = 0; 3130 if (failed && !retried) { 3131 failed = 0; 3132 retried = true; 3133 goto again; 3134 } else if (WARN_ON(failed && retried)) { 3135 ret = -ENOSPC; 3136 } 3137 error: 3138 btrfs_free_path(path); 3139 return ret; 3140 } 3141 3142 /* 3143 * return 1 : allocate a data chunk successfully, 3144 * return <0: errors during allocating a data chunk, 3145 * return 0 : no need to allocate a data chunk. 3146 */ 3147 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3148 u64 chunk_offset) 3149 { 3150 struct btrfs_block_group *cache; 3151 u64 bytes_used; 3152 u64 chunk_type; 3153 3154 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3155 ASSERT(cache); 3156 chunk_type = cache->flags; 3157 btrfs_put_block_group(cache); 3158 3159 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3160 return 0; 3161 3162 spin_lock(&fs_info->data_sinfo->lock); 3163 bytes_used = fs_info->data_sinfo->bytes_used; 3164 spin_unlock(&fs_info->data_sinfo->lock); 3165 3166 if (!bytes_used) { 3167 struct btrfs_trans_handle *trans; 3168 int ret; 3169 3170 trans = btrfs_join_transaction(fs_info->tree_root); 3171 if (IS_ERR(trans)) 3172 return PTR_ERR(trans); 3173 3174 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3175 btrfs_end_transaction(trans); 3176 if (ret < 0) 3177 return ret; 3178 return 1; 3179 } 3180 3181 return 0; 3182 } 3183 3184 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3185 struct btrfs_balance_control *bctl) 3186 { 3187 struct btrfs_root *root = fs_info->tree_root; 3188 struct btrfs_trans_handle *trans; 3189 struct btrfs_balance_item *item; 3190 struct btrfs_disk_balance_args disk_bargs; 3191 struct btrfs_path *path; 3192 struct extent_buffer *leaf; 3193 struct btrfs_key key; 3194 int ret, err; 3195 3196 path = btrfs_alloc_path(); 3197 if (!path) 3198 return -ENOMEM; 3199 3200 trans = btrfs_start_transaction(root, 0); 3201 if (IS_ERR(trans)) { 3202 btrfs_free_path(path); 3203 return PTR_ERR(trans); 3204 } 3205 3206 key.objectid = BTRFS_BALANCE_OBJECTID; 3207 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3208 key.offset = 0; 3209 3210 ret = btrfs_insert_empty_item(trans, root, path, &key, 3211 sizeof(*item)); 3212 if (ret) 3213 goto out; 3214 3215 leaf = path->nodes[0]; 3216 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3217 3218 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3219 3220 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3221 btrfs_set_balance_data(leaf, item, &disk_bargs); 3222 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3223 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3224 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3225 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3226 3227 btrfs_set_balance_flags(leaf, item, bctl->flags); 3228 3229 btrfs_mark_buffer_dirty(leaf); 3230 out: 3231 btrfs_free_path(path); 3232 err = btrfs_commit_transaction(trans); 3233 if (err && !ret) 3234 ret = err; 3235 return ret; 3236 } 3237 3238 static int del_balance_item(struct btrfs_fs_info *fs_info) 3239 { 3240 struct btrfs_root *root = fs_info->tree_root; 3241 struct btrfs_trans_handle *trans; 3242 struct btrfs_path *path; 3243 struct btrfs_key key; 3244 int ret, err; 3245 3246 path = btrfs_alloc_path(); 3247 if (!path) 3248 return -ENOMEM; 3249 3250 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3251 if (IS_ERR(trans)) { 3252 btrfs_free_path(path); 3253 return PTR_ERR(trans); 3254 } 3255 3256 key.objectid = BTRFS_BALANCE_OBJECTID; 3257 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3258 key.offset = 0; 3259 3260 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3261 if (ret < 0) 3262 goto out; 3263 if (ret > 0) { 3264 ret = -ENOENT; 3265 goto out; 3266 } 3267 3268 ret = btrfs_del_item(trans, root, path); 3269 out: 3270 btrfs_free_path(path); 3271 err = btrfs_commit_transaction(trans); 3272 if (err && !ret) 3273 ret = err; 3274 return ret; 3275 } 3276 3277 /* 3278 * This is a heuristic used to reduce the number of chunks balanced on 3279 * resume after balance was interrupted. 3280 */ 3281 static void update_balance_args(struct btrfs_balance_control *bctl) 3282 { 3283 /* 3284 * Turn on soft mode for chunk types that were being converted. 3285 */ 3286 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3287 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3288 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3289 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3290 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3291 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3292 3293 /* 3294 * Turn on usage filter if is not already used. The idea is 3295 * that chunks that we have already balanced should be 3296 * reasonably full. Don't do it for chunks that are being 3297 * converted - that will keep us from relocating unconverted 3298 * (albeit full) chunks. 3299 */ 3300 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3301 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3302 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3303 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3304 bctl->data.usage = 90; 3305 } 3306 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3307 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3308 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3309 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3310 bctl->sys.usage = 90; 3311 } 3312 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3313 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3314 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3315 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3316 bctl->meta.usage = 90; 3317 } 3318 } 3319 3320 /* 3321 * Clear the balance status in fs_info and delete the balance item from disk. 3322 */ 3323 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3324 { 3325 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3326 int ret; 3327 3328 BUG_ON(!fs_info->balance_ctl); 3329 3330 spin_lock(&fs_info->balance_lock); 3331 fs_info->balance_ctl = NULL; 3332 spin_unlock(&fs_info->balance_lock); 3333 3334 kfree(bctl); 3335 ret = del_balance_item(fs_info); 3336 if (ret) 3337 btrfs_handle_fs_error(fs_info, ret, NULL); 3338 } 3339 3340 /* 3341 * Balance filters. Return 1 if chunk should be filtered out 3342 * (should not be balanced). 3343 */ 3344 static int chunk_profiles_filter(u64 chunk_type, 3345 struct btrfs_balance_args *bargs) 3346 { 3347 chunk_type = chunk_to_extended(chunk_type) & 3348 BTRFS_EXTENDED_PROFILE_MASK; 3349 3350 if (bargs->profiles & chunk_type) 3351 return 0; 3352 3353 return 1; 3354 } 3355 3356 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3357 struct btrfs_balance_args *bargs) 3358 { 3359 struct btrfs_block_group *cache; 3360 u64 chunk_used; 3361 u64 user_thresh_min; 3362 u64 user_thresh_max; 3363 int ret = 1; 3364 3365 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3366 chunk_used = cache->used; 3367 3368 if (bargs->usage_min == 0) 3369 user_thresh_min = 0; 3370 else 3371 user_thresh_min = div_factor_fine(cache->length, 3372 bargs->usage_min); 3373 3374 if (bargs->usage_max == 0) 3375 user_thresh_max = 1; 3376 else if (bargs->usage_max > 100) 3377 user_thresh_max = cache->length; 3378 else 3379 user_thresh_max = div_factor_fine(cache->length, 3380 bargs->usage_max); 3381 3382 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3383 ret = 0; 3384 3385 btrfs_put_block_group(cache); 3386 return ret; 3387 } 3388 3389 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3390 u64 chunk_offset, struct btrfs_balance_args *bargs) 3391 { 3392 struct btrfs_block_group *cache; 3393 u64 chunk_used, user_thresh; 3394 int ret = 1; 3395 3396 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3397 chunk_used = cache->used; 3398 3399 if (bargs->usage_min == 0) 3400 user_thresh = 1; 3401 else if (bargs->usage > 100) 3402 user_thresh = cache->length; 3403 else 3404 user_thresh = div_factor_fine(cache->length, bargs->usage); 3405 3406 if (chunk_used < user_thresh) 3407 ret = 0; 3408 3409 btrfs_put_block_group(cache); 3410 return ret; 3411 } 3412 3413 static int chunk_devid_filter(struct extent_buffer *leaf, 3414 struct btrfs_chunk *chunk, 3415 struct btrfs_balance_args *bargs) 3416 { 3417 struct btrfs_stripe *stripe; 3418 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3419 int i; 3420 3421 for (i = 0; i < num_stripes; i++) { 3422 stripe = btrfs_stripe_nr(chunk, i); 3423 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3424 return 0; 3425 } 3426 3427 return 1; 3428 } 3429 3430 static u64 calc_data_stripes(u64 type, int num_stripes) 3431 { 3432 const int index = btrfs_bg_flags_to_raid_index(type); 3433 const int ncopies = btrfs_raid_array[index].ncopies; 3434 const int nparity = btrfs_raid_array[index].nparity; 3435 3436 if (nparity) 3437 return num_stripes - nparity; 3438 else 3439 return num_stripes / ncopies; 3440 } 3441 3442 /* [pstart, pend) */ 3443 static int chunk_drange_filter(struct extent_buffer *leaf, 3444 struct btrfs_chunk *chunk, 3445 struct btrfs_balance_args *bargs) 3446 { 3447 struct btrfs_stripe *stripe; 3448 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3449 u64 stripe_offset; 3450 u64 stripe_length; 3451 u64 type; 3452 int factor; 3453 int i; 3454 3455 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3456 return 0; 3457 3458 type = btrfs_chunk_type(leaf, chunk); 3459 factor = calc_data_stripes(type, num_stripes); 3460 3461 for (i = 0; i < num_stripes; i++) { 3462 stripe = btrfs_stripe_nr(chunk, i); 3463 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3464 continue; 3465 3466 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3467 stripe_length = btrfs_chunk_length(leaf, chunk); 3468 stripe_length = div_u64(stripe_length, factor); 3469 3470 if (stripe_offset < bargs->pend && 3471 stripe_offset + stripe_length > bargs->pstart) 3472 return 0; 3473 } 3474 3475 return 1; 3476 } 3477 3478 /* [vstart, vend) */ 3479 static int chunk_vrange_filter(struct extent_buffer *leaf, 3480 struct btrfs_chunk *chunk, 3481 u64 chunk_offset, 3482 struct btrfs_balance_args *bargs) 3483 { 3484 if (chunk_offset < bargs->vend && 3485 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3486 /* at least part of the chunk is inside this vrange */ 3487 return 0; 3488 3489 return 1; 3490 } 3491 3492 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3493 struct btrfs_chunk *chunk, 3494 struct btrfs_balance_args *bargs) 3495 { 3496 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3497 3498 if (bargs->stripes_min <= num_stripes 3499 && num_stripes <= bargs->stripes_max) 3500 return 0; 3501 3502 return 1; 3503 } 3504 3505 static int chunk_soft_convert_filter(u64 chunk_type, 3506 struct btrfs_balance_args *bargs) 3507 { 3508 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3509 return 0; 3510 3511 chunk_type = chunk_to_extended(chunk_type) & 3512 BTRFS_EXTENDED_PROFILE_MASK; 3513 3514 if (bargs->target == chunk_type) 3515 return 1; 3516 3517 return 0; 3518 } 3519 3520 static int should_balance_chunk(struct extent_buffer *leaf, 3521 struct btrfs_chunk *chunk, u64 chunk_offset) 3522 { 3523 struct btrfs_fs_info *fs_info = leaf->fs_info; 3524 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3525 struct btrfs_balance_args *bargs = NULL; 3526 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3527 3528 /* type filter */ 3529 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3530 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3531 return 0; 3532 } 3533 3534 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3535 bargs = &bctl->data; 3536 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3537 bargs = &bctl->sys; 3538 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3539 bargs = &bctl->meta; 3540 3541 /* profiles filter */ 3542 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3543 chunk_profiles_filter(chunk_type, bargs)) { 3544 return 0; 3545 } 3546 3547 /* usage filter */ 3548 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3549 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3550 return 0; 3551 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3552 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3553 return 0; 3554 } 3555 3556 /* devid filter */ 3557 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3558 chunk_devid_filter(leaf, chunk, bargs)) { 3559 return 0; 3560 } 3561 3562 /* drange filter, makes sense only with devid filter */ 3563 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3564 chunk_drange_filter(leaf, chunk, bargs)) { 3565 return 0; 3566 } 3567 3568 /* vrange filter */ 3569 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3570 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3571 return 0; 3572 } 3573 3574 /* stripes filter */ 3575 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3576 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3577 return 0; 3578 } 3579 3580 /* soft profile changing mode */ 3581 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3582 chunk_soft_convert_filter(chunk_type, bargs)) { 3583 return 0; 3584 } 3585 3586 /* 3587 * limited by count, must be the last filter 3588 */ 3589 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3590 if (bargs->limit == 0) 3591 return 0; 3592 else 3593 bargs->limit--; 3594 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3595 /* 3596 * Same logic as the 'limit' filter; the minimum cannot be 3597 * determined here because we do not have the global information 3598 * about the count of all chunks that satisfy the filters. 3599 */ 3600 if (bargs->limit_max == 0) 3601 return 0; 3602 else 3603 bargs->limit_max--; 3604 } 3605 3606 return 1; 3607 } 3608 3609 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3610 { 3611 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3612 struct btrfs_root *chunk_root = fs_info->chunk_root; 3613 u64 chunk_type; 3614 struct btrfs_chunk *chunk; 3615 struct btrfs_path *path = NULL; 3616 struct btrfs_key key; 3617 struct btrfs_key found_key; 3618 struct extent_buffer *leaf; 3619 int slot; 3620 int ret; 3621 int enospc_errors = 0; 3622 bool counting = true; 3623 /* The single value limit and min/max limits use the same bytes in the */ 3624 u64 limit_data = bctl->data.limit; 3625 u64 limit_meta = bctl->meta.limit; 3626 u64 limit_sys = bctl->sys.limit; 3627 u32 count_data = 0; 3628 u32 count_meta = 0; 3629 u32 count_sys = 0; 3630 int chunk_reserved = 0; 3631 3632 path = btrfs_alloc_path(); 3633 if (!path) { 3634 ret = -ENOMEM; 3635 goto error; 3636 } 3637 3638 /* zero out stat counters */ 3639 spin_lock(&fs_info->balance_lock); 3640 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3641 spin_unlock(&fs_info->balance_lock); 3642 again: 3643 if (!counting) { 3644 /* 3645 * The single value limit and min/max limits use the same bytes 3646 * in the 3647 */ 3648 bctl->data.limit = limit_data; 3649 bctl->meta.limit = limit_meta; 3650 bctl->sys.limit = limit_sys; 3651 } 3652 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3653 key.offset = (u64)-1; 3654 key.type = BTRFS_CHUNK_ITEM_KEY; 3655 3656 while (1) { 3657 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3658 atomic_read(&fs_info->balance_cancel_req)) { 3659 ret = -ECANCELED; 3660 goto error; 3661 } 3662 3663 mutex_lock(&fs_info->delete_unused_bgs_mutex); 3664 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3665 if (ret < 0) { 3666 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3667 goto error; 3668 } 3669 3670 /* 3671 * this shouldn't happen, it means the last relocate 3672 * failed 3673 */ 3674 if (ret == 0) 3675 BUG(); /* FIXME break ? */ 3676 3677 ret = btrfs_previous_item(chunk_root, path, 0, 3678 BTRFS_CHUNK_ITEM_KEY); 3679 if (ret) { 3680 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3681 ret = 0; 3682 break; 3683 } 3684 3685 leaf = path->nodes[0]; 3686 slot = path->slots[0]; 3687 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3688 3689 if (found_key.objectid != key.objectid) { 3690 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3691 break; 3692 } 3693 3694 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3695 chunk_type = btrfs_chunk_type(leaf, chunk); 3696 3697 if (!counting) { 3698 spin_lock(&fs_info->balance_lock); 3699 bctl->stat.considered++; 3700 spin_unlock(&fs_info->balance_lock); 3701 } 3702 3703 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3704 3705 btrfs_release_path(path); 3706 if (!ret) { 3707 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3708 goto loop; 3709 } 3710 3711 if (counting) { 3712 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3713 spin_lock(&fs_info->balance_lock); 3714 bctl->stat.expected++; 3715 spin_unlock(&fs_info->balance_lock); 3716 3717 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3718 count_data++; 3719 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3720 count_sys++; 3721 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3722 count_meta++; 3723 3724 goto loop; 3725 } 3726 3727 /* 3728 * Apply limit_min filter, no need to check if the LIMITS 3729 * filter is used, limit_min is 0 by default 3730 */ 3731 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3732 count_data < bctl->data.limit_min) 3733 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3734 count_meta < bctl->meta.limit_min) 3735 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3736 count_sys < bctl->sys.limit_min)) { 3737 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3738 goto loop; 3739 } 3740 3741 if (!chunk_reserved) { 3742 /* 3743 * We may be relocating the only data chunk we have, 3744 * which could potentially end up with losing data's 3745 * raid profile, so lets allocate an empty one in 3746 * advance. 3747 */ 3748 ret = btrfs_may_alloc_data_chunk(fs_info, 3749 found_key.offset); 3750 if (ret < 0) { 3751 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3752 goto error; 3753 } else if (ret == 1) { 3754 chunk_reserved = 1; 3755 } 3756 } 3757 3758 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3759 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3760 if (ret == -ENOSPC) { 3761 enospc_errors++; 3762 } else if (ret == -ETXTBSY) { 3763 btrfs_info(fs_info, 3764 "skipping relocation of block group %llu due to active swapfile", 3765 found_key.offset); 3766 ret = 0; 3767 } else if (ret) { 3768 goto error; 3769 } else { 3770 spin_lock(&fs_info->balance_lock); 3771 bctl->stat.completed++; 3772 spin_unlock(&fs_info->balance_lock); 3773 } 3774 loop: 3775 if (found_key.offset == 0) 3776 break; 3777 key.offset = found_key.offset - 1; 3778 } 3779 3780 if (counting) { 3781 btrfs_release_path(path); 3782 counting = false; 3783 goto again; 3784 } 3785 error: 3786 btrfs_free_path(path); 3787 if (enospc_errors) { 3788 btrfs_info(fs_info, "%d enospc errors during balance", 3789 enospc_errors); 3790 if (!ret) 3791 ret = -ENOSPC; 3792 } 3793 3794 return ret; 3795 } 3796 3797 /** 3798 * alloc_profile_is_valid - see if a given profile is valid and reduced 3799 * @flags: profile to validate 3800 * @extended: if true @flags is treated as an extended profile 3801 */ 3802 static int alloc_profile_is_valid(u64 flags, int extended) 3803 { 3804 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 3805 BTRFS_BLOCK_GROUP_PROFILE_MASK); 3806 3807 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 3808 3809 /* 1) check that all other bits are zeroed */ 3810 if (flags & ~mask) 3811 return 0; 3812 3813 /* 2) see if profile is reduced */ 3814 if (flags == 0) 3815 return !extended; /* "0" is valid for usual profiles */ 3816 3817 return has_single_bit_set(flags); 3818 } 3819 3820 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 3821 { 3822 /* cancel requested || normal exit path */ 3823 return atomic_read(&fs_info->balance_cancel_req) || 3824 (atomic_read(&fs_info->balance_pause_req) == 0 && 3825 atomic_read(&fs_info->balance_cancel_req) == 0); 3826 } 3827 3828 /* 3829 * Validate target profile against allowed profiles and return true if it's OK. 3830 * Otherwise print the error message and return false. 3831 */ 3832 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 3833 const struct btrfs_balance_args *bargs, 3834 u64 allowed, const char *type) 3835 { 3836 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3837 return true; 3838 3839 /* Profile is valid and does not have bits outside of the allowed set */ 3840 if (alloc_profile_is_valid(bargs->target, 1) && 3841 (bargs->target & ~allowed) == 0) 3842 return true; 3843 3844 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 3845 type, btrfs_bg_type_to_raid_name(bargs->target)); 3846 return false; 3847 } 3848 3849 /* 3850 * Fill @buf with textual description of balance filter flags @bargs, up to 3851 * @size_buf including the terminating null. The output may be trimmed if it 3852 * does not fit into the provided buffer. 3853 */ 3854 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 3855 u32 size_buf) 3856 { 3857 int ret; 3858 u32 size_bp = size_buf; 3859 char *bp = buf; 3860 u64 flags = bargs->flags; 3861 char tmp_buf[128] = {'\0'}; 3862 3863 if (!flags) 3864 return; 3865 3866 #define CHECK_APPEND_NOARG(a) \ 3867 do { \ 3868 ret = snprintf(bp, size_bp, (a)); \ 3869 if (ret < 0 || ret >= size_bp) \ 3870 goto out_overflow; \ 3871 size_bp -= ret; \ 3872 bp += ret; \ 3873 } while (0) 3874 3875 #define CHECK_APPEND_1ARG(a, v1) \ 3876 do { \ 3877 ret = snprintf(bp, size_bp, (a), (v1)); \ 3878 if (ret < 0 || ret >= size_bp) \ 3879 goto out_overflow; \ 3880 size_bp -= ret; \ 3881 bp += ret; \ 3882 } while (0) 3883 3884 #define CHECK_APPEND_2ARG(a, v1, v2) \ 3885 do { \ 3886 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 3887 if (ret < 0 || ret >= size_bp) \ 3888 goto out_overflow; \ 3889 size_bp -= ret; \ 3890 bp += ret; \ 3891 } while (0) 3892 3893 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 3894 CHECK_APPEND_1ARG("convert=%s,", 3895 btrfs_bg_type_to_raid_name(bargs->target)); 3896 3897 if (flags & BTRFS_BALANCE_ARGS_SOFT) 3898 CHECK_APPEND_NOARG("soft,"); 3899 3900 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 3901 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 3902 sizeof(tmp_buf)); 3903 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 3904 } 3905 3906 if (flags & BTRFS_BALANCE_ARGS_USAGE) 3907 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 3908 3909 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 3910 CHECK_APPEND_2ARG("usage=%u..%u,", 3911 bargs->usage_min, bargs->usage_max); 3912 3913 if (flags & BTRFS_BALANCE_ARGS_DEVID) 3914 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 3915 3916 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 3917 CHECK_APPEND_2ARG("drange=%llu..%llu,", 3918 bargs->pstart, bargs->pend); 3919 3920 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 3921 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 3922 bargs->vstart, bargs->vend); 3923 3924 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 3925 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 3926 3927 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 3928 CHECK_APPEND_2ARG("limit=%u..%u,", 3929 bargs->limit_min, bargs->limit_max); 3930 3931 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 3932 CHECK_APPEND_2ARG("stripes=%u..%u,", 3933 bargs->stripes_min, bargs->stripes_max); 3934 3935 #undef CHECK_APPEND_2ARG 3936 #undef CHECK_APPEND_1ARG 3937 #undef CHECK_APPEND_NOARG 3938 3939 out_overflow: 3940 3941 if (size_bp < size_buf) 3942 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 3943 else 3944 buf[0] = '\0'; 3945 } 3946 3947 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 3948 { 3949 u32 size_buf = 1024; 3950 char tmp_buf[192] = {'\0'}; 3951 char *buf; 3952 char *bp; 3953 u32 size_bp = size_buf; 3954 int ret; 3955 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3956 3957 buf = kzalloc(size_buf, GFP_KERNEL); 3958 if (!buf) 3959 return; 3960 3961 bp = buf; 3962 3963 #define CHECK_APPEND_1ARG(a, v1) \ 3964 do { \ 3965 ret = snprintf(bp, size_bp, (a), (v1)); \ 3966 if (ret < 0 || ret >= size_bp) \ 3967 goto out_overflow; \ 3968 size_bp -= ret; \ 3969 bp += ret; \ 3970 } while (0) 3971 3972 if (bctl->flags & BTRFS_BALANCE_FORCE) 3973 CHECK_APPEND_1ARG("%s", "-f "); 3974 3975 if (bctl->flags & BTRFS_BALANCE_DATA) { 3976 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 3977 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 3978 } 3979 3980 if (bctl->flags & BTRFS_BALANCE_METADATA) { 3981 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 3982 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 3983 } 3984 3985 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 3986 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 3987 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 3988 } 3989 3990 #undef CHECK_APPEND_1ARG 3991 3992 out_overflow: 3993 3994 if (size_bp < size_buf) 3995 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 3996 btrfs_info(fs_info, "balance: %s %s", 3997 (bctl->flags & BTRFS_BALANCE_RESUME) ? 3998 "resume" : "start", buf); 3999 4000 kfree(buf); 4001 } 4002 4003 /* 4004 * Should be called with balance mutexe held 4005 */ 4006 int btrfs_balance(struct btrfs_fs_info *fs_info, 4007 struct btrfs_balance_control *bctl, 4008 struct btrfs_ioctl_balance_args *bargs) 4009 { 4010 u64 meta_target, data_target; 4011 u64 allowed; 4012 int mixed = 0; 4013 int ret; 4014 u64 num_devices; 4015 unsigned seq; 4016 bool reducing_redundancy; 4017 int i; 4018 4019 if (btrfs_fs_closing(fs_info) || 4020 atomic_read(&fs_info->balance_pause_req) || 4021 btrfs_should_cancel_balance(fs_info)) { 4022 ret = -EINVAL; 4023 goto out; 4024 } 4025 4026 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4027 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4028 mixed = 1; 4029 4030 /* 4031 * In case of mixed groups both data and meta should be picked, 4032 * and identical options should be given for both of them. 4033 */ 4034 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4035 if (mixed && (bctl->flags & allowed)) { 4036 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4037 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4038 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4039 btrfs_err(fs_info, 4040 "balance: mixed groups data and metadata options must be the same"); 4041 ret = -EINVAL; 4042 goto out; 4043 } 4044 } 4045 4046 /* 4047 * rw_devices will not change at the moment, device add/delete/replace 4048 * are excluded by EXCL_OP 4049 */ 4050 num_devices = fs_info->fs_devices->rw_devices; 4051 4052 /* 4053 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4054 * special bit for it, to make it easier to distinguish. Thus we need 4055 * to set it manually, or balance would refuse the profile. 4056 */ 4057 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4058 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4059 if (num_devices >= btrfs_raid_array[i].devs_min) 4060 allowed |= btrfs_raid_array[i].bg_flag; 4061 4062 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4063 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4064 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4065 ret = -EINVAL; 4066 goto out; 4067 } 4068 4069 /* 4070 * Allow to reduce metadata or system integrity only if force set for 4071 * profiles with redundancy (copies, parity) 4072 */ 4073 allowed = 0; 4074 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4075 if (btrfs_raid_array[i].ncopies >= 2 || 4076 btrfs_raid_array[i].tolerated_failures >= 1) 4077 allowed |= btrfs_raid_array[i].bg_flag; 4078 } 4079 do { 4080 seq = read_seqbegin(&fs_info->profiles_lock); 4081 4082 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4083 (fs_info->avail_system_alloc_bits & allowed) && 4084 !(bctl->sys.target & allowed)) || 4085 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4086 (fs_info->avail_metadata_alloc_bits & allowed) && 4087 !(bctl->meta.target & allowed))) 4088 reducing_redundancy = true; 4089 else 4090 reducing_redundancy = false; 4091 4092 /* if we're not converting, the target field is uninitialized */ 4093 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4094 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4095 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4096 bctl->data.target : fs_info->avail_data_alloc_bits; 4097 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4098 4099 if (reducing_redundancy) { 4100 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4101 btrfs_info(fs_info, 4102 "balance: force reducing metadata redundancy"); 4103 } else { 4104 btrfs_err(fs_info, 4105 "balance: reduces metadata redundancy, use --force if you want this"); 4106 ret = -EINVAL; 4107 goto out; 4108 } 4109 } 4110 4111 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4112 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4113 btrfs_warn(fs_info, 4114 "balance: metadata profile %s has lower redundancy than data profile %s", 4115 btrfs_bg_type_to_raid_name(meta_target), 4116 btrfs_bg_type_to_raid_name(data_target)); 4117 } 4118 4119 if (fs_info->send_in_progress) { 4120 btrfs_warn_rl(fs_info, 4121 "cannot run balance while send operations are in progress (%d in progress)", 4122 fs_info->send_in_progress); 4123 ret = -EAGAIN; 4124 goto out; 4125 } 4126 4127 ret = insert_balance_item(fs_info, bctl); 4128 if (ret && ret != -EEXIST) 4129 goto out; 4130 4131 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4132 BUG_ON(ret == -EEXIST); 4133 BUG_ON(fs_info->balance_ctl); 4134 spin_lock(&fs_info->balance_lock); 4135 fs_info->balance_ctl = bctl; 4136 spin_unlock(&fs_info->balance_lock); 4137 } else { 4138 BUG_ON(ret != -EEXIST); 4139 spin_lock(&fs_info->balance_lock); 4140 update_balance_args(bctl); 4141 spin_unlock(&fs_info->balance_lock); 4142 } 4143 4144 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4145 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4146 describe_balance_start_or_resume(fs_info); 4147 mutex_unlock(&fs_info->balance_mutex); 4148 4149 ret = __btrfs_balance(fs_info); 4150 4151 mutex_lock(&fs_info->balance_mutex); 4152 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) 4153 btrfs_info(fs_info, "balance: paused"); 4154 /* 4155 * Balance can be canceled by: 4156 * 4157 * - Regular cancel request 4158 * Then ret == -ECANCELED and balance_cancel_req > 0 4159 * 4160 * - Fatal signal to "btrfs" process 4161 * Either the signal caught by wait_reserve_ticket() and callers 4162 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4163 * got -ECANCELED. 4164 * Either way, in this case balance_cancel_req = 0, and 4165 * ret == -EINTR or ret == -ECANCELED. 4166 * 4167 * So here we only check the return value to catch canceled balance. 4168 */ 4169 else if (ret == -ECANCELED || ret == -EINTR) 4170 btrfs_info(fs_info, "balance: canceled"); 4171 else 4172 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4173 4174 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4175 4176 if (bargs) { 4177 memset(bargs, 0, sizeof(*bargs)); 4178 btrfs_update_ioctl_balance_args(fs_info, bargs); 4179 } 4180 4181 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4182 balance_need_close(fs_info)) { 4183 reset_balance_state(fs_info); 4184 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4185 } 4186 4187 wake_up(&fs_info->balance_wait_q); 4188 4189 return ret; 4190 out: 4191 if (bctl->flags & BTRFS_BALANCE_RESUME) 4192 reset_balance_state(fs_info); 4193 else 4194 kfree(bctl); 4195 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4196 4197 return ret; 4198 } 4199 4200 static int balance_kthread(void *data) 4201 { 4202 struct btrfs_fs_info *fs_info = data; 4203 int ret = 0; 4204 4205 mutex_lock(&fs_info->balance_mutex); 4206 if (fs_info->balance_ctl) 4207 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4208 mutex_unlock(&fs_info->balance_mutex); 4209 4210 return ret; 4211 } 4212 4213 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4214 { 4215 struct task_struct *tsk; 4216 4217 mutex_lock(&fs_info->balance_mutex); 4218 if (!fs_info->balance_ctl) { 4219 mutex_unlock(&fs_info->balance_mutex); 4220 return 0; 4221 } 4222 mutex_unlock(&fs_info->balance_mutex); 4223 4224 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4225 btrfs_info(fs_info, "balance: resume skipped"); 4226 return 0; 4227 } 4228 4229 /* 4230 * A ro->rw remount sequence should continue with the paused balance 4231 * regardless of who pauses it, system or the user as of now, so set 4232 * the resume flag. 4233 */ 4234 spin_lock(&fs_info->balance_lock); 4235 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4236 spin_unlock(&fs_info->balance_lock); 4237 4238 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4239 return PTR_ERR_OR_ZERO(tsk); 4240 } 4241 4242 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4243 { 4244 struct btrfs_balance_control *bctl; 4245 struct btrfs_balance_item *item; 4246 struct btrfs_disk_balance_args disk_bargs; 4247 struct btrfs_path *path; 4248 struct extent_buffer *leaf; 4249 struct btrfs_key key; 4250 int ret; 4251 4252 path = btrfs_alloc_path(); 4253 if (!path) 4254 return -ENOMEM; 4255 4256 key.objectid = BTRFS_BALANCE_OBJECTID; 4257 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4258 key.offset = 0; 4259 4260 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4261 if (ret < 0) 4262 goto out; 4263 if (ret > 0) { /* ret = -ENOENT; */ 4264 ret = 0; 4265 goto out; 4266 } 4267 4268 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4269 if (!bctl) { 4270 ret = -ENOMEM; 4271 goto out; 4272 } 4273 4274 leaf = path->nodes[0]; 4275 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4276 4277 bctl->flags = btrfs_balance_flags(leaf, item); 4278 bctl->flags |= BTRFS_BALANCE_RESUME; 4279 4280 btrfs_balance_data(leaf, item, &disk_bargs); 4281 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4282 btrfs_balance_meta(leaf, item, &disk_bargs); 4283 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4284 btrfs_balance_sys(leaf, item, &disk_bargs); 4285 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4286 4287 /* 4288 * This should never happen, as the paused balance state is recovered 4289 * during mount without any chance of other exclusive ops to collide. 4290 * 4291 * This gives the exclusive op status to balance and keeps in paused 4292 * state until user intervention (cancel or umount). If the ownership 4293 * cannot be assigned, show a message but do not fail. The balance 4294 * is in a paused state and must have fs_info::balance_ctl properly 4295 * set up. 4296 */ 4297 if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) 4298 btrfs_warn(fs_info, 4299 "balance: cannot set exclusive op status, resume manually"); 4300 4301 mutex_lock(&fs_info->balance_mutex); 4302 BUG_ON(fs_info->balance_ctl); 4303 spin_lock(&fs_info->balance_lock); 4304 fs_info->balance_ctl = bctl; 4305 spin_unlock(&fs_info->balance_lock); 4306 mutex_unlock(&fs_info->balance_mutex); 4307 out: 4308 btrfs_free_path(path); 4309 return ret; 4310 } 4311 4312 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4313 { 4314 int ret = 0; 4315 4316 mutex_lock(&fs_info->balance_mutex); 4317 if (!fs_info->balance_ctl) { 4318 mutex_unlock(&fs_info->balance_mutex); 4319 return -ENOTCONN; 4320 } 4321 4322 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4323 atomic_inc(&fs_info->balance_pause_req); 4324 mutex_unlock(&fs_info->balance_mutex); 4325 4326 wait_event(fs_info->balance_wait_q, 4327 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4328 4329 mutex_lock(&fs_info->balance_mutex); 4330 /* we are good with balance_ctl ripped off from under us */ 4331 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4332 atomic_dec(&fs_info->balance_pause_req); 4333 } else { 4334 ret = -ENOTCONN; 4335 } 4336 4337 mutex_unlock(&fs_info->balance_mutex); 4338 return ret; 4339 } 4340 4341 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4342 { 4343 mutex_lock(&fs_info->balance_mutex); 4344 if (!fs_info->balance_ctl) { 4345 mutex_unlock(&fs_info->balance_mutex); 4346 return -ENOTCONN; 4347 } 4348 4349 /* 4350 * A paused balance with the item stored on disk can be resumed at 4351 * mount time if the mount is read-write. Otherwise it's still paused 4352 * and we must not allow cancelling as it deletes the item. 4353 */ 4354 if (sb_rdonly(fs_info->sb)) { 4355 mutex_unlock(&fs_info->balance_mutex); 4356 return -EROFS; 4357 } 4358 4359 atomic_inc(&fs_info->balance_cancel_req); 4360 /* 4361 * if we are running just wait and return, balance item is 4362 * deleted in btrfs_balance in this case 4363 */ 4364 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4365 mutex_unlock(&fs_info->balance_mutex); 4366 wait_event(fs_info->balance_wait_q, 4367 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4368 mutex_lock(&fs_info->balance_mutex); 4369 } else { 4370 mutex_unlock(&fs_info->balance_mutex); 4371 /* 4372 * Lock released to allow other waiters to continue, we'll 4373 * reexamine the status again. 4374 */ 4375 mutex_lock(&fs_info->balance_mutex); 4376 4377 if (fs_info->balance_ctl) { 4378 reset_balance_state(fs_info); 4379 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); 4380 btrfs_info(fs_info, "balance: canceled"); 4381 } 4382 } 4383 4384 BUG_ON(fs_info->balance_ctl || 4385 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4386 atomic_dec(&fs_info->balance_cancel_req); 4387 mutex_unlock(&fs_info->balance_mutex); 4388 return 0; 4389 } 4390 4391 int btrfs_uuid_scan_kthread(void *data) 4392 { 4393 struct btrfs_fs_info *fs_info = data; 4394 struct btrfs_root *root = fs_info->tree_root; 4395 struct btrfs_key key; 4396 struct btrfs_path *path = NULL; 4397 int ret = 0; 4398 struct extent_buffer *eb; 4399 int slot; 4400 struct btrfs_root_item root_item; 4401 u32 item_size; 4402 struct btrfs_trans_handle *trans = NULL; 4403 bool closing = false; 4404 4405 path = btrfs_alloc_path(); 4406 if (!path) { 4407 ret = -ENOMEM; 4408 goto out; 4409 } 4410 4411 key.objectid = 0; 4412 key.type = BTRFS_ROOT_ITEM_KEY; 4413 key.offset = 0; 4414 4415 while (1) { 4416 if (btrfs_fs_closing(fs_info)) { 4417 closing = true; 4418 break; 4419 } 4420 ret = btrfs_search_forward(root, &key, path, 4421 BTRFS_OLDEST_GENERATION); 4422 if (ret) { 4423 if (ret > 0) 4424 ret = 0; 4425 break; 4426 } 4427 4428 if (key.type != BTRFS_ROOT_ITEM_KEY || 4429 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4430 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4431 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4432 goto skip; 4433 4434 eb = path->nodes[0]; 4435 slot = path->slots[0]; 4436 item_size = btrfs_item_size_nr(eb, slot); 4437 if (item_size < sizeof(root_item)) 4438 goto skip; 4439 4440 read_extent_buffer(eb, &root_item, 4441 btrfs_item_ptr_offset(eb, slot), 4442 (int)sizeof(root_item)); 4443 if (btrfs_root_refs(&root_item) == 0) 4444 goto skip; 4445 4446 if (!btrfs_is_empty_uuid(root_item.uuid) || 4447 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4448 if (trans) 4449 goto update_tree; 4450 4451 btrfs_release_path(path); 4452 /* 4453 * 1 - subvol uuid item 4454 * 1 - received_subvol uuid item 4455 */ 4456 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4457 if (IS_ERR(trans)) { 4458 ret = PTR_ERR(trans); 4459 break; 4460 } 4461 continue; 4462 } else { 4463 goto skip; 4464 } 4465 update_tree: 4466 btrfs_release_path(path); 4467 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4468 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4469 BTRFS_UUID_KEY_SUBVOL, 4470 key.objectid); 4471 if (ret < 0) { 4472 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4473 ret); 4474 break; 4475 } 4476 } 4477 4478 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4479 ret = btrfs_uuid_tree_add(trans, 4480 root_item.received_uuid, 4481 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4482 key.objectid); 4483 if (ret < 0) { 4484 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4485 ret); 4486 break; 4487 } 4488 } 4489 4490 skip: 4491 btrfs_release_path(path); 4492 if (trans) { 4493 ret = btrfs_end_transaction(trans); 4494 trans = NULL; 4495 if (ret) 4496 break; 4497 } 4498 4499 if (key.offset < (u64)-1) { 4500 key.offset++; 4501 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4502 key.offset = 0; 4503 key.type = BTRFS_ROOT_ITEM_KEY; 4504 } else if (key.objectid < (u64)-1) { 4505 key.offset = 0; 4506 key.type = BTRFS_ROOT_ITEM_KEY; 4507 key.objectid++; 4508 } else { 4509 break; 4510 } 4511 cond_resched(); 4512 } 4513 4514 out: 4515 btrfs_free_path(path); 4516 if (trans && !IS_ERR(trans)) 4517 btrfs_end_transaction(trans); 4518 if (ret) 4519 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4520 else if (!closing) 4521 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4522 up(&fs_info->uuid_tree_rescan_sem); 4523 return 0; 4524 } 4525 4526 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4527 { 4528 struct btrfs_trans_handle *trans; 4529 struct btrfs_root *tree_root = fs_info->tree_root; 4530 struct btrfs_root *uuid_root; 4531 struct task_struct *task; 4532 int ret; 4533 4534 /* 4535 * 1 - root node 4536 * 1 - root item 4537 */ 4538 trans = btrfs_start_transaction(tree_root, 2); 4539 if (IS_ERR(trans)) 4540 return PTR_ERR(trans); 4541 4542 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4543 if (IS_ERR(uuid_root)) { 4544 ret = PTR_ERR(uuid_root); 4545 btrfs_abort_transaction(trans, ret); 4546 btrfs_end_transaction(trans); 4547 return ret; 4548 } 4549 4550 fs_info->uuid_root = uuid_root; 4551 4552 ret = btrfs_commit_transaction(trans); 4553 if (ret) 4554 return ret; 4555 4556 down(&fs_info->uuid_tree_rescan_sem); 4557 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4558 if (IS_ERR(task)) { 4559 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4560 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4561 up(&fs_info->uuid_tree_rescan_sem); 4562 return PTR_ERR(task); 4563 } 4564 4565 return 0; 4566 } 4567 4568 /* 4569 * shrinking a device means finding all of the device extents past 4570 * the new size, and then following the back refs to the chunks. 4571 * The chunk relocation code actually frees the device extent 4572 */ 4573 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4574 { 4575 struct btrfs_fs_info *fs_info = device->fs_info; 4576 struct btrfs_root *root = fs_info->dev_root; 4577 struct btrfs_trans_handle *trans; 4578 struct btrfs_dev_extent *dev_extent = NULL; 4579 struct btrfs_path *path; 4580 u64 length; 4581 u64 chunk_offset; 4582 int ret; 4583 int slot; 4584 int failed = 0; 4585 bool retried = false; 4586 struct extent_buffer *l; 4587 struct btrfs_key key; 4588 struct btrfs_super_block *super_copy = fs_info->super_copy; 4589 u64 old_total = btrfs_super_total_bytes(super_copy); 4590 u64 old_size = btrfs_device_get_total_bytes(device); 4591 u64 diff; 4592 u64 start; 4593 4594 new_size = round_down(new_size, fs_info->sectorsize); 4595 start = new_size; 4596 diff = round_down(old_size - new_size, fs_info->sectorsize); 4597 4598 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4599 return -EINVAL; 4600 4601 path = btrfs_alloc_path(); 4602 if (!path) 4603 return -ENOMEM; 4604 4605 path->reada = READA_BACK; 4606 4607 trans = btrfs_start_transaction(root, 0); 4608 if (IS_ERR(trans)) { 4609 btrfs_free_path(path); 4610 return PTR_ERR(trans); 4611 } 4612 4613 mutex_lock(&fs_info->chunk_mutex); 4614 4615 btrfs_device_set_total_bytes(device, new_size); 4616 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4617 device->fs_devices->total_rw_bytes -= diff; 4618 atomic64_sub(diff, &fs_info->free_chunk_space); 4619 } 4620 4621 /* 4622 * Once the device's size has been set to the new size, ensure all 4623 * in-memory chunks are synced to disk so that the loop below sees them 4624 * and relocates them accordingly. 4625 */ 4626 if (contains_pending_extent(device, &start, diff)) { 4627 mutex_unlock(&fs_info->chunk_mutex); 4628 ret = btrfs_commit_transaction(trans); 4629 if (ret) 4630 goto done; 4631 } else { 4632 mutex_unlock(&fs_info->chunk_mutex); 4633 btrfs_end_transaction(trans); 4634 } 4635 4636 again: 4637 key.objectid = device->devid; 4638 key.offset = (u64)-1; 4639 key.type = BTRFS_DEV_EXTENT_KEY; 4640 4641 do { 4642 mutex_lock(&fs_info->delete_unused_bgs_mutex); 4643 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4644 if (ret < 0) { 4645 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4646 goto done; 4647 } 4648 4649 ret = btrfs_previous_item(root, path, 0, key.type); 4650 if (ret) 4651 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4652 if (ret < 0) 4653 goto done; 4654 if (ret) { 4655 ret = 0; 4656 btrfs_release_path(path); 4657 break; 4658 } 4659 4660 l = path->nodes[0]; 4661 slot = path->slots[0]; 4662 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4663 4664 if (key.objectid != device->devid) { 4665 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4666 btrfs_release_path(path); 4667 break; 4668 } 4669 4670 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4671 length = btrfs_dev_extent_length(l, dev_extent); 4672 4673 if (key.offset + length <= new_size) { 4674 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4675 btrfs_release_path(path); 4676 break; 4677 } 4678 4679 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4680 btrfs_release_path(path); 4681 4682 /* 4683 * We may be relocating the only data chunk we have, 4684 * which could potentially end up with losing data's 4685 * raid profile, so lets allocate an empty one in 4686 * advance. 4687 */ 4688 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4689 if (ret < 0) { 4690 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4691 goto done; 4692 } 4693 4694 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4695 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 4696 if (ret == -ENOSPC) { 4697 failed++; 4698 } else if (ret) { 4699 if (ret == -ETXTBSY) { 4700 btrfs_warn(fs_info, 4701 "could not shrink block group %llu due to active swapfile", 4702 chunk_offset); 4703 } 4704 goto done; 4705 } 4706 } while (key.offset-- > 0); 4707 4708 if (failed && !retried) { 4709 failed = 0; 4710 retried = true; 4711 goto again; 4712 } else if (failed && retried) { 4713 ret = -ENOSPC; 4714 goto done; 4715 } 4716 4717 /* Shrinking succeeded, else we would be at "done". */ 4718 trans = btrfs_start_transaction(root, 0); 4719 if (IS_ERR(trans)) { 4720 ret = PTR_ERR(trans); 4721 goto done; 4722 } 4723 4724 mutex_lock(&fs_info->chunk_mutex); 4725 /* Clear all state bits beyond the shrunk device size */ 4726 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4727 CHUNK_STATE_MASK); 4728 4729 btrfs_device_set_disk_total_bytes(device, new_size); 4730 if (list_empty(&device->post_commit_list)) 4731 list_add_tail(&device->post_commit_list, 4732 &trans->transaction->dev_update_list); 4733 4734 WARN_ON(diff > old_total); 4735 btrfs_set_super_total_bytes(super_copy, 4736 round_down(old_total - diff, fs_info->sectorsize)); 4737 mutex_unlock(&fs_info->chunk_mutex); 4738 4739 /* Now btrfs_update_device() will change the on-disk size. */ 4740 ret = btrfs_update_device(trans, device); 4741 if (ret < 0) { 4742 btrfs_abort_transaction(trans, ret); 4743 btrfs_end_transaction(trans); 4744 } else { 4745 ret = btrfs_commit_transaction(trans); 4746 } 4747 done: 4748 btrfs_free_path(path); 4749 if (ret) { 4750 mutex_lock(&fs_info->chunk_mutex); 4751 btrfs_device_set_total_bytes(device, old_size); 4752 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4753 device->fs_devices->total_rw_bytes += diff; 4754 atomic64_add(diff, &fs_info->free_chunk_space); 4755 mutex_unlock(&fs_info->chunk_mutex); 4756 } 4757 return ret; 4758 } 4759 4760 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4761 struct btrfs_key *key, 4762 struct btrfs_chunk *chunk, int item_size) 4763 { 4764 struct btrfs_super_block *super_copy = fs_info->super_copy; 4765 struct btrfs_disk_key disk_key; 4766 u32 array_size; 4767 u8 *ptr; 4768 4769 mutex_lock(&fs_info->chunk_mutex); 4770 array_size = btrfs_super_sys_array_size(super_copy); 4771 if (array_size + item_size + sizeof(disk_key) 4772 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 4773 mutex_unlock(&fs_info->chunk_mutex); 4774 return -EFBIG; 4775 } 4776 4777 ptr = super_copy->sys_chunk_array + array_size; 4778 btrfs_cpu_key_to_disk(&disk_key, key); 4779 memcpy(ptr, &disk_key, sizeof(disk_key)); 4780 ptr += sizeof(disk_key); 4781 memcpy(ptr, chunk, item_size); 4782 item_size += sizeof(disk_key); 4783 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4784 mutex_unlock(&fs_info->chunk_mutex); 4785 4786 return 0; 4787 } 4788 4789 /* 4790 * sort the devices in descending order by max_avail, total_avail 4791 */ 4792 static int btrfs_cmp_device_info(const void *a, const void *b) 4793 { 4794 const struct btrfs_device_info *di_a = a; 4795 const struct btrfs_device_info *di_b = b; 4796 4797 if (di_a->max_avail > di_b->max_avail) 4798 return -1; 4799 if (di_a->max_avail < di_b->max_avail) 4800 return 1; 4801 if (di_a->total_avail > di_b->total_avail) 4802 return -1; 4803 if (di_a->total_avail < di_b->total_avail) 4804 return 1; 4805 return 0; 4806 } 4807 4808 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4809 { 4810 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 4811 return; 4812 4813 btrfs_set_fs_incompat(info, RAID56); 4814 } 4815 4816 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 4817 { 4818 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 4819 return; 4820 4821 btrfs_set_fs_incompat(info, RAID1C34); 4822 } 4823 4824 /* 4825 * Structure used internally for __btrfs_alloc_chunk() function. 4826 * Wraps needed parameters. 4827 */ 4828 struct alloc_chunk_ctl { 4829 u64 start; 4830 u64 type; 4831 /* Total number of stripes to allocate */ 4832 int num_stripes; 4833 /* sub_stripes info for map */ 4834 int sub_stripes; 4835 /* Stripes per device */ 4836 int dev_stripes; 4837 /* Maximum number of devices to use */ 4838 int devs_max; 4839 /* Minimum number of devices to use */ 4840 int devs_min; 4841 /* ndevs has to be a multiple of this */ 4842 int devs_increment; 4843 /* Number of copies */ 4844 int ncopies; 4845 /* Number of stripes worth of bytes to store parity information */ 4846 int nparity; 4847 u64 max_stripe_size; 4848 u64 max_chunk_size; 4849 u64 dev_extent_min; 4850 u64 stripe_size; 4851 u64 chunk_size; 4852 int ndevs; 4853 }; 4854 4855 static void init_alloc_chunk_ctl_policy_regular( 4856 struct btrfs_fs_devices *fs_devices, 4857 struct alloc_chunk_ctl *ctl) 4858 { 4859 u64 type = ctl->type; 4860 4861 if (type & BTRFS_BLOCK_GROUP_DATA) { 4862 ctl->max_stripe_size = SZ_1G; 4863 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4864 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4865 /* For larger filesystems, use larger metadata chunks */ 4866 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4867 ctl->max_stripe_size = SZ_1G; 4868 else 4869 ctl->max_stripe_size = SZ_256M; 4870 ctl->max_chunk_size = ctl->max_stripe_size; 4871 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4872 ctl->max_stripe_size = SZ_32M; 4873 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 4874 ctl->devs_max = min_t(int, ctl->devs_max, 4875 BTRFS_MAX_DEVS_SYS_CHUNK); 4876 } else { 4877 BUG(); 4878 } 4879 4880 /* We don't want a chunk larger than 10% of writable space */ 4881 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4882 ctl->max_chunk_size); 4883 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 4884 } 4885 4886 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 4887 struct alloc_chunk_ctl *ctl) 4888 { 4889 int index = btrfs_bg_flags_to_raid_index(ctl->type); 4890 4891 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 4892 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 4893 ctl->devs_max = btrfs_raid_array[index].devs_max; 4894 if (!ctl->devs_max) 4895 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 4896 ctl->devs_min = btrfs_raid_array[index].devs_min; 4897 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 4898 ctl->ncopies = btrfs_raid_array[index].ncopies; 4899 ctl->nparity = btrfs_raid_array[index].nparity; 4900 ctl->ndevs = 0; 4901 4902 switch (fs_devices->chunk_alloc_policy) { 4903 case BTRFS_CHUNK_ALLOC_REGULAR: 4904 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 4905 break; 4906 default: 4907 BUG(); 4908 } 4909 } 4910 4911 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 4912 struct alloc_chunk_ctl *ctl, 4913 struct btrfs_device_info *devices_info) 4914 { 4915 struct btrfs_fs_info *info = fs_devices->fs_info; 4916 struct btrfs_device *device; 4917 u64 total_avail; 4918 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 4919 int ret; 4920 int ndevs = 0; 4921 u64 max_avail; 4922 u64 dev_offset; 4923 4924 /* 4925 * in the first pass through the devices list, we gather information 4926 * about the available holes on each device. 4927 */ 4928 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 4929 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4930 WARN(1, KERN_ERR 4931 "BTRFS: read-only device in alloc_list\n"); 4932 continue; 4933 } 4934 4935 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 4936 &device->dev_state) || 4937 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4938 continue; 4939 4940 if (device->total_bytes > device->bytes_used) 4941 total_avail = device->total_bytes - device->bytes_used; 4942 else 4943 total_avail = 0; 4944 4945 /* If there is no space on this device, skip it. */ 4946 if (total_avail < ctl->dev_extent_min) 4947 continue; 4948 4949 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 4950 &max_avail); 4951 if (ret && ret != -ENOSPC) 4952 return ret; 4953 4954 if (ret == 0) 4955 max_avail = dev_extent_want; 4956 4957 if (max_avail < ctl->dev_extent_min) { 4958 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 4959 btrfs_debug(info, 4960 "%s: devid %llu has no free space, have=%llu want=%llu", 4961 __func__, device->devid, max_avail, 4962 ctl->dev_extent_min); 4963 continue; 4964 } 4965 4966 if (ndevs == fs_devices->rw_devices) { 4967 WARN(1, "%s: found more than %llu devices\n", 4968 __func__, fs_devices->rw_devices); 4969 break; 4970 } 4971 devices_info[ndevs].dev_offset = dev_offset; 4972 devices_info[ndevs].max_avail = max_avail; 4973 devices_info[ndevs].total_avail = total_avail; 4974 devices_info[ndevs].dev = device; 4975 ++ndevs; 4976 } 4977 ctl->ndevs = ndevs; 4978 4979 /* 4980 * now sort the devices by hole size / available space 4981 */ 4982 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 4983 btrfs_cmp_device_info, NULL); 4984 4985 return 0; 4986 } 4987 4988 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 4989 struct btrfs_device_info *devices_info) 4990 { 4991 /* Number of stripes that count for block group size */ 4992 int data_stripes; 4993 4994 /* 4995 * The primary goal is to maximize the number of stripes, so use as 4996 * many devices as possible, even if the stripes are not maximum sized. 4997 * 4998 * The DUP profile stores more than one stripe per device, the 4999 * max_avail is the total size so we have to adjust. 5000 */ 5001 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5002 ctl->dev_stripes); 5003 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5004 5005 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5006 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5007 5008 /* 5009 * Use the number of data stripes to figure out how big this chunk is 5010 * really going to be in terms of logical address space, and compare 5011 * that answer with the max chunk size. If it's higher, we try to 5012 * reduce stripe_size. 5013 */ 5014 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5015 /* 5016 * Reduce stripe_size, round it up to a 16MB boundary again and 5017 * then use it, unless it ends up being even bigger than the 5018 * previous value we had already. 5019 */ 5020 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5021 data_stripes), SZ_16M), 5022 ctl->stripe_size); 5023 } 5024 5025 /* Align to BTRFS_STRIPE_LEN */ 5026 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5027 ctl->chunk_size = ctl->stripe_size * data_stripes; 5028 5029 return 0; 5030 } 5031 5032 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5033 struct alloc_chunk_ctl *ctl, 5034 struct btrfs_device_info *devices_info) 5035 { 5036 struct btrfs_fs_info *info = fs_devices->fs_info; 5037 5038 /* 5039 * Round down to number of usable stripes, devs_increment can be any 5040 * number so we can't use round_down() that requires power of 2, while 5041 * rounddown is safe. 5042 */ 5043 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5044 5045 if (ctl->ndevs < ctl->devs_min) { 5046 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5047 btrfs_debug(info, 5048 "%s: not enough devices with free space: have=%d minimum required=%d", 5049 __func__, ctl->ndevs, ctl->devs_min); 5050 } 5051 return -ENOSPC; 5052 } 5053 5054 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5055 5056 switch (fs_devices->chunk_alloc_policy) { 5057 case BTRFS_CHUNK_ALLOC_REGULAR: 5058 return decide_stripe_size_regular(ctl, devices_info); 5059 default: 5060 BUG(); 5061 } 5062 } 5063 5064 static int create_chunk(struct btrfs_trans_handle *trans, 5065 struct alloc_chunk_ctl *ctl, 5066 struct btrfs_device_info *devices_info) 5067 { 5068 struct btrfs_fs_info *info = trans->fs_info; 5069 struct map_lookup *map = NULL; 5070 struct extent_map_tree *em_tree; 5071 struct extent_map *em; 5072 u64 start = ctl->start; 5073 u64 type = ctl->type; 5074 int ret; 5075 int i; 5076 int j; 5077 5078 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5079 if (!map) 5080 return -ENOMEM; 5081 map->num_stripes = ctl->num_stripes; 5082 5083 for (i = 0; i < ctl->ndevs; ++i) { 5084 for (j = 0; j < ctl->dev_stripes; ++j) { 5085 int s = i * ctl->dev_stripes + j; 5086 map->stripes[s].dev = devices_info[i].dev; 5087 map->stripes[s].physical = devices_info[i].dev_offset + 5088 j * ctl->stripe_size; 5089 } 5090 } 5091 map->stripe_len = BTRFS_STRIPE_LEN; 5092 map->io_align = BTRFS_STRIPE_LEN; 5093 map->io_width = BTRFS_STRIPE_LEN; 5094 map->type = type; 5095 map->sub_stripes = ctl->sub_stripes; 5096 5097 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5098 5099 em = alloc_extent_map(); 5100 if (!em) { 5101 kfree(map); 5102 return -ENOMEM; 5103 } 5104 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5105 em->map_lookup = map; 5106 em->start = start; 5107 em->len = ctl->chunk_size; 5108 em->block_start = 0; 5109 em->block_len = em->len; 5110 em->orig_block_len = ctl->stripe_size; 5111 5112 em_tree = &info->mapping_tree; 5113 write_lock(&em_tree->lock); 5114 ret = add_extent_mapping(em_tree, em, 0); 5115 if (ret) { 5116 write_unlock(&em_tree->lock); 5117 free_extent_map(em); 5118 return ret; 5119 } 5120 write_unlock(&em_tree->lock); 5121 5122 ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5123 if (ret) 5124 goto error_del_extent; 5125 5126 for (i = 0; i < map->num_stripes; i++) { 5127 struct btrfs_device *dev = map->stripes[i].dev; 5128 5129 btrfs_device_set_bytes_used(dev, 5130 dev->bytes_used + ctl->stripe_size); 5131 if (list_empty(&dev->post_commit_list)) 5132 list_add_tail(&dev->post_commit_list, 5133 &trans->transaction->dev_update_list); 5134 } 5135 5136 atomic64_sub(ctl->stripe_size * map->num_stripes, 5137 &info->free_chunk_space); 5138 5139 free_extent_map(em); 5140 check_raid56_incompat_flag(info, type); 5141 check_raid1c34_incompat_flag(info, type); 5142 5143 return 0; 5144 5145 error_del_extent: 5146 write_lock(&em_tree->lock); 5147 remove_extent_mapping(em_tree, em); 5148 write_unlock(&em_tree->lock); 5149 5150 /* One for our allocation */ 5151 free_extent_map(em); 5152 /* One for the tree reference */ 5153 free_extent_map(em); 5154 5155 return ret; 5156 } 5157 5158 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) 5159 { 5160 struct btrfs_fs_info *info = trans->fs_info; 5161 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5162 struct btrfs_device_info *devices_info = NULL; 5163 struct alloc_chunk_ctl ctl; 5164 int ret; 5165 5166 lockdep_assert_held(&info->chunk_mutex); 5167 5168 if (!alloc_profile_is_valid(type, 0)) { 5169 ASSERT(0); 5170 return -EINVAL; 5171 } 5172 5173 if (list_empty(&fs_devices->alloc_list)) { 5174 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5175 btrfs_debug(info, "%s: no writable device", __func__); 5176 return -ENOSPC; 5177 } 5178 5179 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5180 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5181 ASSERT(0); 5182 return -EINVAL; 5183 } 5184 5185 ctl.start = find_next_chunk(info); 5186 ctl.type = type; 5187 init_alloc_chunk_ctl(fs_devices, &ctl); 5188 5189 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5190 GFP_NOFS); 5191 if (!devices_info) 5192 return -ENOMEM; 5193 5194 ret = gather_device_info(fs_devices, &ctl, devices_info); 5195 if (ret < 0) 5196 goto out; 5197 5198 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5199 if (ret < 0) 5200 goto out; 5201 5202 ret = create_chunk(trans, &ctl, devices_info); 5203 5204 out: 5205 kfree(devices_info); 5206 return ret; 5207 } 5208 5209 /* 5210 * Chunk allocation falls into two parts. The first part does work 5211 * that makes the new allocated chunk usable, but does not do any operation 5212 * that modifies the chunk tree. The second part does the work that 5213 * requires modifying the chunk tree. This division is important for the 5214 * bootstrap process of adding storage to a seed btrfs. 5215 */ 5216 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 5217 u64 chunk_offset, u64 chunk_size) 5218 { 5219 struct btrfs_fs_info *fs_info = trans->fs_info; 5220 struct btrfs_root *extent_root = fs_info->extent_root; 5221 struct btrfs_root *chunk_root = fs_info->chunk_root; 5222 struct btrfs_key key; 5223 struct btrfs_device *device; 5224 struct btrfs_chunk *chunk; 5225 struct btrfs_stripe *stripe; 5226 struct extent_map *em; 5227 struct map_lookup *map; 5228 size_t item_size; 5229 u64 dev_offset; 5230 u64 stripe_size; 5231 int i = 0; 5232 int ret = 0; 5233 5234 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 5235 if (IS_ERR(em)) 5236 return PTR_ERR(em); 5237 5238 map = em->map_lookup; 5239 item_size = btrfs_chunk_item_size(map->num_stripes); 5240 stripe_size = em->orig_block_len; 5241 5242 chunk = kzalloc(item_size, GFP_NOFS); 5243 if (!chunk) { 5244 ret = -ENOMEM; 5245 goto out; 5246 } 5247 5248 /* 5249 * Take the device list mutex to prevent races with the final phase of 5250 * a device replace operation that replaces the device object associated 5251 * with the map's stripes, because the device object's id can change 5252 * at any time during that final phase of the device replace operation 5253 * (dev-replace.c:btrfs_dev_replace_finishing()). 5254 */ 5255 mutex_lock(&fs_info->fs_devices->device_list_mutex); 5256 for (i = 0; i < map->num_stripes; i++) { 5257 device = map->stripes[i].dev; 5258 dev_offset = map->stripes[i].physical; 5259 5260 ret = btrfs_update_device(trans, device); 5261 if (ret) 5262 break; 5263 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, 5264 dev_offset, stripe_size); 5265 if (ret) 5266 break; 5267 } 5268 if (ret) { 5269 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5270 goto out; 5271 } 5272 5273 stripe = &chunk->stripe; 5274 for (i = 0; i < map->num_stripes; i++) { 5275 device = map->stripes[i].dev; 5276 dev_offset = map->stripes[i].physical; 5277 5278 btrfs_set_stack_stripe_devid(stripe, device->devid); 5279 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5280 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5281 stripe++; 5282 } 5283 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 5284 5285 btrfs_set_stack_chunk_length(chunk, chunk_size); 5286 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); 5287 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5288 btrfs_set_stack_chunk_type(chunk, map->type); 5289 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5290 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5291 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5292 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5293 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5294 5295 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5296 key.type = BTRFS_CHUNK_ITEM_KEY; 5297 key.offset = chunk_offset; 5298 5299 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5300 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5301 /* 5302 * TODO: Cleanup of inserted chunk root in case of 5303 * failure. 5304 */ 5305 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5306 } 5307 5308 out: 5309 kfree(chunk); 5310 free_extent_map(em); 5311 return ret; 5312 } 5313 5314 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5315 { 5316 struct btrfs_fs_info *fs_info = trans->fs_info; 5317 u64 alloc_profile; 5318 int ret; 5319 5320 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5321 ret = btrfs_alloc_chunk(trans, alloc_profile); 5322 if (ret) 5323 return ret; 5324 5325 alloc_profile = btrfs_system_alloc_profile(fs_info); 5326 ret = btrfs_alloc_chunk(trans, alloc_profile); 5327 return ret; 5328 } 5329 5330 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5331 { 5332 const int index = btrfs_bg_flags_to_raid_index(map->type); 5333 5334 return btrfs_raid_array[index].tolerated_failures; 5335 } 5336 5337 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5338 { 5339 struct extent_map *em; 5340 struct map_lookup *map; 5341 int readonly = 0; 5342 int miss_ndevs = 0; 5343 int i; 5344 5345 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5346 if (IS_ERR(em)) 5347 return 1; 5348 5349 map = em->map_lookup; 5350 for (i = 0; i < map->num_stripes; i++) { 5351 if (test_bit(BTRFS_DEV_STATE_MISSING, 5352 &map->stripes[i].dev->dev_state)) { 5353 miss_ndevs++; 5354 continue; 5355 } 5356 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5357 &map->stripes[i].dev->dev_state)) { 5358 readonly = 1; 5359 goto end; 5360 } 5361 } 5362 5363 /* 5364 * If the number of missing devices is larger than max errors, 5365 * we can not write the data into that chunk successfully, so 5366 * set it readonly. 5367 */ 5368 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5369 readonly = 1; 5370 end: 5371 free_extent_map(em); 5372 return readonly; 5373 } 5374 5375 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5376 { 5377 struct extent_map *em; 5378 5379 while (1) { 5380 write_lock(&tree->lock); 5381 em = lookup_extent_mapping(tree, 0, (u64)-1); 5382 if (em) 5383 remove_extent_mapping(tree, em); 5384 write_unlock(&tree->lock); 5385 if (!em) 5386 break; 5387 /* once for us */ 5388 free_extent_map(em); 5389 /* once for the tree */ 5390 free_extent_map(em); 5391 } 5392 } 5393 5394 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5395 { 5396 struct extent_map *em; 5397 struct map_lookup *map; 5398 int ret; 5399 5400 em = btrfs_get_chunk_map(fs_info, logical, len); 5401 if (IS_ERR(em)) 5402 /* 5403 * We could return errors for these cases, but that could get 5404 * ugly and we'd probably do the same thing which is just not do 5405 * anything else and exit, so return 1 so the callers don't try 5406 * to use other copies. 5407 */ 5408 return 1; 5409 5410 map = em->map_lookup; 5411 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5412 ret = map->num_stripes; 5413 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5414 ret = map->sub_stripes; 5415 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5416 ret = 2; 5417 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5418 /* 5419 * There could be two corrupted data stripes, we need 5420 * to loop retry in order to rebuild the correct data. 5421 * 5422 * Fail a stripe at a time on every retry except the 5423 * stripe under reconstruction. 5424 */ 5425 ret = map->num_stripes; 5426 else 5427 ret = 1; 5428 free_extent_map(em); 5429 5430 down_read(&fs_info->dev_replace.rwsem); 5431 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5432 fs_info->dev_replace.tgtdev) 5433 ret++; 5434 up_read(&fs_info->dev_replace.rwsem); 5435 5436 return ret; 5437 } 5438 5439 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5440 u64 logical) 5441 { 5442 struct extent_map *em; 5443 struct map_lookup *map; 5444 unsigned long len = fs_info->sectorsize; 5445 5446 em = btrfs_get_chunk_map(fs_info, logical, len); 5447 5448 if (!WARN_ON(IS_ERR(em))) { 5449 map = em->map_lookup; 5450 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5451 len = map->stripe_len * nr_data_stripes(map); 5452 free_extent_map(em); 5453 } 5454 return len; 5455 } 5456 5457 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5458 { 5459 struct extent_map *em; 5460 struct map_lookup *map; 5461 int ret = 0; 5462 5463 em = btrfs_get_chunk_map(fs_info, logical, len); 5464 5465 if(!WARN_ON(IS_ERR(em))) { 5466 map = em->map_lookup; 5467 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5468 ret = 1; 5469 free_extent_map(em); 5470 } 5471 return ret; 5472 } 5473 5474 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5475 struct map_lookup *map, int first, 5476 int dev_replace_is_ongoing) 5477 { 5478 int i; 5479 int num_stripes; 5480 int preferred_mirror; 5481 int tolerance; 5482 struct btrfs_device *srcdev; 5483 5484 ASSERT((map->type & 5485 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5486 5487 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5488 num_stripes = map->sub_stripes; 5489 else 5490 num_stripes = map->num_stripes; 5491 5492 preferred_mirror = first + current->pid % num_stripes; 5493 5494 if (dev_replace_is_ongoing && 5495 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5496 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5497 srcdev = fs_info->dev_replace.srcdev; 5498 else 5499 srcdev = NULL; 5500 5501 /* 5502 * try to avoid the drive that is the source drive for a 5503 * dev-replace procedure, only choose it if no other non-missing 5504 * mirror is available 5505 */ 5506 for (tolerance = 0; tolerance < 2; tolerance++) { 5507 if (map->stripes[preferred_mirror].dev->bdev && 5508 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5509 return preferred_mirror; 5510 for (i = first; i < first + num_stripes; i++) { 5511 if (map->stripes[i].dev->bdev && 5512 (tolerance || map->stripes[i].dev != srcdev)) 5513 return i; 5514 } 5515 } 5516 5517 /* we couldn't find one that doesn't fail. Just return something 5518 * and the io error handling code will clean up eventually 5519 */ 5520 return preferred_mirror; 5521 } 5522 5523 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5524 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) 5525 { 5526 int i; 5527 int again = 1; 5528 5529 while (again) { 5530 again = 0; 5531 for (i = 0; i < num_stripes - 1; i++) { 5532 /* Swap if parity is on a smaller index */ 5533 if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { 5534 swap(bbio->stripes[i], bbio->stripes[i + 1]); 5535 swap(bbio->raid_map[i], bbio->raid_map[i + 1]); 5536 again = 1; 5537 } 5538 } 5539 } 5540 } 5541 5542 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) 5543 { 5544 struct btrfs_bio *bbio = kzalloc( 5545 /* the size of the btrfs_bio */ 5546 sizeof(struct btrfs_bio) + 5547 /* plus the variable array for the stripes */ 5548 sizeof(struct btrfs_bio_stripe) * (total_stripes) + 5549 /* plus the variable array for the tgt dev */ 5550 sizeof(int) * (real_stripes) + 5551 /* 5552 * plus the raid_map, which includes both the tgt dev 5553 * and the stripes 5554 */ 5555 sizeof(u64) * (total_stripes), 5556 GFP_NOFS|__GFP_NOFAIL); 5557 5558 atomic_set(&bbio->error, 0); 5559 refcount_set(&bbio->refs, 1); 5560 5561 bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); 5562 bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); 5563 5564 return bbio; 5565 } 5566 5567 void btrfs_get_bbio(struct btrfs_bio *bbio) 5568 { 5569 WARN_ON(!refcount_read(&bbio->refs)); 5570 refcount_inc(&bbio->refs); 5571 } 5572 5573 void btrfs_put_bbio(struct btrfs_bio *bbio) 5574 { 5575 if (!bbio) 5576 return; 5577 if (refcount_dec_and_test(&bbio->refs)) 5578 kfree(bbio); 5579 } 5580 5581 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5582 /* 5583 * Please note that, discard won't be sent to target device of device 5584 * replace. 5585 */ 5586 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5587 u64 logical, u64 *length_ret, 5588 struct btrfs_bio **bbio_ret) 5589 { 5590 struct extent_map *em; 5591 struct map_lookup *map; 5592 struct btrfs_bio *bbio; 5593 u64 length = *length_ret; 5594 u64 offset; 5595 u64 stripe_nr; 5596 u64 stripe_nr_end; 5597 u64 stripe_end_offset; 5598 u64 stripe_cnt; 5599 u64 stripe_len; 5600 u64 stripe_offset; 5601 u64 num_stripes; 5602 u32 stripe_index; 5603 u32 factor = 0; 5604 u32 sub_stripes = 0; 5605 u64 stripes_per_dev = 0; 5606 u32 remaining_stripes = 0; 5607 u32 last_stripe = 0; 5608 int ret = 0; 5609 int i; 5610 5611 /* discard always return a bbio */ 5612 ASSERT(bbio_ret); 5613 5614 em = btrfs_get_chunk_map(fs_info, logical, length); 5615 if (IS_ERR(em)) 5616 return PTR_ERR(em); 5617 5618 map = em->map_lookup; 5619 /* we don't discard raid56 yet */ 5620 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5621 ret = -EOPNOTSUPP; 5622 goto out; 5623 } 5624 5625 offset = logical - em->start; 5626 length = min_t(u64, em->start + em->len - logical, length); 5627 *length_ret = length; 5628 5629 stripe_len = map->stripe_len; 5630 /* 5631 * stripe_nr counts the total number of stripes we have to stride 5632 * to get to this block 5633 */ 5634 stripe_nr = div64_u64(offset, stripe_len); 5635 5636 /* stripe_offset is the offset of this block in its stripe */ 5637 stripe_offset = offset - stripe_nr * stripe_len; 5638 5639 stripe_nr_end = round_up(offset + length, map->stripe_len); 5640 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5641 stripe_cnt = stripe_nr_end - stripe_nr; 5642 stripe_end_offset = stripe_nr_end * map->stripe_len - 5643 (offset + length); 5644 /* 5645 * after this, stripe_nr is the number of stripes on this 5646 * device we have to walk to find the data, and stripe_index is 5647 * the number of our device in the stripe array 5648 */ 5649 num_stripes = 1; 5650 stripe_index = 0; 5651 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5652 BTRFS_BLOCK_GROUP_RAID10)) { 5653 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5654 sub_stripes = 1; 5655 else 5656 sub_stripes = map->sub_stripes; 5657 5658 factor = map->num_stripes / sub_stripes; 5659 num_stripes = min_t(u64, map->num_stripes, 5660 sub_stripes * stripe_cnt); 5661 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 5662 stripe_index *= sub_stripes; 5663 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 5664 &remaining_stripes); 5665 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5666 last_stripe *= sub_stripes; 5667 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 5668 BTRFS_BLOCK_GROUP_DUP)) { 5669 num_stripes = map->num_stripes; 5670 } else { 5671 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 5672 &stripe_index); 5673 } 5674 5675 bbio = alloc_btrfs_bio(num_stripes, 0); 5676 if (!bbio) { 5677 ret = -ENOMEM; 5678 goto out; 5679 } 5680 5681 for (i = 0; i < num_stripes; i++) { 5682 bbio->stripes[i].physical = 5683 map->stripes[stripe_index].physical + 5684 stripe_offset + stripe_nr * map->stripe_len; 5685 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 5686 5687 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5688 BTRFS_BLOCK_GROUP_RAID10)) { 5689 bbio->stripes[i].length = stripes_per_dev * 5690 map->stripe_len; 5691 5692 if (i / sub_stripes < remaining_stripes) 5693 bbio->stripes[i].length += 5694 map->stripe_len; 5695 5696 /* 5697 * Special for the first stripe and 5698 * the last stripe: 5699 * 5700 * |-------|...|-------| 5701 * |----------| 5702 * off end_off 5703 */ 5704 if (i < sub_stripes) 5705 bbio->stripes[i].length -= 5706 stripe_offset; 5707 5708 if (stripe_index >= last_stripe && 5709 stripe_index <= (last_stripe + 5710 sub_stripes - 1)) 5711 bbio->stripes[i].length -= 5712 stripe_end_offset; 5713 5714 if (i == sub_stripes - 1) 5715 stripe_offset = 0; 5716 } else { 5717 bbio->stripes[i].length = length; 5718 } 5719 5720 stripe_index++; 5721 if (stripe_index == map->num_stripes) { 5722 stripe_index = 0; 5723 stripe_nr++; 5724 } 5725 } 5726 5727 *bbio_ret = bbio; 5728 bbio->map_type = map->type; 5729 bbio->num_stripes = num_stripes; 5730 out: 5731 free_extent_map(em); 5732 return ret; 5733 } 5734 5735 /* 5736 * In dev-replace case, for repair case (that's the only case where the mirror 5737 * is selected explicitly when calling btrfs_map_block), blocks left of the 5738 * left cursor can also be read from the target drive. 5739 * 5740 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 5741 * array of stripes. 5742 * For READ, it also needs to be supported using the same mirror number. 5743 * 5744 * If the requested block is not left of the left cursor, EIO is returned. This 5745 * can happen because btrfs_num_copies() returns one more in the dev-replace 5746 * case. 5747 */ 5748 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 5749 u64 logical, u64 length, 5750 u64 srcdev_devid, int *mirror_num, 5751 u64 *physical) 5752 { 5753 struct btrfs_bio *bbio = NULL; 5754 int num_stripes; 5755 int index_srcdev = 0; 5756 int found = 0; 5757 u64 physical_of_found = 0; 5758 int i; 5759 int ret = 0; 5760 5761 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 5762 logical, &length, &bbio, 0, 0); 5763 if (ret) { 5764 ASSERT(bbio == NULL); 5765 return ret; 5766 } 5767 5768 num_stripes = bbio->num_stripes; 5769 if (*mirror_num > num_stripes) { 5770 /* 5771 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 5772 * that means that the requested area is not left of the left 5773 * cursor 5774 */ 5775 btrfs_put_bbio(bbio); 5776 return -EIO; 5777 } 5778 5779 /* 5780 * process the rest of the function using the mirror_num of the source 5781 * drive. Therefore look it up first. At the end, patch the device 5782 * pointer to the one of the target drive. 5783 */ 5784 for (i = 0; i < num_stripes; i++) { 5785 if (bbio->stripes[i].dev->devid != srcdev_devid) 5786 continue; 5787 5788 /* 5789 * In case of DUP, in order to keep it simple, only add the 5790 * mirror with the lowest physical address 5791 */ 5792 if (found && 5793 physical_of_found <= bbio->stripes[i].physical) 5794 continue; 5795 5796 index_srcdev = i; 5797 found = 1; 5798 physical_of_found = bbio->stripes[i].physical; 5799 } 5800 5801 btrfs_put_bbio(bbio); 5802 5803 ASSERT(found); 5804 if (!found) 5805 return -EIO; 5806 5807 *mirror_num = index_srcdev + 1; 5808 *physical = physical_of_found; 5809 return ret; 5810 } 5811 5812 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 5813 struct btrfs_bio **bbio_ret, 5814 struct btrfs_dev_replace *dev_replace, 5815 int *num_stripes_ret, int *max_errors_ret) 5816 { 5817 struct btrfs_bio *bbio = *bbio_ret; 5818 u64 srcdev_devid = dev_replace->srcdev->devid; 5819 int tgtdev_indexes = 0; 5820 int num_stripes = *num_stripes_ret; 5821 int max_errors = *max_errors_ret; 5822 int i; 5823 5824 if (op == BTRFS_MAP_WRITE) { 5825 int index_where_to_add; 5826 5827 /* 5828 * duplicate the write operations while the dev replace 5829 * procedure is running. Since the copying of the old disk to 5830 * the new disk takes place at run time while the filesystem is 5831 * mounted writable, the regular write operations to the old 5832 * disk have to be duplicated to go to the new disk as well. 5833 * 5834 * Note that device->missing is handled by the caller, and that 5835 * the write to the old disk is already set up in the stripes 5836 * array. 5837 */ 5838 index_where_to_add = num_stripes; 5839 for (i = 0; i < num_stripes; i++) { 5840 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5841 /* write to new disk, too */ 5842 struct btrfs_bio_stripe *new = 5843 bbio->stripes + index_where_to_add; 5844 struct btrfs_bio_stripe *old = 5845 bbio->stripes + i; 5846 5847 new->physical = old->physical; 5848 new->length = old->length; 5849 new->dev = dev_replace->tgtdev; 5850 bbio->tgtdev_map[i] = index_where_to_add; 5851 index_where_to_add++; 5852 max_errors++; 5853 tgtdev_indexes++; 5854 } 5855 } 5856 num_stripes = index_where_to_add; 5857 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 5858 int index_srcdev = 0; 5859 int found = 0; 5860 u64 physical_of_found = 0; 5861 5862 /* 5863 * During the dev-replace procedure, the target drive can also 5864 * be used to read data in case it is needed to repair a corrupt 5865 * block elsewhere. This is possible if the requested area is 5866 * left of the left cursor. In this area, the target drive is a 5867 * full copy of the source drive. 5868 */ 5869 for (i = 0; i < num_stripes; i++) { 5870 if (bbio->stripes[i].dev->devid == srcdev_devid) { 5871 /* 5872 * In case of DUP, in order to keep it simple, 5873 * only add the mirror with the lowest physical 5874 * address 5875 */ 5876 if (found && 5877 physical_of_found <= 5878 bbio->stripes[i].physical) 5879 continue; 5880 index_srcdev = i; 5881 found = 1; 5882 physical_of_found = bbio->stripes[i].physical; 5883 } 5884 } 5885 if (found) { 5886 struct btrfs_bio_stripe *tgtdev_stripe = 5887 bbio->stripes + num_stripes; 5888 5889 tgtdev_stripe->physical = physical_of_found; 5890 tgtdev_stripe->length = 5891 bbio->stripes[index_srcdev].length; 5892 tgtdev_stripe->dev = dev_replace->tgtdev; 5893 bbio->tgtdev_map[index_srcdev] = num_stripes; 5894 5895 tgtdev_indexes++; 5896 num_stripes++; 5897 } 5898 } 5899 5900 *num_stripes_ret = num_stripes; 5901 *max_errors_ret = max_errors; 5902 bbio->num_tgtdevs = tgtdev_indexes; 5903 *bbio_ret = bbio; 5904 } 5905 5906 static bool need_full_stripe(enum btrfs_map_op op) 5907 { 5908 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5909 } 5910 5911 /* 5912 * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) 5913 * tuple. This information is used to calculate how big a 5914 * particular bio can get before it straddles a stripe. 5915 * 5916 * @fs_info - the filesystem 5917 * @logical - address that we want to figure out the geometry of 5918 * @len - the length of IO we are going to perform, starting at @logical 5919 * @op - type of operation - write or read 5920 * @io_geom - pointer used to return values 5921 * 5922 * Returns < 0 in case a chunk for the given logical address cannot be found, 5923 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 5924 */ 5925 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 5926 u64 logical, u64 len, struct btrfs_io_geometry *io_geom) 5927 { 5928 struct extent_map *em; 5929 struct map_lookup *map; 5930 u64 offset; 5931 u64 stripe_offset; 5932 u64 stripe_nr; 5933 u64 stripe_len; 5934 u64 raid56_full_stripe_start = (u64)-1; 5935 int data_stripes; 5936 int ret = 0; 5937 5938 ASSERT(op != BTRFS_MAP_DISCARD); 5939 5940 em = btrfs_get_chunk_map(fs_info, logical, len); 5941 if (IS_ERR(em)) 5942 return PTR_ERR(em); 5943 5944 map = em->map_lookup; 5945 /* Offset of this logical address in the chunk */ 5946 offset = logical - em->start; 5947 /* Len of a stripe in a chunk */ 5948 stripe_len = map->stripe_len; 5949 /* Stripe wher this block falls in */ 5950 stripe_nr = div64_u64(offset, stripe_len); 5951 /* Offset of stripe in the chunk */ 5952 stripe_offset = stripe_nr * stripe_len; 5953 if (offset < stripe_offset) { 5954 btrfs_crit(fs_info, 5955 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 5956 stripe_offset, offset, em->start, logical, stripe_len); 5957 ret = -EINVAL; 5958 goto out; 5959 } 5960 5961 /* stripe_offset is the offset of this block in its stripe */ 5962 stripe_offset = offset - stripe_offset; 5963 data_stripes = nr_data_stripes(map); 5964 5965 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 5966 u64 max_len = stripe_len - stripe_offset; 5967 5968 /* 5969 * In case of raid56, we need to know the stripe aligned start 5970 */ 5971 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5972 unsigned long full_stripe_len = stripe_len * data_stripes; 5973 raid56_full_stripe_start = offset; 5974 5975 /* 5976 * Allow a write of a full stripe, but make sure we 5977 * don't allow straddling of stripes 5978 */ 5979 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 5980 full_stripe_len); 5981 raid56_full_stripe_start *= full_stripe_len; 5982 5983 /* 5984 * For writes to RAID[56], allow a full stripeset across 5985 * all disks. For other RAID types and for RAID[56] 5986 * reads, just allow a single stripe (on a single disk). 5987 */ 5988 if (op == BTRFS_MAP_WRITE) { 5989 max_len = stripe_len * data_stripes - 5990 (offset - raid56_full_stripe_start); 5991 } 5992 } 5993 len = min_t(u64, em->len - offset, max_len); 5994 } else { 5995 len = em->len - offset; 5996 } 5997 5998 io_geom->len = len; 5999 io_geom->offset = offset; 6000 io_geom->stripe_len = stripe_len; 6001 io_geom->stripe_nr = stripe_nr; 6002 io_geom->stripe_offset = stripe_offset; 6003 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6004 6005 out: 6006 /* once for us */ 6007 free_extent_map(em); 6008 return ret; 6009 } 6010 6011 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6012 enum btrfs_map_op op, 6013 u64 logical, u64 *length, 6014 struct btrfs_bio **bbio_ret, 6015 int mirror_num, int need_raid_map) 6016 { 6017 struct extent_map *em; 6018 struct map_lookup *map; 6019 u64 stripe_offset; 6020 u64 stripe_nr; 6021 u64 stripe_len; 6022 u32 stripe_index; 6023 int data_stripes; 6024 int i; 6025 int ret = 0; 6026 int num_stripes; 6027 int max_errors = 0; 6028 int tgtdev_indexes = 0; 6029 struct btrfs_bio *bbio = NULL; 6030 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6031 int dev_replace_is_ongoing = 0; 6032 int num_alloc_stripes; 6033 int patch_the_first_stripe_for_dev_replace = 0; 6034 u64 physical_to_patch_in_first_stripe = 0; 6035 u64 raid56_full_stripe_start = (u64)-1; 6036 struct btrfs_io_geometry geom; 6037 6038 ASSERT(bbio_ret); 6039 ASSERT(op != BTRFS_MAP_DISCARD); 6040 6041 ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); 6042 if (ret < 0) 6043 return ret; 6044 6045 em = btrfs_get_chunk_map(fs_info, logical, *length); 6046 ASSERT(!IS_ERR(em)); 6047 map = em->map_lookup; 6048 6049 *length = geom.len; 6050 stripe_len = geom.stripe_len; 6051 stripe_nr = geom.stripe_nr; 6052 stripe_offset = geom.stripe_offset; 6053 raid56_full_stripe_start = geom.raid56_stripe_offset; 6054 data_stripes = nr_data_stripes(map); 6055 6056 down_read(&dev_replace->rwsem); 6057 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6058 /* 6059 * Hold the semaphore for read during the whole operation, write is 6060 * requested at commit time but must wait. 6061 */ 6062 if (!dev_replace_is_ongoing) 6063 up_read(&dev_replace->rwsem); 6064 6065 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6066 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6067 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6068 dev_replace->srcdev->devid, 6069 &mirror_num, 6070 &physical_to_patch_in_first_stripe); 6071 if (ret) 6072 goto out; 6073 else 6074 patch_the_first_stripe_for_dev_replace = 1; 6075 } else if (mirror_num > map->num_stripes) { 6076 mirror_num = 0; 6077 } 6078 6079 num_stripes = 1; 6080 stripe_index = 0; 6081 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6082 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6083 &stripe_index); 6084 if (!need_full_stripe(op)) 6085 mirror_num = 1; 6086 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6087 if (need_full_stripe(op)) 6088 num_stripes = map->num_stripes; 6089 else if (mirror_num) 6090 stripe_index = mirror_num - 1; 6091 else { 6092 stripe_index = find_live_mirror(fs_info, map, 0, 6093 dev_replace_is_ongoing); 6094 mirror_num = stripe_index + 1; 6095 } 6096 6097 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6098 if (need_full_stripe(op)) { 6099 num_stripes = map->num_stripes; 6100 } else if (mirror_num) { 6101 stripe_index = mirror_num - 1; 6102 } else { 6103 mirror_num = 1; 6104 } 6105 6106 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6107 u32 factor = map->num_stripes / map->sub_stripes; 6108 6109 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6110 stripe_index *= map->sub_stripes; 6111 6112 if (need_full_stripe(op)) 6113 num_stripes = map->sub_stripes; 6114 else if (mirror_num) 6115 stripe_index += mirror_num - 1; 6116 else { 6117 int old_stripe_index = stripe_index; 6118 stripe_index = find_live_mirror(fs_info, map, 6119 stripe_index, 6120 dev_replace_is_ongoing); 6121 mirror_num = stripe_index - old_stripe_index + 1; 6122 } 6123 6124 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6125 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6126 /* push stripe_nr back to the start of the full stripe */ 6127 stripe_nr = div64_u64(raid56_full_stripe_start, 6128 stripe_len * data_stripes); 6129 6130 /* RAID[56] write or recovery. Return all stripes */ 6131 num_stripes = map->num_stripes; 6132 max_errors = nr_parity_stripes(map); 6133 6134 *length = map->stripe_len; 6135 stripe_index = 0; 6136 stripe_offset = 0; 6137 } else { 6138 /* 6139 * Mirror #0 or #1 means the original data block. 6140 * Mirror #2 is RAID5 parity block. 6141 * Mirror #3 is RAID6 Q block. 6142 */ 6143 stripe_nr = div_u64_rem(stripe_nr, 6144 data_stripes, &stripe_index); 6145 if (mirror_num > 1) 6146 stripe_index = data_stripes + mirror_num - 2; 6147 6148 /* We distribute the parity blocks across stripes */ 6149 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6150 &stripe_index); 6151 if (!need_full_stripe(op) && mirror_num <= 1) 6152 mirror_num = 1; 6153 } 6154 } else { 6155 /* 6156 * after this, stripe_nr is the number of stripes on this 6157 * device we have to walk to find the data, and stripe_index is 6158 * the number of our device in the stripe array 6159 */ 6160 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6161 &stripe_index); 6162 mirror_num = stripe_index + 1; 6163 } 6164 if (stripe_index >= map->num_stripes) { 6165 btrfs_crit(fs_info, 6166 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6167 stripe_index, map->num_stripes); 6168 ret = -EINVAL; 6169 goto out; 6170 } 6171 6172 num_alloc_stripes = num_stripes; 6173 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6174 if (op == BTRFS_MAP_WRITE) 6175 num_alloc_stripes <<= 1; 6176 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6177 num_alloc_stripes++; 6178 tgtdev_indexes = num_stripes; 6179 } 6180 6181 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); 6182 if (!bbio) { 6183 ret = -ENOMEM; 6184 goto out; 6185 } 6186 6187 for (i = 0; i < num_stripes; i++) { 6188 bbio->stripes[i].physical = map->stripes[stripe_index].physical + 6189 stripe_offset + stripe_nr * map->stripe_len; 6190 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 6191 stripe_index++; 6192 } 6193 6194 /* build raid_map */ 6195 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6196 (need_full_stripe(op) || mirror_num > 1)) { 6197 u64 tmp; 6198 unsigned rot; 6199 6200 /* Work out the disk rotation on this stripe-set */ 6201 div_u64_rem(stripe_nr, num_stripes, &rot); 6202 6203 /* Fill in the logical address of each stripe */ 6204 tmp = stripe_nr * data_stripes; 6205 for (i = 0; i < data_stripes; i++) 6206 bbio->raid_map[(i+rot) % num_stripes] = 6207 em->start + (tmp + i) * map->stripe_len; 6208 6209 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 6210 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6211 bbio->raid_map[(i+rot+1) % num_stripes] = 6212 RAID6_Q_STRIPE; 6213 6214 sort_parity_stripes(bbio, num_stripes); 6215 } 6216 6217 if (need_full_stripe(op)) 6218 max_errors = btrfs_chunk_max_errors(map); 6219 6220 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6221 need_full_stripe(op)) { 6222 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, 6223 &max_errors); 6224 } 6225 6226 *bbio_ret = bbio; 6227 bbio->map_type = map->type; 6228 bbio->num_stripes = num_stripes; 6229 bbio->max_errors = max_errors; 6230 bbio->mirror_num = mirror_num; 6231 6232 /* 6233 * this is the case that REQ_READ && dev_replace_is_ongoing && 6234 * mirror_num == num_stripes + 1 && dev_replace target drive is 6235 * available as a mirror 6236 */ 6237 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6238 WARN_ON(num_stripes > 1); 6239 bbio->stripes[0].dev = dev_replace->tgtdev; 6240 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 6241 bbio->mirror_num = map->num_stripes + 1; 6242 } 6243 out: 6244 if (dev_replace_is_ongoing) { 6245 lockdep_assert_held(&dev_replace->rwsem); 6246 /* Unlock and let waiting writers proceed */ 6247 up_read(&dev_replace->rwsem); 6248 } 6249 free_extent_map(em); 6250 return ret; 6251 } 6252 6253 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6254 u64 logical, u64 *length, 6255 struct btrfs_bio **bbio_ret, int mirror_num) 6256 { 6257 if (op == BTRFS_MAP_DISCARD) 6258 return __btrfs_map_block_for_discard(fs_info, logical, 6259 length, bbio_ret); 6260 6261 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 6262 mirror_num, 0); 6263 } 6264 6265 /* For Scrub/replace */ 6266 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6267 u64 logical, u64 *length, 6268 struct btrfs_bio **bbio_ret) 6269 { 6270 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); 6271 } 6272 6273 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) 6274 { 6275 bio->bi_private = bbio->private; 6276 bio->bi_end_io = bbio->end_io; 6277 bio_endio(bio); 6278 6279 btrfs_put_bbio(bbio); 6280 } 6281 6282 static void btrfs_end_bio(struct bio *bio) 6283 { 6284 struct btrfs_bio *bbio = bio->bi_private; 6285 int is_orig_bio = 0; 6286 6287 if (bio->bi_status) { 6288 atomic_inc(&bbio->error); 6289 if (bio->bi_status == BLK_STS_IOERR || 6290 bio->bi_status == BLK_STS_TARGET) { 6291 struct btrfs_device *dev = btrfs_io_bio(bio)->device; 6292 6293 ASSERT(dev->bdev); 6294 if (bio_op(bio) == REQ_OP_WRITE) 6295 btrfs_dev_stat_inc_and_print(dev, 6296 BTRFS_DEV_STAT_WRITE_ERRS); 6297 else if (!(bio->bi_opf & REQ_RAHEAD)) 6298 btrfs_dev_stat_inc_and_print(dev, 6299 BTRFS_DEV_STAT_READ_ERRS); 6300 if (bio->bi_opf & REQ_PREFLUSH) 6301 btrfs_dev_stat_inc_and_print(dev, 6302 BTRFS_DEV_STAT_FLUSH_ERRS); 6303 } 6304 } 6305 6306 if (bio == bbio->orig_bio) 6307 is_orig_bio = 1; 6308 6309 btrfs_bio_counter_dec(bbio->fs_info); 6310 6311 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6312 if (!is_orig_bio) { 6313 bio_put(bio); 6314 bio = bbio->orig_bio; 6315 } 6316 6317 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6318 /* only send an error to the higher layers if it is 6319 * beyond the tolerance of the btrfs bio 6320 */ 6321 if (atomic_read(&bbio->error) > bbio->max_errors) { 6322 bio->bi_status = BLK_STS_IOERR; 6323 } else { 6324 /* 6325 * this bio is actually up to date, we didn't 6326 * go over the max number of errors 6327 */ 6328 bio->bi_status = BLK_STS_OK; 6329 } 6330 6331 btrfs_end_bbio(bbio, bio); 6332 } else if (!is_orig_bio) { 6333 bio_put(bio); 6334 } 6335 } 6336 6337 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, 6338 u64 physical, struct btrfs_device *dev) 6339 { 6340 struct btrfs_fs_info *fs_info = bbio->fs_info; 6341 6342 bio->bi_private = bbio; 6343 btrfs_io_bio(bio)->device = dev; 6344 bio->bi_end_io = btrfs_end_bio; 6345 bio->bi_iter.bi_sector = physical >> 9; 6346 btrfs_debug_in_rcu(fs_info, 6347 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6348 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, 6349 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6350 dev->devid, bio->bi_iter.bi_size); 6351 bio_set_dev(bio, dev->bdev); 6352 6353 btrfs_bio_counter_inc_noblocked(fs_info); 6354 6355 btrfsic_submit_bio(bio); 6356 } 6357 6358 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) 6359 { 6360 atomic_inc(&bbio->error); 6361 if (atomic_dec_and_test(&bbio->stripes_pending)) { 6362 /* Should be the original bio. */ 6363 WARN_ON(bio != bbio->orig_bio); 6364 6365 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 6366 bio->bi_iter.bi_sector = logical >> 9; 6367 if (atomic_read(&bbio->error) > bbio->max_errors) 6368 bio->bi_status = BLK_STS_IOERR; 6369 else 6370 bio->bi_status = BLK_STS_OK; 6371 btrfs_end_bbio(bbio, bio); 6372 } 6373 } 6374 6375 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6376 int mirror_num) 6377 { 6378 struct btrfs_device *dev; 6379 struct bio *first_bio = bio; 6380 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 6381 u64 length = 0; 6382 u64 map_length; 6383 int ret; 6384 int dev_nr; 6385 int total_devs; 6386 struct btrfs_bio *bbio = NULL; 6387 6388 length = bio->bi_iter.bi_size; 6389 map_length = length; 6390 6391 btrfs_bio_counter_inc_blocked(fs_info); 6392 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6393 &map_length, &bbio, mirror_num, 1); 6394 if (ret) { 6395 btrfs_bio_counter_dec(fs_info); 6396 return errno_to_blk_status(ret); 6397 } 6398 6399 total_devs = bbio->num_stripes; 6400 bbio->orig_bio = first_bio; 6401 bbio->private = first_bio->bi_private; 6402 bbio->end_io = first_bio->bi_end_io; 6403 bbio->fs_info = fs_info; 6404 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 6405 6406 if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6407 ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { 6408 /* In this case, map_length has been set to the length of 6409 a single stripe; not the whole write */ 6410 if (bio_op(bio) == REQ_OP_WRITE) { 6411 ret = raid56_parity_write(fs_info, bio, bbio, 6412 map_length); 6413 } else { 6414 ret = raid56_parity_recover(fs_info, bio, bbio, 6415 map_length, mirror_num, 1); 6416 } 6417 6418 btrfs_bio_counter_dec(fs_info); 6419 return errno_to_blk_status(ret); 6420 } 6421 6422 if (map_length < length) { 6423 btrfs_crit(fs_info, 6424 "mapping failed logical %llu bio len %llu len %llu", 6425 logical, length, map_length); 6426 BUG(); 6427 } 6428 6429 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6430 dev = bbio->stripes[dev_nr].dev; 6431 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6432 &dev->dev_state) || 6433 (bio_op(first_bio) == REQ_OP_WRITE && 6434 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6435 bbio_error(bbio, first_bio, logical); 6436 continue; 6437 } 6438 6439 if (dev_nr < total_devs - 1) 6440 bio = btrfs_bio_clone(first_bio); 6441 else 6442 bio = first_bio; 6443 6444 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); 6445 } 6446 btrfs_bio_counter_dec(fs_info); 6447 return BLK_STS_OK; 6448 } 6449 6450 /* 6451 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6452 * return NULL. 6453 * 6454 * If devid and uuid are both specified, the match must be exact, otherwise 6455 * only devid is used. 6456 * 6457 * If @seed is true, traverse through the seed devices. 6458 */ 6459 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6460 u64 devid, u8 *uuid, u8 *fsid, 6461 bool seed) 6462 { 6463 struct btrfs_device *device; 6464 6465 while (fs_devices) { 6466 if (!fsid || 6467 !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { 6468 list_for_each_entry(device, &fs_devices->devices, 6469 dev_list) { 6470 if (device->devid == devid && 6471 (!uuid || memcmp(device->uuid, uuid, 6472 BTRFS_UUID_SIZE) == 0)) 6473 return device; 6474 } 6475 } 6476 if (seed) 6477 fs_devices = fs_devices->seed; 6478 else 6479 return NULL; 6480 } 6481 return NULL; 6482 } 6483 6484 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6485 u64 devid, u8 *dev_uuid) 6486 { 6487 struct btrfs_device *device; 6488 unsigned int nofs_flag; 6489 6490 /* 6491 * We call this under the chunk_mutex, so we want to use NOFS for this 6492 * allocation, however we don't want to change btrfs_alloc_device() to 6493 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6494 * places. 6495 */ 6496 nofs_flag = memalloc_nofs_save(); 6497 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6498 memalloc_nofs_restore(nofs_flag); 6499 if (IS_ERR(device)) 6500 return device; 6501 6502 list_add(&device->dev_list, &fs_devices->devices); 6503 device->fs_devices = fs_devices; 6504 fs_devices->num_devices++; 6505 6506 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6507 fs_devices->missing_devices++; 6508 6509 return device; 6510 } 6511 6512 /** 6513 * btrfs_alloc_device - allocate struct btrfs_device 6514 * @fs_info: used only for generating a new devid, can be NULL if 6515 * devid is provided (i.e. @devid != NULL). 6516 * @devid: a pointer to devid for this device. If NULL a new devid 6517 * is generated. 6518 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6519 * is generated. 6520 * 6521 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6522 * on error. Returned struct is not linked onto any lists and must be 6523 * destroyed with btrfs_free_device. 6524 */ 6525 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6526 const u64 *devid, 6527 const u8 *uuid) 6528 { 6529 struct btrfs_device *dev; 6530 u64 tmp; 6531 6532 if (WARN_ON(!devid && !fs_info)) 6533 return ERR_PTR(-EINVAL); 6534 6535 dev = __alloc_device(); 6536 if (IS_ERR(dev)) 6537 return dev; 6538 6539 if (devid) 6540 tmp = *devid; 6541 else { 6542 int ret; 6543 6544 ret = find_next_devid(fs_info, &tmp); 6545 if (ret) { 6546 btrfs_free_device(dev); 6547 return ERR_PTR(ret); 6548 } 6549 } 6550 dev->devid = tmp; 6551 6552 if (uuid) 6553 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6554 else 6555 generate_random_uuid(dev->uuid); 6556 6557 return dev; 6558 } 6559 6560 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6561 u64 devid, u8 *uuid, bool error) 6562 { 6563 if (error) 6564 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6565 devid, uuid); 6566 else 6567 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6568 devid, uuid); 6569 } 6570 6571 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 6572 { 6573 int index = btrfs_bg_flags_to_raid_index(type); 6574 int ncopies = btrfs_raid_array[index].ncopies; 6575 const int nparity = btrfs_raid_array[index].nparity; 6576 int data_stripes; 6577 6578 if (nparity) 6579 data_stripes = num_stripes - nparity; 6580 else 6581 data_stripes = num_stripes / ncopies; 6582 6583 return div_u64(chunk_len, data_stripes); 6584 } 6585 6586 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 6587 struct btrfs_chunk *chunk) 6588 { 6589 struct btrfs_fs_info *fs_info = leaf->fs_info; 6590 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6591 struct map_lookup *map; 6592 struct extent_map *em; 6593 u64 logical; 6594 u64 length; 6595 u64 devid; 6596 u8 uuid[BTRFS_UUID_SIZE]; 6597 int num_stripes; 6598 int ret; 6599 int i; 6600 6601 logical = key->offset; 6602 length = btrfs_chunk_length(leaf, chunk); 6603 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6604 6605 /* 6606 * Only need to verify chunk item if we're reading from sys chunk array, 6607 * as chunk item in tree block is already verified by tree-checker. 6608 */ 6609 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6610 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6611 if (ret) 6612 return ret; 6613 } 6614 6615 read_lock(&map_tree->lock); 6616 em = lookup_extent_mapping(map_tree, logical, 1); 6617 read_unlock(&map_tree->lock); 6618 6619 /* already mapped? */ 6620 if (em && em->start <= logical && em->start + em->len > logical) { 6621 free_extent_map(em); 6622 return 0; 6623 } else if (em) { 6624 free_extent_map(em); 6625 } 6626 6627 em = alloc_extent_map(); 6628 if (!em) 6629 return -ENOMEM; 6630 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6631 if (!map) { 6632 free_extent_map(em); 6633 return -ENOMEM; 6634 } 6635 6636 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6637 em->map_lookup = map; 6638 em->start = logical; 6639 em->len = length; 6640 em->orig_start = 0; 6641 em->block_start = 0; 6642 em->block_len = em->len; 6643 6644 map->num_stripes = num_stripes; 6645 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6646 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6647 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6648 map->type = btrfs_chunk_type(leaf, chunk); 6649 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 6650 map->verified_stripes = 0; 6651 em->orig_block_len = calc_stripe_length(map->type, em->len, 6652 map->num_stripes); 6653 for (i = 0; i < num_stripes; i++) { 6654 map->stripes[i].physical = 6655 btrfs_stripe_offset_nr(leaf, chunk, i); 6656 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6657 read_extent_buffer(leaf, uuid, (unsigned long) 6658 btrfs_stripe_dev_uuid_nr(chunk, i), 6659 BTRFS_UUID_SIZE); 6660 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, 6661 devid, uuid, NULL, true); 6662 if (!map->stripes[i].dev && 6663 !btrfs_test_opt(fs_info, DEGRADED)) { 6664 free_extent_map(em); 6665 btrfs_report_missing_device(fs_info, devid, uuid, true); 6666 return -ENOENT; 6667 } 6668 if (!map->stripes[i].dev) { 6669 map->stripes[i].dev = 6670 add_missing_dev(fs_info->fs_devices, devid, 6671 uuid); 6672 if (IS_ERR(map->stripes[i].dev)) { 6673 free_extent_map(em); 6674 btrfs_err(fs_info, 6675 "failed to init missing dev %llu: %ld", 6676 devid, PTR_ERR(map->stripes[i].dev)); 6677 return PTR_ERR(map->stripes[i].dev); 6678 } 6679 btrfs_report_missing_device(fs_info, devid, uuid, false); 6680 } 6681 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6682 &(map->stripes[i].dev->dev_state)); 6683 6684 } 6685 6686 write_lock(&map_tree->lock); 6687 ret = add_extent_mapping(map_tree, em, 0); 6688 write_unlock(&map_tree->lock); 6689 if (ret < 0) { 6690 btrfs_err(fs_info, 6691 "failed to add chunk map, start=%llu len=%llu: %d", 6692 em->start, em->len, ret); 6693 } 6694 free_extent_map(em); 6695 6696 return ret; 6697 } 6698 6699 static void fill_device_from_item(struct extent_buffer *leaf, 6700 struct btrfs_dev_item *dev_item, 6701 struct btrfs_device *device) 6702 { 6703 unsigned long ptr; 6704 6705 device->devid = btrfs_device_id(leaf, dev_item); 6706 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6707 device->total_bytes = device->disk_total_bytes; 6708 device->commit_total_bytes = device->disk_total_bytes; 6709 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6710 device->commit_bytes_used = device->bytes_used; 6711 device->type = btrfs_device_type(leaf, dev_item); 6712 device->io_align = btrfs_device_io_align(leaf, dev_item); 6713 device->io_width = btrfs_device_io_width(leaf, dev_item); 6714 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6715 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6716 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6717 6718 ptr = btrfs_device_uuid(dev_item); 6719 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6720 } 6721 6722 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 6723 u8 *fsid) 6724 { 6725 struct btrfs_fs_devices *fs_devices; 6726 int ret; 6727 6728 lockdep_assert_held(&uuid_mutex); 6729 ASSERT(fsid); 6730 6731 fs_devices = fs_info->fs_devices->seed; 6732 while (fs_devices) { 6733 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 6734 return fs_devices; 6735 6736 fs_devices = fs_devices->seed; 6737 } 6738 6739 fs_devices = find_fsid(fsid, NULL); 6740 if (!fs_devices) { 6741 if (!btrfs_test_opt(fs_info, DEGRADED)) 6742 return ERR_PTR(-ENOENT); 6743 6744 fs_devices = alloc_fs_devices(fsid, NULL); 6745 if (IS_ERR(fs_devices)) 6746 return fs_devices; 6747 6748 fs_devices->seeding = true; 6749 fs_devices->opened = 1; 6750 return fs_devices; 6751 } 6752 6753 fs_devices = clone_fs_devices(fs_devices); 6754 if (IS_ERR(fs_devices)) 6755 return fs_devices; 6756 6757 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 6758 if (ret) { 6759 free_fs_devices(fs_devices); 6760 fs_devices = ERR_PTR(ret); 6761 goto out; 6762 } 6763 6764 if (!fs_devices->seeding) { 6765 close_fs_devices(fs_devices); 6766 free_fs_devices(fs_devices); 6767 fs_devices = ERR_PTR(-EINVAL); 6768 goto out; 6769 } 6770 6771 fs_devices->seed = fs_info->fs_devices->seed; 6772 fs_info->fs_devices->seed = fs_devices; 6773 out: 6774 return fs_devices; 6775 } 6776 6777 static int read_one_dev(struct extent_buffer *leaf, 6778 struct btrfs_dev_item *dev_item) 6779 { 6780 struct btrfs_fs_info *fs_info = leaf->fs_info; 6781 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6782 struct btrfs_device *device; 6783 u64 devid; 6784 int ret; 6785 u8 fs_uuid[BTRFS_FSID_SIZE]; 6786 u8 dev_uuid[BTRFS_UUID_SIZE]; 6787 6788 devid = btrfs_device_id(leaf, dev_item); 6789 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 6790 BTRFS_UUID_SIZE); 6791 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 6792 BTRFS_FSID_SIZE); 6793 6794 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 6795 fs_devices = open_seed_devices(fs_info, fs_uuid); 6796 if (IS_ERR(fs_devices)) 6797 return PTR_ERR(fs_devices); 6798 } 6799 6800 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, 6801 fs_uuid, true); 6802 if (!device) { 6803 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6804 btrfs_report_missing_device(fs_info, devid, 6805 dev_uuid, true); 6806 return -ENOENT; 6807 } 6808 6809 device = add_missing_dev(fs_devices, devid, dev_uuid); 6810 if (IS_ERR(device)) { 6811 btrfs_err(fs_info, 6812 "failed to add missing dev %llu: %ld", 6813 devid, PTR_ERR(device)); 6814 return PTR_ERR(device); 6815 } 6816 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 6817 } else { 6818 if (!device->bdev) { 6819 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6820 btrfs_report_missing_device(fs_info, 6821 devid, dev_uuid, true); 6822 return -ENOENT; 6823 } 6824 btrfs_report_missing_device(fs_info, devid, 6825 dev_uuid, false); 6826 } 6827 6828 if (!device->bdev && 6829 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 6830 /* 6831 * this happens when a device that was properly setup 6832 * in the device info lists suddenly goes bad. 6833 * device->bdev is NULL, and so we have to set 6834 * device->missing to one here 6835 */ 6836 device->fs_devices->missing_devices++; 6837 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6838 } 6839 6840 /* Move the device to its own fs_devices */ 6841 if (device->fs_devices != fs_devices) { 6842 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 6843 &device->dev_state)); 6844 6845 list_move(&device->dev_list, &fs_devices->devices); 6846 device->fs_devices->num_devices--; 6847 fs_devices->num_devices++; 6848 6849 device->fs_devices->missing_devices--; 6850 fs_devices->missing_devices++; 6851 6852 device->fs_devices = fs_devices; 6853 } 6854 } 6855 6856 if (device->fs_devices != fs_info->fs_devices) { 6857 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 6858 if (device->generation != 6859 btrfs_device_generation(leaf, dev_item)) 6860 return -EINVAL; 6861 } 6862 6863 fill_device_from_item(leaf, dev_item, device); 6864 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 6865 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 6866 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 6867 device->fs_devices->total_rw_bytes += device->total_bytes; 6868 atomic64_add(device->total_bytes - device->bytes_used, 6869 &fs_info->free_chunk_space); 6870 } 6871 ret = 0; 6872 return ret; 6873 } 6874 6875 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 6876 { 6877 struct btrfs_root *root = fs_info->tree_root; 6878 struct btrfs_super_block *super_copy = fs_info->super_copy; 6879 struct extent_buffer *sb; 6880 struct btrfs_disk_key *disk_key; 6881 struct btrfs_chunk *chunk; 6882 u8 *array_ptr; 6883 unsigned long sb_array_offset; 6884 int ret = 0; 6885 u32 num_stripes; 6886 u32 array_size; 6887 u32 len = 0; 6888 u32 cur_offset; 6889 u64 type; 6890 struct btrfs_key key; 6891 6892 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 6893 /* 6894 * This will create extent buffer of nodesize, superblock size is 6895 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 6896 * overallocate but we can keep it as-is, only the first page is used. 6897 */ 6898 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); 6899 if (IS_ERR(sb)) 6900 return PTR_ERR(sb); 6901 set_extent_buffer_uptodate(sb); 6902 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); 6903 /* 6904 * The sb extent buffer is artificial and just used to read the system array. 6905 * set_extent_buffer_uptodate() call does not properly mark all it's 6906 * pages up-to-date when the page is larger: extent does not cover the 6907 * whole page and consequently check_page_uptodate does not find all 6908 * the page's extents up-to-date (the hole beyond sb), 6909 * write_extent_buffer then triggers a WARN_ON. 6910 * 6911 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 6912 * but sb spans only this function. Add an explicit SetPageUptodate call 6913 * to silence the warning eg. on PowerPC 64. 6914 */ 6915 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 6916 SetPageUptodate(sb->pages[0]); 6917 6918 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6919 array_size = btrfs_super_sys_array_size(super_copy); 6920 6921 array_ptr = super_copy->sys_chunk_array; 6922 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 6923 cur_offset = 0; 6924 6925 while (cur_offset < array_size) { 6926 disk_key = (struct btrfs_disk_key *)array_ptr; 6927 len = sizeof(*disk_key); 6928 if (cur_offset + len > array_size) 6929 goto out_short_read; 6930 6931 btrfs_disk_key_to_cpu(&key, disk_key); 6932 6933 array_ptr += len; 6934 sb_array_offset += len; 6935 cur_offset += len; 6936 6937 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 6938 btrfs_err(fs_info, 6939 "unexpected item type %u in sys_array at offset %u", 6940 (u32)key.type, cur_offset); 6941 ret = -EIO; 6942 break; 6943 } 6944 6945 chunk = (struct btrfs_chunk *)sb_array_offset; 6946 /* 6947 * At least one btrfs_chunk with one stripe must be present, 6948 * exact stripe count check comes afterwards 6949 */ 6950 len = btrfs_chunk_item_size(1); 6951 if (cur_offset + len > array_size) 6952 goto out_short_read; 6953 6954 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 6955 if (!num_stripes) { 6956 btrfs_err(fs_info, 6957 "invalid number of stripes %u in sys_array at offset %u", 6958 num_stripes, cur_offset); 6959 ret = -EIO; 6960 break; 6961 } 6962 6963 type = btrfs_chunk_type(sb, chunk); 6964 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 6965 btrfs_err(fs_info, 6966 "invalid chunk type %llu in sys_array at offset %u", 6967 type, cur_offset); 6968 ret = -EIO; 6969 break; 6970 } 6971 6972 len = btrfs_chunk_item_size(num_stripes); 6973 if (cur_offset + len > array_size) 6974 goto out_short_read; 6975 6976 ret = read_one_chunk(&key, sb, chunk); 6977 if (ret) 6978 break; 6979 6980 array_ptr += len; 6981 sb_array_offset += len; 6982 cur_offset += len; 6983 } 6984 clear_extent_buffer_uptodate(sb); 6985 free_extent_buffer_stale(sb); 6986 return ret; 6987 6988 out_short_read: 6989 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 6990 len, cur_offset); 6991 clear_extent_buffer_uptodate(sb); 6992 free_extent_buffer_stale(sb); 6993 return -EIO; 6994 } 6995 6996 /* 6997 * Check if all chunks in the fs are OK for read-write degraded mount 6998 * 6999 * If the @failing_dev is specified, it's accounted as missing. 7000 * 7001 * Return true if all chunks meet the minimal RW mount requirements. 7002 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7003 */ 7004 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7005 struct btrfs_device *failing_dev) 7006 { 7007 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7008 struct extent_map *em; 7009 u64 next_start = 0; 7010 bool ret = true; 7011 7012 read_lock(&map_tree->lock); 7013 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7014 read_unlock(&map_tree->lock); 7015 /* No chunk at all? Return false anyway */ 7016 if (!em) { 7017 ret = false; 7018 goto out; 7019 } 7020 while (em) { 7021 struct map_lookup *map; 7022 int missing = 0; 7023 int max_tolerated; 7024 int i; 7025 7026 map = em->map_lookup; 7027 max_tolerated = 7028 btrfs_get_num_tolerated_disk_barrier_failures( 7029 map->type); 7030 for (i = 0; i < map->num_stripes; i++) { 7031 struct btrfs_device *dev = map->stripes[i].dev; 7032 7033 if (!dev || !dev->bdev || 7034 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7035 dev->last_flush_error) 7036 missing++; 7037 else if (failing_dev && failing_dev == dev) 7038 missing++; 7039 } 7040 if (missing > max_tolerated) { 7041 if (!failing_dev) 7042 btrfs_warn(fs_info, 7043 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7044 em->start, missing, max_tolerated); 7045 free_extent_map(em); 7046 ret = false; 7047 goto out; 7048 } 7049 next_start = extent_map_end(em); 7050 free_extent_map(em); 7051 7052 read_lock(&map_tree->lock); 7053 em = lookup_extent_mapping(map_tree, next_start, 7054 (u64)(-1) - next_start); 7055 read_unlock(&map_tree->lock); 7056 } 7057 out: 7058 return ret; 7059 } 7060 7061 static void readahead_tree_node_children(struct extent_buffer *node) 7062 { 7063 int i; 7064 const int nr_items = btrfs_header_nritems(node); 7065 7066 for (i = 0; i < nr_items; i++) { 7067 u64 start; 7068 7069 start = btrfs_node_blockptr(node, i); 7070 readahead_tree_block(node->fs_info, start); 7071 } 7072 } 7073 7074 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7075 { 7076 struct btrfs_root *root = fs_info->chunk_root; 7077 struct btrfs_path *path; 7078 struct extent_buffer *leaf; 7079 struct btrfs_key key; 7080 struct btrfs_key found_key; 7081 int ret; 7082 int slot; 7083 u64 total_dev = 0; 7084 u64 last_ra_node = 0; 7085 7086 path = btrfs_alloc_path(); 7087 if (!path) 7088 return -ENOMEM; 7089 7090 /* 7091 * uuid_mutex is needed only if we are mounting a sprout FS 7092 * otherwise we don't need it. 7093 */ 7094 mutex_lock(&uuid_mutex); 7095 7096 /* 7097 * It is possible for mount and umount to race in such a way that 7098 * we execute this code path, but open_fs_devices failed to clear 7099 * total_rw_bytes. We certainly want it cleared before reading the 7100 * device items, so clear it here. 7101 */ 7102 fs_info->fs_devices->total_rw_bytes = 0; 7103 7104 /* 7105 * Read all device items, and then all the chunk items. All 7106 * device items are found before any chunk item (their object id 7107 * is smaller than the lowest possible object id for a chunk 7108 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7109 */ 7110 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7111 key.offset = 0; 7112 key.type = 0; 7113 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7114 if (ret < 0) 7115 goto error; 7116 while (1) { 7117 struct extent_buffer *node; 7118 7119 leaf = path->nodes[0]; 7120 slot = path->slots[0]; 7121 if (slot >= btrfs_header_nritems(leaf)) { 7122 ret = btrfs_next_leaf(root, path); 7123 if (ret == 0) 7124 continue; 7125 if (ret < 0) 7126 goto error; 7127 break; 7128 } 7129 /* 7130 * The nodes on level 1 are not locked but we don't need to do 7131 * that during mount time as nothing else can access the tree 7132 */ 7133 node = path->nodes[1]; 7134 if (node) { 7135 if (last_ra_node != node->start) { 7136 readahead_tree_node_children(node); 7137 last_ra_node = node->start; 7138 } 7139 } 7140 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7141 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7142 struct btrfs_dev_item *dev_item; 7143 dev_item = btrfs_item_ptr(leaf, slot, 7144 struct btrfs_dev_item); 7145 ret = read_one_dev(leaf, dev_item); 7146 if (ret) 7147 goto error; 7148 total_dev++; 7149 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7150 struct btrfs_chunk *chunk; 7151 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7152 mutex_lock(&fs_info->chunk_mutex); 7153 ret = read_one_chunk(&found_key, leaf, chunk); 7154 mutex_unlock(&fs_info->chunk_mutex); 7155 if (ret) 7156 goto error; 7157 } 7158 path->slots[0]++; 7159 } 7160 7161 /* 7162 * After loading chunk tree, we've got all device information, 7163 * do another round of validation checks. 7164 */ 7165 if (total_dev != fs_info->fs_devices->total_devices) { 7166 btrfs_err(fs_info, 7167 "super_num_devices %llu mismatch with num_devices %llu found here", 7168 btrfs_super_num_devices(fs_info->super_copy), 7169 total_dev); 7170 ret = -EINVAL; 7171 goto error; 7172 } 7173 if (btrfs_super_total_bytes(fs_info->super_copy) < 7174 fs_info->fs_devices->total_rw_bytes) { 7175 btrfs_err(fs_info, 7176 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7177 btrfs_super_total_bytes(fs_info->super_copy), 7178 fs_info->fs_devices->total_rw_bytes); 7179 ret = -EINVAL; 7180 goto error; 7181 } 7182 ret = 0; 7183 error: 7184 mutex_unlock(&uuid_mutex); 7185 7186 btrfs_free_path(path); 7187 return ret; 7188 } 7189 7190 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7191 { 7192 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7193 struct btrfs_device *device; 7194 7195 while (fs_devices) { 7196 mutex_lock(&fs_devices->device_list_mutex); 7197 list_for_each_entry(device, &fs_devices->devices, dev_list) 7198 device->fs_info = fs_info; 7199 mutex_unlock(&fs_devices->device_list_mutex); 7200 7201 fs_devices = fs_devices->seed; 7202 } 7203 } 7204 7205 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7206 const struct btrfs_dev_stats_item *ptr, 7207 int index) 7208 { 7209 u64 val; 7210 7211 read_extent_buffer(eb, &val, 7212 offsetof(struct btrfs_dev_stats_item, values) + 7213 ((unsigned long)ptr) + (index * sizeof(u64)), 7214 sizeof(val)); 7215 return val; 7216 } 7217 7218 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7219 struct btrfs_dev_stats_item *ptr, 7220 int index, u64 val) 7221 { 7222 write_extent_buffer(eb, &val, 7223 offsetof(struct btrfs_dev_stats_item, values) + 7224 ((unsigned long)ptr) + (index * sizeof(u64)), 7225 sizeof(val)); 7226 } 7227 7228 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7229 { 7230 struct btrfs_key key; 7231 struct btrfs_root *dev_root = fs_info->dev_root; 7232 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7233 struct extent_buffer *eb; 7234 int slot; 7235 int ret = 0; 7236 struct btrfs_device *device; 7237 struct btrfs_path *path = NULL; 7238 int i; 7239 7240 path = btrfs_alloc_path(); 7241 if (!path) 7242 return -ENOMEM; 7243 7244 mutex_lock(&fs_devices->device_list_mutex); 7245 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7246 int item_size; 7247 struct btrfs_dev_stats_item *ptr; 7248 7249 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7250 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7251 key.offset = device->devid; 7252 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 7253 if (ret) { 7254 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7255 btrfs_dev_stat_set(device, i, 0); 7256 device->dev_stats_valid = 1; 7257 btrfs_release_path(path); 7258 continue; 7259 } 7260 slot = path->slots[0]; 7261 eb = path->nodes[0]; 7262 item_size = btrfs_item_size_nr(eb, slot); 7263 7264 ptr = btrfs_item_ptr(eb, slot, 7265 struct btrfs_dev_stats_item); 7266 7267 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7268 if (item_size >= (1 + i) * sizeof(__le64)) 7269 btrfs_dev_stat_set(device, i, 7270 btrfs_dev_stats_value(eb, ptr, i)); 7271 else 7272 btrfs_dev_stat_set(device, i, 0); 7273 } 7274 7275 device->dev_stats_valid = 1; 7276 btrfs_dev_stat_print_on_load(device); 7277 btrfs_release_path(path); 7278 } 7279 mutex_unlock(&fs_devices->device_list_mutex); 7280 7281 btrfs_free_path(path); 7282 return ret < 0 ? ret : 0; 7283 } 7284 7285 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7286 struct btrfs_device *device) 7287 { 7288 struct btrfs_fs_info *fs_info = trans->fs_info; 7289 struct btrfs_root *dev_root = fs_info->dev_root; 7290 struct btrfs_path *path; 7291 struct btrfs_key key; 7292 struct extent_buffer *eb; 7293 struct btrfs_dev_stats_item *ptr; 7294 int ret; 7295 int i; 7296 7297 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7298 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7299 key.offset = device->devid; 7300 7301 path = btrfs_alloc_path(); 7302 if (!path) 7303 return -ENOMEM; 7304 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7305 if (ret < 0) { 7306 btrfs_warn_in_rcu(fs_info, 7307 "error %d while searching for dev_stats item for device %s", 7308 ret, rcu_str_deref(device->name)); 7309 goto out; 7310 } 7311 7312 if (ret == 0 && 7313 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7314 /* need to delete old one and insert a new one */ 7315 ret = btrfs_del_item(trans, dev_root, path); 7316 if (ret != 0) { 7317 btrfs_warn_in_rcu(fs_info, 7318 "delete too small dev_stats item for device %s failed %d", 7319 rcu_str_deref(device->name), ret); 7320 goto out; 7321 } 7322 ret = 1; 7323 } 7324 7325 if (ret == 1) { 7326 /* need to insert a new item */ 7327 btrfs_release_path(path); 7328 ret = btrfs_insert_empty_item(trans, dev_root, path, 7329 &key, sizeof(*ptr)); 7330 if (ret < 0) { 7331 btrfs_warn_in_rcu(fs_info, 7332 "insert dev_stats item for device %s failed %d", 7333 rcu_str_deref(device->name), ret); 7334 goto out; 7335 } 7336 } 7337 7338 eb = path->nodes[0]; 7339 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7340 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7341 btrfs_set_dev_stats_value(eb, ptr, i, 7342 btrfs_dev_stat_read(device, i)); 7343 btrfs_mark_buffer_dirty(eb); 7344 7345 out: 7346 btrfs_free_path(path); 7347 return ret; 7348 } 7349 7350 /* 7351 * called from commit_transaction. Writes all changed device stats to disk. 7352 */ 7353 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7354 { 7355 struct btrfs_fs_info *fs_info = trans->fs_info; 7356 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7357 struct btrfs_device *device; 7358 int stats_cnt; 7359 int ret = 0; 7360 7361 mutex_lock(&fs_devices->device_list_mutex); 7362 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7363 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7364 if (!device->dev_stats_valid || stats_cnt == 0) 7365 continue; 7366 7367 7368 /* 7369 * There is a LOAD-LOAD control dependency between the value of 7370 * dev_stats_ccnt and updating the on-disk values which requires 7371 * reading the in-memory counters. Such control dependencies 7372 * require explicit read memory barriers. 7373 * 7374 * This memory barriers pairs with smp_mb__before_atomic in 7375 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7376 * barrier implied by atomic_xchg in 7377 * btrfs_dev_stats_read_and_reset 7378 */ 7379 smp_rmb(); 7380 7381 ret = update_dev_stat_item(trans, device); 7382 if (!ret) 7383 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7384 } 7385 mutex_unlock(&fs_devices->device_list_mutex); 7386 7387 return ret; 7388 } 7389 7390 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7391 { 7392 btrfs_dev_stat_inc(dev, index); 7393 btrfs_dev_stat_print_on_error(dev); 7394 } 7395 7396 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7397 { 7398 if (!dev->dev_stats_valid) 7399 return; 7400 btrfs_err_rl_in_rcu(dev->fs_info, 7401 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7402 rcu_str_deref(dev->name), 7403 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7404 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7405 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7406 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7407 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7408 } 7409 7410 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7411 { 7412 int i; 7413 7414 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7415 if (btrfs_dev_stat_read(dev, i) != 0) 7416 break; 7417 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7418 return; /* all values == 0, suppress message */ 7419 7420 btrfs_info_in_rcu(dev->fs_info, 7421 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7422 rcu_str_deref(dev->name), 7423 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7424 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7425 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7426 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7427 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7428 } 7429 7430 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7431 struct btrfs_ioctl_get_dev_stats *stats) 7432 { 7433 struct btrfs_device *dev; 7434 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7435 int i; 7436 7437 mutex_lock(&fs_devices->device_list_mutex); 7438 dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, 7439 true); 7440 mutex_unlock(&fs_devices->device_list_mutex); 7441 7442 if (!dev) { 7443 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7444 return -ENODEV; 7445 } else if (!dev->dev_stats_valid) { 7446 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7447 return -ENODEV; 7448 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7449 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7450 if (stats->nr_items > i) 7451 stats->values[i] = 7452 btrfs_dev_stat_read_and_reset(dev, i); 7453 else 7454 btrfs_dev_stat_set(dev, i, 0); 7455 } 7456 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7457 current->comm, task_pid_nr(current)); 7458 } else { 7459 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7460 if (stats->nr_items > i) 7461 stats->values[i] = btrfs_dev_stat_read(dev, i); 7462 } 7463 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7464 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7465 return 0; 7466 } 7467 7468 /* 7469 * Update the size and bytes used for each device where it changed. This is 7470 * delayed since we would otherwise get errors while writing out the 7471 * superblocks. 7472 * 7473 * Must be invoked during transaction commit. 7474 */ 7475 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7476 { 7477 struct btrfs_device *curr, *next; 7478 7479 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7480 7481 if (list_empty(&trans->dev_update_list)) 7482 return; 7483 7484 /* 7485 * We don't need the device_list_mutex here. This list is owned by the 7486 * transaction and the transaction must complete before the device is 7487 * released. 7488 */ 7489 mutex_lock(&trans->fs_info->chunk_mutex); 7490 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7491 post_commit_list) { 7492 list_del_init(&curr->post_commit_list); 7493 curr->commit_total_bytes = curr->disk_total_bytes; 7494 curr->commit_bytes_used = curr->bytes_used; 7495 } 7496 mutex_unlock(&trans->fs_info->chunk_mutex); 7497 } 7498 7499 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) 7500 { 7501 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7502 while (fs_devices) { 7503 fs_devices->fs_info = fs_info; 7504 fs_devices = fs_devices->seed; 7505 } 7506 } 7507 7508 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) 7509 { 7510 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7511 while (fs_devices) { 7512 fs_devices->fs_info = NULL; 7513 fs_devices = fs_devices->seed; 7514 } 7515 } 7516 7517 /* 7518 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7519 */ 7520 int btrfs_bg_type_to_factor(u64 flags) 7521 { 7522 const int index = btrfs_bg_flags_to_raid_index(flags); 7523 7524 return btrfs_raid_array[index].ncopies; 7525 } 7526 7527 7528 7529 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7530 u64 chunk_offset, u64 devid, 7531 u64 physical_offset, u64 physical_len) 7532 { 7533 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7534 struct extent_map *em; 7535 struct map_lookup *map; 7536 struct btrfs_device *dev; 7537 u64 stripe_len; 7538 bool found = false; 7539 int ret = 0; 7540 int i; 7541 7542 read_lock(&em_tree->lock); 7543 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7544 read_unlock(&em_tree->lock); 7545 7546 if (!em) { 7547 btrfs_err(fs_info, 7548 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7549 physical_offset, devid); 7550 ret = -EUCLEAN; 7551 goto out; 7552 } 7553 7554 map = em->map_lookup; 7555 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 7556 if (physical_len != stripe_len) { 7557 btrfs_err(fs_info, 7558 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7559 physical_offset, devid, em->start, physical_len, 7560 stripe_len); 7561 ret = -EUCLEAN; 7562 goto out; 7563 } 7564 7565 for (i = 0; i < map->num_stripes; i++) { 7566 if (map->stripes[i].dev->devid == devid && 7567 map->stripes[i].physical == physical_offset) { 7568 found = true; 7569 if (map->verified_stripes >= map->num_stripes) { 7570 btrfs_err(fs_info, 7571 "too many dev extents for chunk %llu found", 7572 em->start); 7573 ret = -EUCLEAN; 7574 goto out; 7575 } 7576 map->verified_stripes++; 7577 break; 7578 } 7579 } 7580 if (!found) { 7581 btrfs_err(fs_info, 7582 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7583 physical_offset, devid); 7584 ret = -EUCLEAN; 7585 } 7586 7587 /* Make sure no dev extent is beyond device bondary */ 7588 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); 7589 if (!dev) { 7590 btrfs_err(fs_info, "failed to find devid %llu", devid); 7591 ret = -EUCLEAN; 7592 goto out; 7593 } 7594 7595 /* It's possible this device is a dummy for seed device */ 7596 if (dev->disk_total_bytes == 0) { 7597 dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, 7598 NULL, false); 7599 if (!dev) { 7600 btrfs_err(fs_info, "failed to find seed devid %llu", 7601 devid); 7602 ret = -EUCLEAN; 7603 goto out; 7604 } 7605 } 7606 7607 if (physical_offset + physical_len > dev->disk_total_bytes) { 7608 btrfs_err(fs_info, 7609 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7610 devid, physical_offset, physical_len, 7611 dev->disk_total_bytes); 7612 ret = -EUCLEAN; 7613 goto out; 7614 } 7615 out: 7616 free_extent_map(em); 7617 return ret; 7618 } 7619 7620 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7621 { 7622 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7623 struct extent_map *em; 7624 struct rb_node *node; 7625 int ret = 0; 7626 7627 read_lock(&em_tree->lock); 7628 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 7629 em = rb_entry(node, struct extent_map, rb_node); 7630 if (em->map_lookup->num_stripes != 7631 em->map_lookup->verified_stripes) { 7632 btrfs_err(fs_info, 7633 "chunk %llu has missing dev extent, have %d expect %d", 7634 em->start, em->map_lookup->verified_stripes, 7635 em->map_lookup->num_stripes); 7636 ret = -EUCLEAN; 7637 goto out; 7638 } 7639 } 7640 out: 7641 read_unlock(&em_tree->lock); 7642 return ret; 7643 } 7644 7645 /* 7646 * Ensure that all dev extents are mapped to correct chunk, otherwise 7647 * later chunk allocation/free would cause unexpected behavior. 7648 * 7649 * NOTE: This will iterate through the whole device tree, which should be of 7650 * the same size level as the chunk tree. This slightly increases mount time. 7651 */ 7652 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 7653 { 7654 struct btrfs_path *path; 7655 struct btrfs_root *root = fs_info->dev_root; 7656 struct btrfs_key key; 7657 u64 prev_devid = 0; 7658 u64 prev_dev_ext_end = 0; 7659 int ret = 0; 7660 7661 key.objectid = 1; 7662 key.type = BTRFS_DEV_EXTENT_KEY; 7663 key.offset = 0; 7664 7665 path = btrfs_alloc_path(); 7666 if (!path) 7667 return -ENOMEM; 7668 7669 path->reada = READA_FORWARD; 7670 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7671 if (ret < 0) 7672 goto out; 7673 7674 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 7675 ret = btrfs_next_item(root, path); 7676 if (ret < 0) 7677 goto out; 7678 /* No dev extents at all? Not good */ 7679 if (ret > 0) { 7680 ret = -EUCLEAN; 7681 goto out; 7682 } 7683 } 7684 while (1) { 7685 struct extent_buffer *leaf = path->nodes[0]; 7686 struct btrfs_dev_extent *dext; 7687 int slot = path->slots[0]; 7688 u64 chunk_offset; 7689 u64 physical_offset; 7690 u64 physical_len; 7691 u64 devid; 7692 7693 btrfs_item_key_to_cpu(leaf, &key, slot); 7694 if (key.type != BTRFS_DEV_EXTENT_KEY) 7695 break; 7696 devid = key.objectid; 7697 physical_offset = key.offset; 7698 7699 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 7700 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 7701 physical_len = btrfs_dev_extent_length(leaf, dext); 7702 7703 /* Check if this dev extent overlaps with the previous one */ 7704 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 7705 btrfs_err(fs_info, 7706 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 7707 devid, physical_offset, prev_dev_ext_end); 7708 ret = -EUCLEAN; 7709 goto out; 7710 } 7711 7712 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 7713 physical_offset, physical_len); 7714 if (ret < 0) 7715 goto out; 7716 prev_devid = devid; 7717 prev_dev_ext_end = physical_offset + physical_len; 7718 7719 ret = btrfs_next_item(root, path); 7720 if (ret < 0) 7721 goto out; 7722 if (ret > 0) { 7723 ret = 0; 7724 break; 7725 } 7726 } 7727 7728 /* Ensure all chunks have corresponding dev extents */ 7729 ret = verify_chunk_dev_extent_mapping(fs_info); 7730 out: 7731 btrfs_free_path(path); 7732 return ret; 7733 } 7734 7735 /* 7736 * Check whether the given block group or device is pinned by any inode being 7737 * used as a swapfile. 7738 */ 7739 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 7740 { 7741 struct btrfs_swapfile_pin *sp; 7742 struct rb_node *node; 7743 7744 spin_lock(&fs_info->swapfile_pins_lock); 7745 node = fs_info->swapfile_pins.rb_node; 7746 while (node) { 7747 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 7748 if (ptr < sp->ptr) 7749 node = node->rb_left; 7750 else if (ptr > sp->ptr) 7751 node = node->rb_right; 7752 else 7753 break; 7754 } 7755 spin_unlock(&fs_info->swapfile_pins_lock); 7756 return node != NULL; 7757 } 7758