1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/slab.h> 9 #include <linux/ratelimit.h> 10 #include <linux/kthread.h> 11 #include <linux/semaphore.h> 12 #include <linux/uuid.h> 13 #include <linux/list_sort.h> 14 #include <linux/namei.h> 15 #include "misc.h" 16 #include "ctree.h" 17 #include "disk-io.h" 18 #include "transaction.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "rcu-string.h" 22 #include "dev-replace.h" 23 #include "sysfs.h" 24 #include "tree-checker.h" 25 #include "space-info.h" 26 #include "block-group.h" 27 #include "discard.h" 28 #include "zoned.h" 29 #include "fs.h" 30 #include "accessors.h" 31 #include "uuid-tree.h" 32 #include "ioctl.h" 33 #include "relocation.h" 34 #include "scrub.h" 35 #include "super.h" 36 #include "raid-stripe-tree.h" 37 38 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 39 BTRFS_BLOCK_GROUP_RAID10 | \ 40 BTRFS_BLOCK_GROUP_RAID56_MASK) 41 42 struct btrfs_io_geometry { 43 u32 stripe_index; 44 u32 stripe_nr; 45 int mirror_num; 46 int num_stripes; 47 u64 stripe_offset; 48 u64 raid56_full_stripe_start; 49 int max_errors; 50 enum btrfs_map_op op; 51 }; 52 53 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 54 [BTRFS_RAID_RAID10] = { 55 .sub_stripes = 2, 56 .dev_stripes = 1, 57 .devs_max = 0, /* 0 == as many as possible */ 58 .devs_min = 2, 59 .tolerated_failures = 1, 60 .devs_increment = 2, 61 .ncopies = 2, 62 .nparity = 0, 63 .raid_name = "raid10", 64 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 65 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 66 }, 67 [BTRFS_RAID_RAID1] = { 68 .sub_stripes = 1, 69 .dev_stripes = 1, 70 .devs_max = 2, 71 .devs_min = 2, 72 .tolerated_failures = 1, 73 .devs_increment = 2, 74 .ncopies = 2, 75 .nparity = 0, 76 .raid_name = "raid1", 77 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 78 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 79 }, 80 [BTRFS_RAID_RAID1C3] = { 81 .sub_stripes = 1, 82 .dev_stripes = 1, 83 .devs_max = 3, 84 .devs_min = 3, 85 .tolerated_failures = 2, 86 .devs_increment = 3, 87 .ncopies = 3, 88 .nparity = 0, 89 .raid_name = "raid1c3", 90 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 91 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 92 }, 93 [BTRFS_RAID_RAID1C4] = { 94 .sub_stripes = 1, 95 .dev_stripes = 1, 96 .devs_max = 4, 97 .devs_min = 4, 98 .tolerated_failures = 3, 99 .devs_increment = 4, 100 .ncopies = 4, 101 .nparity = 0, 102 .raid_name = "raid1c4", 103 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 104 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 105 }, 106 [BTRFS_RAID_DUP] = { 107 .sub_stripes = 1, 108 .dev_stripes = 2, 109 .devs_max = 1, 110 .devs_min = 1, 111 .tolerated_failures = 0, 112 .devs_increment = 1, 113 .ncopies = 2, 114 .nparity = 0, 115 .raid_name = "dup", 116 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 117 .mindev_error = 0, 118 }, 119 [BTRFS_RAID_RAID0] = { 120 .sub_stripes = 1, 121 .dev_stripes = 1, 122 .devs_max = 0, 123 .devs_min = 1, 124 .tolerated_failures = 0, 125 .devs_increment = 1, 126 .ncopies = 1, 127 .nparity = 0, 128 .raid_name = "raid0", 129 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 130 .mindev_error = 0, 131 }, 132 [BTRFS_RAID_SINGLE] = { 133 .sub_stripes = 1, 134 .dev_stripes = 1, 135 .devs_max = 1, 136 .devs_min = 1, 137 .tolerated_failures = 0, 138 .devs_increment = 1, 139 .ncopies = 1, 140 .nparity = 0, 141 .raid_name = "single", 142 .bg_flag = 0, 143 .mindev_error = 0, 144 }, 145 [BTRFS_RAID_RAID5] = { 146 .sub_stripes = 1, 147 .dev_stripes = 1, 148 .devs_max = 0, 149 .devs_min = 2, 150 .tolerated_failures = 1, 151 .devs_increment = 1, 152 .ncopies = 1, 153 .nparity = 1, 154 .raid_name = "raid5", 155 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 156 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 157 }, 158 [BTRFS_RAID_RAID6] = { 159 .sub_stripes = 1, 160 .dev_stripes = 1, 161 .devs_max = 0, 162 .devs_min = 3, 163 .tolerated_failures = 2, 164 .devs_increment = 1, 165 .ncopies = 1, 166 .nparity = 2, 167 .raid_name = "raid6", 168 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 169 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 170 }, 171 }; 172 173 /* 174 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 175 * can be used as index to access btrfs_raid_array[]. 176 */ 177 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 178 { 179 const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); 180 181 if (!profile) 182 return BTRFS_RAID_SINGLE; 183 184 return BTRFS_BG_FLAG_TO_INDEX(profile); 185 } 186 187 const char *btrfs_bg_type_to_raid_name(u64 flags) 188 { 189 const int index = btrfs_bg_flags_to_raid_index(flags); 190 191 if (index >= BTRFS_NR_RAID_TYPES) 192 return NULL; 193 194 return btrfs_raid_array[index].raid_name; 195 } 196 197 int btrfs_nr_parity_stripes(u64 type) 198 { 199 enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); 200 201 return btrfs_raid_array[index].nparity; 202 } 203 204 /* 205 * Fill @buf with textual description of @bg_flags, no more than @size_buf 206 * bytes including terminating null byte. 207 */ 208 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 209 { 210 int i; 211 int ret; 212 char *bp = buf; 213 u64 flags = bg_flags; 214 u32 size_bp = size_buf; 215 216 if (!flags) { 217 strcpy(bp, "NONE"); 218 return; 219 } 220 221 #define DESCRIBE_FLAG(flag, desc) \ 222 do { \ 223 if (flags & (flag)) { \ 224 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 225 if (ret < 0 || ret >= size_bp) \ 226 goto out_overflow; \ 227 size_bp -= ret; \ 228 bp += ret; \ 229 flags &= ~(flag); \ 230 } \ 231 } while (0) 232 233 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 234 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 235 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 236 237 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 238 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 239 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 240 btrfs_raid_array[i].raid_name); 241 #undef DESCRIBE_FLAG 242 243 if (flags) { 244 ret = snprintf(bp, size_bp, "0x%llx|", flags); 245 size_bp -= ret; 246 } 247 248 if (size_bp < size_buf) 249 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 250 251 /* 252 * The text is trimmed, it's up to the caller to provide sufficiently 253 * large buffer 254 */ 255 out_overflow:; 256 } 257 258 static int init_first_rw_device(struct btrfs_trans_handle *trans); 259 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 260 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 261 262 /* 263 * Device locking 264 * ============== 265 * 266 * There are several mutexes that protect manipulation of devices and low-level 267 * structures like chunks but not block groups, extents or files 268 * 269 * uuid_mutex (global lock) 270 * ------------------------ 271 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 272 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 273 * device) or requested by the device= mount option 274 * 275 * the mutex can be very coarse and can cover long-running operations 276 * 277 * protects: updates to fs_devices counters like missing devices, rw devices, 278 * seeding, structure cloning, opening/closing devices at mount/umount time 279 * 280 * global::fs_devs - add, remove, updates to the global list 281 * 282 * does not protect: manipulation of the fs_devices::devices list in general 283 * but in mount context it could be used to exclude list modifications by eg. 284 * scan ioctl 285 * 286 * btrfs_device::name - renames (write side), read is RCU 287 * 288 * fs_devices::device_list_mutex (per-fs, with RCU) 289 * ------------------------------------------------ 290 * protects updates to fs_devices::devices, ie. adding and deleting 291 * 292 * simple list traversal with read-only actions can be done with RCU protection 293 * 294 * may be used to exclude some operations from running concurrently without any 295 * modifications to the list (see write_all_supers) 296 * 297 * Is not required at mount and close times, because our device list is 298 * protected by the uuid_mutex at that point. 299 * 300 * balance_mutex 301 * ------------- 302 * protects balance structures (status, state) and context accessed from 303 * several places (internally, ioctl) 304 * 305 * chunk_mutex 306 * ----------- 307 * protects chunks, adding or removing during allocation, trim or when a new 308 * device is added/removed. Additionally it also protects post_commit_list of 309 * individual devices, since they can be added to the transaction's 310 * post_commit_list only with chunk_mutex held. 311 * 312 * cleaner_mutex 313 * ------------- 314 * a big lock that is held by the cleaner thread and prevents running subvolume 315 * cleaning together with relocation or delayed iputs 316 * 317 * 318 * Lock nesting 319 * ============ 320 * 321 * uuid_mutex 322 * device_list_mutex 323 * chunk_mutex 324 * balance_mutex 325 * 326 * 327 * Exclusive operations 328 * ==================== 329 * 330 * Maintains the exclusivity of the following operations that apply to the 331 * whole filesystem and cannot run in parallel. 332 * 333 * - Balance (*) 334 * - Device add 335 * - Device remove 336 * - Device replace (*) 337 * - Resize 338 * 339 * The device operations (as above) can be in one of the following states: 340 * 341 * - Running state 342 * - Paused state 343 * - Completed state 344 * 345 * Only device operations marked with (*) can go into the Paused state for the 346 * following reasons: 347 * 348 * - ioctl (only Balance can be Paused through ioctl) 349 * - filesystem remounted as read-only 350 * - filesystem unmounted and mounted as read-only 351 * - system power-cycle and filesystem mounted as read-only 352 * - filesystem or device errors leading to forced read-only 353 * 354 * The status of exclusive operation is set and cleared atomically. 355 * During the course of Paused state, fs_info::exclusive_operation remains set. 356 * A device operation in Paused or Running state can be canceled or resumed 357 * either by ioctl (Balance only) or when remounted as read-write. 358 * The exclusive status is cleared when the device operation is canceled or 359 * completed. 360 */ 361 362 DEFINE_MUTEX(uuid_mutex); 363 static LIST_HEAD(fs_uuids); 364 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 365 { 366 return &fs_uuids; 367 } 368 369 /* 370 * Allocate new btrfs_fs_devices structure identified by a fsid. 371 * 372 * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to 373 * fs_devices::metadata_fsid 374 * 375 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 376 * The returned struct is not linked onto any lists and can be destroyed with 377 * kfree() right away. 378 */ 379 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) 380 { 381 struct btrfs_fs_devices *fs_devs; 382 383 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 384 if (!fs_devs) 385 return ERR_PTR(-ENOMEM); 386 387 mutex_init(&fs_devs->device_list_mutex); 388 389 INIT_LIST_HEAD(&fs_devs->devices); 390 INIT_LIST_HEAD(&fs_devs->alloc_list); 391 INIT_LIST_HEAD(&fs_devs->fs_list); 392 INIT_LIST_HEAD(&fs_devs->seed_list); 393 394 if (fsid) { 395 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 396 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 397 } 398 399 return fs_devs; 400 } 401 402 static void btrfs_free_device(struct btrfs_device *device) 403 { 404 WARN_ON(!list_empty(&device->post_commit_list)); 405 rcu_string_free(device->name); 406 extent_io_tree_release(&device->alloc_state); 407 btrfs_destroy_dev_zone_info(device); 408 kfree(device); 409 } 410 411 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 412 { 413 struct btrfs_device *device; 414 415 WARN_ON(fs_devices->opened); 416 while (!list_empty(&fs_devices->devices)) { 417 device = list_entry(fs_devices->devices.next, 418 struct btrfs_device, dev_list); 419 list_del(&device->dev_list); 420 btrfs_free_device(device); 421 } 422 kfree(fs_devices); 423 } 424 425 void __exit btrfs_cleanup_fs_uuids(void) 426 { 427 struct btrfs_fs_devices *fs_devices; 428 429 while (!list_empty(&fs_uuids)) { 430 fs_devices = list_entry(fs_uuids.next, 431 struct btrfs_fs_devices, fs_list); 432 list_del(&fs_devices->fs_list); 433 free_fs_devices(fs_devices); 434 } 435 } 436 437 static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, 438 const u8 *fsid, const u8 *metadata_fsid) 439 { 440 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) 441 return false; 442 443 if (!metadata_fsid) 444 return true; 445 446 if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) 447 return false; 448 449 return true; 450 } 451 452 static noinline struct btrfs_fs_devices *find_fsid( 453 const u8 *fsid, const u8 *metadata_fsid) 454 { 455 struct btrfs_fs_devices *fs_devices; 456 457 ASSERT(fsid); 458 459 /* Handle non-split brain cases */ 460 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 461 if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) 462 return fs_devices; 463 } 464 return NULL; 465 } 466 467 static int 468 btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, 469 int flush, struct file **bdev_file, 470 struct btrfs_super_block **disk_super) 471 { 472 struct block_device *bdev; 473 int ret; 474 475 *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); 476 477 if (IS_ERR(*bdev_file)) { 478 ret = PTR_ERR(*bdev_file); 479 goto error; 480 } 481 bdev = file_bdev(*bdev_file); 482 483 if (flush) 484 sync_blockdev(bdev); 485 if (holder) { 486 ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); 487 if (ret) { 488 fput(*bdev_file); 489 goto error; 490 } 491 } 492 invalidate_bdev(bdev); 493 *disk_super = btrfs_read_dev_super(bdev); 494 if (IS_ERR(*disk_super)) { 495 ret = PTR_ERR(*disk_super); 496 fput(*bdev_file); 497 goto error; 498 } 499 500 return 0; 501 502 error: 503 *disk_super = NULL; 504 *bdev_file = NULL; 505 return ret; 506 } 507 508 /* 509 * Search and remove all stale devices (which are not mounted). When both 510 * inputs are NULL, it will search and release all stale devices. 511 * 512 * @devt: Optional. When provided will it release all unmounted devices 513 * matching this devt only. 514 * @skip_device: Optional. Will skip this device when searching for the stale 515 * devices. 516 * 517 * Return: 0 for success or if @devt is 0. 518 * -EBUSY if @devt is a mounted device. 519 * -ENOENT if @devt does not match any device in the list. 520 */ 521 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 522 { 523 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 524 struct btrfs_device *device, *tmp_device; 525 int ret; 526 bool freed = false; 527 528 lockdep_assert_held(&uuid_mutex); 529 530 /* Return good status if there is no instance of devt. */ 531 ret = 0; 532 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 533 534 mutex_lock(&fs_devices->device_list_mutex); 535 list_for_each_entry_safe(device, tmp_device, 536 &fs_devices->devices, dev_list) { 537 if (skip_device && skip_device == device) 538 continue; 539 if (devt && devt != device->devt) 540 continue; 541 if (fs_devices->opened) { 542 if (devt) 543 ret = -EBUSY; 544 break; 545 } 546 547 /* delete the stale device */ 548 fs_devices->num_devices--; 549 list_del(&device->dev_list); 550 btrfs_free_device(device); 551 552 freed = true; 553 } 554 mutex_unlock(&fs_devices->device_list_mutex); 555 556 if (fs_devices->num_devices == 0) { 557 btrfs_sysfs_remove_fsid(fs_devices); 558 list_del(&fs_devices->fs_list); 559 free_fs_devices(fs_devices); 560 } 561 } 562 563 /* If there is at least one freed device return 0. */ 564 if (freed) 565 return 0; 566 567 return ret; 568 } 569 570 static struct btrfs_fs_devices *find_fsid_by_device( 571 struct btrfs_super_block *disk_super, 572 dev_t devt, bool *same_fsid_diff_dev) 573 { 574 struct btrfs_fs_devices *fsid_fs_devices; 575 struct btrfs_fs_devices *devt_fs_devices; 576 const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 577 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 578 bool found_by_devt = false; 579 580 /* Find the fs_device by the usual method, if found use it. */ 581 fsid_fs_devices = find_fsid(disk_super->fsid, 582 has_metadata_uuid ? disk_super->metadata_uuid : NULL); 583 584 /* The temp_fsid feature is supported only with single device filesystem. */ 585 if (btrfs_super_num_devices(disk_super) != 1) 586 return fsid_fs_devices; 587 588 /* 589 * A seed device is an integral component of the sprout device, which 590 * functions as a multi-device filesystem. So, temp-fsid feature is 591 * not supported. 592 */ 593 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) 594 return fsid_fs_devices; 595 596 /* Try to find a fs_devices by matching devt. */ 597 list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { 598 struct btrfs_device *device; 599 600 list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { 601 if (device->devt == devt) { 602 found_by_devt = true; 603 break; 604 } 605 } 606 if (found_by_devt) 607 break; 608 } 609 610 if (found_by_devt) { 611 /* Existing device. */ 612 if (fsid_fs_devices == NULL) { 613 if (devt_fs_devices->opened == 0) { 614 /* Stale device. */ 615 return NULL; 616 } else { 617 /* temp_fsid is mounting a subvol. */ 618 return devt_fs_devices; 619 } 620 } else { 621 /* Regular or temp_fsid device mounting a subvol. */ 622 return devt_fs_devices; 623 } 624 } else { 625 /* New device. */ 626 if (fsid_fs_devices == NULL) { 627 return NULL; 628 } else { 629 /* sb::fsid is already used create a new temp_fsid. */ 630 *same_fsid_diff_dev = true; 631 return NULL; 632 } 633 } 634 635 /* Not reached. */ 636 } 637 638 /* 639 * This is only used on mount, and we are protected from competing things 640 * messing with our fs_devices by the uuid_mutex, thus we do not need the 641 * fs_devices->device_list_mutex here. 642 */ 643 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 644 struct btrfs_device *device, blk_mode_t flags, 645 void *holder) 646 { 647 struct file *bdev_file; 648 struct btrfs_super_block *disk_super; 649 u64 devid; 650 int ret; 651 652 if (device->bdev) 653 return -EINVAL; 654 if (!device->name) 655 return -EINVAL; 656 657 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 658 &bdev_file, &disk_super); 659 if (ret) 660 return ret; 661 662 devid = btrfs_stack_device_id(&disk_super->dev_item); 663 if (devid != device->devid) 664 goto error_free_page; 665 666 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 667 goto error_free_page; 668 669 device->generation = btrfs_super_generation(disk_super); 670 671 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 672 if (btrfs_super_incompat_flags(disk_super) & 673 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 674 pr_err( 675 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 676 goto error_free_page; 677 } 678 679 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 680 fs_devices->seeding = true; 681 } else { 682 if (bdev_read_only(file_bdev(bdev_file))) 683 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 684 else 685 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 686 } 687 688 if (!bdev_nonrot(file_bdev(bdev_file))) 689 fs_devices->rotating = true; 690 691 if (bdev_max_discard_sectors(file_bdev(bdev_file))) 692 fs_devices->discardable = true; 693 694 device->bdev_file = bdev_file; 695 device->bdev = file_bdev(bdev_file); 696 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 697 698 if (device->devt != device->bdev->bd_dev) { 699 btrfs_warn(NULL, 700 "device %s maj:min changed from %d:%d to %d:%d", 701 device->name->str, MAJOR(device->devt), 702 MINOR(device->devt), MAJOR(device->bdev->bd_dev), 703 MINOR(device->bdev->bd_dev)); 704 705 device->devt = device->bdev->bd_dev; 706 } 707 708 fs_devices->open_devices++; 709 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 710 device->devid != BTRFS_DEV_REPLACE_DEVID) { 711 fs_devices->rw_devices++; 712 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 713 } 714 btrfs_release_disk_super(disk_super); 715 716 return 0; 717 718 error_free_page: 719 btrfs_release_disk_super(disk_super); 720 fput(bdev_file); 721 722 return -EINVAL; 723 } 724 725 const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) 726 { 727 bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & 728 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 729 730 return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; 731 } 732 733 /* 734 * Add new device to list of registered devices 735 * 736 * Returns: 737 * device pointer which was just added or updated when successful 738 * error pointer when failed 739 */ 740 static noinline struct btrfs_device *device_list_add(const char *path, 741 struct btrfs_super_block *disk_super, 742 bool *new_device_added) 743 { 744 struct btrfs_device *device; 745 struct btrfs_fs_devices *fs_devices = NULL; 746 struct rcu_string *name; 747 u64 found_transid = btrfs_super_generation(disk_super); 748 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 749 dev_t path_devt; 750 int error; 751 bool same_fsid_diff_dev = false; 752 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 753 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 754 755 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { 756 btrfs_err(NULL, 757 "device %s has incomplete metadata_uuid change, please use btrfstune to complete", 758 path); 759 return ERR_PTR(-EAGAIN); 760 } 761 762 error = lookup_bdev(path, &path_devt); 763 if (error) { 764 btrfs_err(NULL, "failed to lookup block device for path %s: %d", 765 path, error); 766 return ERR_PTR(error); 767 } 768 769 fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); 770 771 if (!fs_devices) { 772 fs_devices = alloc_fs_devices(disk_super->fsid); 773 if (IS_ERR(fs_devices)) 774 return ERR_CAST(fs_devices); 775 776 if (has_metadata_uuid) 777 memcpy(fs_devices->metadata_uuid, 778 disk_super->metadata_uuid, BTRFS_FSID_SIZE); 779 780 if (same_fsid_diff_dev) { 781 generate_random_uuid(fs_devices->fsid); 782 fs_devices->temp_fsid = true; 783 pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", 784 path, MAJOR(path_devt), MINOR(path_devt), 785 fs_devices->fsid); 786 } 787 788 mutex_lock(&fs_devices->device_list_mutex); 789 list_add(&fs_devices->fs_list, &fs_uuids); 790 791 device = NULL; 792 } else { 793 struct btrfs_dev_lookup_args args = { 794 .devid = devid, 795 .uuid = disk_super->dev_item.uuid, 796 }; 797 798 mutex_lock(&fs_devices->device_list_mutex); 799 device = btrfs_find_device(fs_devices, &args); 800 801 if (found_transid > fs_devices->latest_generation) { 802 memcpy(fs_devices->fsid, disk_super->fsid, 803 BTRFS_FSID_SIZE); 804 memcpy(fs_devices->metadata_uuid, 805 btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); 806 } 807 } 808 809 if (!device) { 810 unsigned int nofs_flag; 811 812 if (fs_devices->opened) { 813 btrfs_err(NULL, 814 "device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", 815 path, MAJOR(path_devt), MINOR(path_devt), 816 fs_devices->fsid, current->comm, 817 task_pid_nr(current)); 818 mutex_unlock(&fs_devices->device_list_mutex); 819 return ERR_PTR(-EBUSY); 820 } 821 822 nofs_flag = memalloc_nofs_save(); 823 device = btrfs_alloc_device(NULL, &devid, 824 disk_super->dev_item.uuid, path); 825 memalloc_nofs_restore(nofs_flag); 826 if (IS_ERR(device)) { 827 mutex_unlock(&fs_devices->device_list_mutex); 828 /* we can safely leave the fs_devices entry around */ 829 return device; 830 } 831 832 device->devt = path_devt; 833 834 list_add_rcu(&device->dev_list, &fs_devices->devices); 835 fs_devices->num_devices++; 836 837 device->fs_devices = fs_devices; 838 *new_device_added = true; 839 840 if (disk_super->label[0]) 841 pr_info( 842 "BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", 843 disk_super->label, devid, found_transid, path, 844 MAJOR(path_devt), MINOR(path_devt), 845 current->comm, task_pid_nr(current)); 846 else 847 pr_info( 848 "BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n", 849 disk_super->fsid, devid, found_transid, path, 850 MAJOR(path_devt), MINOR(path_devt), 851 current->comm, task_pid_nr(current)); 852 853 } else if (!device->name || strcmp(device->name->str, path)) { 854 /* 855 * When FS is already mounted. 856 * 1. If you are here and if the device->name is NULL that 857 * means this device was missing at time of FS mount. 858 * 2. If you are here and if the device->name is different 859 * from 'path' that means either 860 * a. The same device disappeared and reappeared with 861 * different name. or 862 * b. The missing-disk-which-was-replaced, has 863 * reappeared now. 864 * 865 * We must allow 1 and 2a above. But 2b would be a spurious 866 * and unintentional. 867 * 868 * Further in case of 1 and 2a above, the disk at 'path' 869 * would have missed some transaction when it was away and 870 * in case of 2a the stale bdev has to be updated as well. 871 * 2b must not be allowed at all time. 872 */ 873 874 /* 875 * For now, we do allow update to btrfs_fs_device through the 876 * btrfs dev scan cli after FS has been mounted. We're still 877 * tracking a problem where systems fail mount by subvolume id 878 * when we reject replacement on a mounted FS. 879 */ 880 if (!fs_devices->opened && found_transid < device->generation) { 881 /* 882 * That is if the FS is _not_ mounted and if you 883 * are here, that means there is more than one 884 * disk with same uuid and devid.We keep the one 885 * with larger generation number or the last-in if 886 * generation are equal. 887 */ 888 mutex_unlock(&fs_devices->device_list_mutex); 889 btrfs_err(NULL, 890 "device %s already registered with a higher generation, found %llu expect %llu", 891 path, found_transid, device->generation); 892 return ERR_PTR(-EEXIST); 893 } 894 895 /* 896 * We are going to replace the device path for a given devid, 897 * make sure it's the same device if the device is mounted 898 * 899 * NOTE: the device->fs_info may not be reliable here so pass 900 * in a NULL to message helpers instead. This avoids a possible 901 * use-after-free when the fs_info and fs_info->sb are already 902 * torn down. 903 */ 904 if (device->bdev) { 905 if (device->devt != path_devt) { 906 mutex_unlock(&fs_devices->device_list_mutex); 907 btrfs_warn_in_rcu(NULL, 908 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 909 path, devid, found_transid, 910 current->comm, 911 task_pid_nr(current)); 912 return ERR_PTR(-EEXIST); 913 } 914 btrfs_info_in_rcu(NULL, 915 "devid %llu device path %s changed to %s scanned by %s (%d)", 916 devid, btrfs_dev_name(device), 917 path, current->comm, 918 task_pid_nr(current)); 919 } 920 921 name = rcu_string_strdup(path, GFP_NOFS); 922 if (!name) { 923 mutex_unlock(&fs_devices->device_list_mutex); 924 return ERR_PTR(-ENOMEM); 925 } 926 rcu_string_free(device->name); 927 rcu_assign_pointer(device->name, name); 928 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 929 fs_devices->missing_devices--; 930 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 931 } 932 device->devt = path_devt; 933 } 934 935 /* 936 * Unmount does not free the btrfs_device struct but would zero 937 * generation along with most of the other members. So just update 938 * it back. We need it to pick the disk with largest generation 939 * (as above). 940 */ 941 if (!fs_devices->opened) { 942 device->generation = found_transid; 943 fs_devices->latest_generation = max_t(u64, found_transid, 944 fs_devices->latest_generation); 945 } 946 947 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 948 949 mutex_unlock(&fs_devices->device_list_mutex); 950 return device; 951 } 952 953 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 954 { 955 struct btrfs_fs_devices *fs_devices; 956 struct btrfs_device *device; 957 struct btrfs_device *orig_dev; 958 int ret = 0; 959 960 lockdep_assert_held(&uuid_mutex); 961 962 fs_devices = alloc_fs_devices(orig->fsid); 963 if (IS_ERR(fs_devices)) 964 return fs_devices; 965 966 fs_devices->total_devices = orig->total_devices; 967 968 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 969 const char *dev_path = NULL; 970 971 /* 972 * This is ok to do without RCU read locked because we hold the 973 * uuid mutex so nothing we touch in here is going to disappear. 974 */ 975 if (orig_dev->name) 976 dev_path = orig_dev->name->str; 977 978 device = btrfs_alloc_device(NULL, &orig_dev->devid, 979 orig_dev->uuid, dev_path); 980 if (IS_ERR(device)) { 981 ret = PTR_ERR(device); 982 goto error; 983 } 984 985 if (orig_dev->zone_info) { 986 struct btrfs_zoned_device_info *zone_info; 987 988 zone_info = btrfs_clone_dev_zone_info(orig_dev); 989 if (!zone_info) { 990 btrfs_free_device(device); 991 ret = -ENOMEM; 992 goto error; 993 } 994 device->zone_info = zone_info; 995 } 996 997 list_add(&device->dev_list, &fs_devices->devices); 998 device->fs_devices = fs_devices; 999 fs_devices->num_devices++; 1000 } 1001 return fs_devices; 1002 error: 1003 free_fs_devices(fs_devices); 1004 return ERR_PTR(ret); 1005 } 1006 1007 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1008 struct btrfs_device **latest_dev) 1009 { 1010 struct btrfs_device *device, *next; 1011 1012 /* This is the initialized path, it is safe to release the devices. */ 1013 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1014 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1015 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1016 &device->dev_state) && 1017 !test_bit(BTRFS_DEV_STATE_MISSING, 1018 &device->dev_state) && 1019 (!*latest_dev || 1020 device->generation > (*latest_dev)->generation)) { 1021 *latest_dev = device; 1022 } 1023 continue; 1024 } 1025 1026 /* 1027 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1028 * in btrfs_init_dev_replace() so just continue. 1029 */ 1030 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1031 continue; 1032 1033 if (device->bdev_file) { 1034 fput(device->bdev_file); 1035 device->bdev = NULL; 1036 device->bdev_file = NULL; 1037 fs_devices->open_devices--; 1038 } 1039 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1040 list_del_init(&device->dev_alloc_list); 1041 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1042 fs_devices->rw_devices--; 1043 } 1044 list_del_init(&device->dev_list); 1045 fs_devices->num_devices--; 1046 btrfs_free_device(device); 1047 } 1048 1049 } 1050 1051 /* 1052 * After we have read the system tree and know devids belonging to this 1053 * filesystem, remove the device which does not belong there. 1054 */ 1055 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1056 { 1057 struct btrfs_device *latest_dev = NULL; 1058 struct btrfs_fs_devices *seed_dev; 1059 1060 mutex_lock(&uuid_mutex); 1061 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1062 1063 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1064 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1065 1066 fs_devices->latest_dev = latest_dev; 1067 1068 mutex_unlock(&uuid_mutex); 1069 } 1070 1071 static void btrfs_close_bdev(struct btrfs_device *device) 1072 { 1073 if (!device->bdev) 1074 return; 1075 1076 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1077 sync_blockdev(device->bdev); 1078 invalidate_bdev(device->bdev); 1079 } 1080 1081 fput(device->bdev_file); 1082 } 1083 1084 static void btrfs_close_one_device(struct btrfs_device *device) 1085 { 1086 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1087 1088 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1089 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1090 list_del_init(&device->dev_alloc_list); 1091 fs_devices->rw_devices--; 1092 } 1093 1094 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1095 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1096 1097 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1098 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1099 fs_devices->missing_devices--; 1100 } 1101 1102 btrfs_close_bdev(device); 1103 if (device->bdev) { 1104 fs_devices->open_devices--; 1105 device->bdev = NULL; 1106 } 1107 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1108 btrfs_destroy_dev_zone_info(device); 1109 1110 device->fs_info = NULL; 1111 atomic_set(&device->dev_stats_ccnt, 0); 1112 extent_io_tree_release(&device->alloc_state); 1113 1114 /* 1115 * Reset the flush error record. We might have a transient flush error 1116 * in this mount, and if so we aborted the current transaction and set 1117 * the fs to an error state, guaranteeing no super blocks can be further 1118 * committed. However that error might be transient and if we unmount the 1119 * filesystem and mount it again, we should allow the mount to succeed 1120 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1121 * filesystem again we still get flush errors, then we will again abort 1122 * any transaction and set the error state, guaranteeing no commits of 1123 * unsafe super blocks. 1124 */ 1125 device->last_flush_error = 0; 1126 1127 /* Verify the device is back in a pristine state */ 1128 WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1129 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1130 WARN_ON(!list_empty(&device->dev_alloc_list)); 1131 WARN_ON(!list_empty(&device->post_commit_list)); 1132 } 1133 1134 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1135 { 1136 struct btrfs_device *device, *tmp; 1137 1138 lockdep_assert_held(&uuid_mutex); 1139 1140 if (--fs_devices->opened > 0) 1141 return; 1142 1143 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1144 btrfs_close_one_device(device); 1145 1146 WARN_ON(fs_devices->open_devices); 1147 WARN_ON(fs_devices->rw_devices); 1148 fs_devices->opened = 0; 1149 fs_devices->seeding = false; 1150 fs_devices->fs_info = NULL; 1151 } 1152 1153 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1154 { 1155 LIST_HEAD(list); 1156 struct btrfs_fs_devices *tmp; 1157 1158 mutex_lock(&uuid_mutex); 1159 close_fs_devices(fs_devices); 1160 if (!fs_devices->opened) { 1161 list_splice_init(&fs_devices->seed_list, &list); 1162 1163 /* 1164 * If the struct btrfs_fs_devices is not assembled with any 1165 * other device, it can be re-initialized during the next mount 1166 * without the needing device-scan step. Therefore, it can be 1167 * fully freed. 1168 */ 1169 if (fs_devices->num_devices == 1) { 1170 list_del(&fs_devices->fs_list); 1171 free_fs_devices(fs_devices); 1172 } 1173 } 1174 1175 1176 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1177 close_fs_devices(fs_devices); 1178 list_del(&fs_devices->seed_list); 1179 free_fs_devices(fs_devices); 1180 } 1181 mutex_unlock(&uuid_mutex); 1182 } 1183 1184 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1185 blk_mode_t flags, void *holder) 1186 { 1187 struct btrfs_device *device; 1188 struct btrfs_device *latest_dev = NULL; 1189 struct btrfs_device *tmp_device; 1190 int ret = 0; 1191 1192 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1193 dev_list) { 1194 int ret2; 1195 1196 ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); 1197 if (ret2 == 0 && 1198 (!latest_dev || device->generation > latest_dev->generation)) { 1199 latest_dev = device; 1200 } else if (ret2 == -ENODATA) { 1201 fs_devices->num_devices--; 1202 list_del(&device->dev_list); 1203 btrfs_free_device(device); 1204 } 1205 if (ret == 0 && ret2 != 0) 1206 ret = ret2; 1207 } 1208 1209 if (fs_devices->open_devices == 0) { 1210 if (ret) 1211 return ret; 1212 return -EINVAL; 1213 } 1214 1215 fs_devices->opened = 1; 1216 fs_devices->latest_dev = latest_dev; 1217 fs_devices->total_rw_bytes = 0; 1218 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1219 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1220 1221 return 0; 1222 } 1223 1224 static int devid_cmp(void *priv, const struct list_head *a, 1225 const struct list_head *b) 1226 { 1227 const struct btrfs_device *dev1, *dev2; 1228 1229 dev1 = list_entry(a, struct btrfs_device, dev_list); 1230 dev2 = list_entry(b, struct btrfs_device, dev_list); 1231 1232 if (dev1->devid < dev2->devid) 1233 return -1; 1234 else if (dev1->devid > dev2->devid) 1235 return 1; 1236 return 0; 1237 } 1238 1239 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1240 blk_mode_t flags, void *holder) 1241 { 1242 int ret; 1243 1244 lockdep_assert_held(&uuid_mutex); 1245 /* 1246 * The device_list_mutex cannot be taken here in case opening the 1247 * underlying device takes further locks like open_mutex. 1248 * 1249 * We also don't need the lock here as this is called during mount and 1250 * exclusion is provided by uuid_mutex 1251 */ 1252 1253 if (fs_devices->opened) { 1254 fs_devices->opened++; 1255 ret = 0; 1256 } else { 1257 list_sort(NULL, &fs_devices->devices, devid_cmp); 1258 ret = open_fs_devices(fs_devices, flags, holder); 1259 } 1260 1261 return ret; 1262 } 1263 1264 void btrfs_release_disk_super(struct btrfs_super_block *super) 1265 { 1266 struct page *page = virt_to_page(super); 1267 1268 put_page(page); 1269 } 1270 1271 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1272 u64 bytenr, u64 bytenr_orig) 1273 { 1274 struct btrfs_super_block *disk_super; 1275 struct page *page; 1276 void *p; 1277 pgoff_t index; 1278 1279 /* make sure our super fits in the device */ 1280 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1281 return ERR_PTR(-EINVAL); 1282 1283 /* make sure our super fits in the page */ 1284 if (sizeof(*disk_super) > PAGE_SIZE) 1285 return ERR_PTR(-EINVAL); 1286 1287 /* make sure our super doesn't straddle pages on disk */ 1288 index = bytenr >> PAGE_SHIFT; 1289 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1290 return ERR_PTR(-EINVAL); 1291 1292 /* pull in the page with our super */ 1293 page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); 1294 1295 if (IS_ERR(page)) 1296 return ERR_CAST(page); 1297 1298 p = page_address(page); 1299 1300 /* align our pointer to the offset of the super block */ 1301 disk_super = p + offset_in_page(bytenr); 1302 1303 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1304 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1305 btrfs_release_disk_super(p); 1306 return ERR_PTR(-EINVAL); 1307 } 1308 1309 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1310 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1311 1312 return disk_super; 1313 } 1314 1315 int btrfs_forget_devices(dev_t devt) 1316 { 1317 int ret; 1318 1319 mutex_lock(&uuid_mutex); 1320 ret = btrfs_free_stale_devices(devt, NULL); 1321 mutex_unlock(&uuid_mutex); 1322 1323 return ret; 1324 } 1325 1326 static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, 1327 const char *path, dev_t devt, 1328 bool mount_arg_dev) 1329 { 1330 struct btrfs_fs_devices *fs_devices; 1331 1332 /* 1333 * Do not skip device registration for mounted devices with matching 1334 * maj:min but different paths. Booting without initrd relies on 1335 * /dev/root initially, later replaced with the actual root device. 1336 * A successful scan ensures grub2-probe selects the correct device. 1337 */ 1338 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 1339 struct btrfs_device *device; 1340 1341 mutex_lock(&fs_devices->device_list_mutex); 1342 1343 if (!fs_devices->opened) { 1344 mutex_unlock(&fs_devices->device_list_mutex); 1345 continue; 1346 } 1347 1348 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1349 if (device->bdev && (device->bdev->bd_dev == devt) && 1350 strcmp(device->name->str, path) != 0) { 1351 mutex_unlock(&fs_devices->device_list_mutex); 1352 1353 /* Do not skip registration. */ 1354 return false; 1355 } 1356 } 1357 mutex_unlock(&fs_devices->device_list_mutex); 1358 } 1359 1360 if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && 1361 !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) 1362 return true; 1363 1364 return false; 1365 } 1366 1367 /* 1368 * Look for a btrfs signature on a device. This may be called out of the mount path 1369 * and we are not allowed to call set_blocksize during the scan. The superblock 1370 * is read via pagecache. 1371 * 1372 * With @mount_arg_dev it's a scan during mount time that will always register 1373 * the device or return an error. Multi-device and seeding devices are registered 1374 * in both cases. 1375 */ 1376 struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, 1377 bool mount_arg_dev) 1378 { 1379 struct btrfs_super_block *disk_super; 1380 bool new_device_added = false; 1381 struct btrfs_device *device = NULL; 1382 struct file *bdev_file; 1383 u64 bytenr; 1384 dev_t devt; 1385 int ret; 1386 1387 lockdep_assert_held(&uuid_mutex); 1388 1389 /* 1390 * Avoid an exclusive open here, as the systemd-udev may initiate the 1391 * device scan which may race with the user's mount or mkfs command, 1392 * resulting in failure. 1393 * Since the device scan is solely for reading purposes, there is no 1394 * need for an exclusive open. Additionally, the devices are read again 1395 * during the mount process. It is ok to get some inconsistent 1396 * values temporarily, as the device paths of the fsid are the only 1397 * required information for assembling the volume. 1398 */ 1399 bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); 1400 if (IS_ERR(bdev_file)) 1401 return ERR_CAST(bdev_file); 1402 1403 /* 1404 * We would like to check all the super blocks, but doing so would 1405 * allow a mount to succeed after a mkfs from a different filesystem. 1406 * Currently, recovery from a bad primary btrfs superblock is done 1407 * using the userspace command 'btrfs check --super'. 1408 */ 1409 ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); 1410 if (ret) { 1411 device = ERR_PTR(ret); 1412 goto error_bdev_put; 1413 } 1414 1415 disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, 1416 btrfs_sb_offset(0)); 1417 if (IS_ERR(disk_super)) { 1418 device = ERR_CAST(disk_super); 1419 goto error_bdev_put; 1420 } 1421 1422 devt = file_bdev(bdev_file)->bd_dev; 1423 if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { 1424 pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", 1425 path, MAJOR(devt), MINOR(devt)); 1426 1427 btrfs_free_stale_devices(devt, NULL); 1428 1429 device = NULL; 1430 goto free_disk_super; 1431 } 1432 1433 device = device_list_add(path, disk_super, &new_device_added); 1434 if (!IS_ERR(device) && new_device_added) 1435 btrfs_free_stale_devices(device->devt, device); 1436 1437 free_disk_super: 1438 btrfs_release_disk_super(disk_super); 1439 1440 error_bdev_put: 1441 fput(bdev_file); 1442 1443 return device; 1444 } 1445 1446 /* 1447 * Try to find a chunk that intersects [start, start + len] range and when one 1448 * such is found, record the end of it in *start 1449 */ 1450 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1451 u64 len) 1452 { 1453 u64 physical_start, physical_end; 1454 1455 lockdep_assert_held(&device->fs_info->chunk_mutex); 1456 1457 if (find_first_extent_bit(&device->alloc_state, *start, 1458 &physical_start, &physical_end, 1459 CHUNK_ALLOCATED, NULL)) { 1460 1461 if (in_range(physical_start, *start, len) || 1462 in_range(*start, physical_start, 1463 physical_end + 1 - physical_start)) { 1464 *start = physical_end + 1; 1465 return true; 1466 } 1467 } 1468 return false; 1469 } 1470 1471 static u64 dev_extent_search_start(struct btrfs_device *device) 1472 { 1473 switch (device->fs_devices->chunk_alloc_policy) { 1474 case BTRFS_CHUNK_ALLOC_REGULAR: 1475 return BTRFS_DEVICE_RANGE_RESERVED; 1476 case BTRFS_CHUNK_ALLOC_ZONED: 1477 /* 1478 * We don't care about the starting region like regular 1479 * allocator, because we anyway use/reserve the first two zones 1480 * for superblock logging. 1481 */ 1482 return 0; 1483 default: 1484 BUG(); 1485 } 1486 } 1487 1488 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1489 u64 *hole_start, u64 *hole_size, 1490 u64 num_bytes) 1491 { 1492 u64 zone_size = device->zone_info->zone_size; 1493 u64 pos; 1494 int ret; 1495 bool changed = false; 1496 1497 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1498 1499 while (*hole_size > 0) { 1500 pos = btrfs_find_allocatable_zones(device, *hole_start, 1501 *hole_start + *hole_size, 1502 num_bytes); 1503 if (pos != *hole_start) { 1504 *hole_size = *hole_start + *hole_size - pos; 1505 *hole_start = pos; 1506 changed = true; 1507 if (*hole_size < num_bytes) 1508 break; 1509 } 1510 1511 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1512 1513 /* Range is ensured to be empty */ 1514 if (!ret) 1515 return changed; 1516 1517 /* Given hole range was invalid (outside of device) */ 1518 if (ret == -ERANGE) { 1519 *hole_start += *hole_size; 1520 *hole_size = 0; 1521 return true; 1522 } 1523 1524 *hole_start += zone_size; 1525 *hole_size -= zone_size; 1526 changed = true; 1527 } 1528 1529 return changed; 1530 } 1531 1532 /* 1533 * Check if specified hole is suitable for allocation. 1534 * 1535 * @device: the device which we have the hole 1536 * @hole_start: starting position of the hole 1537 * @hole_size: the size of the hole 1538 * @num_bytes: the size of the free space that we need 1539 * 1540 * This function may modify @hole_start and @hole_size to reflect the suitable 1541 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1542 */ 1543 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1544 u64 *hole_size, u64 num_bytes) 1545 { 1546 bool changed = false; 1547 u64 hole_end = *hole_start + *hole_size; 1548 1549 for (;;) { 1550 /* 1551 * Check before we set max_hole_start, otherwise we could end up 1552 * sending back this offset anyway. 1553 */ 1554 if (contains_pending_extent(device, hole_start, *hole_size)) { 1555 if (hole_end >= *hole_start) 1556 *hole_size = hole_end - *hole_start; 1557 else 1558 *hole_size = 0; 1559 changed = true; 1560 } 1561 1562 switch (device->fs_devices->chunk_alloc_policy) { 1563 case BTRFS_CHUNK_ALLOC_REGULAR: 1564 /* No extra check */ 1565 break; 1566 case BTRFS_CHUNK_ALLOC_ZONED: 1567 if (dev_extent_hole_check_zoned(device, hole_start, 1568 hole_size, num_bytes)) { 1569 changed = true; 1570 /* 1571 * The changed hole can contain pending extent. 1572 * Loop again to check that. 1573 */ 1574 continue; 1575 } 1576 break; 1577 default: 1578 BUG(); 1579 } 1580 1581 break; 1582 } 1583 1584 return changed; 1585 } 1586 1587 /* 1588 * Find free space in the specified device. 1589 * 1590 * @device: the device which we search the free space in 1591 * @num_bytes: the size of the free space that we need 1592 * @search_start: the position from which to begin the search 1593 * @start: store the start of the free space. 1594 * @len: the size of the free space. that we find, or the size 1595 * of the max free space if we don't find suitable free space 1596 * 1597 * This does a pretty simple search, the expectation is that it is called very 1598 * infrequently and that a given device has a small number of extents. 1599 * 1600 * @start is used to store the start of the free space if we find. But if we 1601 * don't find suitable free space, it will be used to store the start position 1602 * of the max free space. 1603 * 1604 * @len is used to store the size of the free space that we find. 1605 * But if we don't find suitable free space, it is used to store the size of 1606 * the max free space. 1607 * 1608 * NOTE: This function will search *commit* root of device tree, and does extra 1609 * check to ensure dev extents are not double allocated. 1610 * This makes the function safe to allocate dev extents but may not report 1611 * correct usable device space, as device extent freed in current transaction 1612 * is not reported as available. 1613 */ 1614 static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1615 u64 *start, u64 *len) 1616 { 1617 struct btrfs_fs_info *fs_info = device->fs_info; 1618 struct btrfs_root *root = fs_info->dev_root; 1619 struct btrfs_key key; 1620 struct btrfs_dev_extent *dev_extent; 1621 struct btrfs_path *path; 1622 u64 search_start; 1623 u64 hole_size; 1624 u64 max_hole_start; 1625 u64 max_hole_size = 0; 1626 u64 extent_end; 1627 u64 search_end = device->total_bytes; 1628 int ret; 1629 int slot; 1630 struct extent_buffer *l; 1631 1632 search_start = dev_extent_search_start(device); 1633 max_hole_start = search_start; 1634 1635 WARN_ON(device->zone_info && 1636 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1637 1638 path = btrfs_alloc_path(); 1639 if (!path) { 1640 ret = -ENOMEM; 1641 goto out; 1642 } 1643 again: 1644 if (search_start >= search_end || 1645 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1646 ret = -ENOSPC; 1647 goto out; 1648 } 1649 1650 path->reada = READA_FORWARD; 1651 path->search_commit_root = 1; 1652 path->skip_locking = 1; 1653 1654 key.objectid = device->devid; 1655 key.offset = search_start; 1656 key.type = BTRFS_DEV_EXTENT_KEY; 1657 1658 ret = btrfs_search_backwards(root, &key, path); 1659 if (ret < 0) 1660 goto out; 1661 1662 while (search_start < search_end) { 1663 l = path->nodes[0]; 1664 slot = path->slots[0]; 1665 if (slot >= btrfs_header_nritems(l)) { 1666 ret = btrfs_next_leaf(root, path); 1667 if (ret == 0) 1668 continue; 1669 if (ret < 0) 1670 goto out; 1671 1672 break; 1673 } 1674 btrfs_item_key_to_cpu(l, &key, slot); 1675 1676 if (key.objectid < device->devid) 1677 goto next; 1678 1679 if (key.objectid > device->devid) 1680 break; 1681 1682 if (key.type != BTRFS_DEV_EXTENT_KEY) 1683 goto next; 1684 1685 if (key.offset > search_end) 1686 break; 1687 1688 if (key.offset > search_start) { 1689 hole_size = key.offset - search_start; 1690 dev_extent_hole_check(device, &search_start, &hole_size, 1691 num_bytes); 1692 1693 if (hole_size > max_hole_size) { 1694 max_hole_start = search_start; 1695 max_hole_size = hole_size; 1696 } 1697 1698 /* 1699 * If this free space is greater than which we need, 1700 * it must be the max free space that we have found 1701 * until now, so max_hole_start must point to the start 1702 * of this free space and the length of this free space 1703 * is stored in max_hole_size. Thus, we return 1704 * max_hole_start and max_hole_size and go back to the 1705 * caller. 1706 */ 1707 if (hole_size >= num_bytes) { 1708 ret = 0; 1709 goto out; 1710 } 1711 } 1712 1713 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1714 extent_end = key.offset + btrfs_dev_extent_length(l, 1715 dev_extent); 1716 if (extent_end > search_start) 1717 search_start = extent_end; 1718 next: 1719 path->slots[0]++; 1720 cond_resched(); 1721 } 1722 1723 /* 1724 * At this point, search_start should be the end of 1725 * allocated dev extents, and when shrinking the device, 1726 * search_end may be smaller than search_start. 1727 */ 1728 if (search_end > search_start) { 1729 hole_size = search_end - search_start; 1730 if (dev_extent_hole_check(device, &search_start, &hole_size, 1731 num_bytes)) { 1732 btrfs_release_path(path); 1733 goto again; 1734 } 1735 1736 if (hole_size > max_hole_size) { 1737 max_hole_start = search_start; 1738 max_hole_size = hole_size; 1739 } 1740 } 1741 1742 /* See above. */ 1743 if (max_hole_size < num_bytes) 1744 ret = -ENOSPC; 1745 else 1746 ret = 0; 1747 1748 ASSERT(max_hole_start + max_hole_size <= search_end); 1749 out: 1750 btrfs_free_path(path); 1751 *start = max_hole_start; 1752 if (len) 1753 *len = max_hole_size; 1754 return ret; 1755 } 1756 1757 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1758 struct btrfs_device *device, 1759 u64 start, u64 *dev_extent_len) 1760 { 1761 struct btrfs_fs_info *fs_info = device->fs_info; 1762 struct btrfs_root *root = fs_info->dev_root; 1763 int ret; 1764 struct btrfs_path *path; 1765 struct btrfs_key key; 1766 struct btrfs_key found_key; 1767 struct extent_buffer *leaf = NULL; 1768 struct btrfs_dev_extent *extent = NULL; 1769 1770 path = btrfs_alloc_path(); 1771 if (!path) 1772 return -ENOMEM; 1773 1774 key.objectid = device->devid; 1775 key.offset = start; 1776 key.type = BTRFS_DEV_EXTENT_KEY; 1777 again: 1778 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1779 if (ret > 0) { 1780 ret = btrfs_previous_item(root, path, key.objectid, 1781 BTRFS_DEV_EXTENT_KEY); 1782 if (ret) 1783 goto out; 1784 leaf = path->nodes[0]; 1785 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1786 extent = btrfs_item_ptr(leaf, path->slots[0], 1787 struct btrfs_dev_extent); 1788 BUG_ON(found_key.offset > start || found_key.offset + 1789 btrfs_dev_extent_length(leaf, extent) < start); 1790 key = found_key; 1791 btrfs_release_path(path); 1792 goto again; 1793 } else if (ret == 0) { 1794 leaf = path->nodes[0]; 1795 extent = btrfs_item_ptr(leaf, path->slots[0], 1796 struct btrfs_dev_extent); 1797 } else { 1798 goto out; 1799 } 1800 1801 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1802 1803 ret = btrfs_del_item(trans, root, path); 1804 if (ret == 0) 1805 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1806 out: 1807 btrfs_free_path(path); 1808 return ret; 1809 } 1810 1811 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1812 { 1813 struct rb_node *n; 1814 u64 ret = 0; 1815 1816 read_lock(&fs_info->mapping_tree_lock); 1817 n = rb_last(&fs_info->mapping_tree.rb_root); 1818 if (n) { 1819 struct btrfs_chunk_map *map; 1820 1821 map = rb_entry(n, struct btrfs_chunk_map, rb_node); 1822 ret = map->start + map->chunk_len; 1823 } 1824 read_unlock(&fs_info->mapping_tree_lock); 1825 1826 return ret; 1827 } 1828 1829 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1830 u64 *devid_ret) 1831 { 1832 int ret; 1833 struct btrfs_key key; 1834 struct btrfs_key found_key; 1835 struct btrfs_path *path; 1836 1837 path = btrfs_alloc_path(); 1838 if (!path) 1839 return -ENOMEM; 1840 1841 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1842 key.type = BTRFS_DEV_ITEM_KEY; 1843 key.offset = (u64)-1; 1844 1845 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1846 if (ret < 0) 1847 goto error; 1848 1849 if (ret == 0) { 1850 /* Corruption */ 1851 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1852 ret = -EUCLEAN; 1853 goto error; 1854 } 1855 1856 ret = btrfs_previous_item(fs_info->chunk_root, path, 1857 BTRFS_DEV_ITEMS_OBJECTID, 1858 BTRFS_DEV_ITEM_KEY); 1859 if (ret) { 1860 *devid_ret = 1; 1861 } else { 1862 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1863 path->slots[0]); 1864 *devid_ret = found_key.offset + 1; 1865 } 1866 ret = 0; 1867 error: 1868 btrfs_free_path(path); 1869 return ret; 1870 } 1871 1872 /* 1873 * the device information is stored in the chunk root 1874 * the btrfs_device struct should be fully filled in 1875 */ 1876 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1877 struct btrfs_device *device) 1878 { 1879 int ret; 1880 struct btrfs_path *path; 1881 struct btrfs_dev_item *dev_item; 1882 struct extent_buffer *leaf; 1883 struct btrfs_key key; 1884 unsigned long ptr; 1885 1886 path = btrfs_alloc_path(); 1887 if (!path) 1888 return -ENOMEM; 1889 1890 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1891 key.type = BTRFS_DEV_ITEM_KEY; 1892 key.offset = device->devid; 1893 1894 btrfs_reserve_chunk_metadata(trans, true); 1895 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1896 &key, sizeof(*dev_item)); 1897 btrfs_trans_release_chunk_metadata(trans); 1898 if (ret) 1899 goto out; 1900 1901 leaf = path->nodes[0]; 1902 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1903 1904 btrfs_set_device_id(leaf, dev_item, device->devid); 1905 btrfs_set_device_generation(leaf, dev_item, 0); 1906 btrfs_set_device_type(leaf, dev_item, device->type); 1907 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1908 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1909 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1910 btrfs_set_device_total_bytes(leaf, dev_item, 1911 btrfs_device_get_disk_total_bytes(device)); 1912 btrfs_set_device_bytes_used(leaf, dev_item, 1913 btrfs_device_get_bytes_used(device)); 1914 btrfs_set_device_group(leaf, dev_item, 0); 1915 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1916 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1917 btrfs_set_device_start_offset(leaf, dev_item, 0); 1918 1919 ptr = btrfs_device_uuid(dev_item); 1920 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1921 ptr = btrfs_device_fsid(dev_item); 1922 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1923 ptr, BTRFS_FSID_SIZE); 1924 btrfs_mark_buffer_dirty(trans, leaf); 1925 1926 ret = 0; 1927 out: 1928 btrfs_free_path(path); 1929 return ret; 1930 } 1931 1932 /* 1933 * Function to update ctime/mtime for a given device path. 1934 * Mainly used for ctime/mtime based probe like libblkid. 1935 * 1936 * We don't care about errors here, this is just to be kind to userspace. 1937 */ 1938 static void update_dev_time(const char *device_path) 1939 { 1940 struct path path; 1941 int ret; 1942 1943 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1944 if (ret) 1945 return; 1946 1947 inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); 1948 path_put(&path); 1949 } 1950 1951 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, 1952 struct btrfs_device *device) 1953 { 1954 struct btrfs_root *root = device->fs_info->chunk_root; 1955 int ret; 1956 struct btrfs_path *path; 1957 struct btrfs_key key; 1958 1959 path = btrfs_alloc_path(); 1960 if (!path) 1961 return -ENOMEM; 1962 1963 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1964 key.type = BTRFS_DEV_ITEM_KEY; 1965 key.offset = device->devid; 1966 1967 btrfs_reserve_chunk_metadata(trans, false); 1968 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1969 btrfs_trans_release_chunk_metadata(trans); 1970 if (ret) { 1971 if (ret > 0) 1972 ret = -ENOENT; 1973 goto out; 1974 } 1975 1976 ret = btrfs_del_item(trans, root, path); 1977 out: 1978 btrfs_free_path(path); 1979 return ret; 1980 } 1981 1982 /* 1983 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1984 * filesystem. It's up to the caller to adjust that number regarding eg. device 1985 * replace. 1986 */ 1987 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1988 u64 num_devices) 1989 { 1990 u64 all_avail; 1991 unsigned seq; 1992 int i; 1993 1994 do { 1995 seq = read_seqbegin(&fs_info->profiles_lock); 1996 1997 all_avail = fs_info->avail_data_alloc_bits | 1998 fs_info->avail_system_alloc_bits | 1999 fs_info->avail_metadata_alloc_bits; 2000 } while (read_seqretry(&fs_info->profiles_lock, seq)); 2001 2002 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2003 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 2004 continue; 2005 2006 if (num_devices < btrfs_raid_array[i].devs_min) 2007 return btrfs_raid_array[i].mindev_error; 2008 } 2009 2010 return 0; 2011 } 2012 2013 static struct btrfs_device * btrfs_find_next_active_device( 2014 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 2015 { 2016 struct btrfs_device *next_device; 2017 2018 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 2019 if (next_device != device && 2020 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 2021 && next_device->bdev) 2022 return next_device; 2023 } 2024 2025 return NULL; 2026 } 2027 2028 /* 2029 * Helper function to check if the given device is part of s_bdev / latest_dev 2030 * and replace it with the provided or the next active device, in the context 2031 * where this function called, there should be always be another device (or 2032 * this_dev) which is active. 2033 */ 2034 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 2035 struct btrfs_device *next_device) 2036 { 2037 struct btrfs_fs_info *fs_info = device->fs_info; 2038 2039 if (!next_device) 2040 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2041 device); 2042 ASSERT(next_device); 2043 2044 if (fs_info->sb->s_bdev && 2045 (fs_info->sb->s_bdev == device->bdev)) 2046 fs_info->sb->s_bdev = next_device->bdev; 2047 2048 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2049 fs_info->fs_devices->latest_dev = next_device; 2050 } 2051 2052 /* 2053 * Return btrfs_fs_devices::num_devices excluding the device that's being 2054 * currently replaced. 2055 */ 2056 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2057 { 2058 u64 num_devices = fs_info->fs_devices->num_devices; 2059 2060 down_read(&fs_info->dev_replace.rwsem); 2061 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2062 ASSERT(num_devices > 1); 2063 num_devices--; 2064 } 2065 up_read(&fs_info->dev_replace.rwsem); 2066 2067 return num_devices; 2068 } 2069 2070 static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, 2071 struct block_device *bdev, int copy_num) 2072 { 2073 struct btrfs_super_block *disk_super; 2074 const size_t len = sizeof(disk_super->magic); 2075 const u64 bytenr = btrfs_sb_offset(copy_num); 2076 int ret; 2077 2078 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); 2079 if (IS_ERR(disk_super)) 2080 return; 2081 2082 memset(&disk_super->magic, 0, len); 2083 folio_mark_dirty(virt_to_folio(disk_super)); 2084 btrfs_release_disk_super(disk_super); 2085 2086 ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); 2087 if (ret) 2088 btrfs_warn(fs_info, "error clearing superblock number %d (%d)", 2089 copy_num, ret); 2090 } 2091 2092 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device) 2093 { 2094 int copy_num; 2095 struct block_device *bdev = device->bdev; 2096 2097 if (!bdev) 2098 return; 2099 2100 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2101 if (bdev_is_zoned(bdev)) 2102 btrfs_reset_sb_log_zones(bdev, copy_num); 2103 else 2104 btrfs_scratch_superblock(fs_info, bdev, copy_num); 2105 } 2106 2107 /* Notify udev that device has changed */ 2108 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2109 2110 /* Update ctime/mtime for device path for libblkid */ 2111 update_dev_time(device->name->str); 2112 } 2113 2114 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2115 struct btrfs_dev_lookup_args *args, 2116 struct file **bdev_file) 2117 { 2118 struct btrfs_trans_handle *trans; 2119 struct btrfs_device *device; 2120 struct btrfs_fs_devices *cur_devices; 2121 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2122 u64 num_devices; 2123 int ret = 0; 2124 2125 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2126 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); 2127 return -EINVAL; 2128 } 2129 2130 /* 2131 * The device list in fs_devices is accessed without locks (neither 2132 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2133 * filesystem and another device rm cannot run. 2134 */ 2135 num_devices = btrfs_num_devices(fs_info); 2136 2137 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2138 if (ret) 2139 return ret; 2140 2141 device = btrfs_find_device(fs_info->fs_devices, args); 2142 if (!device) { 2143 if (args->missing) 2144 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2145 else 2146 ret = -ENOENT; 2147 return ret; 2148 } 2149 2150 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2151 btrfs_warn_in_rcu(fs_info, 2152 "cannot remove device %s (devid %llu) due to active swapfile", 2153 btrfs_dev_name(device), device->devid); 2154 return -ETXTBSY; 2155 } 2156 2157 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 2158 return BTRFS_ERROR_DEV_TGT_REPLACE; 2159 2160 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2161 fs_info->fs_devices->rw_devices == 1) 2162 return BTRFS_ERROR_DEV_ONLY_WRITABLE; 2163 2164 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2165 mutex_lock(&fs_info->chunk_mutex); 2166 list_del_init(&device->dev_alloc_list); 2167 device->fs_devices->rw_devices--; 2168 mutex_unlock(&fs_info->chunk_mutex); 2169 } 2170 2171 ret = btrfs_shrink_device(device, 0); 2172 if (ret) 2173 goto error_undo; 2174 2175 trans = btrfs_start_transaction(fs_info->chunk_root, 0); 2176 if (IS_ERR(trans)) { 2177 ret = PTR_ERR(trans); 2178 goto error_undo; 2179 } 2180 2181 ret = btrfs_rm_dev_item(trans, device); 2182 if (ret) { 2183 /* Any error in dev item removal is critical */ 2184 btrfs_crit(fs_info, 2185 "failed to remove device item for devid %llu: %d", 2186 device->devid, ret); 2187 btrfs_abort_transaction(trans, ret); 2188 btrfs_end_transaction(trans); 2189 return ret; 2190 } 2191 2192 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2193 btrfs_scrub_cancel_dev(device); 2194 2195 /* 2196 * the device list mutex makes sure that we don't change 2197 * the device list while someone else is writing out all 2198 * the device supers. Whoever is writing all supers, should 2199 * lock the device list mutex before getting the number of 2200 * devices in the super block (super_copy). Conversely, 2201 * whoever updates the number of devices in the super block 2202 * (super_copy) should hold the device list mutex. 2203 */ 2204 2205 /* 2206 * In normal cases the cur_devices == fs_devices. But in case 2207 * of deleting a seed device, the cur_devices should point to 2208 * its own fs_devices listed under the fs_devices->seed_list. 2209 */ 2210 cur_devices = device->fs_devices; 2211 mutex_lock(&fs_devices->device_list_mutex); 2212 list_del_rcu(&device->dev_list); 2213 2214 cur_devices->num_devices--; 2215 cur_devices->total_devices--; 2216 /* Update total_devices of the parent fs_devices if it's seed */ 2217 if (cur_devices != fs_devices) 2218 fs_devices->total_devices--; 2219 2220 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2221 cur_devices->missing_devices--; 2222 2223 btrfs_assign_next_active_device(device, NULL); 2224 2225 if (device->bdev_file) { 2226 cur_devices->open_devices--; 2227 /* remove sysfs entry */ 2228 btrfs_sysfs_remove_device(device); 2229 } 2230 2231 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2232 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2233 mutex_unlock(&fs_devices->device_list_mutex); 2234 2235 /* 2236 * At this point, the device is zero sized and detached from the 2237 * devices list. All that's left is to zero out the old supers and 2238 * free the device. 2239 * 2240 * We cannot call btrfs_close_bdev() here because we're holding the sb 2241 * write lock, and fput() on the block device will pull in the 2242 * ->open_mutex on the block device and it's dependencies. Instead 2243 * just flush the device and let the caller do the final bdev_release. 2244 */ 2245 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2246 btrfs_scratch_superblocks(fs_info, device); 2247 if (device->bdev) { 2248 sync_blockdev(device->bdev); 2249 invalidate_bdev(device->bdev); 2250 } 2251 } 2252 2253 *bdev_file = device->bdev_file; 2254 synchronize_rcu(); 2255 btrfs_free_device(device); 2256 2257 /* 2258 * This can happen if cur_devices is the private seed devices list. We 2259 * cannot call close_fs_devices() here because it expects the uuid_mutex 2260 * to be held, but in fact we don't need that for the private 2261 * seed_devices, we can simply decrement cur_devices->opened and then 2262 * remove it from our list and free the fs_devices. 2263 */ 2264 if (cur_devices->num_devices == 0) { 2265 list_del_init(&cur_devices->seed_list); 2266 ASSERT(cur_devices->opened == 1); 2267 cur_devices->opened--; 2268 free_fs_devices(cur_devices); 2269 } 2270 2271 ret = btrfs_commit_transaction(trans); 2272 2273 return ret; 2274 2275 error_undo: 2276 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2277 mutex_lock(&fs_info->chunk_mutex); 2278 list_add(&device->dev_alloc_list, 2279 &fs_devices->alloc_list); 2280 device->fs_devices->rw_devices++; 2281 mutex_unlock(&fs_info->chunk_mutex); 2282 } 2283 return ret; 2284 } 2285 2286 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2287 { 2288 struct btrfs_fs_devices *fs_devices; 2289 2290 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2291 2292 /* 2293 * in case of fs with no seed, srcdev->fs_devices will point 2294 * to fs_devices of fs_info. However when the dev being replaced is 2295 * a seed dev it will point to the seed's local fs_devices. In short 2296 * srcdev will have its correct fs_devices in both the cases. 2297 */ 2298 fs_devices = srcdev->fs_devices; 2299 2300 list_del_rcu(&srcdev->dev_list); 2301 list_del(&srcdev->dev_alloc_list); 2302 fs_devices->num_devices--; 2303 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2304 fs_devices->missing_devices--; 2305 2306 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2307 fs_devices->rw_devices--; 2308 2309 if (srcdev->bdev) 2310 fs_devices->open_devices--; 2311 } 2312 2313 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2314 { 2315 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2316 2317 mutex_lock(&uuid_mutex); 2318 2319 btrfs_close_bdev(srcdev); 2320 synchronize_rcu(); 2321 btrfs_free_device(srcdev); 2322 2323 /* if this is no devs we rather delete the fs_devices */ 2324 if (!fs_devices->num_devices) { 2325 /* 2326 * On a mounted FS, num_devices can't be zero unless it's a 2327 * seed. In case of a seed device being replaced, the replace 2328 * target added to the sprout FS, so there will be no more 2329 * device left under the seed FS. 2330 */ 2331 ASSERT(fs_devices->seeding); 2332 2333 list_del_init(&fs_devices->seed_list); 2334 close_fs_devices(fs_devices); 2335 free_fs_devices(fs_devices); 2336 } 2337 mutex_unlock(&uuid_mutex); 2338 } 2339 2340 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2341 { 2342 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2343 2344 mutex_lock(&fs_devices->device_list_mutex); 2345 2346 btrfs_sysfs_remove_device(tgtdev); 2347 2348 if (tgtdev->bdev) 2349 fs_devices->open_devices--; 2350 2351 fs_devices->num_devices--; 2352 2353 btrfs_assign_next_active_device(tgtdev, NULL); 2354 2355 list_del_rcu(&tgtdev->dev_list); 2356 2357 mutex_unlock(&fs_devices->device_list_mutex); 2358 2359 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev); 2360 2361 btrfs_close_bdev(tgtdev); 2362 synchronize_rcu(); 2363 btrfs_free_device(tgtdev); 2364 } 2365 2366 /* 2367 * Populate args from device at path. 2368 * 2369 * @fs_info: the filesystem 2370 * @args: the args to populate 2371 * @path: the path to the device 2372 * 2373 * This will read the super block of the device at @path and populate @args with 2374 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2375 * lookup a device to operate on, but need to do it before we take any locks. 2376 * This properly handles the special case of "missing" that a user may pass in, 2377 * and does some basic sanity checks. The caller must make sure that @path is 2378 * properly NUL terminated before calling in, and must call 2379 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2380 * uuid buffers. 2381 * 2382 * Return: 0 for success, -errno for failure 2383 */ 2384 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2385 struct btrfs_dev_lookup_args *args, 2386 const char *path) 2387 { 2388 struct btrfs_super_block *disk_super; 2389 struct file *bdev_file; 2390 int ret; 2391 2392 if (!path || !path[0]) 2393 return -EINVAL; 2394 if (!strcmp(path, "missing")) { 2395 args->missing = true; 2396 return 0; 2397 } 2398 2399 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2400 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2401 if (!args->uuid || !args->fsid) { 2402 btrfs_put_dev_args_from_path(args); 2403 return -ENOMEM; 2404 } 2405 2406 ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, 2407 &bdev_file, &disk_super); 2408 if (ret) { 2409 btrfs_put_dev_args_from_path(args); 2410 return ret; 2411 } 2412 2413 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2414 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2415 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2416 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2417 else 2418 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2419 btrfs_release_disk_super(disk_super); 2420 fput(bdev_file); 2421 return 0; 2422 } 2423 2424 /* 2425 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2426 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2427 * that don't need to be freed. 2428 */ 2429 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2430 { 2431 kfree(args->uuid); 2432 kfree(args->fsid); 2433 args->uuid = NULL; 2434 args->fsid = NULL; 2435 } 2436 2437 struct btrfs_device *btrfs_find_device_by_devspec( 2438 struct btrfs_fs_info *fs_info, u64 devid, 2439 const char *device_path) 2440 { 2441 BTRFS_DEV_LOOKUP_ARGS(args); 2442 struct btrfs_device *device; 2443 int ret; 2444 2445 if (devid) { 2446 args.devid = devid; 2447 device = btrfs_find_device(fs_info->fs_devices, &args); 2448 if (!device) 2449 return ERR_PTR(-ENOENT); 2450 return device; 2451 } 2452 2453 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2454 if (ret) 2455 return ERR_PTR(ret); 2456 device = btrfs_find_device(fs_info->fs_devices, &args); 2457 btrfs_put_dev_args_from_path(&args); 2458 if (!device) 2459 return ERR_PTR(-ENOENT); 2460 return device; 2461 } 2462 2463 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2464 { 2465 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2466 struct btrfs_fs_devices *old_devices; 2467 struct btrfs_fs_devices *seed_devices; 2468 2469 lockdep_assert_held(&uuid_mutex); 2470 if (!fs_devices->seeding) 2471 return ERR_PTR(-EINVAL); 2472 2473 /* 2474 * Private copy of the seed devices, anchored at 2475 * fs_info->fs_devices->seed_list 2476 */ 2477 seed_devices = alloc_fs_devices(NULL); 2478 if (IS_ERR(seed_devices)) 2479 return seed_devices; 2480 2481 /* 2482 * It's necessary to retain a copy of the original seed fs_devices in 2483 * fs_uuids so that filesystems which have been seeded can successfully 2484 * reference the seed device from open_seed_devices. This also supports 2485 * multiple fs seed. 2486 */ 2487 old_devices = clone_fs_devices(fs_devices); 2488 if (IS_ERR(old_devices)) { 2489 kfree(seed_devices); 2490 return old_devices; 2491 } 2492 2493 list_add(&old_devices->fs_list, &fs_uuids); 2494 2495 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2496 seed_devices->opened = 1; 2497 INIT_LIST_HEAD(&seed_devices->devices); 2498 INIT_LIST_HEAD(&seed_devices->alloc_list); 2499 mutex_init(&seed_devices->device_list_mutex); 2500 2501 return seed_devices; 2502 } 2503 2504 /* 2505 * Splice seed devices into the sprout fs_devices. 2506 * Generate a new fsid for the sprouted read-write filesystem. 2507 */ 2508 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2509 struct btrfs_fs_devices *seed_devices) 2510 { 2511 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2512 struct btrfs_super_block *disk_super = fs_info->super_copy; 2513 struct btrfs_device *device; 2514 u64 super_flags; 2515 2516 /* 2517 * We are updating the fsid, the thread leading to device_list_add() 2518 * could race, so uuid_mutex is needed. 2519 */ 2520 lockdep_assert_held(&uuid_mutex); 2521 2522 /* 2523 * The threads listed below may traverse dev_list but can do that without 2524 * device_list_mutex: 2525 * - All device ops and balance - as we are in btrfs_exclop_start. 2526 * - Various dev_list readers - are using RCU. 2527 * - btrfs_ioctl_fitrim() - is using RCU. 2528 * 2529 * For-read threads as below are using device_list_mutex: 2530 * - Readonly scrub btrfs_scrub_dev() 2531 * - Readonly scrub btrfs_scrub_progress() 2532 * - btrfs_get_dev_stats() 2533 */ 2534 lockdep_assert_held(&fs_devices->device_list_mutex); 2535 2536 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2537 synchronize_rcu); 2538 list_for_each_entry(device, &seed_devices->devices, dev_list) 2539 device->fs_devices = seed_devices; 2540 2541 fs_devices->seeding = false; 2542 fs_devices->num_devices = 0; 2543 fs_devices->open_devices = 0; 2544 fs_devices->missing_devices = 0; 2545 fs_devices->rotating = false; 2546 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2547 2548 generate_random_uuid(fs_devices->fsid); 2549 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2550 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2551 2552 super_flags = btrfs_super_flags(disk_super) & 2553 ~BTRFS_SUPER_FLAG_SEEDING; 2554 btrfs_set_super_flags(disk_super, super_flags); 2555 } 2556 2557 /* 2558 * Store the expected generation for seed devices in device items. 2559 */ 2560 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2561 { 2562 BTRFS_DEV_LOOKUP_ARGS(args); 2563 struct btrfs_fs_info *fs_info = trans->fs_info; 2564 struct btrfs_root *root = fs_info->chunk_root; 2565 struct btrfs_path *path; 2566 struct extent_buffer *leaf; 2567 struct btrfs_dev_item *dev_item; 2568 struct btrfs_device *device; 2569 struct btrfs_key key; 2570 u8 fs_uuid[BTRFS_FSID_SIZE]; 2571 u8 dev_uuid[BTRFS_UUID_SIZE]; 2572 int ret; 2573 2574 path = btrfs_alloc_path(); 2575 if (!path) 2576 return -ENOMEM; 2577 2578 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2579 key.offset = 0; 2580 key.type = BTRFS_DEV_ITEM_KEY; 2581 2582 while (1) { 2583 btrfs_reserve_chunk_metadata(trans, false); 2584 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2585 btrfs_trans_release_chunk_metadata(trans); 2586 if (ret < 0) 2587 goto error; 2588 2589 leaf = path->nodes[0]; 2590 next_slot: 2591 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2592 ret = btrfs_next_leaf(root, path); 2593 if (ret > 0) 2594 break; 2595 if (ret < 0) 2596 goto error; 2597 leaf = path->nodes[0]; 2598 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2599 btrfs_release_path(path); 2600 continue; 2601 } 2602 2603 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2604 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2605 key.type != BTRFS_DEV_ITEM_KEY) 2606 break; 2607 2608 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2609 struct btrfs_dev_item); 2610 args.devid = btrfs_device_id(leaf, dev_item); 2611 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2612 BTRFS_UUID_SIZE); 2613 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2614 BTRFS_FSID_SIZE); 2615 args.uuid = dev_uuid; 2616 args.fsid = fs_uuid; 2617 device = btrfs_find_device(fs_info->fs_devices, &args); 2618 BUG_ON(!device); /* Logic error */ 2619 2620 if (device->fs_devices->seeding) { 2621 btrfs_set_device_generation(leaf, dev_item, 2622 device->generation); 2623 btrfs_mark_buffer_dirty(trans, leaf); 2624 } 2625 2626 path->slots[0]++; 2627 goto next_slot; 2628 } 2629 ret = 0; 2630 error: 2631 btrfs_free_path(path); 2632 return ret; 2633 } 2634 2635 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2636 { 2637 struct btrfs_root *root = fs_info->dev_root; 2638 struct btrfs_trans_handle *trans; 2639 struct btrfs_device *device; 2640 struct file *bdev_file; 2641 struct super_block *sb = fs_info->sb; 2642 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2643 struct btrfs_fs_devices *seed_devices = NULL; 2644 u64 orig_super_total_bytes; 2645 u64 orig_super_num_devices; 2646 int ret = 0; 2647 bool seeding_dev = false; 2648 bool locked = false; 2649 2650 if (sb_rdonly(sb) && !fs_devices->seeding) 2651 return -EROFS; 2652 2653 bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, 2654 fs_info->bdev_holder, NULL); 2655 if (IS_ERR(bdev_file)) 2656 return PTR_ERR(bdev_file); 2657 2658 if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) { 2659 ret = -EINVAL; 2660 goto error; 2661 } 2662 2663 if (fs_devices->seeding) { 2664 seeding_dev = true; 2665 down_write(&sb->s_umount); 2666 mutex_lock(&uuid_mutex); 2667 locked = true; 2668 } 2669 2670 sync_blockdev(file_bdev(bdev_file)); 2671 2672 rcu_read_lock(); 2673 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2674 if (device->bdev == file_bdev(bdev_file)) { 2675 ret = -EEXIST; 2676 rcu_read_unlock(); 2677 goto error; 2678 } 2679 } 2680 rcu_read_unlock(); 2681 2682 device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); 2683 if (IS_ERR(device)) { 2684 /* we can safely leave the fs_devices entry around */ 2685 ret = PTR_ERR(device); 2686 goto error; 2687 } 2688 2689 device->fs_info = fs_info; 2690 device->bdev_file = bdev_file; 2691 device->bdev = file_bdev(bdev_file); 2692 ret = lookup_bdev(device_path, &device->devt); 2693 if (ret) 2694 goto error_free_device; 2695 2696 ret = btrfs_get_dev_zone_info(device, false); 2697 if (ret) 2698 goto error_free_device; 2699 2700 trans = btrfs_start_transaction(root, 0); 2701 if (IS_ERR(trans)) { 2702 ret = PTR_ERR(trans); 2703 goto error_free_zone; 2704 } 2705 2706 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2707 device->generation = trans->transid; 2708 device->io_width = fs_info->sectorsize; 2709 device->io_align = fs_info->sectorsize; 2710 device->sector_size = fs_info->sectorsize; 2711 device->total_bytes = 2712 round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); 2713 device->disk_total_bytes = device->total_bytes; 2714 device->commit_total_bytes = device->total_bytes; 2715 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2716 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2717 device->dev_stats_valid = 1; 2718 set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); 2719 2720 if (seeding_dev) { 2721 btrfs_clear_sb_rdonly(sb); 2722 2723 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2724 seed_devices = btrfs_init_sprout(fs_info); 2725 if (IS_ERR(seed_devices)) { 2726 ret = PTR_ERR(seed_devices); 2727 btrfs_abort_transaction(trans, ret); 2728 goto error_trans; 2729 } 2730 } 2731 2732 mutex_lock(&fs_devices->device_list_mutex); 2733 if (seeding_dev) { 2734 btrfs_setup_sprout(fs_info, seed_devices); 2735 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2736 device); 2737 } 2738 2739 device->fs_devices = fs_devices; 2740 2741 mutex_lock(&fs_info->chunk_mutex); 2742 list_add_rcu(&device->dev_list, &fs_devices->devices); 2743 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2744 fs_devices->num_devices++; 2745 fs_devices->open_devices++; 2746 fs_devices->rw_devices++; 2747 fs_devices->total_devices++; 2748 fs_devices->total_rw_bytes += device->total_bytes; 2749 2750 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2751 2752 if (!bdev_nonrot(device->bdev)) 2753 fs_devices->rotating = true; 2754 2755 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2756 btrfs_set_super_total_bytes(fs_info->super_copy, 2757 round_down(orig_super_total_bytes + device->total_bytes, 2758 fs_info->sectorsize)); 2759 2760 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2761 btrfs_set_super_num_devices(fs_info->super_copy, 2762 orig_super_num_devices + 1); 2763 2764 /* 2765 * we've got more storage, clear any full flags on the space 2766 * infos 2767 */ 2768 btrfs_clear_space_info_full(fs_info); 2769 2770 mutex_unlock(&fs_info->chunk_mutex); 2771 2772 /* Add sysfs device entry */ 2773 btrfs_sysfs_add_device(device); 2774 2775 mutex_unlock(&fs_devices->device_list_mutex); 2776 2777 if (seeding_dev) { 2778 mutex_lock(&fs_info->chunk_mutex); 2779 ret = init_first_rw_device(trans); 2780 mutex_unlock(&fs_info->chunk_mutex); 2781 if (ret) { 2782 btrfs_abort_transaction(trans, ret); 2783 goto error_sysfs; 2784 } 2785 } 2786 2787 ret = btrfs_add_dev_item(trans, device); 2788 if (ret) { 2789 btrfs_abort_transaction(trans, ret); 2790 goto error_sysfs; 2791 } 2792 2793 if (seeding_dev) { 2794 ret = btrfs_finish_sprout(trans); 2795 if (ret) { 2796 btrfs_abort_transaction(trans, ret); 2797 goto error_sysfs; 2798 } 2799 2800 /* 2801 * fs_devices now represents the newly sprouted filesystem and 2802 * its fsid has been changed by btrfs_sprout_splice(). 2803 */ 2804 btrfs_sysfs_update_sprout_fsid(fs_devices); 2805 } 2806 2807 ret = btrfs_commit_transaction(trans); 2808 2809 if (seeding_dev) { 2810 mutex_unlock(&uuid_mutex); 2811 up_write(&sb->s_umount); 2812 locked = false; 2813 2814 if (ret) /* transaction commit */ 2815 return ret; 2816 2817 ret = btrfs_relocate_sys_chunks(fs_info); 2818 if (ret < 0) 2819 btrfs_handle_fs_error(fs_info, ret, 2820 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2821 trans = btrfs_attach_transaction(root); 2822 if (IS_ERR(trans)) { 2823 if (PTR_ERR(trans) == -ENOENT) 2824 return 0; 2825 ret = PTR_ERR(trans); 2826 trans = NULL; 2827 goto error_sysfs; 2828 } 2829 ret = btrfs_commit_transaction(trans); 2830 } 2831 2832 /* 2833 * Now that we have written a new super block to this device, check all 2834 * other fs_devices list if device_path alienates any other scanned 2835 * device. 2836 * We can ignore the return value as it typically returns -EINVAL and 2837 * only succeeds if the device was an alien. 2838 */ 2839 btrfs_forget_devices(device->devt); 2840 2841 /* Update ctime/mtime for blkid or udev */ 2842 update_dev_time(device_path); 2843 2844 return ret; 2845 2846 error_sysfs: 2847 btrfs_sysfs_remove_device(device); 2848 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2849 mutex_lock(&fs_info->chunk_mutex); 2850 list_del_rcu(&device->dev_list); 2851 list_del(&device->dev_alloc_list); 2852 fs_info->fs_devices->num_devices--; 2853 fs_info->fs_devices->open_devices--; 2854 fs_info->fs_devices->rw_devices--; 2855 fs_info->fs_devices->total_devices--; 2856 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2857 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2858 btrfs_set_super_total_bytes(fs_info->super_copy, 2859 orig_super_total_bytes); 2860 btrfs_set_super_num_devices(fs_info->super_copy, 2861 orig_super_num_devices); 2862 mutex_unlock(&fs_info->chunk_mutex); 2863 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2864 error_trans: 2865 if (seeding_dev) 2866 btrfs_set_sb_rdonly(sb); 2867 if (trans) 2868 btrfs_end_transaction(trans); 2869 error_free_zone: 2870 btrfs_destroy_dev_zone_info(device); 2871 error_free_device: 2872 btrfs_free_device(device); 2873 error: 2874 fput(bdev_file); 2875 if (locked) { 2876 mutex_unlock(&uuid_mutex); 2877 up_write(&sb->s_umount); 2878 } 2879 return ret; 2880 } 2881 2882 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2883 struct btrfs_device *device) 2884 { 2885 int ret; 2886 struct btrfs_path *path; 2887 struct btrfs_root *root = device->fs_info->chunk_root; 2888 struct btrfs_dev_item *dev_item; 2889 struct extent_buffer *leaf; 2890 struct btrfs_key key; 2891 2892 path = btrfs_alloc_path(); 2893 if (!path) 2894 return -ENOMEM; 2895 2896 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2897 key.type = BTRFS_DEV_ITEM_KEY; 2898 key.offset = device->devid; 2899 2900 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2901 if (ret < 0) 2902 goto out; 2903 2904 if (ret > 0) { 2905 ret = -ENOENT; 2906 goto out; 2907 } 2908 2909 leaf = path->nodes[0]; 2910 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2911 2912 btrfs_set_device_id(leaf, dev_item, device->devid); 2913 btrfs_set_device_type(leaf, dev_item, device->type); 2914 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2915 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2916 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2917 btrfs_set_device_total_bytes(leaf, dev_item, 2918 btrfs_device_get_disk_total_bytes(device)); 2919 btrfs_set_device_bytes_used(leaf, dev_item, 2920 btrfs_device_get_bytes_used(device)); 2921 btrfs_mark_buffer_dirty(trans, leaf); 2922 2923 out: 2924 btrfs_free_path(path); 2925 return ret; 2926 } 2927 2928 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2929 struct btrfs_device *device, u64 new_size) 2930 { 2931 struct btrfs_fs_info *fs_info = device->fs_info; 2932 struct btrfs_super_block *super_copy = fs_info->super_copy; 2933 u64 old_total; 2934 u64 diff; 2935 int ret; 2936 2937 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2938 return -EACCES; 2939 2940 new_size = round_down(new_size, fs_info->sectorsize); 2941 2942 mutex_lock(&fs_info->chunk_mutex); 2943 old_total = btrfs_super_total_bytes(super_copy); 2944 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2945 2946 if (new_size <= device->total_bytes || 2947 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2948 mutex_unlock(&fs_info->chunk_mutex); 2949 return -EINVAL; 2950 } 2951 2952 btrfs_set_super_total_bytes(super_copy, 2953 round_down(old_total + diff, fs_info->sectorsize)); 2954 device->fs_devices->total_rw_bytes += diff; 2955 atomic64_add(diff, &fs_info->free_chunk_space); 2956 2957 btrfs_device_set_total_bytes(device, new_size); 2958 btrfs_device_set_disk_total_bytes(device, new_size); 2959 btrfs_clear_space_info_full(device->fs_info); 2960 if (list_empty(&device->post_commit_list)) 2961 list_add_tail(&device->post_commit_list, 2962 &trans->transaction->dev_update_list); 2963 mutex_unlock(&fs_info->chunk_mutex); 2964 2965 btrfs_reserve_chunk_metadata(trans, false); 2966 ret = btrfs_update_device(trans, device); 2967 btrfs_trans_release_chunk_metadata(trans); 2968 2969 return ret; 2970 } 2971 2972 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2973 { 2974 struct btrfs_fs_info *fs_info = trans->fs_info; 2975 struct btrfs_root *root = fs_info->chunk_root; 2976 int ret; 2977 struct btrfs_path *path; 2978 struct btrfs_key key; 2979 2980 path = btrfs_alloc_path(); 2981 if (!path) 2982 return -ENOMEM; 2983 2984 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2985 key.offset = chunk_offset; 2986 key.type = BTRFS_CHUNK_ITEM_KEY; 2987 2988 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2989 if (ret < 0) 2990 goto out; 2991 else if (ret > 0) { /* Logic error or corruption */ 2992 btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", 2993 chunk_offset); 2994 btrfs_abort_transaction(trans, -ENOENT); 2995 ret = -EUCLEAN; 2996 goto out; 2997 } 2998 2999 ret = btrfs_del_item(trans, root, path); 3000 if (ret < 0) { 3001 btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); 3002 btrfs_abort_transaction(trans, ret); 3003 goto out; 3004 } 3005 out: 3006 btrfs_free_path(path); 3007 return ret; 3008 } 3009 3010 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3011 { 3012 struct btrfs_super_block *super_copy = fs_info->super_copy; 3013 struct btrfs_disk_key *disk_key; 3014 struct btrfs_chunk *chunk; 3015 u8 *ptr; 3016 int ret = 0; 3017 u32 num_stripes; 3018 u32 array_size; 3019 u32 len = 0; 3020 u32 cur; 3021 struct btrfs_key key; 3022 3023 lockdep_assert_held(&fs_info->chunk_mutex); 3024 array_size = btrfs_super_sys_array_size(super_copy); 3025 3026 ptr = super_copy->sys_chunk_array; 3027 cur = 0; 3028 3029 while (cur < array_size) { 3030 disk_key = (struct btrfs_disk_key *)ptr; 3031 btrfs_disk_key_to_cpu(&key, disk_key); 3032 3033 len = sizeof(*disk_key); 3034 3035 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3036 chunk = (struct btrfs_chunk *)(ptr + len); 3037 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 3038 len += btrfs_chunk_item_size(num_stripes); 3039 } else { 3040 ret = -EIO; 3041 break; 3042 } 3043 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 3044 key.offset == chunk_offset) { 3045 memmove(ptr, ptr + len, array_size - (cur + len)); 3046 array_size -= len; 3047 btrfs_set_super_sys_array_size(super_copy, array_size); 3048 } else { 3049 ptr += len; 3050 cur += len; 3051 } 3052 } 3053 return ret; 3054 } 3055 3056 struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, 3057 u64 logical, u64 length) 3058 { 3059 struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; 3060 struct rb_node *prev = NULL; 3061 struct rb_node *orig_prev; 3062 struct btrfs_chunk_map *map; 3063 struct btrfs_chunk_map *prev_map = NULL; 3064 3065 while (node) { 3066 map = rb_entry(node, struct btrfs_chunk_map, rb_node); 3067 prev = node; 3068 prev_map = map; 3069 3070 if (logical < map->start) { 3071 node = node->rb_left; 3072 } else if (logical >= map->start + map->chunk_len) { 3073 node = node->rb_right; 3074 } else { 3075 refcount_inc(&map->refs); 3076 return map; 3077 } 3078 } 3079 3080 if (!prev) 3081 return NULL; 3082 3083 orig_prev = prev; 3084 while (prev && logical >= prev_map->start + prev_map->chunk_len) { 3085 prev = rb_next(prev); 3086 prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); 3087 } 3088 3089 if (!prev) { 3090 prev = orig_prev; 3091 prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); 3092 while (prev && logical < prev_map->start) { 3093 prev = rb_prev(prev); 3094 prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); 3095 } 3096 } 3097 3098 if (prev) { 3099 u64 end = logical + length; 3100 3101 /* 3102 * Caller can pass a U64_MAX length when it wants to get any 3103 * chunk starting at an offset of 'logical' or higher, so deal 3104 * with underflow by resetting the end offset to U64_MAX. 3105 */ 3106 if (end < logical) 3107 end = U64_MAX; 3108 3109 if (end > prev_map->start && 3110 logical < prev_map->start + prev_map->chunk_len) { 3111 refcount_inc(&prev_map->refs); 3112 return prev_map; 3113 } 3114 } 3115 3116 return NULL; 3117 } 3118 3119 struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, 3120 u64 logical, u64 length) 3121 { 3122 struct btrfs_chunk_map *map; 3123 3124 read_lock(&fs_info->mapping_tree_lock); 3125 map = btrfs_find_chunk_map_nolock(fs_info, logical, length); 3126 read_unlock(&fs_info->mapping_tree_lock); 3127 3128 return map; 3129 } 3130 3131 /* 3132 * Find the mapping containing the given logical extent. 3133 * 3134 * @logical: Logical block offset in bytes. 3135 * @length: Length of extent in bytes. 3136 * 3137 * Return: Chunk mapping or ERR_PTR. 3138 */ 3139 struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3140 u64 logical, u64 length) 3141 { 3142 struct btrfs_chunk_map *map; 3143 3144 map = btrfs_find_chunk_map(fs_info, logical, length); 3145 3146 if (unlikely(!map)) { 3147 btrfs_crit(fs_info, 3148 "unable to find chunk map for logical %llu length %llu", 3149 logical, length); 3150 return ERR_PTR(-EINVAL); 3151 } 3152 3153 if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { 3154 btrfs_crit(fs_info, 3155 "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", 3156 logical, logical + length, map->start, 3157 map->start + map->chunk_len); 3158 btrfs_free_chunk_map(map); 3159 return ERR_PTR(-EINVAL); 3160 } 3161 3162 /* Callers are responsible for dropping the reference. */ 3163 return map; 3164 } 3165 3166 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3167 struct btrfs_chunk_map *map, u64 chunk_offset) 3168 { 3169 int i; 3170 3171 /* 3172 * Removing chunk items and updating the device items in the chunks btree 3173 * requires holding the chunk_mutex. 3174 * See the comment at btrfs_chunk_alloc() for the details. 3175 */ 3176 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3177 3178 for (i = 0; i < map->num_stripes; i++) { 3179 int ret; 3180 3181 ret = btrfs_update_device(trans, map->stripes[i].dev); 3182 if (ret) 3183 return ret; 3184 } 3185 3186 return btrfs_free_chunk(trans, chunk_offset); 3187 } 3188 3189 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3190 { 3191 struct btrfs_fs_info *fs_info = trans->fs_info; 3192 struct btrfs_chunk_map *map; 3193 u64 dev_extent_len = 0; 3194 int i, ret = 0; 3195 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3196 3197 map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3198 if (IS_ERR(map)) { 3199 /* 3200 * This is a logic error, but we don't want to just rely on the 3201 * user having built with ASSERT enabled, so if ASSERT doesn't 3202 * do anything we still error out. 3203 */ 3204 ASSERT(0); 3205 return PTR_ERR(map); 3206 } 3207 3208 /* 3209 * First delete the device extent items from the devices btree. 3210 * We take the device_list_mutex to avoid racing with the finishing phase 3211 * of a device replace operation. See the comment below before acquiring 3212 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3213 * because that can result in a deadlock when deleting the device extent 3214 * items from the devices btree - COWing an extent buffer from the btree 3215 * may result in allocating a new metadata chunk, which would attempt to 3216 * lock again fs_info->chunk_mutex. 3217 */ 3218 mutex_lock(&fs_devices->device_list_mutex); 3219 for (i = 0; i < map->num_stripes; i++) { 3220 struct btrfs_device *device = map->stripes[i].dev; 3221 ret = btrfs_free_dev_extent(trans, device, 3222 map->stripes[i].physical, 3223 &dev_extent_len); 3224 if (ret) { 3225 mutex_unlock(&fs_devices->device_list_mutex); 3226 btrfs_abort_transaction(trans, ret); 3227 goto out; 3228 } 3229 3230 if (device->bytes_used > 0) { 3231 mutex_lock(&fs_info->chunk_mutex); 3232 btrfs_device_set_bytes_used(device, 3233 device->bytes_used - dev_extent_len); 3234 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3235 btrfs_clear_space_info_full(fs_info); 3236 mutex_unlock(&fs_info->chunk_mutex); 3237 } 3238 } 3239 mutex_unlock(&fs_devices->device_list_mutex); 3240 3241 /* 3242 * We acquire fs_info->chunk_mutex for 2 reasons: 3243 * 3244 * 1) Just like with the first phase of the chunk allocation, we must 3245 * reserve system space, do all chunk btree updates and deletions, and 3246 * update the system chunk array in the superblock while holding this 3247 * mutex. This is for similar reasons as explained on the comment at 3248 * the top of btrfs_chunk_alloc(); 3249 * 3250 * 2) Prevent races with the final phase of a device replace operation 3251 * that replaces the device object associated with the map's stripes, 3252 * because the device object's id can change at any time during that 3253 * final phase of the device replace operation 3254 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3255 * replaced device and then see it with an ID of 3256 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3257 * the device item, which does not exists on the chunk btree. 3258 * The finishing phase of device replace acquires both the 3259 * device_list_mutex and the chunk_mutex, in that order, so we are 3260 * safe by just acquiring the chunk_mutex. 3261 */ 3262 trans->removing_chunk = true; 3263 mutex_lock(&fs_info->chunk_mutex); 3264 3265 check_system_chunk(trans, map->type); 3266 3267 ret = remove_chunk_item(trans, map, chunk_offset); 3268 /* 3269 * Normally we should not get -ENOSPC since we reserved space before 3270 * through the call to check_system_chunk(). 3271 * 3272 * Despite our system space_info having enough free space, we may not 3273 * be able to allocate extents from its block groups, because all have 3274 * an incompatible profile, which will force us to allocate a new system 3275 * block group with the right profile, or right after we called 3276 * check_system_space() above, a scrub turned the only system block group 3277 * with enough free space into RO mode. 3278 * This is explained with more detail at do_chunk_alloc(). 3279 * 3280 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3281 */ 3282 if (ret == -ENOSPC) { 3283 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3284 struct btrfs_block_group *sys_bg; 3285 3286 sys_bg = btrfs_create_chunk(trans, sys_flags); 3287 if (IS_ERR(sys_bg)) { 3288 ret = PTR_ERR(sys_bg); 3289 btrfs_abort_transaction(trans, ret); 3290 goto out; 3291 } 3292 3293 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3294 if (ret) { 3295 btrfs_abort_transaction(trans, ret); 3296 goto out; 3297 } 3298 3299 ret = remove_chunk_item(trans, map, chunk_offset); 3300 if (ret) { 3301 btrfs_abort_transaction(trans, ret); 3302 goto out; 3303 } 3304 } else if (ret) { 3305 btrfs_abort_transaction(trans, ret); 3306 goto out; 3307 } 3308 3309 trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); 3310 3311 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3312 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3313 if (ret) { 3314 btrfs_abort_transaction(trans, ret); 3315 goto out; 3316 } 3317 } 3318 3319 mutex_unlock(&fs_info->chunk_mutex); 3320 trans->removing_chunk = false; 3321 3322 /* 3323 * We are done with chunk btree updates and deletions, so release the 3324 * system space we previously reserved (with check_system_chunk()). 3325 */ 3326 btrfs_trans_release_chunk_metadata(trans); 3327 3328 ret = btrfs_remove_block_group(trans, map); 3329 if (ret) { 3330 btrfs_abort_transaction(trans, ret); 3331 goto out; 3332 } 3333 3334 out: 3335 if (trans->removing_chunk) { 3336 mutex_unlock(&fs_info->chunk_mutex); 3337 trans->removing_chunk = false; 3338 } 3339 /* once for us */ 3340 btrfs_free_chunk_map(map); 3341 return ret; 3342 } 3343 3344 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3345 { 3346 struct btrfs_root *root = fs_info->chunk_root; 3347 struct btrfs_trans_handle *trans; 3348 struct btrfs_block_group *block_group; 3349 u64 length; 3350 int ret; 3351 3352 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3353 btrfs_err(fs_info, 3354 "relocate: not supported on extent tree v2 yet"); 3355 return -EINVAL; 3356 } 3357 3358 /* 3359 * Prevent races with automatic removal of unused block groups. 3360 * After we relocate and before we remove the chunk with offset 3361 * chunk_offset, automatic removal of the block group can kick in, 3362 * resulting in a failure when calling btrfs_remove_chunk() below. 3363 * 3364 * Make sure to acquire this mutex before doing a tree search (dev 3365 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3366 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3367 * we release the path used to search the chunk/dev tree and before 3368 * the current task acquires this mutex and calls us. 3369 */ 3370 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3371 3372 /* step one, relocate all the extents inside this chunk */ 3373 btrfs_scrub_pause(fs_info); 3374 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3375 btrfs_scrub_continue(fs_info); 3376 if (ret) { 3377 /* 3378 * If we had a transaction abort, stop all running scrubs. 3379 * See transaction.c:cleanup_transaction() why we do it here. 3380 */ 3381 if (BTRFS_FS_ERROR(fs_info)) 3382 btrfs_scrub_cancel(fs_info); 3383 return ret; 3384 } 3385 3386 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3387 if (!block_group) 3388 return -ENOENT; 3389 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3390 length = block_group->length; 3391 btrfs_put_block_group(block_group); 3392 3393 /* 3394 * On a zoned file system, discard the whole block group, this will 3395 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3396 * resetting the zone fails, don't treat it as a fatal problem from the 3397 * filesystem's point of view. 3398 */ 3399 if (btrfs_is_zoned(fs_info)) { 3400 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3401 if (ret) 3402 btrfs_info(fs_info, 3403 "failed to reset zone %llu after relocation", 3404 chunk_offset); 3405 } 3406 3407 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3408 chunk_offset); 3409 if (IS_ERR(trans)) { 3410 ret = PTR_ERR(trans); 3411 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3412 return ret; 3413 } 3414 3415 /* 3416 * step two, delete the device extents and the 3417 * chunk tree entries 3418 */ 3419 ret = btrfs_remove_chunk(trans, chunk_offset); 3420 btrfs_end_transaction(trans); 3421 return ret; 3422 } 3423 3424 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3425 { 3426 struct btrfs_root *chunk_root = fs_info->chunk_root; 3427 struct btrfs_path *path; 3428 struct extent_buffer *leaf; 3429 struct btrfs_chunk *chunk; 3430 struct btrfs_key key; 3431 struct btrfs_key found_key; 3432 u64 chunk_type; 3433 bool retried = false; 3434 int failed = 0; 3435 int ret; 3436 3437 path = btrfs_alloc_path(); 3438 if (!path) 3439 return -ENOMEM; 3440 3441 again: 3442 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3443 key.offset = (u64)-1; 3444 key.type = BTRFS_CHUNK_ITEM_KEY; 3445 3446 while (1) { 3447 mutex_lock(&fs_info->reclaim_bgs_lock); 3448 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3449 if (ret < 0) { 3450 mutex_unlock(&fs_info->reclaim_bgs_lock); 3451 goto error; 3452 } 3453 if (ret == 0) { 3454 /* 3455 * On the first search we would find chunk tree with 3456 * offset -1, which is not possible. On subsequent 3457 * loops this would find an existing item on an invalid 3458 * offset (one less than the previous one, wrong 3459 * alignment and size). 3460 */ 3461 ret = -EUCLEAN; 3462 mutex_unlock(&fs_info->reclaim_bgs_lock); 3463 goto error; 3464 } 3465 3466 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3467 key.type); 3468 if (ret) 3469 mutex_unlock(&fs_info->reclaim_bgs_lock); 3470 if (ret < 0) 3471 goto error; 3472 if (ret > 0) 3473 break; 3474 3475 leaf = path->nodes[0]; 3476 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3477 3478 chunk = btrfs_item_ptr(leaf, path->slots[0], 3479 struct btrfs_chunk); 3480 chunk_type = btrfs_chunk_type(leaf, chunk); 3481 btrfs_release_path(path); 3482 3483 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3484 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3485 if (ret == -ENOSPC) 3486 failed++; 3487 else 3488 BUG_ON(ret); 3489 } 3490 mutex_unlock(&fs_info->reclaim_bgs_lock); 3491 3492 if (found_key.offset == 0) 3493 break; 3494 key.offset = found_key.offset - 1; 3495 } 3496 ret = 0; 3497 if (failed && !retried) { 3498 failed = 0; 3499 retried = true; 3500 goto again; 3501 } else if (WARN_ON(failed && retried)) { 3502 ret = -ENOSPC; 3503 } 3504 error: 3505 btrfs_free_path(path); 3506 return ret; 3507 } 3508 3509 /* 3510 * return 1 : allocate a data chunk successfully, 3511 * return <0: errors during allocating a data chunk, 3512 * return 0 : no need to allocate a data chunk. 3513 */ 3514 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3515 u64 chunk_offset) 3516 { 3517 struct btrfs_block_group *cache; 3518 u64 bytes_used; 3519 u64 chunk_type; 3520 3521 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3522 ASSERT(cache); 3523 chunk_type = cache->flags; 3524 btrfs_put_block_group(cache); 3525 3526 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3527 return 0; 3528 3529 spin_lock(&fs_info->data_sinfo->lock); 3530 bytes_used = fs_info->data_sinfo->bytes_used; 3531 spin_unlock(&fs_info->data_sinfo->lock); 3532 3533 if (!bytes_used) { 3534 struct btrfs_trans_handle *trans; 3535 int ret; 3536 3537 trans = btrfs_join_transaction(fs_info->tree_root); 3538 if (IS_ERR(trans)) 3539 return PTR_ERR(trans); 3540 3541 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3542 btrfs_end_transaction(trans); 3543 if (ret < 0) 3544 return ret; 3545 return 1; 3546 } 3547 3548 return 0; 3549 } 3550 3551 static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, 3552 const struct btrfs_disk_balance_args *disk) 3553 { 3554 memset(cpu, 0, sizeof(*cpu)); 3555 3556 cpu->profiles = le64_to_cpu(disk->profiles); 3557 cpu->usage = le64_to_cpu(disk->usage); 3558 cpu->devid = le64_to_cpu(disk->devid); 3559 cpu->pstart = le64_to_cpu(disk->pstart); 3560 cpu->pend = le64_to_cpu(disk->pend); 3561 cpu->vstart = le64_to_cpu(disk->vstart); 3562 cpu->vend = le64_to_cpu(disk->vend); 3563 cpu->target = le64_to_cpu(disk->target); 3564 cpu->flags = le64_to_cpu(disk->flags); 3565 cpu->limit = le64_to_cpu(disk->limit); 3566 cpu->stripes_min = le32_to_cpu(disk->stripes_min); 3567 cpu->stripes_max = le32_to_cpu(disk->stripes_max); 3568 } 3569 3570 static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, 3571 const struct btrfs_balance_args *cpu) 3572 { 3573 memset(disk, 0, sizeof(*disk)); 3574 3575 disk->profiles = cpu_to_le64(cpu->profiles); 3576 disk->usage = cpu_to_le64(cpu->usage); 3577 disk->devid = cpu_to_le64(cpu->devid); 3578 disk->pstart = cpu_to_le64(cpu->pstart); 3579 disk->pend = cpu_to_le64(cpu->pend); 3580 disk->vstart = cpu_to_le64(cpu->vstart); 3581 disk->vend = cpu_to_le64(cpu->vend); 3582 disk->target = cpu_to_le64(cpu->target); 3583 disk->flags = cpu_to_le64(cpu->flags); 3584 disk->limit = cpu_to_le64(cpu->limit); 3585 disk->stripes_min = cpu_to_le32(cpu->stripes_min); 3586 disk->stripes_max = cpu_to_le32(cpu->stripes_max); 3587 } 3588 3589 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3590 struct btrfs_balance_control *bctl) 3591 { 3592 struct btrfs_root *root = fs_info->tree_root; 3593 struct btrfs_trans_handle *trans; 3594 struct btrfs_balance_item *item; 3595 struct btrfs_disk_balance_args disk_bargs; 3596 struct btrfs_path *path; 3597 struct extent_buffer *leaf; 3598 struct btrfs_key key; 3599 int ret, err; 3600 3601 path = btrfs_alloc_path(); 3602 if (!path) 3603 return -ENOMEM; 3604 3605 trans = btrfs_start_transaction(root, 0); 3606 if (IS_ERR(trans)) { 3607 btrfs_free_path(path); 3608 return PTR_ERR(trans); 3609 } 3610 3611 key.objectid = BTRFS_BALANCE_OBJECTID; 3612 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3613 key.offset = 0; 3614 3615 ret = btrfs_insert_empty_item(trans, root, path, &key, 3616 sizeof(*item)); 3617 if (ret) 3618 goto out; 3619 3620 leaf = path->nodes[0]; 3621 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3622 3623 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3624 3625 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3626 btrfs_set_balance_data(leaf, item, &disk_bargs); 3627 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3628 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3629 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3630 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3631 3632 btrfs_set_balance_flags(leaf, item, bctl->flags); 3633 3634 btrfs_mark_buffer_dirty(trans, leaf); 3635 out: 3636 btrfs_free_path(path); 3637 err = btrfs_commit_transaction(trans); 3638 if (err && !ret) 3639 ret = err; 3640 return ret; 3641 } 3642 3643 static int del_balance_item(struct btrfs_fs_info *fs_info) 3644 { 3645 struct btrfs_root *root = fs_info->tree_root; 3646 struct btrfs_trans_handle *trans; 3647 struct btrfs_path *path; 3648 struct btrfs_key key; 3649 int ret, err; 3650 3651 path = btrfs_alloc_path(); 3652 if (!path) 3653 return -ENOMEM; 3654 3655 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3656 if (IS_ERR(trans)) { 3657 btrfs_free_path(path); 3658 return PTR_ERR(trans); 3659 } 3660 3661 key.objectid = BTRFS_BALANCE_OBJECTID; 3662 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3663 key.offset = 0; 3664 3665 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3666 if (ret < 0) 3667 goto out; 3668 if (ret > 0) { 3669 ret = -ENOENT; 3670 goto out; 3671 } 3672 3673 ret = btrfs_del_item(trans, root, path); 3674 out: 3675 btrfs_free_path(path); 3676 err = btrfs_commit_transaction(trans); 3677 if (err && !ret) 3678 ret = err; 3679 return ret; 3680 } 3681 3682 /* 3683 * This is a heuristic used to reduce the number of chunks balanced on 3684 * resume after balance was interrupted. 3685 */ 3686 static void update_balance_args(struct btrfs_balance_control *bctl) 3687 { 3688 /* 3689 * Turn on soft mode for chunk types that were being converted. 3690 */ 3691 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3692 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3693 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3694 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3695 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3696 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3697 3698 /* 3699 * Turn on usage filter if is not already used. The idea is 3700 * that chunks that we have already balanced should be 3701 * reasonably full. Don't do it for chunks that are being 3702 * converted - that will keep us from relocating unconverted 3703 * (albeit full) chunks. 3704 */ 3705 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3706 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3707 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3708 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3709 bctl->data.usage = 90; 3710 } 3711 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3712 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3713 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3714 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3715 bctl->sys.usage = 90; 3716 } 3717 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3718 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3719 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3720 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3721 bctl->meta.usage = 90; 3722 } 3723 } 3724 3725 /* 3726 * Clear the balance status in fs_info and delete the balance item from disk. 3727 */ 3728 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3729 { 3730 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3731 int ret; 3732 3733 ASSERT(fs_info->balance_ctl); 3734 3735 spin_lock(&fs_info->balance_lock); 3736 fs_info->balance_ctl = NULL; 3737 spin_unlock(&fs_info->balance_lock); 3738 3739 kfree(bctl); 3740 ret = del_balance_item(fs_info); 3741 if (ret) 3742 btrfs_handle_fs_error(fs_info, ret, NULL); 3743 } 3744 3745 /* 3746 * Balance filters. Return 1 if chunk should be filtered out 3747 * (should not be balanced). 3748 */ 3749 static int chunk_profiles_filter(u64 chunk_type, 3750 struct btrfs_balance_args *bargs) 3751 { 3752 chunk_type = chunk_to_extended(chunk_type) & 3753 BTRFS_EXTENDED_PROFILE_MASK; 3754 3755 if (bargs->profiles & chunk_type) 3756 return 0; 3757 3758 return 1; 3759 } 3760 3761 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3762 struct btrfs_balance_args *bargs) 3763 { 3764 struct btrfs_block_group *cache; 3765 u64 chunk_used; 3766 u64 user_thresh_min; 3767 u64 user_thresh_max; 3768 int ret = 1; 3769 3770 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3771 chunk_used = cache->used; 3772 3773 if (bargs->usage_min == 0) 3774 user_thresh_min = 0; 3775 else 3776 user_thresh_min = mult_perc(cache->length, bargs->usage_min); 3777 3778 if (bargs->usage_max == 0) 3779 user_thresh_max = 1; 3780 else if (bargs->usage_max > 100) 3781 user_thresh_max = cache->length; 3782 else 3783 user_thresh_max = mult_perc(cache->length, bargs->usage_max); 3784 3785 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3786 ret = 0; 3787 3788 btrfs_put_block_group(cache); 3789 return ret; 3790 } 3791 3792 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3793 u64 chunk_offset, struct btrfs_balance_args *bargs) 3794 { 3795 struct btrfs_block_group *cache; 3796 u64 chunk_used, user_thresh; 3797 int ret = 1; 3798 3799 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3800 chunk_used = cache->used; 3801 3802 if (bargs->usage_min == 0) 3803 user_thresh = 1; 3804 else if (bargs->usage > 100) 3805 user_thresh = cache->length; 3806 else 3807 user_thresh = mult_perc(cache->length, bargs->usage); 3808 3809 if (chunk_used < user_thresh) 3810 ret = 0; 3811 3812 btrfs_put_block_group(cache); 3813 return ret; 3814 } 3815 3816 static int chunk_devid_filter(struct extent_buffer *leaf, 3817 struct btrfs_chunk *chunk, 3818 struct btrfs_balance_args *bargs) 3819 { 3820 struct btrfs_stripe *stripe; 3821 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3822 int i; 3823 3824 for (i = 0; i < num_stripes; i++) { 3825 stripe = btrfs_stripe_nr(chunk, i); 3826 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3827 return 0; 3828 } 3829 3830 return 1; 3831 } 3832 3833 static u64 calc_data_stripes(u64 type, int num_stripes) 3834 { 3835 const int index = btrfs_bg_flags_to_raid_index(type); 3836 const int ncopies = btrfs_raid_array[index].ncopies; 3837 const int nparity = btrfs_raid_array[index].nparity; 3838 3839 return (num_stripes - nparity) / ncopies; 3840 } 3841 3842 /* [pstart, pend) */ 3843 static int chunk_drange_filter(struct extent_buffer *leaf, 3844 struct btrfs_chunk *chunk, 3845 struct btrfs_balance_args *bargs) 3846 { 3847 struct btrfs_stripe *stripe; 3848 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3849 u64 stripe_offset; 3850 u64 stripe_length; 3851 u64 type; 3852 int factor; 3853 int i; 3854 3855 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3856 return 0; 3857 3858 type = btrfs_chunk_type(leaf, chunk); 3859 factor = calc_data_stripes(type, num_stripes); 3860 3861 for (i = 0; i < num_stripes; i++) { 3862 stripe = btrfs_stripe_nr(chunk, i); 3863 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3864 continue; 3865 3866 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3867 stripe_length = btrfs_chunk_length(leaf, chunk); 3868 stripe_length = div_u64(stripe_length, factor); 3869 3870 if (stripe_offset < bargs->pend && 3871 stripe_offset + stripe_length > bargs->pstart) 3872 return 0; 3873 } 3874 3875 return 1; 3876 } 3877 3878 /* [vstart, vend) */ 3879 static int chunk_vrange_filter(struct extent_buffer *leaf, 3880 struct btrfs_chunk *chunk, 3881 u64 chunk_offset, 3882 struct btrfs_balance_args *bargs) 3883 { 3884 if (chunk_offset < bargs->vend && 3885 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3886 /* at least part of the chunk is inside this vrange */ 3887 return 0; 3888 3889 return 1; 3890 } 3891 3892 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3893 struct btrfs_chunk *chunk, 3894 struct btrfs_balance_args *bargs) 3895 { 3896 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3897 3898 if (bargs->stripes_min <= num_stripes 3899 && num_stripes <= bargs->stripes_max) 3900 return 0; 3901 3902 return 1; 3903 } 3904 3905 static int chunk_soft_convert_filter(u64 chunk_type, 3906 struct btrfs_balance_args *bargs) 3907 { 3908 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3909 return 0; 3910 3911 chunk_type = chunk_to_extended(chunk_type) & 3912 BTRFS_EXTENDED_PROFILE_MASK; 3913 3914 if (bargs->target == chunk_type) 3915 return 1; 3916 3917 return 0; 3918 } 3919 3920 static int should_balance_chunk(struct extent_buffer *leaf, 3921 struct btrfs_chunk *chunk, u64 chunk_offset) 3922 { 3923 struct btrfs_fs_info *fs_info = leaf->fs_info; 3924 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3925 struct btrfs_balance_args *bargs = NULL; 3926 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3927 3928 /* type filter */ 3929 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3930 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3931 return 0; 3932 } 3933 3934 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3935 bargs = &bctl->data; 3936 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3937 bargs = &bctl->sys; 3938 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3939 bargs = &bctl->meta; 3940 3941 /* profiles filter */ 3942 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3943 chunk_profiles_filter(chunk_type, bargs)) { 3944 return 0; 3945 } 3946 3947 /* usage filter */ 3948 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3949 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3950 return 0; 3951 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3952 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3953 return 0; 3954 } 3955 3956 /* devid filter */ 3957 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3958 chunk_devid_filter(leaf, chunk, bargs)) { 3959 return 0; 3960 } 3961 3962 /* drange filter, makes sense only with devid filter */ 3963 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3964 chunk_drange_filter(leaf, chunk, bargs)) { 3965 return 0; 3966 } 3967 3968 /* vrange filter */ 3969 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3970 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3971 return 0; 3972 } 3973 3974 /* stripes filter */ 3975 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3976 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3977 return 0; 3978 } 3979 3980 /* soft profile changing mode */ 3981 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3982 chunk_soft_convert_filter(chunk_type, bargs)) { 3983 return 0; 3984 } 3985 3986 /* 3987 * limited by count, must be the last filter 3988 */ 3989 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3990 if (bargs->limit == 0) 3991 return 0; 3992 else 3993 bargs->limit--; 3994 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3995 /* 3996 * Same logic as the 'limit' filter; the minimum cannot be 3997 * determined here because we do not have the global information 3998 * about the count of all chunks that satisfy the filters. 3999 */ 4000 if (bargs->limit_max == 0) 4001 return 0; 4002 else 4003 bargs->limit_max--; 4004 } 4005 4006 return 1; 4007 } 4008 4009 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 4010 { 4011 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4012 struct btrfs_root *chunk_root = fs_info->chunk_root; 4013 u64 chunk_type; 4014 struct btrfs_chunk *chunk; 4015 struct btrfs_path *path = NULL; 4016 struct btrfs_key key; 4017 struct btrfs_key found_key; 4018 struct extent_buffer *leaf; 4019 int slot; 4020 int ret; 4021 int enospc_errors = 0; 4022 bool counting = true; 4023 /* The single value limit and min/max limits use the same bytes in the */ 4024 u64 limit_data = bctl->data.limit; 4025 u64 limit_meta = bctl->meta.limit; 4026 u64 limit_sys = bctl->sys.limit; 4027 u32 count_data = 0; 4028 u32 count_meta = 0; 4029 u32 count_sys = 0; 4030 int chunk_reserved = 0; 4031 4032 path = btrfs_alloc_path(); 4033 if (!path) { 4034 ret = -ENOMEM; 4035 goto error; 4036 } 4037 4038 /* zero out stat counters */ 4039 spin_lock(&fs_info->balance_lock); 4040 memset(&bctl->stat, 0, sizeof(bctl->stat)); 4041 spin_unlock(&fs_info->balance_lock); 4042 again: 4043 if (!counting) { 4044 /* 4045 * The single value limit and min/max limits use the same bytes 4046 * in the 4047 */ 4048 bctl->data.limit = limit_data; 4049 bctl->meta.limit = limit_meta; 4050 bctl->sys.limit = limit_sys; 4051 } 4052 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4053 key.offset = (u64)-1; 4054 key.type = BTRFS_CHUNK_ITEM_KEY; 4055 4056 while (1) { 4057 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 4058 atomic_read(&fs_info->balance_cancel_req)) { 4059 ret = -ECANCELED; 4060 goto error; 4061 } 4062 4063 mutex_lock(&fs_info->reclaim_bgs_lock); 4064 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 4065 if (ret < 0) { 4066 mutex_unlock(&fs_info->reclaim_bgs_lock); 4067 goto error; 4068 } 4069 4070 /* 4071 * this shouldn't happen, it means the last relocate 4072 * failed 4073 */ 4074 if (ret == 0) 4075 BUG(); /* FIXME break ? */ 4076 4077 ret = btrfs_previous_item(chunk_root, path, 0, 4078 BTRFS_CHUNK_ITEM_KEY); 4079 if (ret) { 4080 mutex_unlock(&fs_info->reclaim_bgs_lock); 4081 ret = 0; 4082 break; 4083 } 4084 4085 leaf = path->nodes[0]; 4086 slot = path->slots[0]; 4087 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4088 4089 if (found_key.objectid != key.objectid) { 4090 mutex_unlock(&fs_info->reclaim_bgs_lock); 4091 break; 4092 } 4093 4094 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 4095 chunk_type = btrfs_chunk_type(leaf, chunk); 4096 4097 if (!counting) { 4098 spin_lock(&fs_info->balance_lock); 4099 bctl->stat.considered++; 4100 spin_unlock(&fs_info->balance_lock); 4101 } 4102 4103 ret = should_balance_chunk(leaf, chunk, found_key.offset); 4104 4105 btrfs_release_path(path); 4106 if (!ret) { 4107 mutex_unlock(&fs_info->reclaim_bgs_lock); 4108 goto loop; 4109 } 4110 4111 if (counting) { 4112 mutex_unlock(&fs_info->reclaim_bgs_lock); 4113 spin_lock(&fs_info->balance_lock); 4114 bctl->stat.expected++; 4115 spin_unlock(&fs_info->balance_lock); 4116 4117 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 4118 count_data++; 4119 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 4120 count_sys++; 4121 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 4122 count_meta++; 4123 4124 goto loop; 4125 } 4126 4127 /* 4128 * Apply limit_min filter, no need to check if the LIMITS 4129 * filter is used, limit_min is 0 by default 4130 */ 4131 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 4132 count_data < bctl->data.limit_min) 4133 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 4134 count_meta < bctl->meta.limit_min) 4135 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 4136 count_sys < bctl->sys.limit_min)) { 4137 mutex_unlock(&fs_info->reclaim_bgs_lock); 4138 goto loop; 4139 } 4140 4141 if (!chunk_reserved) { 4142 /* 4143 * We may be relocating the only data chunk we have, 4144 * which could potentially end up with losing data's 4145 * raid profile, so lets allocate an empty one in 4146 * advance. 4147 */ 4148 ret = btrfs_may_alloc_data_chunk(fs_info, 4149 found_key.offset); 4150 if (ret < 0) { 4151 mutex_unlock(&fs_info->reclaim_bgs_lock); 4152 goto error; 4153 } else if (ret == 1) { 4154 chunk_reserved = 1; 4155 } 4156 } 4157 4158 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 4159 mutex_unlock(&fs_info->reclaim_bgs_lock); 4160 if (ret == -ENOSPC) { 4161 enospc_errors++; 4162 } else if (ret == -ETXTBSY) { 4163 btrfs_info(fs_info, 4164 "skipping relocation of block group %llu due to active swapfile", 4165 found_key.offset); 4166 ret = 0; 4167 } else if (ret) { 4168 goto error; 4169 } else { 4170 spin_lock(&fs_info->balance_lock); 4171 bctl->stat.completed++; 4172 spin_unlock(&fs_info->balance_lock); 4173 } 4174 loop: 4175 if (found_key.offset == 0) 4176 break; 4177 key.offset = found_key.offset - 1; 4178 } 4179 4180 if (counting) { 4181 btrfs_release_path(path); 4182 counting = false; 4183 goto again; 4184 } 4185 error: 4186 btrfs_free_path(path); 4187 if (enospc_errors) { 4188 btrfs_info(fs_info, "%d enospc errors during balance", 4189 enospc_errors); 4190 if (!ret) 4191 ret = -ENOSPC; 4192 } 4193 4194 return ret; 4195 } 4196 4197 /* 4198 * See if a given profile is valid and reduced. 4199 * 4200 * @flags: profile to validate 4201 * @extended: if true @flags is treated as an extended profile 4202 */ 4203 static int alloc_profile_is_valid(u64 flags, int extended) 4204 { 4205 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4206 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4207 4208 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4209 4210 /* 1) check that all other bits are zeroed */ 4211 if (flags & ~mask) 4212 return 0; 4213 4214 /* 2) see if profile is reduced */ 4215 if (flags == 0) 4216 return !extended; /* "0" is valid for usual profiles */ 4217 4218 return has_single_bit_set(flags); 4219 } 4220 4221 /* 4222 * Validate target profile against allowed profiles and return true if it's OK. 4223 * Otherwise print the error message and return false. 4224 */ 4225 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4226 const struct btrfs_balance_args *bargs, 4227 u64 allowed, const char *type) 4228 { 4229 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4230 return true; 4231 4232 /* Profile is valid and does not have bits outside of the allowed set */ 4233 if (alloc_profile_is_valid(bargs->target, 1) && 4234 (bargs->target & ~allowed) == 0) 4235 return true; 4236 4237 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4238 type, btrfs_bg_type_to_raid_name(bargs->target)); 4239 return false; 4240 } 4241 4242 /* 4243 * Fill @buf with textual description of balance filter flags @bargs, up to 4244 * @size_buf including the terminating null. The output may be trimmed if it 4245 * does not fit into the provided buffer. 4246 */ 4247 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4248 u32 size_buf) 4249 { 4250 int ret; 4251 u32 size_bp = size_buf; 4252 char *bp = buf; 4253 u64 flags = bargs->flags; 4254 char tmp_buf[128] = {'\0'}; 4255 4256 if (!flags) 4257 return; 4258 4259 #define CHECK_APPEND_NOARG(a) \ 4260 do { \ 4261 ret = snprintf(bp, size_bp, (a)); \ 4262 if (ret < 0 || ret >= size_bp) \ 4263 goto out_overflow; \ 4264 size_bp -= ret; \ 4265 bp += ret; \ 4266 } while (0) 4267 4268 #define CHECK_APPEND_1ARG(a, v1) \ 4269 do { \ 4270 ret = snprintf(bp, size_bp, (a), (v1)); \ 4271 if (ret < 0 || ret >= size_bp) \ 4272 goto out_overflow; \ 4273 size_bp -= ret; \ 4274 bp += ret; \ 4275 } while (0) 4276 4277 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4278 do { \ 4279 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4280 if (ret < 0 || ret >= size_bp) \ 4281 goto out_overflow; \ 4282 size_bp -= ret; \ 4283 bp += ret; \ 4284 } while (0) 4285 4286 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4287 CHECK_APPEND_1ARG("convert=%s,", 4288 btrfs_bg_type_to_raid_name(bargs->target)); 4289 4290 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4291 CHECK_APPEND_NOARG("soft,"); 4292 4293 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4294 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4295 sizeof(tmp_buf)); 4296 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4297 } 4298 4299 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4300 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4301 4302 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4303 CHECK_APPEND_2ARG("usage=%u..%u,", 4304 bargs->usage_min, bargs->usage_max); 4305 4306 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4307 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4308 4309 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4310 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4311 bargs->pstart, bargs->pend); 4312 4313 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4314 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4315 bargs->vstart, bargs->vend); 4316 4317 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4318 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4319 4320 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4321 CHECK_APPEND_2ARG("limit=%u..%u,", 4322 bargs->limit_min, bargs->limit_max); 4323 4324 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4325 CHECK_APPEND_2ARG("stripes=%u..%u,", 4326 bargs->stripes_min, bargs->stripes_max); 4327 4328 #undef CHECK_APPEND_2ARG 4329 #undef CHECK_APPEND_1ARG 4330 #undef CHECK_APPEND_NOARG 4331 4332 out_overflow: 4333 4334 if (size_bp < size_buf) 4335 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4336 else 4337 buf[0] = '\0'; 4338 } 4339 4340 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4341 { 4342 u32 size_buf = 1024; 4343 char tmp_buf[192] = {'\0'}; 4344 char *buf; 4345 char *bp; 4346 u32 size_bp = size_buf; 4347 int ret; 4348 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4349 4350 buf = kzalloc(size_buf, GFP_KERNEL); 4351 if (!buf) 4352 return; 4353 4354 bp = buf; 4355 4356 #define CHECK_APPEND_1ARG(a, v1) \ 4357 do { \ 4358 ret = snprintf(bp, size_bp, (a), (v1)); \ 4359 if (ret < 0 || ret >= size_bp) \ 4360 goto out_overflow; \ 4361 size_bp -= ret; \ 4362 bp += ret; \ 4363 } while (0) 4364 4365 if (bctl->flags & BTRFS_BALANCE_FORCE) 4366 CHECK_APPEND_1ARG("%s", "-f "); 4367 4368 if (bctl->flags & BTRFS_BALANCE_DATA) { 4369 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4370 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4371 } 4372 4373 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4374 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4375 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4376 } 4377 4378 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4379 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4380 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4381 } 4382 4383 #undef CHECK_APPEND_1ARG 4384 4385 out_overflow: 4386 4387 if (size_bp < size_buf) 4388 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4389 btrfs_info(fs_info, "balance: %s %s", 4390 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4391 "resume" : "start", buf); 4392 4393 kfree(buf); 4394 } 4395 4396 /* 4397 * Should be called with balance mutexe held 4398 */ 4399 int btrfs_balance(struct btrfs_fs_info *fs_info, 4400 struct btrfs_balance_control *bctl, 4401 struct btrfs_ioctl_balance_args *bargs) 4402 { 4403 u64 meta_target, data_target; 4404 u64 allowed; 4405 int mixed = 0; 4406 int ret; 4407 u64 num_devices; 4408 unsigned seq; 4409 bool reducing_redundancy; 4410 bool paused = false; 4411 int i; 4412 4413 if (btrfs_fs_closing(fs_info) || 4414 atomic_read(&fs_info->balance_pause_req) || 4415 btrfs_should_cancel_balance(fs_info)) { 4416 ret = -EINVAL; 4417 goto out; 4418 } 4419 4420 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4421 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4422 mixed = 1; 4423 4424 /* 4425 * In case of mixed groups both data and meta should be picked, 4426 * and identical options should be given for both of them. 4427 */ 4428 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4429 if (mixed && (bctl->flags & allowed)) { 4430 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4431 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4432 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4433 btrfs_err(fs_info, 4434 "balance: mixed groups data and metadata options must be the same"); 4435 ret = -EINVAL; 4436 goto out; 4437 } 4438 } 4439 4440 /* 4441 * rw_devices will not change at the moment, device add/delete/replace 4442 * are exclusive 4443 */ 4444 num_devices = fs_info->fs_devices->rw_devices; 4445 4446 /* 4447 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4448 * special bit for it, to make it easier to distinguish. Thus we need 4449 * to set it manually, or balance would refuse the profile. 4450 */ 4451 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4452 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4453 if (num_devices >= btrfs_raid_array[i].devs_min) 4454 allowed |= btrfs_raid_array[i].bg_flag; 4455 4456 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4457 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4458 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4459 ret = -EINVAL; 4460 goto out; 4461 } 4462 4463 /* 4464 * Allow to reduce metadata or system integrity only if force set for 4465 * profiles with redundancy (copies, parity) 4466 */ 4467 allowed = 0; 4468 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4469 if (btrfs_raid_array[i].ncopies >= 2 || 4470 btrfs_raid_array[i].tolerated_failures >= 1) 4471 allowed |= btrfs_raid_array[i].bg_flag; 4472 } 4473 do { 4474 seq = read_seqbegin(&fs_info->profiles_lock); 4475 4476 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4477 (fs_info->avail_system_alloc_bits & allowed) && 4478 !(bctl->sys.target & allowed)) || 4479 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4480 (fs_info->avail_metadata_alloc_bits & allowed) && 4481 !(bctl->meta.target & allowed))) 4482 reducing_redundancy = true; 4483 else 4484 reducing_redundancy = false; 4485 4486 /* if we're not converting, the target field is uninitialized */ 4487 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4488 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4489 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4490 bctl->data.target : fs_info->avail_data_alloc_bits; 4491 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4492 4493 if (reducing_redundancy) { 4494 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4495 btrfs_info(fs_info, 4496 "balance: force reducing metadata redundancy"); 4497 } else { 4498 btrfs_err(fs_info, 4499 "balance: reduces metadata redundancy, use --force if you want this"); 4500 ret = -EINVAL; 4501 goto out; 4502 } 4503 } 4504 4505 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4506 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4507 btrfs_warn(fs_info, 4508 "balance: metadata profile %s has lower redundancy than data profile %s", 4509 btrfs_bg_type_to_raid_name(meta_target), 4510 btrfs_bg_type_to_raid_name(data_target)); 4511 } 4512 4513 ret = insert_balance_item(fs_info, bctl); 4514 if (ret && ret != -EEXIST) 4515 goto out; 4516 4517 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4518 BUG_ON(ret == -EEXIST); 4519 BUG_ON(fs_info->balance_ctl); 4520 spin_lock(&fs_info->balance_lock); 4521 fs_info->balance_ctl = bctl; 4522 spin_unlock(&fs_info->balance_lock); 4523 } else { 4524 BUG_ON(ret != -EEXIST); 4525 spin_lock(&fs_info->balance_lock); 4526 update_balance_args(bctl); 4527 spin_unlock(&fs_info->balance_lock); 4528 } 4529 4530 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4531 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4532 describe_balance_start_or_resume(fs_info); 4533 mutex_unlock(&fs_info->balance_mutex); 4534 4535 ret = __btrfs_balance(fs_info); 4536 4537 mutex_lock(&fs_info->balance_mutex); 4538 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4539 btrfs_info(fs_info, "balance: paused"); 4540 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4541 paused = true; 4542 } 4543 /* 4544 * Balance can be canceled by: 4545 * 4546 * - Regular cancel request 4547 * Then ret == -ECANCELED and balance_cancel_req > 0 4548 * 4549 * - Fatal signal to "btrfs" process 4550 * Either the signal caught by wait_reserve_ticket() and callers 4551 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4552 * got -ECANCELED. 4553 * Either way, in this case balance_cancel_req = 0, and 4554 * ret == -EINTR or ret == -ECANCELED. 4555 * 4556 * So here we only check the return value to catch canceled balance. 4557 */ 4558 else if (ret == -ECANCELED || ret == -EINTR) 4559 btrfs_info(fs_info, "balance: canceled"); 4560 else 4561 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4562 4563 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4564 4565 if (bargs) { 4566 memset(bargs, 0, sizeof(*bargs)); 4567 btrfs_update_ioctl_balance_args(fs_info, bargs); 4568 } 4569 4570 /* We didn't pause, we can clean everything up. */ 4571 if (!paused) { 4572 reset_balance_state(fs_info); 4573 btrfs_exclop_finish(fs_info); 4574 } 4575 4576 wake_up(&fs_info->balance_wait_q); 4577 4578 return ret; 4579 out: 4580 if (bctl->flags & BTRFS_BALANCE_RESUME) 4581 reset_balance_state(fs_info); 4582 else 4583 kfree(bctl); 4584 btrfs_exclop_finish(fs_info); 4585 4586 return ret; 4587 } 4588 4589 static int balance_kthread(void *data) 4590 { 4591 struct btrfs_fs_info *fs_info = data; 4592 int ret = 0; 4593 4594 sb_start_write(fs_info->sb); 4595 mutex_lock(&fs_info->balance_mutex); 4596 if (fs_info->balance_ctl) 4597 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4598 mutex_unlock(&fs_info->balance_mutex); 4599 sb_end_write(fs_info->sb); 4600 4601 return ret; 4602 } 4603 4604 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4605 { 4606 struct task_struct *tsk; 4607 4608 mutex_lock(&fs_info->balance_mutex); 4609 if (!fs_info->balance_ctl) { 4610 mutex_unlock(&fs_info->balance_mutex); 4611 return 0; 4612 } 4613 mutex_unlock(&fs_info->balance_mutex); 4614 4615 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4616 btrfs_info(fs_info, "balance: resume skipped"); 4617 return 0; 4618 } 4619 4620 spin_lock(&fs_info->super_lock); 4621 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4622 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4623 spin_unlock(&fs_info->super_lock); 4624 /* 4625 * A ro->rw remount sequence should continue with the paused balance 4626 * regardless of who pauses it, system or the user as of now, so set 4627 * the resume flag. 4628 */ 4629 spin_lock(&fs_info->balance_lock); 4630 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4631 spin_unlock(&fs_info->balance_lock); 4632 4633 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4634 return PTR_ERR_OR_ZERO(tsk); 4635 } 4636 4637 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4638 { 4639 struct btrfs_balance_control *bctl; 4640 struct btrfs_balance_item *item; 4641 struct btrfs_disk_balance_args disk_bargs; 4642 struct btrfs_path *path; 4643 struct extent_buffer *leaf; 4644 struct btrfs_key key; 4645 int ret; 4646 4647 path = btrfs_alloc_path(); 4648 if (!path) 4649 return -ENOMEM; 4650 4651 key.objectid = BTRFS_BALANCE_OBJECTID; 4652 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4653 key.offset = 0; 4654 4655 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4656 if (ret < 0) 4657 goto out; 4658 if (ret > 0) { /* ret = -ENOENT; */ 4659 ret = 0; 4660 goto out; 4661 } 4662 4663 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4664 if (!bctl) { 4665 ret = -ENOMEM; 4666 goto out; 4667 } 4668 4669 leaf = path->nodes[0]; 4670 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4671 4672 bctl->flags = btrfs_balance_flags(leaf, item); 4673 bctl->flags |= BTRFS_BALANCE_RESUME; 4674 4675 btrfs_balance_data(leaf, item, &disk_bargs); 4676 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4677 btrfs_balance_meta(leaf, item, &disk_bargs); 4678 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4679 btrfs_balance_sys(leaf, item, &disk_bargs); 4680 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4681 4682 /* 4683 * This should never happen, as the paused balance state is recovered 4684 * during mount without any chance of other exclusive ops to collide. 4685 * 4686 * This gives the exclusive op status to balance and keeps in paused 4687 * state until user intervention (cancel or umount). If the ownership 4688 * cannot be assigned, show a message but do not fail. The balance 4689 * is in a paused state and must have fs_info::balance_ctl properly 4690 * set up. 4691 */ 4692 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4693 btrfs_warn(fs_info, 4694 "balance: cannot set exclusive op status, resume manually"); 4695 4696 btrfs_release_path(path); 4697 4698 mutex_lock(&fs_info->balance_mutex); 4699 BUG_ON(fs_info->balance_ctl); 4700 spin_lock(&fs_info->balance_lock); 4701 fs_info->balance_ctl = bctl; 4702 spin_unlock(&fs_info->balance_lock); 4703 mutex_unlock(&fs_info->balance_mutex); 4704 out: 4705 btrfs_free_path(path); 4706 return ret; 4707 } 4708 4709 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4710 { 4711 int ret = 0; 4712 4713 mutex_lock(&fs_info->balance_mutex); 4714 if (!fs_info->balance_ctl) { 4715 mutex_unlock(&fs_info->balance_mutex); 4716 return -ENOTCONN; 4717 } 4718 4719 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4720 atomic_inc(&fs_info->balance_pause_req); 4721 mutex_unlock(&fs_info->balance_mutex); 4722 4723 wait_event(fs_info->balance_wait_q, 4724 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4725 4726 mutex_lock(&fs_info->balance_mutex); 4727 /* we are good with balance_ctl ripped off from under us */ 4728 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4729 atomic_dec(&fs_info->balance_pause_req); 4730 } else { 4731 ret = -ENOTCONN; 4732 } 4733 4734 mutex_unlock(&fs_info->balance_mutex); 4735 return ret; 4736 } 4737 4738 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4739 { 4740 mutex_lock(&fs_info->balance_mutex); 4741 if (!fs_info->balance_ctl) { 4742 mutex_unlock(&fs_info->balance_mutex); 4743 return -ENOTCONN; 4744 } 4745 4746 /* 4747 * A paused balance with the item stored on disk can be resumed at 4748 * mount time if the mount is read-write. Otherwise it's still paused 4749 * and we must not allow cancelling as it deletes the item. 4750 */ 4751 if (sb_rdonly(fs_info->sb)) { 4752 mutex_unlock(&fs_info->balance_mutex); 4753 return -EROFS; 4754 } 4755 4756 atomic_inc(&fs_info->balance_cancel_req); 4757 /* 4758 * if we are running just wait and return, balance item is 4759 * deleted in btrfs_balance in this case 4760 */ 4761 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4762 mutex_unlock(&fs_info->balance_mutex); 4763 wait_event(fs_info->balance_wait_q, 4764 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4765 mutex_lock(&fs_info->balance_mutex); 4766 } else { 4767 mutex_unlock(&fs_info->balance_mutex); 4768 /* 4769 * Lock released to allow other waiters to continue, we'll 4770 * reexamine the status again. 4771 */ 4772 mutex_lock(&fs_info->balance_mutex); 4773 4774 if (fs_info->balance_ctl) { 4775 reset_balance_state(fs_info); 4776 btrfs_exclop_finish(fs_info); 4777 btrfs_info(fs_info, "balance: canceled"); 4778 } 4779 } 4780 4781 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4782 atomic_dec(&fs_info->balance_cancel_req); 4783 mutex_unlock(&fs_info->balance_mutex); 4784 return 0; 4785 } 4786 4787 int btrfs_uuid_scan_kthread(void *data) 4788 { 4789 struct btrfs_fs_info *fs_info = data; 4790 struct btrfs_root *root = fs_info->tree_root; 4791 struct btrfs_key key; 4792 struct btrfs_path *path = NULL; 4793 int ret = 0; 4794 struct extent_buffer *eb; 4795 int slot; 4796 struct btrfs_root_item root_item; 4797 u32 item_size; 4798 struct btrfs_trans_handle *trans = NULL; 4799 bool closing = false; 4800 4801 path = btrfs_alloc_path(); 4802 if (!path) { 4803 ret = -ENOMEM; 4804 goto out; 4805 } 4806 4807 key.objectid = 0; 4808 key.type = BTRFS_ROOT_ITEM_KEY; 4809 key.offset = 0; 4810 4811 while (1) { 4812 if (btrfs_fs_closing(fs_info)) { 4813 closing = true; 4814 break; 4815 } 4816 ret = btrfs_search_forward(root, &key, path, 4817 BTRFS_OLDEST_GENERATION); 4818 if (ret) { 4819 if (ret > 0) 4820 ret = 0; 4821 break; 4822 } 4823 4824 if (key.type != BTRFS_ROOT_ITEM_KEY || 4825 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4826 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4827 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4828 goto skip; 4829 4830 eb = path->nodes[0]; 4831 slot = path->slots[0]; 4832 item_size = btrfs_item_size(eb, slot); 4833 if (item_size < sizeof(root_item)) 4834 goto skip; 4835 4836 read_extent_buffer(eb, &root_item, 4837 btrfs_item_ptr_offset(eb, slot), 4838 (int)sizeof(root_item)); 4839 if (btrfs_root_refs(&root_item) == 0) 4840 goto skip; 4841 4842 if (!btrfs_is_empty_uuid(root_item.uuid) || 4843 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4844 if (trans) 4845 goto update_tree; 4846 4847 btrfs_release_path(path); 4848 /* 4849 * 1 - subvol uuid item 4850 * 1 - received_subvol uuid item 4851 */ 4852 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4853 if (IS_ERR(trans)) { 4854 ret = PTR_ERR(trans); 4855 break; 4856 } 4857 continue; 4858 } else { 4859 goto skip; 4860 } 4861 update_tree: 4862 btrfs_release_path(path); 4863 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4864 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4865 BTRFS_UUID_KEY_SUBVOL, 4866 key.objectid); 4867 if (ret < 0) { 4868 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4869 ret); 4870 break; 4871 } 4872 } 4873 4874 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4875 ret = btrfs_uuid_tree_add(trans, 4876 root_item.received_uuid, 4877 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4878 key.objectid); 4879 if (ret < 0) { 4880 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4881 ret); 4882 break; 4883 } 4884 } 4885 4886 skip: 4887 btrfs_release_path(path); 4888 if (trans) { 4889 ret = btrfs_end_transaction(trans); 4890 trans = NULL; 4891 if (ret) 4892 break; 4893 } 4894 4895 if (key.offset < (u64)-1) { 4896 key.offset++; 4897 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4898 key.offset = 0; 4899 key.type = BTRFS_ROOT_ITEM_KEY; 4900 } else if (key.objectid < (u64)-1) { 4901 key.offset = 0; 4902 key.type = BTRFS_ROOT_ITEM_KEY; 4903 key.objectid++; 4904 } else { 4905 break; 4906 } 4907 cond_resched(); 4908 } 4909 4910 out: 4911 btrfs_free_path(path); 4912 if (trans && !IS_ERR(trans)) 4913 btrfs_end_transaction(trans); 4914 if (ret) 4915 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4916 else if (!closing) 4917 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4918 up(&fs_info->uuid_tree_rescan_sem); 4919 return 0; 4920 } 4921 4922 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4923 { 4924 struct btrfs_trans_handle *trans; 4925 struct btrfs_root *tree_root = fs_info->tree_root; 4926 struct btrfs_root *uuid_root; 4927 struct task_struct *task; 4928 int ret; 4929 4930 /* 4931 * 1 - root node 4932 * 1 - root item 4933 */ 4934 trans = btrfs_start_transaction(tree_root, 2); 4935 if (IS_ERR(trans)) 4936 return PTR_ERR(trans); 4937 4938 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4939 if (IS_ERR(uuid_root)) { 4940 ret = PTR_ERR(uuid_root); 4941 btrfs_abort_transaction(trans, ret); 4942 btrfs_end_transaction(trans); 4943 return ret; 4944 } 4945 4946 fs_info->uuid_root = uuid_root; 4947 4948 ret = btrfs_commit_transaction(trans); 4949 if (ret) 4950 return ret; 4951 4952 down(&fs_info->uuid_tree_rescan_sem); 4953 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4954 if (IS_ERR(task)) { 4955 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4956 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4957 up(&fs_info->uuid_tree_rescan_sem); 4958 return PTR_ERR(task); 4959 } 4960 4961 return 0; 4962 } 4963 4964 /* 4965 * shrinking a device means finding all of the device extents past 4966 * the new size, and then following the back refs to the chunks. 4967 * The chunk relocation code actually frees the device extent 4968 */ 4969 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4970 { 4971 struct btrfs_fs_info *fs_info = device->fs_info; 4972 struct btrfs_root *root = fs_info->dev_root; 4973 struct btrfs_trans_handle *trans; 4974 struct btrfs_dev_extent *dev_extent = NULL; 4975 struct btrfs_path *path; 4976 u64 length; 4977 u64 chunk_offset; 4978 int ret; 4979 int slot; 4980 int failed = 0; 4981 bool retried = false; 4982 struct extent_buffer *l; 4983 struct btrfs_key key; 4984 struct btrfs_super_block *super_copy = fs_info->super_copy; 4985 u64 old_total = btrfs_super_total_bytes(super_copy); 4986 u64 old_size = btrfs_device_get_total_bytes(device); 4987 u64 diff; 4988 u64 start; 4989 u64 free_diff = 0; 4990 4991 new_size = round_down(new_size, fs_info->sectorsize); 4992 start = new_size; 4993 diff = round_down(old_size - new_size, fs_info->sectorsize); 4994 4995 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4996 return -EINVAL; 4997 4998 path = btrfs_alloc_path(); 4999 if (!path) 5000 return -ENOMEM; 5001 5002 path->reada = READA_BACK; 5003 5004 trans = btrfs_start_transaction(root, 0); 5005 if (IS_ERR(trans)) { 5006 btrfs_free_path(path); 5007 return PTR_ERR(trans); 5008 } 5009 5010 mutex_lock(&fs_info->chunk_mutex); 5011 5012 btrfs_device_set_total_bytes(device, new_size); 5013 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5014 device->fs_devices->total_rw_bytes -= diff; 5015 5016 /* 5017 * The new free_chunk_space is new_size - used, so we have to 5018 * subtract the delta of the old free_chunk_space which included 5019 * old_size - used. If used > new_size then just subtract this 5020 * entire device's free space. 5021 */ 5022 if (device->bytes_used < new_size) 5023 free_diff = (old_size - device->bytes_used) - 5024 (new_size - device->bytes_used); 5025 else 5026 free_diff = old_size - device->bytes_used; 5027 atomic64_sub(free_diff, &fs_info->free_chunk_space); 5028 } 5029 5030 /* 5031 * Once the device's size has been set to the new size, ensure all 5032 * in-memory chunks are synced to disk so that the loop below sees them 5033 * and relocates them accordingly. 5034 */ 5035 if (contains_pending_extent(device, &start, diff)) { 5036 mutex_unlock(&fs_info->chunk_mutex); 5037 ret = btrfs_commit_transaction(trans); 5038 if (ret) 5039 goto done; 5040 } else { 5041 mutex_unlock(&fs_info->chunk_mutex); 5042 btrfs_end_transaction(trans); 5043 } 5044 5045 again: 5046 key.objectid = device->devid; 5047 key.offset = (u64)-1; 5048 key.type = BTRFS_DEV_EXTENT_KEY; 5049 5050 do { 5051 mutex_lock(&fs_info->reclaim_bgs_lock); 5052 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5053 if (ret < 0) { 5054 mutex_unlock(&fs_info->reclaim_bgs_lock); 5055 goto done; 5056 } 5057 5058 ret = btrfs_previous_item(root, path, 0, key.type); 5059 if (ret) { 5060 mutex_unlock(&fs_info->reclaim_bgs_lock); 5061 if (ret < 0) 5062 goto done; 5063 ret = 0; 5064 btrfs_release_path(path); 5065 break; 5066 } 5067 5068 l = path->nodes[0]; 5069 slot = path->slots[0]; 5070 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 5071 5072 if (key.objectid != device->devid) { 5073 mutex_unlock(&fs_info->reclaim_bgs_lock); 5074 btrfs_release_path(path); 5075 break; 5076 } 5077 5078 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 5079 length = btrfs_dev_extent_length(l, dev_extent); 5080 5081 if (key.offset + length <= new_size) { 5082 mutex_unlock(&fs_info->reclaim_bgs_lock); 5083 btrfs_release_path(path); 5084 break; 5085 } 5086 5087 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 5088 btrfs_release_path(path); 5089 5090 /* 5091 * We may be relocating the only data chunk we have, 5092 * which could potentially end up with losing data's 5093 * raid profile, so lets allocate an empty one in 5094 * advance. 5095 */ 5096 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 5097 if (ret < 0) { 5098 mutex_unlock(&fs_info->reclaim_bgs_lock); 5099 goto done; 5100 } 5101 5102 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 5103 mutex_unlock(&fs_info->reclaim_bgs_lock); 5104 if (ret == -ENOSPC) { 5105 failed++; 5106 } else if (ret) { 5107 if (ret == -ETXTBSY) { 5108 btrfs_warn(fs_info, 5109 "could not shrink block group %llu due to active swapfile", 5110 chunk_offset); 5111 } 5112 goto done; 5113 } 5114 } while (key.offset-- > 0); 5115 5116 if (failed && !retried) { 5117 failed = 0; 5118 retried = true; 5119 goto again; 5120 } else if (failed && retried) { 5121 ret = -ENOSPC; 5122 goto done; 5123 } 5124 5125 /* Shrinking succeeded, else we would be at "done". */ 5126 trans = btrfs_start_transaction(root, 0); 5127 if (IS_ERR(trans)) { 5128 ret = PTR_ERR(trans); 5129 goto done; 5130 } 5131 5132 mutex_lock(&fs_info->chunk_mutex); 5133 /* Clear all state bits beyond the shrunk device size */ 5134 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 5135 CHUNK_STATE_MASK); 5136 5137 btrfs_device_set_disk_total_bytes(device, new_size); 5138 if (list_empty(&device->post_commit_list)) 5139 list_add_tail(&device->post_commit_list, 5140 &trans->transaction->dev_update_list); 5141 5142 WARN_ON(diff > old_total); 5143 btrfs_set_super_total_bytes(super_copy, 5144 round_down(old_total - diff, fs_info->sectorsize)); 5145 mutex_unlock(&fs_info->chunk_mutex); 5146 5147 btrfs_reserve_chunk_metadata(trans, false); 5148 /* Now btrfs_update_device() will change the on-disk size. */ 5149 ret = btrfs_update_device(trans, device); 5150 btrfs_trans_release_chunk_metadata(trans); 5151 if (ret < 0) { 5152 btrfs_abort_transaction(trans, ret); 5153 btrfs_end_transaction(trans); 5154 } else { 5155 ret = btrfs_commit_transaction(trans); 5156 } 5157 done: 5158 btrfs_free_path(path); 5159 if (ret) { 5160 mutex_lock(&fs_info->chunk_mutex); 5161 btrfs_device_set_total_bytes(device, old_size); 5162 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5163 device->fs_devices->total_rw_bytes += diff; 5164 atomic64_add(free_diff, &fs_info->free_chunk_space); 5165 } 5166 mutex_unlock(&fs_info->chunk_mutex); 5167 } 5168 return ret; 5169 } 5170 5171 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 5172 struct btrfs_key *key, 5173 struct btrfs_chunk *chunk, int item_size) 5174 { 5175 struct btrfs_super_block *super_copy = fs_info->super_copy; 5176 struct btrfs_disk_key disk_key; 5177 u32 array_size; 5178 u8 *ptr; 5179 5180 lockdep_assert_held(&fs_info->chunk_mutex); 5181 5182 array_size = btrfs_super_sys_array_size(super_copy); 5183 if (array_size + item_size + sizeof(disk_key) 5184 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5185 return -EFBIG; 5186 5187 ptr = super_copy->sys_chunk_array + array_size; 5188 btrfs_cpu_key_to_disk(&disk_key, key); 5189 memcpy(ptr, &disk_key, sizeof(disk_key)); 5190 ptr += sizeof(disk_key); 5191 memcpy(ptr, chunk, item_size); 5192 item_size += sizeof(disk_key); 5193 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5194 5195 return 0; 5196 } 5197 5198 /* 5199 * sort the devices in descending order by max_avail, total_avail 5200 */ 5201 static int btrfs_cmp_device_info(const void *a, const void *b) 5202 { 5203 const struct btrfs_device_info *di_a = a; 5204 const struct btrfs_device_info *di_b = b; 5205 5206 if (di_a->max_avail > di_b->max_avail) 5207 return -1; 5208 if (di_a->max_avail < di_b->max_avail) 5209 return 1; 5210 if (di_a->total_avail > di_b->total_avail) 5211 return -1; 5212 if (di_a->total_avail < di_b->total_avail) 5213 return 1; 5214 return 0; 5215 } 5216 5217 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5218 { 5219 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5220 return; 5221 5222 btrfs_set_fs_incompat(info, RAID56); 5223 } 5224 5225 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5226 { 5227 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5228 return; 5229 5230 btrfs_set_fs_incompat(info, RAID1C34); 5231 } 5232 5233 /* 5234 * Structure used internally for btrfs_create_chunk() function. 5235 * Wraps needed parameters. 5236 */ 5237 struct alloc_chunk_ctl { 5238 u64 start; 5239 u64 type; 5240 /* Total number of stripes to allocate */ 5241 int num_stripes; 5242 /* sub_stripes info for map */ 5243 int sub_stripes; 5244 /* Stripes per device */ 5245 int dev_stripes; 5246 /* Maximum number of devices to use */ 5247 int devs_max; 5248 /* Minimum number of devices to use */ 5249 int devs_min; 5250 /* ndevs has to be a multiple of this */ 5251 int devs_increment; 5252 /* Number of copies */ 5253 int ncopies; 5254 /* Number of stripes worth of bytes to store parity information */ 5255 int nparity; 5256 u64 max_stripe_size; 5257 u64 max_chunk_size; 5258 u64 dev_extent_min; 5259 u64 stripe_size; 5260 u64 chunk_size; 5261 int ndevs; 5262 }; 5263 5264 static void init_alloc_chunk_ctl_policy_regular( 5265 struct btrfs_fs_devices *fs_devices, 5266 struct alloc_chunk_ctl *ctl) 5267 { 5268 struct btrfs_space_info *space_info; 5269 5270 space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); 5271 ASSERT(space_info); 5272 5273 ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); 5274 ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); 5275 5276 if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) 5277 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); 5278 5279 /* We don't want a chunk larger than 10% of writable space */ 5280 ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), 5281 ctl->max_chunk_size); 5282 ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); 5283 } 5284 5285 static void init_alloc_chunk_ctl_policy_zoned( 5286 struct btrfs_fs_devices *fs_devices, 5287 struct alloc_chunk_ctl *ctl) 5288 { 5289 u64 zone_size = fs_devices->fs_info->zone_size; 5290 u64 limit; 5291 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5292 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5293 u64 min_chunk_size = min_data_stripes * zone_size; 5294 u64 type = ctl->type; 5295 5296 ctl->max_stripe_size = zone_size; 5297 if (type & BTRFS_BLOCK_GROUP_DATA) { 5298 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5299 zone_size); 5300 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5301 ctl->max_chunk_size = ctl->max_stripe_size; 5302 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5303 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5304 ctl->devs_max = min_t(int, ctl->devs_max, 5305 BTRFS_MAX_DEVS_SYS_CHUNK); 5306 } else { 5307 BUG(); 5308 } 5309 5310 /* We don't want a chunk larger than 10% of writable space */ 5311 limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), 5312 zone_size), 5313 min_chunk_size); 5314 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5315 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5316 } 5317 5318 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5319 struct alloc_chunk_ctl *ctl) 5320 { 5321 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5322 5323 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5324 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5325 ctl->devs_max = btrfs_raid_array[index].devs_max; 5326 if (!ctl->devs_max) 5327 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5328 ctl->devs_min = btrfs_raid_array[index].devs_min; 5329 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5330 ctl->ncopies = btrfs_raid_array[index].ncopies; 5331 ctl->nparity = btrfs_raid_array[index].nparity; 5332 ctl->ndevs = 0; 5333 5334 switch (fs_devices->chunk_alloc_policy) { 5335 case BTRFS_CHUNK_ALLOC_REGULAR: 5336 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5337 break; 5338 case BTRFS_CHUNK_ALLOC_ZONED: 5339 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5340 break; 5341 default: 5342 BUG(); 5343 } 5344 } 5345 5346 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5347 struct alloc_chunk_ctl *ctl, 5348 struct btrfs_device_info *devices_info) 5349 { 5350 struct btrfs_fs_info *info = fs_devices->fs_info; 5351 struct btrfs_device *device; 5352 u64 total_avail; 5353 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5354 int ret; 5355 int ndevs = 0; 5356 u64 max_avail; 5357 u64 dev_offset; 5358 5359 /* 5360 * in the first pass through the devices list, we gather information 5361 * about the available holes on each device. 5362 */ 5363 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5364 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5365 WARN(1, KERN_ERR 5366 "BTRFS: read-only device in alloc_list\n"); 5367 continue; 5368 } 5369 5370 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5371 &device->dev_state) || 5372 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5373 continue; 5374 5375 if (device->total_bytes > device->bytes_used) 5376 total_avail = device->total_bytes - device->bytes_used; 5377 else 5378 total_avail = 0; 5379 5380 /* If there is no space on this device, skip it. */ 5381 if (total_avail < ctl->dev_extent_min) 5382 continue; 5383 5384 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5385 &max_avail); 5386 if (ret && ret != -ENOSPC) 5387 return ret; 5388 5389 if (ret == 0) 5390 max_avail = dev_extent_want; 5391 5392 if (max_avail < ctl->dev_extent_min) { 5393 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5394 btrfs_debug(info, 5395 "%s: devid %llu has no free space, have=%llu want=%llu", 5396 __func__, device->devid, max_avail, 5397 ctl->dev_extent_min); 5398 continue; 5399 } 5400 5401 if (ndevs == fs_devices->rw_devices) { 5402 WARN(1, "%s: found more than %llu devices\n", 5403 __func__, fs_devices->rw_devices); 5404 break; 5405 } 5406 devices_info[ndevs].dev_offset = dev_offset; 5407 devices_info[ndevs].max_avail = max_avail; 5408 devices_info[ndevs].total_avail = total_avail; 5409 devices_info[ndevs].dev = device; 5410 ++ndevs; 5411 } 5412 ctl->ndevs = ndevs; 5413 5414 /* 5415 * now sort the devices by hole size / available space 5416 */ 5417 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5418 btrfs_cmp_device_info, NULL); 5419 5420 return 0; 5421 } 5422 5423 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5424 struct btrfs_device_info *devices_info) 5425 { 5426 /* Number of stripes that count for block group size */ 5427 int data_stripes; 5428 5429 /* 5430 * The primary goal is to maximize the number of stripes, so use as 5431 * many devices as possible, even if the stripes are not maximum sized. 5432 * 5433 * The DUP profile stores more than one stripe per device, the 5434 * max_avail is the total size so we have to adjust. 5435 */ 5436 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5437 ctl->dev_stripes); 5438 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5439 5440 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5441 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5442 5443 /* 5444 * Use the number of data stripes to figure out how big this chunk is 5445 * really going to be in terms of logical address space, and compare 5446 * that answer with the max chunk size. If it's higher, we try to 5447 * reduce stripe_size. 5448 */ 5449 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5450 /* 5451 * Reduce stripe_size, round it up to a 16MB boundary again and 5452 * then use it, unless it ends up being even bigger than the 5453 * previous value we had already. 5454 */ 5455 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5456 data_stripes), SZ_16M), 5457 ctl->stripe_size); 5458 } 5459 5460 /* Stripe size should not go beyond 1G. */ 5461 ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); 5462 5463 /* Align to BTRFS_STRIPE_LEN */ 5464 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5465 ctl->chunk_size = ctl->stripe_size * data_stripes; 5466 5467 return 0; 5468 } 5469 5470 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5471 struct btrfs_device_info *devices_info) 5472 { 5473 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5474 /* Number of stripes that count for block group size */ 5475 int data_stripes; 5476 5477 /* 5478 * It should hold because: 5479 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5480 */ 5481 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5482 5483 ctl->stripe_size = zone_size; 5484 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5485 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5486 5487 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5488 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5489 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5490 ctl->stripe_size) + ctl->nparity, 5491 ctl->dev_stripes); 5492 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5493 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5494 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5495 } 5496 5497 ctl->chunk_size = ctl->stripe_size * data_stripes; 5498 5499 return 0; 5500 } 5501 5502 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5503 struct alloc_chunk_ctl *ctl, 5504 struct btrfs_device_info *devices_info) 5505 { 5506 struct btrfs_fs_info *info = fs_devices->fs_info; 5507 5508 /* 5509 * Round down to number of usable stripes, devs_increment can be any 5510 * number so we can't use round_down() that requires power of 2, while 5511 * rounddown is safe. 5512 */ 5513 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5514 5515 if (ctl->ndevs < ctl->devs_min) { 5516 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5517 btrfs_debug(info, 5518 "%s: not enough devices with free space: have=%d minimum required=%d", 5519 __func__, ctl->ndevs, ctl->devs_min); 5520 } 5521 return -ENOSPC; 5522 } 5523 5524 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5525 5526 switch (fs_devices->chunk_alloc_policy) { 5527 case BTRFS_CHUNK_ALLOC_REGULAR: 5528 return decide_stripe_size_regular(ctl, devices_info); 5529 case BTRFS_CHUNK_ALLOC_ZONED: 5530 return decide_stripe_size_zoned(ctl, devices_info); 5531 default: 5532 BUG(); 5533 } 5534 } 5535 5536 static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) 5537 { 5538 for (int i = 0; i < map->num_stripes; i++) { 5539 struct btrfs_io_stripe *stripe = &map->stripes[i]; 5540 struct btrfs_device *device = stripe->dev; 5541 5542 set_extent_bit(&device->alloc_state, stripe->physical, 5543 stripe->physical + map->stripe_size - 1, 5544 bits | EXTENT_NOWAIT, NULL); 5545 } 5546 } 5547 5548 static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) 5549 { 5550 for (int i = 0; i < map->num_stripes; i++) { 5551 struct btrfs_io_stripe *stripe = &map->stripes[i]; 5552 struct btrfs_device *device = stripe->dev; 5553 5554 __clear_extent_bit(&device->alloc_state, stripe->physical, 5555 stripe->physical + map->stripe_size - 1, 5556 bits | EXTENT_NOWAIT, 5557 NULL, NULL); 5558 } 5559 } 5560 5561 void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) 5562 { 5563 write_lock(&fs_info->mapping_tree_lock); 5564 rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); 5565 RB_CLEAR_NODE(&map->rb_node); 5566 chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); 5567 write_unlock(&fs_info->mapping_tree_lock); 5568 5569 /* Once for the tree reference. */ 5570 btrfs_free_chunk_map(map); 5571 } 5572 5573 EXPORT_FOR_TESTS 5574 int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) 5575 { 5576 struct rb_node **p; 5577 struct rb_node *parent = NULL; 5578 bool leftmost = true; 5579 5580 write_lock(&fs_info->mapping_tree_lock); 5581 p = &fs_info->mapping_tree.rb_root.rb_node; 5582 while (*p) { 5583 struct btrfs_chunk_map *entry; 5584 5585 parent = *p; 5586 entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); 5587 5588 if (map->start < entry->start) { 5589 p = &(*p)->rb_left; 5590 } else if (map->start > entry->start) { 5591 p = &(*p)->rb_right; 5592 leftmost = false; 5593 } else { 5594 write_unlock(&fs_info->mapping_tree_lock); 5595 return -EEXIST; 5596 } 5597 } 5598 rb_link_node(&map->rb_node, parent, p); 5599 rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); 5600 chunk_map_device_set_bits(map, CHUNK_ALLOCATED); 5601 chunk_map_device_clear_bits(map, CHUNK_TRIMMED); 5602 write_unlock(&fs_info->mapping_tree_lock); 5603 5604 return 0; 5605 } 5606 5607 EXPORT_FOR_TESTS 5608 struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) 5609 { 5610 struct btrfs_chunk_map *map; 5611 5612 map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); 5613 if (!map) 5614 return NULL; 5615 5616 refcount_set(&map->refs, 1); 5617 RB_CLEAR_NODE(&map->rb_node); 5618 5619 return map; 5620 } 5621 5622 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5623 struct alloc_chunk_ctl *ctl, 5624 struct btrfs_device_info *devices_info) 5625 { 5626 struct btrfs_fs_info *info = trans->fs_info; 5627 struct btrfs_chunk_map *map; 5628 struct btrfs_block_group *block_group; 5629 u64 start = ctl->start; 5630 u64 type = ctl->type; 5631 int ret; 5632 5633 map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); 5634 if (!map) 5635 return ERR_PTR(-ENOMEM); 5636 5637 map->start = start; 5638 map->chunk_len = ctl->chunk_size; 5639 map->stripe_size = ctl->stripe_size; 5640 map->type = type; 5641 map->io_align = BTRFS_STRIPE_LEN; 5642 map->io_width = BTRFS_STRIPE_LEN; 5643 map->sub_stripes = ctl->sub_stripes; 5644 map->num_stripes = ctl->num_stripes; 5645 5646 for (int i = 0; i < ctl->ndevs; i++) { 5647 for (int j = 0; j < ctl->dev_stripes; j++) { 5648 int s = i * ctl->dev_stripes + j; 5649 map->stripes[s].dev = devices_info[i].dev; 5650 map->stripes[s].physical = devices_info[i].dev_offset + 5651 j * ctl->stripe_size; 5652 } 5653 } 5654 5655 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5656 5657 ret = btrfs_add_chunk_map(info, map); 5658 if (ret) { 5659 btrfs_free_chunk_map(map); 5660 return ERR_PTR(ret); 5661 } 5662 5663 block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); 5664 if (IS_ERR(block_group)) { 5665 btrfs_remove_chunk_map(info, map); 5666 return block_group; 5667 } 5668 5669 for (int i = 0; i < map->num_stripes; i++) { 5670 struct btrfs_device *dev = map->stripes[i].dev; 5671 5672 btrfs_device_set_bytes_used(dev, 5673 dev->bytes_used + ctl->stripe_size); 5674 if (list_empty(&dev->post_commit_list)) 5675 list_add_tail(&dev->post_commit_list, 5676 &trans->transaction->dev_update_list); 5677 } 5678 5679 atomic64_sub(ctl->stripe_size * map->num_stripes, 5680 &info->free_chunk_space); 5681 5682 check_raid56_incompat_flag(info, type); 5683 check_raid1c34_incompat_flag(info, type); 5684 5685 return block_group; 5686 } 5687 5688 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5689 u64 type) 5690 { 5691 struct btrfs_fs_info *info = trans->fs_info; 5692 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5693 struct btrfs_device_info *devices_info = NULL; 5694 struct alloc_chunk_ctl ctl; 5695 struct btrfs_block_group *block_group; 5696 int ret; 5697 5698 lockdep_assert_held(&info->chunk_mutex); 5699 5700 if (!alloc_profile_is_valid(type, 0)) { 5701 ASSERT(0); 5702 return ERR_PTR(-EINVAL); 5703 } 5704 5705 if (list_empty(&fs_devices->alloc_list)) { 5706 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5707 btrfs_debug(info, "%s: no writable device", __func__); 5708 return ERR_PTR(-ENOSPC); 5709 } 5710 5711 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5712 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5713 ASSERT(0); 5714 return ERR_PTR(-EINVAL); 5715 } 5716 5717 ctl.start = find_next_chunk(info); 5718 ctl.type = type; 5719 init_alloc_chunk_ctl(fs_devices, &ctl); 5720 5721 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5722 GFP_NOFS); 5723 if (!devices_info) 5724 return ERR_PTR(-ENOMEM); 5725 5726 ret = gather_device_info(fs_devices, &ctl, devices_info); 5727 if (ret < 0) { 5728 block_group = ERR_PTR(ret); 5729 goto out; 5730 } 5731 5732 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5733 if (ret < 0) { 5734 block_group = ERR_PTR(ret); 5735 goto out; 5736 } 5737 5738 block_group = create_chunk(trans, &ctl, devices_info); 5739 5740 out: 5741 kfree(devices_info); 5742 return block_group; 5743 } 5744 5745 /* 5746 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5747 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5748 * chunks. 5749 * 5750 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5751 * phases. 5752 */ 5753 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5754 struct btrfs_block_group *bg) 5755 { 5756 struct btrfs_fs_info *fs_info = trans->fs_info; 5757 struct btrfs_root *chunk_root = fs_info->chunk_root; 5758 struct btrfs_key key; 5759 struct btrfs_chunk *chunk; 5760 struct btrfs_stripe *stripe; 5761 struct btrfs_chunk_map *map; 5762 size_t item_size; 5763 int i; 5764 int ret; 5765 5766 /* 5767 * We take the chunk_mutex for 2 reasons: 5768 * 5769 * 1) Updates and insertions in the chunk btree must be done while holding 5770 * the chunk_mutex, as well as updating the system chunk array in the 5771 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5772 * details; 5773 * 5774 * 2) To prevent races with the final phase of a device replace operation 5775 * that replaces the device object associated with the map's stripes, 5776 * because the device object's id can change at any time during that 5777 * final phase of the device replace operation 5778 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5779 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5780 * which would cause a failure when updating the device item, which does 5781 * not exists, or persisting a stripe of the chunk item with such ID. 5782 * Here we can't use the device_list_mutex because our caller already 5783 * has locked the chunk_mutex, and the final phase of device replace 5784 * acquires both mutexes - first the device_list_mutex and then the 5785 * chunk_mutex. Using any of those two mutexes protects us from a 5786 * concurrent device replace. 5787 */ 5788 lockdep_assert_held(&fs_info->chunk_mutex); 5789 5790 map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5791 if (IS_ERR(map)) { 5792 ret = PTR_ERR(map); 5793 btrfs_abort_transaction(trans, ret); 5794 return ret; 5795 } 5796 5797 item_size = btrfs_chunk_item_size(map->num_stripes); 5798 5799 chunk = kzalloc(item_size, GFP_NOFS); 5800 if (!chunk) { 5801 ret = -ENOMEM; 5802 btrfs_abort_transaction(trans, ret); 5803 goto out; 5804 } 5805 5806 for (i = 0; i < map->num_stripes; i++) { 5807 struct btrfs_device *device = map->stripes[i].dev; 5808 5809 ret = btrfs_update_device(trans, device); 5810 if (ret) 5811 goto out; 5812 } 5813 5814 stripe = &chunk->stripe; 5815 for (i = 0; i < map->num_stripes; i++) { 5816 struct btrfs_device *device = map->stripes[i].dev; 5817 const u64 dev_offset = map->stripes[i].physical; 5818 5819 btrfs_set_stack_stripe_devid(stripe, device->devid); 5820 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5821 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5822 stripe++; 5823 } 5824 5825 btrfs_set_stack_chunk_length(chunk, bg->length); 5826 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5827 btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); 5828 btrfs_set_stack_chunk_type(chunk, map->type); 5829 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5830 btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); 5831 btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); 5832 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5833 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5834 5835 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5836 key.type = BTRFS_CHUNK_ITEM_KEY; 5837 key.offset = bg->start; 5838 5839 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5840 if (ret) 5841 goto out; 5842 5843 set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); 5844 5845 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5846 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5847 if (ret) 5848 goto out; 5849 } 5850 5851 out: 5852 kfree(chunk); 5853 btrfs_free_chunk_map(map); 5854 return ret; 5855 } 5856 5857 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5858 { 5859 struct btrfs_fs_info *fs_info = trans->fs_info; 5860 u64 alloc_profile; 5861 struct btrfs_block_group *meta_bg; 5862 struct btrfs_block_group *sys_bg; 5863 5864 /* 5865 * When adding a new device for sprouting, the seed device is read-only 5866 * so we must first allocate a metadata and a system chunk. But before 5867 * adding the block group items to the extent, device and chunk btrees, 5868 * we must first: 5869 * 5870 * 1) Create both chunks without doing any changes to the btrees, as 5871 * otherwise we would get -ENOSPC since the block groups from the 5872 * seed device are read-only; 5873 * 5874 * 2) Add the device item for the new sprout device - finishing the setup 5875 * of a new block group requires updating the device item in the chunk 5876 * btree, so it must exist when we attempt to do it. The previous step 5877 * ensures this does not fail with -ENOSPC. 5878 * 5879 * After that we can add the block group items to their btrees: 5880 * update existing device item in the chunk btree, add a new block group 5881 * item to the extent btree, add a new chunk item to the chunk btree and 5882 * finally add the new device extent items to the devices btree. 5883 */ 5884 5885 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5886 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5887 if (IS_ERR(meta_bg)) 5888 return PTR_ERR(meta_bg); 5889 5890 alloc_profile = btrfs_system_alloc_profile(fs_info); 5891 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5892 if (IS_ERR(sys_bg)) 5893 return PTR_ERR(sys_bg); 5894 5895 return 0; 5896 } 5897 5898 static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) 5899 { 5900 const int index = btrfs_bg_flags_to_raid_index(map->type); 5901 5902 return btrfs_raid_array[index].tolerated_failures; 5903 } 5904 5905 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5906 { 5907 struct btrfs_chunk_map *map; 5908 int miss_ndevs = 0; 5909 int i; 5910 bool ret = true; 5911 5912 map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5913 if (IS_ERR(map)) 5914 return false; 5915 5916 for (i = 0; i < map->num_stripes; i++) { 5917 if (test_bit(BTRFS_DEV_STATE_MISSING, 5918 &map->stripes[i].dev->dev_state)) { 5919 miss_ndevs++; 5920 continue; 5921 } 5922 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5923 &map->stripes[i].dev->dev_state)) { 5924 ret = false; 5925 goto end; 5926 } 5927 } 5928 5929 /* 5930 * If the number of missing devices is larger than max errors, we can 5931 * not write the data into that chunk successfully. 5932 */ 5933 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5934 ret = false; 5935 end: 5936 btrfs_free_chunk_map(map); 5937 return ret; 5938 } 5939 5940 void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) 5941 { 5942 write_lock(&fs_info->mapping_tree_lock); 5943 while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { 5944 struct btrfs_chunk_map *map; 5945 struct rb_node *node; 5946 5947 node = rb_first_cached(&fs_info->mapping_tree); 5948 map = rb_entry(node, struct btrfs_chunk_map, rb_node); 5949 rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); 5950 RB_CLEAR_NODE(&map->rb_node); 5951 chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); 5952 /* Once for the tree ref. */ 5953 btrfs_free_chunk_map(map); 5954 cond_resched_rwlock_write(&fs_info->mapping_tree_lock); 5955 } 5956 write_unlock(&fs_info->mapping_tree_lock); 5957 } 5958 5959 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5960 { 5961 struct btrfs_chunk_map *map; 5962 enum btrfs_raid_types index; 5963 int ret = 1; 5964 5965 map = btrfs_get_chunk_map(fs_info, logical, len); 5966 if (IS_ERR(map)) 5967 /* 5968 * We could return errors for these cases, but that could get 5969 * ugly and we'd probably do the same thing which is just not do 5970 * anything else and exit, so return 1 so the callers don't try 5971 * to use other copies. 5972 */ 5973 return 1; 5974 5975 index = btrfs_bg_flags_to_raid_index(map->type); 5976 5977 /* Non-RAID56, use their ncopies from btrfs_raid_array. */ 5978 if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5979 ret = btrfs_raid_array[index].ncopies; 5980 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5981 ret = 2; 5982 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5983 /* 5984 * There could be two corrupted data stripes, we need 5985 * to loop retry in order to rebuild the correct data. 5986 * 5987 * Fail a stripe at a time on every retry except the 5988 * stripe under reconstruction. 5989 */ 5990 ret = map->num_stripes; 5991 btrfs_free_chunk_map(map); 5992 return ret; 5993 } 5994 5995 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5996 u64 logical) 5997 { 5998 struct btrfs_chunk_map *map; 5999 unsigned long len = fs_info->sectorsize; 6000 6001 if (!btrfs_fs_incompat(fs_info, RAID56)) 6002 return len; 6003 6004 map = btrfs_get_chunk_map(fs_info, logical, len); 6005 6006 if (!WARN_ON(IS_ERR(map))) { 6007 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 6008 len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 6009 btrfs_free_chunk_map(map); 6010 } 6011 return len; 6012 } 6013 6014 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 6015 { 6016 struct btrfs_chunk_map *map; 6017 int ret = 0; 6018 6019 if (!btrfs_fs_incompat(fs_info, RAID56)) 6020 return 0; 6021 6022 map = btrfs_get_chunk_map(fs_info, logical, len); 6023 6024 if (!WARN_ON(IS_ERR(map))) { 6025 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 6026 ret = 1; 6027 btrfs_free_chunk_map(map); 6028 } 6029 return ret; 6030 } 6031 6032 static int find_live_mirror(struct btrfs_fs_info *fs_info, 6033 struct btrfs_chunk_map *map, int first, 6034 int dev_replace_is_ongoing) 6035 { 6036 const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); 6037 int i; 6038 int num_stripes; 6039 int preferred_mirror; 6040 int tolerance; 6041 struct btrfs_device *srcdev; 6042 6043 ASSERT((map->type & 6044 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 6045 6046 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 6047 num_stripes = map->sub_stripes; 6048 else 6049 num_stripes = map->num_stripes; 6050 6051 switch (policy) { 6052 default: 6053 /* Shouldn't happen, just warn and use pid instead of failing */ 6054 btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid", 6055 policy); 6056 WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID); 6057 fallthrough; 6058 case BTRFS_READ_POLICY_PID: 6059 preferred_mirror = first + (current->pid % num_stripes); 6060 break; 6061 } 6062 6063 if (dev_replace_is_ongoing && 6064 fs_info->dev_replace.cont_reading_from_srcdev_mode == 6065 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 6066 srcdev = fs_info->dev_replace.srcdev; 6067 else 6068 srcdev = NULL; 6069 6070 /* 6071 * try to avoid the drive that is the source drive for a 6072 * dev-replace procedure, only choose it if no other non-missing 6073 * mirror is available 6074 */ 6075 for (tolerance = 0; tolerance < 2; tolerance++) { 6076 if (map->stripes[preferred_mirror].dev->bdev && 6077 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 6078 return preferred_mirror; 6079 for (i = first; i < first + num_stripes; i++) { 6080 if (map->stripes[i].dev->bdev && 6081 (tolerance || map->stripes[i].dev != srcdev)) 6082 return i; 6083 } 6084 } 6085 6086 /* we couldn't find one that doesn't fail. Just return something 6087 * and the io error handling code will clean up eventually 6088 */ 6089 return preferred_mirror; 6090 } 6091 6092 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 6093 u64 logical, 6094 u16 total_stripes) 6095 { 6096 struct btrfs_io_context *bioc; 6097 6098 bioc = kzalloc( 6099 /* The size of btrfs_io_context */ 6100 sizeof(struct btrfs_io_context) + 6101 /* Plus the variable array for the stripes */ 6102 sizeof(struct btrfs_io_stripe) * (total_stripes), 6103 GFP_NOFS); 6104 6105 if (!bioc) 6106 return NULL; 6107 6108 refcount_set(&bioc->refs, 1); 6109 6110 bioc->fs_info = fs_info; 6111 bioc->replace_stripe_src = -1; 6112 bioc->full_stripe_logical = (u64)-1; 6113 bioc->logical = logical; 6114 6115 return bioc; 6116 } 6117 6118 void btrfs_get_bioc(struct btrfs_io_context *bioc) 6119 { 6120 WARN_ON(!refcount_read(&bioc->refs)); 6121 refcount_inc(&bioc->refs); 6122 } 6123 6124 void btrfs_put_bioc(struct btrfs_io_context *bioc) 6125 { 6126 if (!bioc) 6127 return; 6128 if (refcount_dec_and_test(&bioc->refs)) 6129 kfree(bioc); 6130 } 6131 6132 /* 6133 * Please note that, discard won't be sent to target device of device 6134 * replace. 6135 */ 6136 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 6137 u64 logical, u64 *length_ret, 6138 u32 *num_stripes) 6139 { 6140 struct btrfs_chunk_map *map; 6141 struct btrfs_discard_stripe *stripes; 6142 u64 length = *length_ret; 6143 u64 offset; 6144 u32 stripe_nr; 6145 u32 stripe_nr_end; 6146 u32 stripe_cnt; 6147 u64 stripe_end_offset; 6148 u64 stripe_offset; 6149 u32 stripe_index; 6150 u32 factor = 0; 6151 u32 sub_stripes = 0; 6152 u32 stripes_per_dev = 0; 6153 u32 remaining_stripes = 0; 6154 u32 last_stripe = 0; 6155 int ret; 6156 int i; 6157 6158 map = btrfs_get_chunk_map(fs_info, logical, length); 6159 if (IS_ERR(map)) 6160 return ERR_CAST(map); 6161 6162 /* we don't discard raid56 yet */ 6163 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6164 ret = -EOPNOTSUPP; 6165 goto out_free_map; 6166 } 6167 6168 offset = logical - map->start; 6169 length = min_t(u64, map->start + map->chunk_len - logical, length); 6170 *length_ret = length; 6171 6172 /* 6173 * stripe_nr counts the total number of stripes we have to stride 6174 * to get to this block 6175 */ 6176 stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; 6177 6178 /* stripe_offset is the offset of this block in its stripe */ 6179 stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); 6180 6181 stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> 6182 BTRFS_STRIPE_LEN_SHIFT; 6183 stripe_cnt = stripe_nr_end - stripe_nr; 6184 stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - 6185 (offset + length); 6186 /* 6187 * after this, stripe_nr is the number of stripes on this 6188 * device we have to walk to find the data, and stripe_index is 6189 * the number of our device in the stripe array 6190 */ 6191 *num_stripes = 1; 6192 stripe_index = 0; 6193 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6194 BTRFS_BLOCK_GROUP_RAID10)) { 6195 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6196 sub_stripes = 1; 6197 else 6198 sub_stripes = map->sub_stripes; 6199 6200 factor = map->num_stripes / sub_stripes; 6201 *num_stripes = min_t(u64, map->num_stripes, 6202 sub_stripes * stripe_cnt); 6203 stripe_index = stripe_nr % factor; 6204 stripe_nr /= factor; 6205 stripe_index *= sub_stripes; 6206 6207 remaining_stripes = stripe_cnt % factor; 6208 stripes_per_dev = stripe_cnt / factor; 6209 last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; 6210 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6211 BTRFS_BLOCK_GROUP_DUP)) { 6212 *num_stripes = map->num_stripes; 6213 } else { 6214 stripe_index = stripe_nr % map->num_stripes; 6215 stripe_nr /= map->num_stripes; 6216 } 6217 6218 stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); 6219 if (!stripes) { 6220 ret = -ENOMEM; 6221 goto out_free_map; 6222 } 6223 6224 for (i = 0; i < *num_stripes; i++) { 6225 stripes[i].physical = 6226 map->stripes[stripe_index].physical + 6227 stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); 6228 stripes[i].dev = map->stripes[stripe_index].dev; 6229 6230 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6231 BTRFS_BLOCK_GROUP_RAID10)) { 6232 stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); 6233 6234 if (i / sub_stripes < remaining_stripes) 6235 stripes[i].length += BTRFS_STRIPE_LEN; 6236 6237 /* 6238 * Special for the first stripe and 6239 * the last stripe: 6240 * 6241 * |-------|...|-------| 6242 * |----------| 6243 * off end_off 6244 */ 6245 if (i < sub_stripes) 6246 stripes[i].length -= stripe_offset; 6247 6248 if (stripe_index >= last_stripe && 6249 stripe_index <= (last_stripe + 6250 sub_stripes - 1)) 6251 stripes[i].length -= stripe_end_offset; 6252 6253 if (i == sub_stripes - 1) 6254 stripe_offset = 0; 6255 } else { 6256 stripes[i].length = length; 6257 } 6258 6259 stripe_index++; 6260 if (stripe_index == map->num_stripes) { 6261 stripe_index = 0; 6262 stripe_nr++; 6263 } 6264 } 6265 6266 btrfs_free_chunk_map(map); 6267 return stripes; 6268 out_free_map: 6269 btrfs_free_chunk_map(map); 6270 return ERR_PTR(ret); 6271 } 6272 6273 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6274 { 6275 struct btrfs_block_group *cache; 6276 bool ret; 6277 6278 /* Non zoned filesystem does not use "to_copy" flag */ 6279 if (!btrfs_is_zoned(fs_info)) 6280 return false; 6281 6282 cache = btrfs_lookup_block_group(fs_info, logical); 6283 6284 ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 6285 6286 btrfs_put_block_group(cache); 6287 return ret; 6288 } 6289 6290 static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, 6291 struct btrfs_dev_replace *dev_replace, 6292 u64 logical, 6293 struct btrfs_io_geometry *io_geom) 6294 { 6295 u64 srcdev_devid = dev_replace->srcdev->devid; 6296 /* 6297 * At this stage, num_stripes is still the real number of stripes, 6298 * excluding the duplicated stripes. 6299 */ 6300 int num_stripes = io_geom->num_stripes; 6301 int max_errors = io_geom->max_errors; 6302 int nr_extra_stripes = 0; 6303 int i; 6304 6305 /* 6306 * A block group which has "to_copy" set will eventually be copied by 6307 * the dev-replace process. We can avoid cloning IO here. 6308 */ 6309 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6310 return; 6311 6312 /* 6313 * Duplicate the write operations while the dev-replace procedure is 6314 * running. Since the copying of the old disk to the new disk takes 6315 * place at run time while the filesystem is mounted writable, the 6316 * regular write operations to the old disk have to be duplicated to go 6317 * to the new disk as well. 6318 * 6319 * Note that device->missing is handled by the caller, and that the 6320 * write to the old disk is already set up in the stripes array. 6321 */ 6322 for (i = 0; i < num_stripes; i++) { 6323 struct btrfs_io_stripe *old = &bioc->stripes[i]; 6324 struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; 6325 6326 if (old->dev->devid != srcdev_devid) 6327 continue; 6328 6329 new->physical = old->physical; 6330 new->dev = dev_replace->tgtdev; 6331 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) 6332 bioc->replace_stripe_src = i; 6333 nr_extra_stripes++; 6334 } 6335 6336 /* We can only have at most 2 extra nr_stripes (for DUP). */ 6337 ASSERT(nr_extra_stripes <= 2); 6338 /* 6339 * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for 6340 * replace. 6341 * If we have 2 extra stripes, only choose the one with smaller physical. 6342 */ 6343 if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { 6344 struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; 6345 struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; 6346 6347 /* Only DUP can have two extra stripes. */ 6348 ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); 6349 6350 /* 6351 * Swap the last stripe stripes and reduce @nr_extra_stripes. 6352 * The extra stripe would still be there, but won't be accessed. 6353 */ 6354 if (first->physical > second->physical) { 6355 swap(second->physical, first->physical); 6356 swap(second->dev, first->dev); 6357 nr_extra_stripes--; 6358 } 6359 } 6360 6361 io_geom->num_stripes = num_stripes + nr_extra_stripes; 6362 io_geom->max_errors = max_errors + nr_extra_stripes; 6363 bioc->replace_nr_stripes = nr_extra_stripes; 6364 } 6365 6366 static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, 6367 struct btrfs_io_geometry *io_geom) 6368 { 6369 /* 6370 * Stripe_nr is the stripe where this block falls. stripe_offset is 6371 * the offset of this block in its stripe. 6372 */ 6373 io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; 6374 io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; 6375 ASSERT(io_geom->stripe_offset < U32_MAX); 6376 6377 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6378 unsigned long full_stripe_len = 6379 btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 6380 6381 /* 6382 * For full stripe start, we use previously calculated 6383 * @stripe_nr. Align it to nr_data_stripes, then multiply with 6384 * STRIPE_LEN. 6385 * 6386 * By this we can avoid u64 division completely. And we have 6387 * to go rounddown(), not round_down(), as nr_data_stripes is 6388 * not ensured to be power of 2. 6389 */ 6390 io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( 6391 rounddown(io_geom->stripe_nr, nr_data_stripes(map))); 6392 6393 ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); 6394 ASSERT(io_geom->raid56_full_stripe_start <= offset); 6395 /* 6396 * For writes to RAID56, allow to write a full stripe set, but 6397 * no straddling of stripe sets. 6398 */ 6399 if (io_geom->op == BTRFS_MAP_WRITE) 6400 return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); 6401 } 6402 6403 /* 6404 * For other RAID types and for RAID56 reads, allow a single stripe (on 6405 * a single disk). 6406 */ 6407 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) 6408 return BTRFS_STRIPE_LEN - io_geom->stripe_offset; 6409 return U64_MAX; 6410 } 6411 6412 static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, 6413 u64 *length, struct btrfs_io_stripe *dst, 6414 struct btrfs_chunk_map *map, 6415 struct btrfs_io_geometry *io_geom) 6416 { 6417 dst->dev = map->stripes[io_geom->stripe_index].dev; 6418 6419 if (io_geom->op == BTRFS_MAP_READ && 6420 btrfs_need_stripe_tree_update(fs_info, map->type)) 6421 return btrfs_get_raid_extent_offset(fs_info, logical, length, 6422 map->type, 6423 io_geom->stripe_index, dst); 6424 6425 dst->physical = map->stripes[io_geom->stripe_index].physical + 6426 io_geom->stripe_offset + 6427 btrfs_stripe_nr_to_offset(io_geom->stripe_nr); 6428 return 0; 6429 } 6430 6431 static bool is_single_device_io(struct btrfs_fs_info *fs_info, 6432 const struct btrfs_io_stripe *smap, 6433 const struct btrfs_chunk_map *map, 6434 int num_alloc_stripes, 6435 enum btrfs_map_op op, int mirror_num) 6436 { 6437 if (!smap) 6438 return false; 6439 6440 if (num_alloc_stripes != 1) 6441 return false; 6442 6443 if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) 6444 return false; 6445 6446 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) 6447 return false; 6448 6449 return true; 6450 } 6451 6452 static void map_blocks_raid0(const struct btrfs_chunk_map *map, 6453 struct btrfs_io_geometry *io_geom) 6454 { 6455 io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; 6456 io_geom->stripe_nr /= map->num_stripes; 6457 if (io_geom->op == BTRFS_MAP_READ) 6458 io_geom->mirror_num = 1; 6459 } 6460 6461 static void map_blocks_raid1(struct btrfs_fs_info *fs_info, 6462 struct btrfs_chunk_map *map, 6463 struct btrfs_io_geometry *io_geom, 6464 bool dev_replace_is_ongoing) 6465 { 6466 if (io_geom->op != BTRFS_MAP_READ) { 6467 io_geom->num_stripes = map->num_stripes; 6468 return; 6469 } 6470 6471 if (io_geom->mirror_num) { 6472 io_geom->stripe_index = io_geom->mirror_num - 1; 6473 return; 6474 } 6475 6476 io_geom->stripe_index = find_live_mirror(fs_info, map, 0, 6477 dev_replace_is_ongoing); 6478 io_geom->mirror_num = io_geom->stripe_index + 1; 6479 } 6480 6481 static void map_blocks_dup(const struct btrfs_chunk_map *map, 6482 struct btrfs_io_geometry *io_geom) 6483 { 6484 if (io_geom->op != BTRFS_MAP_READ) { 6485 io_geom->num_stripes = map->num_stripes; 6486 return; 6487 } 6488 6489 if (io_geom->mirror_num) { 6490 io_geom->stripe_index = io_geom->mirror_num - 1; 6491 return; 6492 } 6493 6494 io_geom->mirror_num = 1; 6495 } 6496 6497 static void map_blocks_raid10(struct btrfs_fs_info *fs_info, 6498 struct btrfs_chunk_map *map, 6499 struct btrfs_io_geometry *io_geom, 6500 bool dev_replace_is_ongoing) 6501 { 6502 u32 factor = map->num_stripes / map->sub_stripes; 6503 int old_stripe_index; 6504 6505 io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; 6506 io_geom->stripe_nr /= factor; 6507 6508 if (io_geom->op != BTRFS_MAP_READ) { 6509 io_geom->num_stripes = map->sub_stripes; 6510 return; 6511 } 6512 6513 if (io_geom->mirror_num) { 6514 io_geom->stripe_index += io_geom->mirror_num - 1; 6515 return; 6516 } 6517 6518 old_stripe_index = io_geom->stripe_index; 6519 io_geom->stripe_index = find_live_mirror(fs_info, map, 6520 io_geom->stripe_index, 6521 dev_replace_is_ongoing); 6522 io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; 6523 } 6524 6525 static void map_blocks_raid56_write(struct btrfs_chunk_map *map, 6526 struct btrfs_io_geometry *io_geom, 6527 u64 logical, u64 *length) 6528 { 6529 int data_stripes = nr_data_stripes(map); 6530 6531 /* 6532 * Needs full stripe mapping. 6533 * 6534 * Push stripe_nr back to the start of the full stripe For those cases 6535 * needing a full stripe, @stripe_nr is the full stripe number. 6536 * 6537 * Originally we go raid56_full_stripe_start / full_stripe_len, but 6538 * that can be expensive. Here we just divide @stripe_nr with 6539 * @data_stripes. 6540 */ 6541 io_geom->stripe_nr /= data_stripes; 6542 6543 /* RAID[56] write or recovery. Return all stripes */ 6544 io_geom->num_stripes = map->num_stripes; 6545 io_geom->max_errors = btrfs_chunk_max_errors(map); 6546 6547 /* Return the length to the full stripe end. */ 6548 *length = min(logical + *length, 6549 io_geom->raid56_full_stripe_start + map->start + 6550 btrfs_stripe_nr_to_offset(data_stripes)) - 6551 logical; 6552 io_geom->stripe_index = 0; 6553 io_geom->stripe_offset = 0; 6554 } 6555 6556 static void map_blocks_raid56_read(struct btrfs_chunk_map *map, 6557 struct btrfs_io_geometry *io_geom) 6558 { 6559 int data_stripes = nr_data_stripes(map); 6560 6561 ASSERT(io_geom->mirror_num <= 1); 6562 /* Just grab the data stripe directly. */ 6563 io_geom->stripe_index = io_geom->stripe_nr % data_stripes; 6564 io_geom->stripe_nr /= data_stripes; 6565 6566 /* We distribute the parity blocks across stripes. */ 6567 io_geom->stripe_index = 6568 (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; 6569 6570 if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) 6571 io_geom->mirror_num = 1; 6572 } 6573 6574 static void map_blocks_single(const struct btrfs_chunk_map *map, 6575 struct btrfs_io_geometry *io_geom) 6576 { 6577 io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; 6578 io_geom->stripe_nr /= map->num_stripes; 6579 io_geom->mirror_num = io_geom->stripe_index + 1; 6580 } 6581 6582 /* 6583 * Map one logical range to one or more physical ranges. 6584 * 6585 * @length: (Mandatory) mapped length of this run. 6586 * One logical range can be split into different segments 6587 * due to factors like zones and RAID0/5/6/10 stripe 6588 * boundaries. 6589 * 6590 * @bioc_ret: (Mandatory) returned btrfs_io_context structure. 6591 * which has one or more physical ranges (btrfs_io_stripe) 6592 * recorded inside. 6593 * Caller should call btrfs_put_bioc() to free it after use. 6594 * 6595 * @smap: (Optional) single physical range optimization. 6596 * If the map request can be fulfilled by one single 6597 * physical range, and this is parameter is not NULL, 6598 * then @bioc_ret would be NULL, and @smap would be 6599 * updated. 6600 * 6601 * @mirror_num_ret: (Mandatory) returned mirror number if the original 6602 * value is 0. 6603 * 6604 * Mirror number 0 means to choose any live mirrors. 6605 * 6606 * For non-RAID56 profiles, non-zero mirror_num means 6607 * the Nth mirror. (e.g. mirror_num 1 means the first 6608 * copy). 6609 * 6610 * For RAID56 profile, mirror 1 means rebuild from P and 6611 * the remaining data stripes. 6612 * 6613 * For RAID6 profile, mirror > 2 means mark another 6614 * data/P stripe error and rebuild from the remaining 6615 * stripes.. 6616 */ 6617 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6618 u64 logical, u64 *length, 6619 struct btrfs_io_context **bioc_ret, 6620 struct btrfs_io_stripe *smap, int *mirror_num_ret) 6621 { 6622 struct btrfs_chunk_map *map; 6623 struct btrfs_io_geometry io_geom = { 0 }; 6624 u64 map_offset; 6625 int ret = 0; 6626 int num_copies; 6627 struct btrfs_io_context *bioc = NULL; 6628 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6629 int dev_replace_is_ongoing = 0; 6630 u16 num_alloc_stripes; 6631 u64 max_len; 6632 6633 ASSERT(bioc_ret); 6634 6635 io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); 6636 io_geom.num_stripes = 1; 6637 io_geom.stripe_index = 0; 6638 io_geom.op = op; 6639 6640 num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); 6641 if (io_geom.mirror_num > num_copies) 6642 return -EINVAL; 6643 6644 map = btrfs_get_chunk_map(fs_info, logical, *length); 6645 if (IS_ERR(map)) 6646 return PTR_ERR(map); 6647 6648 map_offset = logical - map->start; 6649 io_geom.raid56_full_stripe_start = (u64)-1; 6650 max_len = btrfs_max_io_len(map, map_offset, &io_geom); 6651 *length = min_t(u64, map->chunk_len - map_offset, max_len); 6652 6653 down_read(&dev_replace->rwsem); 6654 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6655 /* 6656 * Hold the semaphore for read during the whole operation, write is 6657 * requested at commit time but must wait. 6658 */ 6659 if (!dev_replace_is_ongoing) 6660 up_read(&dev_replace->rwsem); 6661 6662 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 6663 case BTRFS_BLOCK_GROUP_RAID0: 6664 map_blocks_raid0(map, &io_geom); 6665 break; 6666 case BTRFS_BLOCK_GROUP_RAID1: 6667 case BTRFS_BLOCK_GROUP_RAID1C3: 6668 case BTRFS_BLOCK_GROUP_RAID1C4: 6669 map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); 6670 break; 6671 case BTRFS_BLOCK_GROUP_DUP: 6672 map_blocks_dup(map, &io_geom); 6673 break; 6674 case BTRFS_BLOCK_GROUP_RAID10: 6675 map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); 6676 break; 6677 case BTRFS_BLOCK_GROUP_RAID5: 6678 case BTRFS_BLOCK_GROUP_RAID6: 6679 if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) 6680 map_blocks_raid56_write(map, &io_geom, logical, length); 6681 else 6682 map_blocks_raid56_read(map, &io_geom); 6683 break; 6684 default: 6685 /* 6686 * After this, stripe_nr is the number of stripes on this 6687 * device we have to walk to find the data, and stripe_index is 6688 * the number of our device in the stripe array 6689 */ 6690 map_blocks_single(map, &io_geom); 6691 break; 6692 } 6693 if (io_geom.stripe_index >= map->num_stripes) { 6694 btrfs_crit(fs_info, 6695 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6696 io_geom.stripe_index, map->num_stripes); 6697 ret = -EINVAL; 6698 goto out; 6699 } 6700 6701 num_alloc_stripes = io_geom.num_stripes; 6702 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6703 op != BTRFS_MAP_READ) 6704 /* 6705 * For replace case, we need to add extra stripes for extra 6706 * duplicated stripes. 6707 * 6708 * For both WRITE and GET_READ_MIRRORS, we may have at most 6709 * 2 more stripes (DUP types, otherwise 1). 6710 */ 6711 num_alloc_stripes += 2; 6712 6713 /* 6714 * If this I/O maps to a single device, try to return the device and 6715 * physical block information on the stack instead of allocating an 6716 * I/O context structure. 6717 */ 6718 if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, 6719 io_geom.mirror_num)) { 6720 ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); 6721 if (mirror_num_ret) 6722 *mirror_num_ret = io_geom.mirror_num; 6723 *bioc_ret = NULL; 6724 goto out; 6725 } 6726 6727 bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); 6728 if (!bioc) { 6729 ret = -ENOMEM; 6730 goto out; 6731 } 6732 bioc->map_type = map->type; 6733 6734 /* 6735 * For RAID56 full map, we need to make sure the stripes[] follows the 6736 * rule that data stripes are all ordered, then followed with P and Q 6737 * (if we have). 6738 * 6739 * It's still mostly the same as other profiles, just with extra rotation. 6740 */ 6741 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && 6742 (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { 6743 /* 6744 * For RAID56 @stripe_nr is already the number of full stripes 6745 * before us, which is also the rotation value (needs to modulo 6746 * with num_stripes). 6747 * 6748 * In this case, we just add @stripe_nr with @i, then do the 6749 * modulo, to reduce one modulo call. 6750 */ 6751 bioc->full_stripe_logical = map->start + 6752 btrfs_stripe_nr_to_offset(io_geom.stripe_nr * 6753 nr_data_stripes(map)); 6754 for (int i = 0; i < io_geom.num_stripes; i++) { 6755 struct btrfs_io_stripe *dst = &bioc->stripes[i]; 6756 u32 stripe_index; 6757 6758 stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; 6759 dst->dev = map->stripes[stripe_index].dev; 6760 dst->physical = 6761 map->stripes[stripe_index].physical + 6762 io_geom.stripe_offset + 6763 btrfs_stripe_nr_to_offset(io_geom.stripe_nr); 6764 } 6765 } else { 6766 /* 6767 * For all other non-RAID56 profiles, just copy the target 6768 * stripe into the bioc. 6769 */ 6770 for (int i = 0; i < io_geom.num_stripes; i++) { 6771 ret = set_io_stripe(fs_info, logical, length, 6772 &bioc->stripes[i], map, &io_geom); 6773 if (ret < 0) 6774 break; 6775 io_geom.stripe_index++; 6776 } 6777 } 6778 6779 if (ret) { 6780 *bioc_ret = NULL; 6781 btrfs_put_bioc(bioc); 6782 goto out; 6783 } 6784 6785 if (op != BTRFS_MAP_READ) 6786 io_geom.max_errors = btrfs_chunk_max_errors(map); 6787 6788 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6789 op != BTRFS_MAP_READ) { 6790 handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); 6791 } 6792 6793 *bioc_ret = bioc; 6794 bioc->num_stripes = io_geom.num_stripes; 6795 bioc->max_errors = io_geom.max_errors; 6796 bioc->mirror_num = io_geom.mirror_num; 6797 6798 out: 6799 if (dev_replace_is_ongoing) { 6800 lockdep_assert_held(&dev_replace->rwsem); 6801 /* Unlock and let waiting writers proceed */ 6802 up_read(&dev_replace->rwsem); 6803 } 6804 btrfs_free_chunk_map(map); 6805 return ret; 6806 } 6807 6808 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6809 const struct btrfs_fs_devices *fs_devices) 6810 { 6811 if (args->fsid == NULL) 6812 return true; 6813 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6814 return true; 6815 return false; 6816 } 6817 6818 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6819 const struct btrfs_device *device) 6820 { 6821 if (args->missing) { 6822 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6823 !device->bdev) 6824 return true; 6825 return false; 6826 } 6827 6828 if (device->devid != args->devid) 6829 return false; 6830 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6831 return false; 6832 return true; 6833 } 6834 6835 /* 6836 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6837 * return NULL. 6838 * 6839 * If devid and uuid are both specified, the match must be exact, otherwise 6840 * only devid is used. 6841 */ 6842 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6843 const struct btrfs_dev_lookup_args *args) 6844 { 6845 struct btrfs_device *device; 6846 struct btrfs_fs_devices *seed_devs; 6847 6848 if (dev_args_match_fs_devices(args, fs_devices)) { 6849 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6850 if (dev_args_match_device(args, device)) 6851 return device; 6852 } 6853 } 6854 6855 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6856 if (!dev_args_match_fs_devices(args, seed_devs)) 6857 continue; 6858 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6859 if (dev_args_match_device(args, device)) 6860 return device; 6861 } 6862 } 6863 6864 return NULL; 6865 } 6866 6867 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6868 u64 devid, u8 *dev_uuid) 6869 { 6870 struct btrfs_device *device; 6871 unsigned int nofs_flag; 6872 6873 /* 6874 * We call this under the chunk_mutex, so we want to use NOFS for this 6875 * allocation, however we don't want to change btrfs_alloc_device() to 6876 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6877 * places. 6878 */ 6879 6880 nofs_flag = memalloc_nofs_save(); 6881 device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); 6882 memalloc_nofs_restore(nofs_flag); 6883 if (IS_ERR(device)) 6884 return device; 6885 6886 list_add(&device->dev_list, &fs_devices->devices); 6887 device->fs_devices = fs_devices; 6888 fs_devices->num_devices++; 6889 6890 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6891 fs_devices->missing_devices++; 6892 6893 return device; 6894 } 6895 6896 /* 6897 * Allocate new device struct, set up devid and UUID. 6898 * 6899 * @fs_info: used only for generating a new devid, can be NULL if 6900 * devid is provided (i.e. @devid != NULL). 6901 * @devid: a pointer to devid for this device. If NULL a new devid 6902 * is generated. 6903 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6904 * is generated. 6905 * @path: a pointer to device path if available, NULL otherwise. 6906 * 6907 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6908 * on error. Returned struct is not linked onto any lists and must be 6909 * destroyed with btrfs_free_device. 6910 */ 6911 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6912 const u64 *devid, const u8 *uuid, 6913 const char *path) 6914 { 6915 struct btrfs_device *dev; 6916 u64 tmp; 6917 6918 if (WARN_ON(!devid && !fs_info)) 6919 return ERR_PTR(-EINVAL); 6920 6921 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6922 if (!dev) 6923 return ERR_PTR(-ENOMEM); 6924 6925 INIT_LIST_HEAD(&dev->dev_list); 6926 INIT_LIST_HEAD(&dev->dev_alloc_list); 6927 INIT_LIST_HEAD(&dev->post_commit_list); 6928 6929 atomic_set(&dev->dev_stats_ccnt, 0); 6930 btrfs_device_data_ordered_init(dev); 6931 extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); 6932 6933 if (devid) 6934 tmp = *devid; 6935 else { 6936 int ret; 6937 6938 ret = find_next_devid(fs_info, &tmp); 6939 if (ret) { 6940 btrfs_free_device(dev); 6941 return ERR_PTR(ret); 6942 } 6943 } 6944 dev->devid = tmp; 6945 6946 if (uuid) 6947 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6948 else 6949 generate_random_uuid(dev->uuid); 6950 6951 if (path) { 6952 struct rcu_string *name; 6953 6954 name = rcu_string_strdup(path, GFP_KERNEL); 6955 if (!name) { 6956 btrfs_free_device(dev); 6957 return ERR_PTR(-ENOMEM); 6958 } 6959 rcu_assign_pointer(dev->name, name); 6960 } 6961 6962 return dev; 6963 } 6964 6965 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6966 u64 devid, u8 *uuid, bool error) 6967 { 6968 if (error) 6969 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6970 devid, uuid); 6971 else 6972 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6973 devid, uuid); 6974 } 6975 6976 u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) 6977 { 6978 const int data_stripes = calc_data_stripes(map->type, map->num_stripes); 6979 6980 return div_u64(map->chunk_len, data_stripes); 6981 } 6982 6983 #if BITS_PER_LONG == 32 6984 /* 6985 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6986 * can't be accessed on 32bit systems. 6987 * 6988 * This function do mount time check to reject the fs if it already has 6989 * metadata chunk beyond that limit. 6990 */ 6991 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6992 u64 logical, u64 length, u64 type) 6993 { 6994 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6995 return 0; 6996 6997 if (logical + length < MAX_LFS_FILESIZE) 6998 return 0; 6999 7000 btrfs_err_32bit_limit(fs_info); 7001 return -EOVERFLOW; 7002 } 7003 7004 /* 7005 * This is to give early warning for any metadata chunk reaching 7006 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 7007 * Although we can still access the metadata, it's not going to be possible 7008 * once the limit is reached. 7009 */ 7010 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7011 u64 logical, u64 length, u64 type) 7012 { 7013 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7014 return; 7015 7016 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 7017 return; 7018 7019 btrfs_warn_32bit_limit(fs_info); 7020 } 7021 #endif 7022 7023 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 7024 u64 devid, u8 *uuid) 7025 { 7026 struct btrfs_device *dev; 7027 7028 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7029 btrfs_report_missing_device(fs_info, devid, uuid, true); 7030 return ERR_PTR(-ENOENT); 7031 } 7032 7033 dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 7034 if (IS_ERR(dev)) { 7035 btrfs_err(fs_info, "failed to init missing device %llu: %ld", 7036 devid, PTR_ERR(dev)); 7037 return dev; 7038 } 7039 btrfs_report_missing_device(fs_info, devid, uuid, false); 7040 7041 return dev; 7042 } 7043 7044 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7045 struct btrfs_chunk *chunk) 7046 { 7047 BTRFS_DEV_LOOKUP_ARGS(args); 7048 struct btrfs_fs_info *fs_info = leaf->fs_info; 7049 struct btrfs_chunk_map *map; 7050 u64 logical; 7051 u64 length; 7052 u64 devid; 7053 u64 type; 7054 u8 uuid[BTRFS_UUID_SIZE]; 7055 int index; 7056 int num_stripes; 7057 int ret; 7058 int i; 7059 7060 logical = key->offset; 7061 length = btrfs_chunk_length(leaf, chunk); 7062 type = btrfs_chunk_type(leaf, chunk); 7063 index = btrfs_bg_flags_to_raid_index(type); 7064 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 7065 7066 #if BITS_PER_LONG == 32 7067 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 7068 if (ret < 0) 7069 return ret; 7070 warn_32bit_meta_chunk(fs_info, logical, length, type); 7071 #endif 7072 7073 /* 7074 * Only need to verify chunk item if we're reading from sys chunk array, 7075 * as chunk item in tree block is already verified by tree-checker. 7076 */ 7077 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 7078 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 7079 if (ret) 7080 return ret; 7081 } 7082 7083 map = btrfs_find_chunk_map(fs_info, logical, 1); 7084 7085 /* already mapped? */ 7086 if (map && map->start <= logical && map->start + map->chunk_len > logical) { 7087 btrfs_free_chunk_map(map); 7088 return 0; 7089 } else if (map) { 7090 btrfs_free_chunk_map(map); 7091 } 7092 7093 map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); 7094 if (!map) 7095 return -ENOMEM; 7096 7097 map->start = logical; 7098 map->chunk_len = length; 7099 map->num_stripes = num_stripes; 7100 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7101 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7102 map->type = type; 7103 /* 7104 * We can't use the sub_stripes value, as for profiles other than 7105 * RAID10, they may have 0 as sub_stripes for filesystems created by 7106 * older mkfs (<v5.4). 7107 * In that case, it can cause divide-by-zero errors later. 7108 * Since currently sub_stripes is fixed for each profile, let's 7109 * use the trusted value instead. 7110 */ 7111 map->sub_stripes = btrfs_raid_array[index].sub_stripes; 7112 map->verified_stripes = 0; 7113 map->stripe_size = btrfs_calc_stripe_length(map); 7114 for (i = 0; i < num_stripes; i++) { 7115 map->stripes[i].physical = 7116 btrfs_stripe_offset_nr(leaf, chunk, i); 7117 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7118 args.devid = devid; 7119 read_extent_buffer(leaf, uuid, (unsigned long) 7120 btrfs_stripe_dev_uuid_nr(chunk, i), 7121 BTRFS_UUID_SIZE); 7122 args.uuid = uuid; 7123 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7124 if (!map->stripes[i].dev) { 7125 map->stripes[i].dev = handle_missing_device(fs_info, 7126 devid, uuid); 7127 if (IS_ERR(map->stripes[i].dev)) { 7128 ret = PTR_ERR(map->stripes[i].dev); 7129 btrfs_free_chunk_map(map); 7130 return ret; 7131 } 7132 } 7133 7134 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7135 &(map->stripes[i].dev->dev_state)); 7136 } 7137 7138 ret = btrfs_add_chunk_map(fs_info, map); 7139 if (ret < 0) { 7140 btrfs_err(fs_info, 7141 "failed to add chunk map, start=%llu len=%llu: %d", 7142 map->start, map->chunk_len, ret); 7143 } 7144 7145 return ret; 7146 } 7147 7148 static void fill_device_from_item(struct extent_buffer *leaf, 7149 struct btrfs_dev_item *dev_item, 7150 struct btrfs_device *device) 7151 { 7152 unsigned long ptr; 7153 7154 device->devid = btrfs_device_id(leaf, dev_item); 7155 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7156 device->total_bytes = device->disk_total_bytes; 7157 device->commit_total_bytes = device->disk_total_bytes; 7158 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7159 device->commit_bytes_used = device->bytes_used; 7160 device->type = btrfs_device_type(leaf, dev_item); 7161 device->io_align = btrfs_device_io_align(leaf, dev_item); 7162 device->io_width = btrfs_device_io_width(leaf, dev_item); 7163 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7164 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7165 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7166 7167 ptr = btrfs_device_uuid(dev_item); 7168 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7169 } 7170 7171 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7172 u8 *fsid) 7173 { 7174 struct btrfs_fs_devices *fs_devices; 7175 int ret; 7176 7177 lockdep_assert_held(&uuid_mutex); 7178 ASSERT(fsid); 7179 7180 /* This will match only for multi-device seed fs */ 7181 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7182 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7183 return fs_devices; 7184 7185 7186 fs_devices = find_fsid(fsid, NULL); 7187 if (!fs_devices) { 7188 if (!btrfs_test_opt(fs_info, DEGRADED)) 7189 return ERR_PTR(-ENOENT); 7190 7191 fs_devices = alloc_fs_devices(fsid); 7192 if (IS_ERR(fs_devices)) 7193 return fs_devices; 7194 7195 fs_devices->seeding = true; 7196 fs_devices->opened = 1; 7197 return fs_devices; 7198 } 7199 7200 /* 7201 * Upon first call for a seed fs fsid, just create a private copy of the 7202 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7203 */ 7204 fs_devices = clone_fs_devices(fs_devices); 7205 if (IS_ERR(fs_devices)) 7206 return fs_devices; 7207 7208 ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); 7209 if (ret) { 7210 free_fs_devices(fs_devices); 7211 return ERR_PTR(ret); 7212 } 7213 7214 if (!fs_devices->seeding) { 7215 close_fs_devices(fs_devices); 7216 free_fs_devices(fs_devices); 7217 return ERR_PTR(-EINVAL); 7218 } 7219 7220 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7221 7222 return fs_devices; 7223 } 7224 7225 static int read_one_dev(struct extent_buffer *leaf, 7226 struct btrfs_dev_item *dev_item) 7227 { 7228 BTRFS_DEV_LOOKUP_ARGS(args); 7229 struct btrfs_fs_info *fs_info = leaf->fs_info; 7230 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7231 struct btrfs_device *device; 7232 u64 devid; 7233 int ret; 7234 u8 fs_uuid[BTRFS_FSID_SIZE]; 7235 u8 dev_uuid[BTRFS_UUID_SIZE]; 7236 7237 devid = btrfs_device_id(leaf, dev_item); 7238 args.devid = devid; 7239 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7240 BTRFS_UUID_SIZE); 7241 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7242 BTRFS_FSID_SIZE); 7243 args.uuid = dev_uuid; 7244 args.fsid = fs_uuid; 7245 7246 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7247 fs_devices = open_seed_devices(fs_info, fs_uuid); 7248 if (IS_ERR(fs_devices)) 7249 return PTR_ERR(fs_devices); 7250 } 7251 7252 device = btrfs_find_device(fs_info->fs_devices, &args); 7253 if (!device) { 7254 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7255 btrfs_report_missing_device(fs_info, devid, 7256 dev_uuid, true); 7257 return -ENOENT; 7258 } 7259 7260 device = add_missing_dev(fs_devices, devid, dev_uuid); 7261 if (IS_ERR(device)) { 7262 btrfs_err(fs_info, 7263 "failed to add missing dev %llu: %ld", 7264 devid, PTR_ERR(device)); 7265 return PTR_ERR(device); 7266 } 7267 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7268 } else { 7269 if (!device->bdev) { 7270 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7271 btrfs_report_missing_device(fs_info, 7272 devid, dev_uuid, true); 7273 return -ENOENT; 7274 } 7275 btrfs_report_missing_device(fs_info, devid, 7276 dev_uuid, false); 7277 } 7278 7279 if (!device->bdev && 7280 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7281 /* 7282 * this happens when a device that was properly setup 7283 * in the device info lists suddenly goes bad. 7284 * device->bdev is NULL, and so we have to set 7285 * device->missing to one here 7286 */ 7287 device->fs_devices->missing_devices++; 7288 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7289 } 7290 7291 /* Move the device to its own fs_devices */ 7292 if (device->fs_devices != fs_devices) { 7293 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7294 &device->dev_state)); 7295 7296 list_move(&device->dev_list, &fs_devices->devices); 7297 device->fs_devices->num_devices--; 7298 fs_devices->num_devices++; 7299 7300 device->fs_devices->missing_devices--; 7301 fs_devices->missing_devices++; 7302 7303 device->fs_devices = fs_devices; 7304 } 7305 } 7306 7307 if (device->fs_devices != fs_info->fs_devices) { 7308 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7309 if (device->generation != 7310 btrfs_device_generation(leaf, dev_item)) 7311 return -EINVAL; 7312 } 7313 7314 fill_device_from_item(leaf, dev_item, device); 7315 if (device->bdev) { 7316 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7317 7318 if (device->total_bytes > max_total_bytes) { 7319 btrfs_err(fs_info, 7320 "device total_bytes should be at most %llu but found %llu", 7321 max_total_bytes, device->total_bytes); 7322 return -EINVAL; 7323 } 7324 } 7325 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7326 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7327 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7328 device->fs_devices->total_rw_bytes += device->total_bytes; 7329 atomic64_add(device->total_bytes - device->bytes_used, 7330 &fs_info->free_chunk_space); 7331 } 7332 ret = 0; 7333 return ret; 7334 } 7335 7336 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7337 { 7338 struct btrfs_super_block *super_copy = fs_info->super_copy; 7339 struct extent_buffer *sb; 7340 struct btrfs_disk_key *disk_key; 7341 struct btrfs_chunk *chunk; 7342 u8 *array_ptr; 7343 unsigned long sb_array_offset; 7344 int ret = 0; 7345 u32 num_stripes; 7346 u32 array_size; 7347 u32 len = 0; 7348 u32 cur_offset; 7349 u64 type; 7350 struct btrfs_key key; 7351 7352 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7353 7354 /* 7355 * We allocated a dummy extent, just to use extent buffer accessors. 7356 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but 7357 * that's fine, we will not go beyond system chunk array anyway. 7358 */ 7359 sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); 7360 if (!sb) 7361 return -ENOMEM; 7362 set_extent_buffer_uptodate(sb); 7363 7364 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7365 array_size = btrfs_super_sys_array_size(super_copy); 7366 7367 array_ptr = super_copy->sys_chunk_array; 7368 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7369 cur_offset = 0; 7370 7371 while (cur_offset < array_size) { 7372 disk_key = (struct btrfs_disk_key *)array_ptr; 7373 len = sizeof(*disk_key); 7374 if (cur_offset + len > array_size) 7375 goto out_short_read; 7376 7377 btrfs_disk_key_to_cpu(&key, disk_key); 7378 7379 array_ptr += len; 7380 sb_array_offset += len; 7381 cur_offset += len; 7382 7383 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7384 btrfs_err(fs_info, 7385 "unexpected item type %u in sys_array at offset %u", 7386 (u32)key.type, cur_offset); 7387 ret = -EIO; 7388 break; 7389 } 7390 7391 chunk = (struct btrfs_chunk *)sb_array_offset; 7392 /* 7393 * At least one btrfs_chunk with one stripe must be present, 7394 * exact stripe count check comes afterwards 7395 */ 7396 len = btrfs_chunk_item_size(1); 7397 if (cur_offset + len > array_size) 7398 goto out_short_read; 7399 7400 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7401 if (!num_stripes) { 7402 btrfs_err(fs_info, 7403 "invalid number of stripes %u in sys_array at offset %u", 7404 num_stripes, cur_offset); 7405 ret = -EIO; 7406 break; 7407 } 7408 7409 type = btrfs_chunk_type(sb, chunk); 7410 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7411 btrfs_err(fs_info, 7412 "invalid chunk type %llu in sys_array at offset %u", 7413 type, cur_offset); 7414 ret = -EIO; 7415 break; 7416 } 7417 7418 len = btrfs_chunk_item_size(num_stripes); 7419 if (cur_offset + len > array_size) 7420 goto out_short_read; 7421 7422 ret = read_one_chunk(&key, sb, chunk); 7423 if (ret) 7424 break; 7425 7426 array_ptr += len; 7427 sb_array_offset += len; 7428 cur_offset += len; 7429 } 7430 clear_extent_buffer_uptodate(sb); 7431 free_extent_buffer_stale(sb); 7432 return ret; 7433 7434 out_short_read: 7435 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7436 len, cur_offset); 7437 clear_extent_buffer_uptodate(sb); 7438 free_extent_buffer_stale(sb); 7439 return -EIO; 7440 } 7441 7442 /* 7443 * Check if all chunks in the fs are OK for read-write degraded mount 7444 * 7445 * If the @failing_dev is specified, it's accounted as missing. 7446 * 7447 * Return true if all chunks meet the minimal RW mount requirements. 7448 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7449 */ 7450 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7451 struct btrfs_device *failing_dev) 7452 { 7453 struct btrfs_chunk_map *map; 7454 u64 next_start; 7455 bool ret = true; 7456 7457 map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); 7458 /* No chunk at all? Return false anyway */ 7459 if (!map) { 7460 ret = false; 7461 goto out; 7462 } 7463 while (map) { 7464 int missing = 0; 7465 int max_tolerated; 7466 int i; 7467 7468 max_tolerated = 7469 btrfs_get_num_tolerated_disk_barrier_failures( 7470 map->type); 7471 for (i = 0; i < map->num_stripes; i++) { 7472 struct btrfs_device *dev = map->stripes[i].dev; 7473 7474 if (!dev || !dev->bdev || 7475 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7476 dev->last_flush_error) 7477 missing++; 7478 else if (failing_dev && failing_dev == dev) 7479 missing++; 7480 } 7481 if (missing > max_tolerated) { 7482 if (!failing_dev) 7483 btrfs_warn(fs_info, 7484 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7485 map->start, missing, max_tolerated); 7486 btrfs_free_chunk_map(map); 7487 ret = false; 7488 goto out; 7489 } 7490 next_start = map->start + map->chunk_len; 7491 btrfs_free_chunk_map(map); 7492 7493 map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); 7494 } 7495 out: 7496 return ret; 7497 } 7498 7499 static void readahead_tree_node_children(struct extent_buffer *node) 7500 { 7501 int i; 7502 const int nr_items = btrfs_header_nritems(node); 7503 7504 for (i = 0; i < nr_items; i++) 7505 btrfs_readahead_node_child(node, i); 7506 } 7507 7508 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7509 { 7510 struct btrfs_root *root = fs_info->chunk_root; 7511 struct btrfs_path *path; 7512 struct extent_buffer *leaf; 7513 struct btrfs_key key; 7514 struct btrfs_key found_key; 7515 int ret; 7516 int slot; 7517 int iter_ret = 0; 7518 u64 total_dev = 0; 7519 u64 last_ra_node = 0; 7520 7521 path = btrfs_alloc_path(); 7522 if (!path) 7523 return -ENOMEM; 7524 7525 /* 7526 * uuid_mutex is needed only if we are mounting a sprout FS 7527 * otherwise we don't need it. 7528 */ 7529 mutex_lock(&uuid_mutex); 7530 7531 /* 7532 * It is possible for mount and umount to race in such a way that 7533 * we execute this code path, but open_fs_devices failed to clear 7534 * total_rw_bytes. We certainly want it cleared before reading the 7535 * device items, so clear it here. 7536 */ 7537 fs_info->fs_devices->total_rw_bytes = 0; 7538 7539 /* 7540 * Lockdep complains about possible circular locking dependency between 7541 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7542 * used for freeze procection of a fs (struct super_block.s_writers), 7543 * which we take when starting a transaction, and extent buffers of the 7544 * chunk tree if we call read_one_dev() while holding a lock on an 7545 * extent buffer of the chunk tree. Since we are mounting the filesystem 7546 * and at this point there can't be any concurrent task modifying the 7547 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7548 */ 7549 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7550 path->skip_locking = 1; 7551 7552 /* 7553 * Read all device items, and then all the chunk items. All 7554 * device items are found before any chunk item (their object id 7555 * is smaller than the lowest possible object id for a chunk 7556 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7557 */ 7558 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7559 key.offset = 0; 7560 key.type = 0; 7561 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 7562 struct extent_buffer *node = path->nodes[1]; 7563 7564 leaf = path->nodes[0]; 7565 slot = path->slots[0]; 7566 7567 if (node) { 7568 if (last_ra_node != node->start) { 7569 readahead_tree_node_children(node); 7570 last_ra_node = node->start; 7571 } 7572 } 7573 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7574 struct btrfs_dev_item *dev_item; 7575 dev_item = btrfs_item_ptr(leaf, slot, 7576 struct btrfs_dev_item); 7577 ret = read_one_dev(leaf, dev_item); 7578 if (ret) 7579 goto error; 7580 total_dev++; 7581 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7582 struct btrfs_chunk *chunk; 7583 7584 /* 7585 * We are only called at mount time, so no need to take 7586 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7587 * we always lock first fs_info->chunk_mutex before 7588 * acquiring any locks on the chunk tree. This is a 7589 * requirement for chunk allocation, see the comment on 7590 * top of btrfs_chunk_alloc() for details. 7591 */ 7592 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7593 ret = read_one_chunk(&found_key, leaf, chunk); 7594 if (ret) 7595 goto error; 7596 } 7597 } 7598 /* Catch error found during iteration */ 7599 if (iter_ret < 0) { 7600 ret = iter_ret; 7601 goto error; 7602 } 7603 7604 /* 7605 * After loading chunk tree, we've got all device information, 7606 * do another round of validation checks. 7607 */ 7608 if (total_dev != fs_info->fs_devices->total_devices) { 7609 btrfs_warn(fs_info, 7610 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", 7611 btrfs_super_num_devices(fs_info->super_copy), 7612 total_dev); 7613 fs_info->fs_devices->total_devices = total_dev; 7614 btrfs_set_super_num_devices(fs_info->super_copy, total_dev); 7615 } 7616 if (btrfs_super_total_bytes(fs_info->super_copy) < 7617 fs_info->fs_devices->total_rw_bytes) { 7618 btrfs_err(fs_info, 7619 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7620 btrfs_super_total_bytes(fs_info->super_copy), 7621 fs_info->fs_devices->total_rw_bytes); 7622 ret = -EINVAL; 7623 goto error; 7624 } 7625 ret = 0; 7626 error: 7627 mutex_unlock(&uuid_mutex); 7628 7629 btrfs_free_path(path); 7630 return ret; 7631 } 7632 7633 int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7634 { 7635 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7636 struct btrfs_device *device; 7637 int ret = 0; 7638 7639 fs_devices->fs_info = fs_info; 7640 7641 mutex_lock(&fs_devices->device_list_mutex); 7642 list_for_each_entry(device, &fs_devices->devices, dev_list) 7643 device->fs_info = fs_info; 7644 7645 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7646 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7647 device->fs_info = fs_info; 7648 ret = btrfs_get_dev_zone_info(device, false); 7649 if (ret) 7650 break; 7651 } 7652 7653 seed_devs->fs_info = fs_info; 7654 } 7655 mutex_unlock(&fs_devices->device_list_mutex); 7656 7657 return ret; 7658 } 7659 7660 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7661 const struct btrfs_dev_stats_item *ptr, 7662 int index) 7663 { 7664 u64 val; 7665 7666 read_extent_buffer(eb, &val, 7667 offsetof(struct btrfs_dev_stats_item, values) + 7668 ((unsigned long)ptr) + (index * sizeof(u64)), 7669 sizeof(val)); 7670 return val; 7671 } 7672 7673 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7674 struct btrfs_dev_stats_item *ptr, 7675 int index, u64 val) 7676 { 7677 write_extent_buffer(eb, &val, 7678 offsetof(struct btrfs_dev_stats_item, values) + 7679 ((unsigned long)ptr) + (index * sizeof(u64)), 7680 sizeof(val)); 7681 } 7682 7683 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7684 struct btrfs_path *path) 7685 { 7686 struct btrfs_dev_stats_item *ptr; 7687 struct extent_buffer *eb; 7688 struct btrfs_key key; 7689 int item_size; 7690 int i, ret, slot; 7691 7692 if (!device->fs_info->dev_root) 7693 return 0; 7694 7695 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7696 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7697 key.offset = device->devid; 7698 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7699 if (ret) { 7700 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7701 btrfs_dev_stat_set(device, i, 0); 7702 device->dev_stats_valid = 1; 7703 btrfs_release_path(path); 7704 return ret < 0 ? ret : 0; 7705 } 7706 slot = path->slots[0]; 7707 eb = path->nodes[0]; 7708 item_size = btrfs_item_size(eb, slot); 7709 7710 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7711 7712 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7713 if (item_size >= (1 + i) * sizeof(__le64)) 7714 btrfs_dev_stat_set(device, i, 7715 btrfs_dev_stats_value(eb, ptr, i)); 7716 else 7717 btrfs_dev_stat_set(device, i, 0); 7718 } 7719 7720 device->dev_stats_valid = 1; 7721 btrfs_dev_stat_print_on_load(device); 7722 btrfs_release_path(path); 7723 7724 return 0; 7725 } 7726 7727 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7728 { 7729 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7730 struct btrfs_device *device; 7731 struct btrfs_path *path = NULL; 7732 int ret = 0; 7733 7734 path = btrfs_alloc_path(); 7735 if (!path) 7736 return -ENOMEM; 7737 7738 mutex_lock(&fs_devices->device_list_mutex); 7739 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7740 ret = btrfs_device_init_dev_stats(device, path); 7741 if (ret) 7742 goto out; 7743 } 7744 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7745 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7746 ret = btrfs_device_init_dev_stats(device, path); 7747 if (ret) 7748 goto out; 7749 } 7750 } 7751 out: 7752 mutex_unlock(&fs_devices->device_list_mutex); 7753 7754 btrfs_free_path(path); 7755 return ret; 7756 } 7757 7758 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7759 struct btrfs_device *device) 7760 { 7761 struct btrfs_fs_info *fs_info = trans->fs_info; 7762 struct btrfs_root *dev_root = fs_info->dev_root; 7763 struct btrfs_path *path; 7764 struct btrfs_key key; 7765 struct extent_buffer *eb; 7766 struct btrfs_dev_stats_item *ptr; 7767 int ret; 7768 int i; 7769 7770 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7771 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7772 key.offset = device->devid; 7773 7774 path = btrfs_alloc_path(); 7775 if (!path) 7776 return -ENOMEM; 7777 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7778 if (ret < 0) { 7779 btrfs_warn_in_rcu(fs_info, 7780 "error %d while searching for dev_stats item for device %s", 7781 ret, btrfs_dev_name(device)); 7782 goto out; 7783 } 7784 7785 if (ret == 0 && 7786 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7787 /* need to delete old one and insert a new one */ 7788 ret = btrfs_del_item(trans, dev_root, path); 7789 if (ret != 0) { 7790 btrfs_warn_in_rcu(fs_info, 7791 "delete too small dev_stats item for device %s failed %d", 7792 btrfs_dev_name(device), ret); 7793 goto out; 7794 } 7795 ret = 1; 7796 } 7797 7798 if (ret == 1) { 7799 /* need to insert a new item */ 7800 btrfs_release_path(path); 7801 ret = btrfs_insert_empty_item(trans, dev_root, path, 7802 &key, sizeof(*ptr)); 7803 if (ret < 0) { 7804 btrfs_warn_in_rcu(fs_info, 7805 "insert dev_stats item for device %s failed %d", 7806 btrfs_dev_name(device), ret); 7807 goto out; 7808 } 7809 } 7810 7811 eb = path->nodes[0]; 7812 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7813 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7814 btrfs_set_dev_stats_value(eb, ptr, i, 7815 btrfs_dev_stat_read(device, i)); 7816 btrfs_mark_buffer_dirty(trans, eb); 7817 7818 out: 7819 btrfs_free_path(path); 7820 return ret; 7821 } 7822 7823 /* 7824 * called from commit_transaction. Writes all changed device stats to disk. 7825 */ 7826 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7827 { 7828 struct btrfs_fs_info *fs_info = trans->fs_info; 7829 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7830 struct btrfs_device *device; 7831 int stats_cnt; 7832 int ret = 0; 7833 7834 mutex_lock(&fs_devices->device_list_mutex); 7835 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7836 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7837 if (!device->dev_stats_valid || stats_cnt == 0) 7838 continue; 7839 7840 7841 /* 7842 * There is a LOAD-LOAD control dependency between the value of 7843 * dev_stats_ccnt and updating the on-disk values which requires 7844 * reading the in-memory counters. Such control dependencies 7845 * require explicit read memory barriers. 7846 * 7847 * This memory barriers pairs with smp_mb__before_atomic in 7848 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7849 * barrier implied by atomic_xchg in 7850 * btrfs_dev_stats_read_and_reset 7851 */ 7852 smp_rmb(); 7853 7854 ret = update_dev_stat_item(trans, device); 7855 if (!ret) 7856 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7857 } 7858 mutex_unlock(&fs_devices->device_list_mutex); 7859 7860 return ret; 7861 } 7862 7863 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7864 { 7865 btrfs_dev_stat_inc(dev, index); 7866 7867 if (!dev->dev_stats_valid) 7868 return; 7869 btrfs_err_rl_in_rcu(dev->fs_info, 7870 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7871 btrfs_dev_name(dev), 7872 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7873 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7874 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7875 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7876 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7877 } 7878 7879 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7880 { 7881 int i; 7882 7883 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7884 if (btrfs_dev_stat_read(dev, i) != 0) 7885 break; 7886 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7887 return; /* all values == 0, suppress message */ 7888 7889 btrfs_info_in_rcu(dev->fs_info, 7890 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7891 btrfs_dev_name(dev), 7892 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7893 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7894 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7895 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7896 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7897 } 7898 7899 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7900 struct btrfs_ioctl_get_dev_stats *stats) 7901 { 7902 BTRFS_DEV_LOOKUP_ARGS(args); 7903 struct btrfs_device *dev; 7904 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7905 int i; 7906 7907 mutex_lock(&fs_devices->device_list_mutex); 7908 args.devid = stats->devid; 7909 dev = btrfs_find_device(fs_info->fs_devices, &args); 7910 mutex_unlock(&fs_devices->device_list_mutex); 7911 7912 if (!dev) { 7913 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7914 return -ENODEV; 7915 } else if (!dev->dev_stats_valid) { 7916 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7917 return -ENODEV; 7918 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7919 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7920 if (stats->nr_items > i) 7921 stats->values[i] = 7922 btrfs_dev_stat_read_and_reset(dev, i); 7923 else 7924 btrfs_dev_stat_set(dev, i, 0); 7925 } 7926 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7927 current->comm, task_pid_nr(current)); 7928 } else { 7929 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7930 if (stats->nr_items > i) 7931 stats->values[i] = btrfs_dev_stat_read(dev, i); 7932 } 7933 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7934 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7935 return 0; 7936 } 7937 7938 /* 7939 * Update the size and bytes used for each device where it changed. This is 7940 * delayed since we would otherwise get errors while writing out the 7941 * superblocks. 7942 * 7943 * Must be invoked during transaction commit. 7944 */ 7945 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7946 { 7947 struct btrfs_device *curr, *next; 7948 7949 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7950 7951 if (list_empty(&trans->dev_update_list)) 7952 return; 7953 7954 /* 7955 * We don't need the device_list_mutex here. This list is owned by the 7956 * transaction and the transaction must complete before the device is 7957 * released. 7958 */ 7959 mutex_lock(&trans->fs_info->chunk_mutex); 7960 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7961 post_commit_list) { 7962 list_del_init(&curr->post_commit_list); 7963 curr->commit_total_bytes = curr->disk_total_bytes; 7964 curr->commit_bytes_used = curr->bytes_used; 7965 } 7966 mutex_unlock(&trans->fs_info->chunk_mutex); 7967 } 7968 7969 /* 7970 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7971 */ 7972 int btrfs_bg_type_to_factor(u64 flags) 7973 { 7974 const int index = btrfs_bg_flags_to_raid_index(flags); 7975 7976 return btrfs_raid_array[index].ncopies; 7977 } 7978 7979 7980 7981 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7982 u64 chunk_offset, u64 devid, 7983 u64 physical_offset, u64 physical_len) 7984 { 7985 struct btrfs_dev_lookup_args args = { .devid = devid }; 7986 struct btrfs_chunk_map *map; 7987 struct btrfs_device *dev; 7988 u64 stripe_len; 7989 bool found = false; 7990 int ret = 0; 7991 int i; 7992 7993 map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); 7994 if (!map) { 7995 btrfs_err(fs_info, 7996 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7997 physical_offset, devid); 7998 ret = -EUCLEAN; 7999 goto out; 8000 } 8001 8002 stripe_len = btrfs_calc_stripe_length(map); 8003 if (physical_len != stripe_len) { 8004 btrfs_err(fs_info, 8005 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 8006 physical_offset, devid, map->start, physical_len, 8007 stripe_len); 8008 ret = -EUCLEAN; 8009 goto out; 8010 } 8011 8012 /* 8013 * Very old mkfs.btrfs (before v4.1) will not respect the reserved 8014 * space. Although kernel can handle it without problem, better to warn 8015 * the users. 8016 */ 8017 if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) 8018 btrfs_warn(fs_info, 8019 "devid %llu physical %llu len %llu inside the reserved space", 8020 devid, physical_offset, physical_len); 8021 8022 for (i = 0; i < map->num_stripes; i++) { 8023 if (map->stripes[i].dev->devid == devid && 8024 map->stripes[i].physical == physical_offset) { 8025 found = true; 8026 if (map->verified_stripes >= map->num_stripes) { 8027 btrfs_err(fs_info, 8028 "too many dev extents for chunk %llu found", 8029 map->start); 8030 ret = -EUCLEAN; 8031 goto out; 8032 } 8033 map->verified_stripes++; 8034 break; 8035 } 8036 } 8037 if (!found) { 8038 btrfs_err(fs_info, 8039 "dev extent physical offset %llu devid %llu has no corresponding chunk", 8040 physical_offset, devid); 8041 ret = -EUCLEAN; 8042 } 8043 8044 /* Make sure no dev extent is beyond device boundary */ 8045 dev = btrfs_find_device(fs_info->fs_devices, &args); 8046 if (!dev) { 8047 btrfs_err(fs_info, "failed to find devid %llu", devid); 8048 ret = -EUCLEAN; 8049 goto out; 8050 } 8051 8052 if (physical_offset + physical_len > dev->disk_total_bytes) { 8053 btrfs_err(fs_info, 8054 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 8055 devid, physical_offset, physical_len, 8056 dev->disk_total_bytes); 8057 ret = -EUCLEAN; 8058 goto out; 8059 } 8060 8061 if (dev->zone_info) { 8062 u64 zone_size = dev->zone_info->zone_size; 8063 8064 if (!IS_ALIGNED(physical_offset, zone_size) || 8065 !IS_ALIGNED(physical_len, zone_size)) { 8066 btrfs_err(fs_info, 8067 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8068 devid, physical_offset, physical_len); 8069 ret = -EUCLEAN; 8070 goto out; 8071 } 8072 } 8073 8074 out: 8075 btrfs_free_chunk_map(map); 8076 return ret; 8077 } 8078 8079 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8080 { 8081 struct rb_node *node; 8082 int ret = 0; 8083 8084 read_lock(&fs_info->mapping_tree_lock); 8085 for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { 8086 struct btrfs_chunk_map *map; 8087 8088 map = rb_entry(node, struct btrfs_chunk_map, rb_node); 8089 if (map->num_stripes != map->verified_stripes) { 8090 btrfs_err(fs_info, 8091 "chunk %llu has missing dev extent, have %d expect %d", 8092 map->start, map->verified_stripes, map->num_stripes); 8093 ret = -EUCLEAN; 8094 goto out; 8095 } 8096 } 8097 out: 8098 read_unlock(&fs_info->mapping_tree_lock); 8099 return ret; 8100 } 8101 8102 /* 8103 * Ensure that all dev extents are mapped to correct chunk, otherwise 8104 * later chunk allocation/free would cause unexpected behavior. 8105 * 8106 * NOTE: This will iterate through the whole device tree, which should be of 8107 * the same size level as the chunk tree. This slightly increases mount time. 8108 */ 8109 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8110 { 8111 struct btrfs_path *path; 8112 struct btrfs_root *root = fs_info->dev_root; 8113 struct btrfs_key key; 8114 u64 prev_devid = 0; 8115 u64 prev_dev_ext_end = 0; 8116 int ret = 0; 8117 8118 /* 8119 * We don't have a dev_root because we mounted with ignorebadroots and 8120 * failed to load the root, so we want to skip the verification in this 8121 * case for sure. 8122 * 8123 * However if the dev root is fine, but the tree itself is corrupted 8124 * we'd still fail to mount. This verification is only to make sure 8125 * writes can happen safely, so instead just bypass this check 8126 * completely in the case of IGNOREBADROOTS. 8127 */ 8128 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8129 return 0; 8130 8131 key.objectid = 1; 8132 key.type = BTRFS_DEV_EXTENT_KEY; 8133 key.offset = 0; 8134 8135 path = btrfs_alloc_path(); 8136 if (!path) 8137 return -ENOMEM; 8138 8139 path->reada = READA_FORWARD; 8140 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8141 if (ret < 0) 8142 goto out; 8143 8144 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8145 ret = btrfs_next_leaf(root, path); 8146 if (ret < 0) 8147 goto out; 8148 /* No dev extents at all? Not good */ 8149 if (ret > 0) { 8150 ret = -EUCLEAN; 8151 goto out; 8152 } 8153 } 8154 while (1) { 8155 struct extent_buffer *leaf = path->nodes[0]; 8156 struct btrfs_dev_extent *dext; 8157 int slot = path->slots[0]; 8158 u64 chunk_offset; 8159 u64 physical_offset; 8160 u64 physical_len; 8161 u64 devid; 8162 8163 btrfs_item_key_to_cpu(leaf, &key, slot); 8164 if (key.type != BTRFS_DEV_EXTENT_KEY) 8165 break; 8166 devid = key.objectid; 8167 physical_offset = key.offset; 8168 8169 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8170 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8171 physical_len = btrfs_dev_extent_length(leaf, dext); 8172 8173 /* Check if this dev extent overlaps with the previous one */ 8174 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8175 btrfs_err(fs_info, 8176 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8177 devid, physical_offset, prev_dev_ext_end); 8178 ret = -EUCLEAN; 8179 goto out; 8180 } 8181 8182 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8183 physical_offset, physical_len); 8184 if (ret < 0) 8185 goto out; 8186 prev_devid = devid; 8187 prev_dev_ext_end = physical_offset + physical_len; 8188 8189 ret = btrfs_next_item(root, path); 8190 if (ret < 0) 8191 goto out; 8192 if (ret > 0) { 8193 ret = 0; 8194 break; 8195 } 8196 } 8197 8198 /* Ensure all chunks have corresponding dev extents */ 8199 ret = verify_chunk_dev_extent_mapping(fs_info); 8200 out: 8201 btrfs_free_path(path); 8202 return ret; 8203 } 8204 8205 /* 8206 * Check whether the given block group or device is pinned by any inode being 8207 * used as a swapfile. 8208 */ 8209 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8210 { 8211 struct btrfs_swapfile_pin *sp; 8212 struct rb_node *node; 8213 8214 spin_lock(&fs_info->swapfile_pins_lock); 8215 node = fs_info->swapfile_pins.rb_node; 8216 while (node) { 8217 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8218 if (ptr < sp->ptr) 8219 node = node->rb_left; 8220 else if (ptr > sp->ptr) 8221 node = node->rb_right; 8222 else 8223 break; 8224 } 8225 spin_unlock(&fs_info->swapfile_pins_lock); 8226 return node != NULL; 8227 } 8228 8229 static int relocating_repair_kthread(void *data) 8230 { 8231 struct btrfs_block_group *cache = data; 8232 struct btrfs_fs_info *fs_info = cache->fs_info; 8233 u64 target; 8234 int ret = 0; 8235 8236 target = cache->start; 8237 btrfs_put_block_group(cache); 8238 8239 sb_start_write(fs_info->sb); 8240 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8241 btrfs_info(fs_info, 8242 "zoned: skip relocating block group %llu to repair: EBUSY", 8243 target); 8244 sb_end_write(fs_info->sb); 8245 return -EBUSY; 8246 } 8247 8248 mutex_lock(&fs_info->reclaim_bgs_lock); 8249 8250 /* Ensure block group still exists */ 8251 cache = btrfs_lookup_block_group(fs_info, target); 8252 if (!cache) 8253 goto out; 8254 8255 if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) 8256 goto out; 8257 8258 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8259 if (ret < 0) 8260 goto out; 8261 8262 btrfs_info(fs_info, 8263 "zoned: relocating block group %llu to repair IO failure", 8264 target); 8265 ret = btrfs_relocate_chunk(fs_info, target); 8266 8267 out: 8268 if (cache) 8269 btrfs_put_block_group(cache); 8270 mutex_unlock(&fs_info->reclaim_bgs_lock); 8271 btrfs_exclop_finish(fs_info); 8272 sb_end_write(fs_info->sb); 8273 8274 return ret; 8275 } 8276 8277 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8278 { 8279 struct btrfs_block_group *cache; 8280 8281 if (!btrfs_is_zoned(fs_info)) 8282 return false; 8283 8284 /* Do not attempt to repair in degraded state */ 8285 if (btrfs_test_opt(fs_info, DEGRADED)) 8286 return true; 8287 8288 cache = btrfs_lookup_block_group(fs_info, logical); 8289 if (!cache) 8290 return true; 8291 8292 if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { 8293 btrfs_put_block_group(cache); 8294 return true; 8295 } 8296 8297 kthread_run(relocating_repair_kthread, cache, 8298 "btrfs-relocating-repair"); 8299 8300 return true; 8301 } 8302 8303 static void map_raid56_repair_block(struct btrfs_io_context *bioc, 8304 struct btrfs_io_stripe *smap, 8305 u64 logical) 8306 { 8307 int data_stripes = nr_bioc_data_stripes(bioc); 8308 int i; 8309 8310 for (i = 0; i < data_stripes; i++) { 8311 u64 stripe_start = bioc->full_stripe_logical + 8312 btrfs_stripe_nr_to_offset(i); 8313 8314 if (logical >= stripe_start && 8315 logical < stripe_start + BTRFS_STRIPE_LEN) 8316 break; 8317 } 8318 ASSERT(i < data_stripes); 8319 smap->dev = bioc->stripes[i].dev; 8320 smap->physical = bioc->stripes[i].physical + 8321 ((logical - bioc->full_stripe_logical) & 8322 BTRFS_STRIPE_LEN_MASK); 8323 } 8324 8325 /* 8326 * Map a repair write into a single device. 8327 * 8328 * A repair write is triggered by read time repair or scrub, which would only 8329 * update the contents of a single device. 8330 * Not update any other mirrors nor go through RMW path. 8331 * 8332 * Callers should ensure: 8333 * 8334 * - Call btrfs_bio_counter_inc_blocked() first 8335 * - The range does not cross stripe boundary 8336 * - Has a valid @mirror_num passed in. 8337 */ 8338 int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, 8339 struct btrfs_io_stripe *smap, u64 logical, 8340 u32 length, int mirror_num) 8341 { 8342 struct btrfs_io_context *bioc = NULL; 8343 u64 map_length = length; 8344 int mirror_ret = mirror_num; 8345 int ret; 8346 8347 ASSERT(mirror_num > 0); 8348 8349 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, 8350 &bioc, smap, &mirror_ret); 8351 if (ret < 0) 8352 return ret; 8353 8354 /* The map range should not cross stripe boundary. */ 8355 ASSERT(map_length >= length); 8356 8357 /* Already mapped to single stripe. */ 8358 if (!bioc) 8359 goto out; 8360 8361 /* Map the RAID56 multi-stripe writes to a single one. */ 8362 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 8363 map_raid56_repair_block(bioc, smap, logical); 8364 goto out; 8365 } 8366 8367 ASSERT(mirror_num <= bioc->num_stripes); 8368 smap->dev = bioc->stripes[mirror_num - 1].dev; 8369 smap->physical = bioc->stripes[mirror_num - 1].physical; 8370 out: 8371 btrfs_put_bioc(bioc); 8372 ASSERT(smap->dev); 8373 return 0; 8374 } 8375